aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c4
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dentry.c8
-rw-r--r--fs/9p/vfs_dir.c4
-rw-r--r--fs/9p/vfs_file.c8
-rw-r--r--fs/9p/vfs_inode.c32
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/Kconfig7
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/adfs.h1
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/dir_fplus.c9
-rw-r--r--fs/affs/amigaffs.c2
-rw-r--r--fs/affs/file.c39
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/aio.c178
-rw-r--r--fs/autofs4/autofs_i.h69
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/expire.c217
-rw-r--r--fs/autofs4/root.c72
-rw-r--r--fs/bad_inode.c7
-rw-r--r--fs/befs/btree.c53
-rw-r--r--fs/befs/linuxvfs.c6
-rw-r--r--fs/bfs/bfs.h1
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/bfs/inode.c8
-rw-r--r--fs/binfmt_aout.c25
-rw-r--r--fs/binfmt_elf.c25
-rw-r--r--fs/binfmt_elf_fdpic.c24
-rw-r--r--fs/binfmt_misc.c23
-rw-r--r--fs/block_dev.c44
-rw-r--r--fs/btrfs/async-thread.c55
-rw-r--r--fs/btrfs/async-thread.h29
-rw-r--r--fs/btrfs/backref.c137
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h46
-rw-r--r--fs/btrfs/check-integrity.c18
-rw-r--r--fs/btrfs/compression.c21
-rw-r--r--fs/btrfs/ctree.c126
-rw-r--r--fs/btrfs/ctree.h99
-rw-r--r--fs/btrfs/delayed-inode.c12
-rw-r--r--fs/btrfs/dev-replace.c82
-rw-r--r--fs/btrfs/dir-item.c12
-rw-r--r--fs/btrfs/disk-io.c379
-rw-r--r--fs/btrfs/disk-io.h16
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c583
-rw-r--r--fs/btrfs/extent_io.c488
-rw-r--r--fs/btrfs/extent_io.h60
-rw-r--r--fs/btrfs/file-item.c34
-rw-r--r--fs/btrfs/file.c182
-rw-r--r--fs/btrfs/free-space-cache.c157
-rw-r--r--fs/btrfs/hash.c20
-rw-r--r--fs/btrfs/inode-item.c12
-rw-r--r--fs/btrfs/inode-map.c68
-rw-r--r--fs/btrfs/inode.c977
-rw-r--r--fs/btrfs/ioctl.c120
-rw-r--r--fs/btrfs/lzo.c3
-rw-r--r--fs/btrfs/ordered-data.c124
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/orphan.c4
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c200
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid56.c17
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/relocation.c142
-rw-r--r--fs/btrfs/scrub.c92
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/super.c198
-rw-r--r--fs/btrfs/sysfs.c43
-rw-r--r--fs/btrfs/sysfs.h16
-rw-r--r--fs/btrfs/tests/free-space-tests.c516
-rw-r--r--fs/btrfs/transaction.c85
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-log.c333
-rw-r--r--fs/btrfs/tree-log.h4
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/uuid-tree.c1
-rw-r--r--fs/btrfs/volumes.c722
-rw-r--r--fs/btrfs/volumes.h166
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zlib.c141
-rw-r--r--fs/buffer.c129
-rw-r--r--fs/cachefiles/bind.c8
-rw-r--r--fs/cachefiles/daemon.c30
-rw-r--r--fs/cachefiles/interface.c33
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/main.c2
-rw-r--r--fs/cachefiles/namei.c19
-rw-r--r--fs/cachefiles/rdwr.c54
-rw-r--r--fs/cachefiles/xattr.c10
-rw-r--r--fs/ceph/acl.c113
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/caps.c41
-rw-r--r--fs/ceph/debugfs.c46
-rw-r--r--fs/ceph/dir.c42
-rw-r--r--fs/ceph/file.c57
-rw-r--r--fs/ceph/inode.c16
-rw-r--r--fs/ceph/ioctl.c6
-rw-r--r--fs/ceph/mds_client.c152
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h27
-rw-r--r--fs/ceph/xattr.c85
-rw-r--r--fs/cifs/Kconfig35
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c6
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_spnego.c1
-rw-r--r--fs/cifs/cifs_unicode.c203
-rw-r--r--fs/cifs/cifs_unicode.h31
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/cifsencrypt.c2
-rw-r--r--fs/cifs/cifsfs.c33
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h35
-rw-r--r--fs/cifs/cifspdu.h23
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c135
-rw-r--r--fs/cifs/connect.c60
-rw-r--r--fs/cifs/dir.c44
-rw-r--r--fs/cifs/file.c894
-rw-r--r--fs/cifs/inode.c94
-rw-r--r--fs/cifs/link.c157
-rw-r--r--fs/cifs/misc.c20
-rw-r--r--fs/cifs/netmisc.c20
-rw-r--r--fs/cifs/readdir.c20
-rw-r--r--fs/cifs/sess.c1182
-rw-r--r--fs/cifs/smb1ops.c50
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2inode.c4
-rw-r--r--fs/cifs/smb2maperror.c8
-rw-r--r--fs/cifs/smb2misc.c35
-rw-r--r--fs/cifs/smb2ops.c272
-rw-r--r--fs/cifs/smb2pdu.c119
-rw-r--r--fs/cifs/smb2pdu.h8
-rw-r--r--fs/cifs/smb2proto.h10
-rw-r--r--fs/cifs/smb2transport.c5
-rw-r--r--fs/cifs/smbencrypt.c1
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/cifs/transport.c25
-rw-r--r--fs/cifs/xattr.c32
-rw-r--r--fs/coda/cache.c2
-rw-r--r--fs/coda/coda_linux.c2
-rw-r--r--fs/coda/dir.c3
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/coda/inode.c4
-rw-r--r--fs/coda/pioctl.c2
-rw-r--r--fs/coda/psdev.c2
-rw-r--r--fs/coda/upcall.c2
-rw-r--r--fs/compat.c28
-rw-r--r--fs/coredump.c8
-rw-r--r--fs/cramfs/inode.c45
-rw-r--r--fs/cramfs/uncompress.c10
-rw-r--r--fs/dcache.c530
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/debug_fs.c15
-rw-r--r--fs/dlm/plock.c8
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/ecryptfs/file.c6
-rw-r--r--fs/ecryptfs/inode.c25
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efs/namei.c11
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/exec.c21
-rw-r--r--fs/exofs/Kbuild2
-rw-r--r--fs/exofs/common.h2
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/exofs.h2
-rw-r--r--fs/exofs/file.c2
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/exofs/namei.c2
-rw-r--r--fs/exofs/ore.c4
-rw-r--r--fs/exofs/ore_raid.c4
-rw-r--r--fs/exofs/ore_raid.h2
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/exofs/symlink.c2
-rw-r--r--fs/exofs/sys.c2
-rw-r--r--fs/ext2/super.c8
-rw-r--r--fs/ext3/ext3.h12
-rw-r--r--fs/ext3/super.c32
-rw-r--r--fs/ext4/balloc.c15
-rw-r--r--fs/ext4/bitmap.c12
-rw-r--r--fs/ext4/dir.c8
-rw-r--r--fs/ext4/ext4.h68
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_jbd2.c4
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c710
-rw-r--r--fs/ext4/extents_status.c200
-rw-r--r--fs/ext4/extents_status.h13
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c7
-rw-r--r--fs/ext4/indirect.c86
-rw-r--r--fs/ext4/inline.c7
-rw-r--r--fs/ext4/inode.c184
-rw-r--r--fs/ext4/ioctl.c13
-rw-r--r--fs/ext4/mballoc.c22
-rw-r--r--fs/ext4/migrate.c11
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c1068
-rw-r--r--fs/ext4/namei.c402
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c275
-rw-r--r--fs/ext4/xattr.c44
-rw-r--r--fs/f2fs/Kconfig4
-rw-r--r--fs/f2fs/checkpoint.c175
-rw-r--r--fs/f2fs/data.c86
-rw-r--r--fs/f2fs/debug.c24
-rw-r--r--fs/f2fs/dir.c25
-rw-r--r--fs/f2fs/f2fs.h187
-rw-r--r--fs/f2fs/file.c315
-rw-r--r--fs/f2fs/gc.c34
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/hash.c7
-rw-r--r--fs/f2fs/inline.c58
-rw-r--r--fs/f2fs/inode.c37
-rw-r--r--fs/f2fs/namei.c66
-rw-r--r--fs/f2fs/node.c536
-rw-r--r--fs/f2fs/node.h60
-rw-r--r--fs/f2fs/recovery.c219
-rw-r--r--fs/f2fs/segment.c573
-rw-r--r--fs/f2fs/segment.h162
-rw-r--r--fs/f2fs/super.c77
-rw-r--r--fs/f2fs/xattr.c10
-rw-r--r--fs/fat/misc.c2
-rw-r--r--fs/fcntl.c26
-rw-r--r--fs/file.c3
-rw-r--r--fs/file_table.c14
-rw-r--r--fs/fs_pin.c78
-rw-r--r--fs/fscache/main.c4
-rw-r--r--fs/fscache/object-list.c16
-rw-r--r--fs/fscache/object.c1
-rw-r--r--fs/fscache/page.c25
-rw-r--r--fs/fuse/dir.c14
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/gfs2/bmap.c9
-rw-r--r--fs/gfs2/dentry.c3
-rw-r--r--fs/gfs2/dir.c9
-rw-r--r--fs/gfs2/dir.h1
-rw-r--r--fs/gfs2/file.c37
-rw-r--r--fs/gfs2/glock.c4
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c21
-rw-r--r--fs/gfs2/rgrp.c30
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/super.c20
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hostfs/hostfs.h1
-rw-r--r--fs/hostfs/hostfs_kern.c30
-rw-r--r--fs/hostfs/hostfs_user.c28
-rw-r--r--fs/hpfs/dnode.c17
-rw-r--r--fs/inode.c1
-rw-r--r--fs/internal.h19
-rw-r--r--fs/isofs/compress.c4
-rw-r--r--fs/isofs/inode.c41
-rw-r--r--fs/isofs/isofs.h23
-rw-r--r--fs/isofs/namei.c22
-rw-r--r--fs/isofs/rock.c39
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/revoke.c7
-rw-r--r--fs/jbd2/checkpoint.c334
-rw-r--r--fs/jbd2/commit.c21
-rw-r--r--fs/jbd2/journal.c60
-rw-r--r--fs/jbd2/recovery.c34
-rw-r--r--fs/jbd2/revoke.c16
-rw-r--r--fs/jffs2/acl.c3
-rw-r--r--fs/jffs2/compr_zlib.c7
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/wbuf.c17
-rw-r--r--fs/jffs2/xattr.c3
-rw-r--r--fs/jfs/jfs_logmgr.c2
-rw-r--r--fs/jfs/jfs_txnmgr.c3
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c11
-rw-r--r--fs/libfs.c18
-rw-r--r--fs/lockd/Makefile3
-rw-r--r--fs/lockd/mon.c6
-rw-r--r--fs/lockd/netns.h1
-rw-r--r--fs/lockd/procfs.c92
-rw-r--r--fs/lockd/procfs.h28
-rw-r--r--fs/lockd/svc.c20
-rw-r--r--fs/lockd/svclock.c68
-rw-r--r--fs/locks.c518
-rw-r--r--fs/logfs/readwrite.c15
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/minix/inode.c4
-rw-r--r--fs/mount.h27
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c203
-rw-r--r--fs/namespace.c373
-rw-r--r--fs/ncpfs/dir.c9
-rw-r--r--fs/ncpfs/ncplib_kernel.h14
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1435
-rw-r--r--fs/nfs/blocklayout/blocklayout.h213
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c384
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c108
-rw-r--r--fs/nfs/blocklayout/dev.c363
-rw-r--r--fs/nfs/blocklayout/extent_tree.c602
-rw-r--r--fs/nfs/blocklayout/extents.c908
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c288
-rw-r--r--fs/nfs/callback.c16
-rw-r--r--fs/nfs/callback_proc.c23
-rw-r--r--fs/nfs/client.c111
-rw-r--r--fs/nfs/delegation.c59
-rw-r--r--fs/nfs/delegation.h2
-rw-r--r--fs/nfs/dir.c216
-rw-r--r--fs/nfs/direct.c60
-rw-r--r--fs/nfs/file.c69
-rw-r--r--fs/nfs/filelayout/filelayout.c334
-rw-r--r--fs/nfs/filelayout/filelayout.h7
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c108
-rw-r--r--fs/nfs/fscache-index.c3
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c2
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/internal.h28
-rw-r--r--fs/nfs/netns.h4
-rw-r--r--fs/nfs/nfs3_fs.h34
-rw-r--r--fs/nfs/nfs3acl.c8
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c22
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs42.h14
-rw-r--r--fs/nfs/nfs42proc.c69
-rw-r--r--fs/nfs/nfs42xdr.c98
-rw-r--r--fs/nfs/nfs4_fs.h24
-rw-r--r--fs/nfs/nfs4client.c43
-rw-r--r--fs/nfs/nfs4file.c27
-rw-r--r--fs/nfs/nfs4proc.c493
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c63
-rw-r--r--fs/nfs/nfs4trace.h28
-rw-r--r--fs/nfs/nfs4xdr.c188
-rw-r--r--fs/nfs/objlayout/objio_osd.c139
-rw-r--r--fs/nfs/objlayout/objlayout.c153
-rw-r--r--fs/nfs/objlayout/objlayout.h15
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c2
-rw-r--r--fs/nfs/pagelist.c324
-rw-r--r--fs/nfs/pnfs.c304
-rw-r--r--fs/nfs/pnfs.h92
-rw-r--r--fs/nfs/pnfs_dev.c150
-rw-r--r--fs/nfs/proc.c27
-rw-r--r--fs/nfs/read.c54
-rw-r--r--fs/nfs/super.c23
-rw-r--r--fs/nfs/write.c245
-rw-r--r--fs/nfs_common/Makefile3
-rw-r--r--fs/nfs_common/grace.c (renamed from fs/lockd/grace.c)68
-rw-r--r--fs/nfs_common/nfsacl.c5
-rw-r--r--fs/nfsd/Kconfig4
-rw-r--r--fs/nfsd/acl.h2
-rw-r--r--fs/nfsd/auth.c2
-rw-r--r--fs/nfsd/cache.h1
-rw-r--r--fs/nfsd/export.c7
-rw-r--r--fs/nfsd/export.h3
-rw-r--r--fs/nfsd/fault_inject.c138
-rw-r--r--fs/nfsd/netns.h23
-rw-r--r--fs/nfsd/nfs2acl.c8
-rw-r--r--fs/nfsd/nfs3acl.c8
-rw-r--r--fs/nfsd/nfs3proc.c22
-rw-r--r--fs/nfsd/nfs3xdr.c30
-rw-r--r--fs/nfsd/nfs4acl.c39
-rw-r--r--fs/nfsd/nfs4callback.c142
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfs4proc.c109
-rw-r--r--fs/nfsd/nfs4recover.c206
-rw-r--r--fs/nfsd/nfs4state.c3239
-rw-r--r--fs/nfsd/nfs4xdr.c220
-rw-r--r--fs/nfsd/nfscache.c205
-rw-r--r--fs/nfsd/nfsctl.c96
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c18
-rw-r--r--fs/nfsd/nfsfh.h15
-rw-r--r--fs/nfsd/nfsproc.c13
-rw-r--r--fs/nfsd/nfssvc.c21
-rw-r--r--fs/nfsd/nfsxdr.c14
-rw-r--r--fs/nfsd/state.h248
-rw-r--r--fs/nfsd/vfs.c85
-rw-r--r--fs/nfsd/vfs.h8
-rw-r--r--fs/nfsd/xdr4.h44
-rw-r--r--fs/nilfs2/Makefile2
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/inode.c20
-rw-r--r--fs/nilfs2/ioctl.c8
-rw-r--r--fs/nilfs2/nilfs.h22
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/super.c17
-rw-r--r--fs/nilfs2/sysfs.c1137
-rw-r--r--fs/nilfs2/sysfs.h176
-rw-r--r--fs/nilfs2/the_nilfs.c17
-rw-r--r--fs/nilfs2/the_nilfs.h42
-rw-r--r--fs/notify/dnotify/dnotify.c8
-rw-r--r--fs/notify/fanotify/fanotify.c11
-rw-r--r--fs/notify/fanotify/fanotify_user.c16
-rw-r--r--fs/notify/fdinfo.c4
-rw-r--r--fs/notify/fsnotify.c36
-rw-r--r--fs/notify/fsnotify.h7
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inode_mark.c27
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c8
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/notify/mark.c36
-rw-r--r--fs/notify/notification.c37
-rw-r--r--fs/notify/vfsmount_mark.c10
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/aops.c163
-rw-r--r--fs/ntfs/debug.c2
-rw-r--r--fs/ntfs/file.c8
-rw-r--r--fs/ntfs/inode.c19
-rw-r--r--fs/ntfs/ntfs.h8
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/alloc.c15
-rw-r--r--fs/ocfs2/aops.c15
-rw-r--r--fs/ocfs2/cluster/heartbeat.c21
-rw-r--r--fs/ocfs2/cluster/heartbeat.h1
-rw-r--r--fs/ocfs2/cluster/masklog.c6
-rw-r--r--fs/ocfs2/cluster/netdebug.c78
-rw-r--r--fs/ocfs2/cluster/quorum.c13
-rw-r--r--fs/ocfs2/cluster/tcp.c90
-rw-r--r--fs/ocfs2/cluster/tcp.h1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c39
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c49
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c29
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/dlmglue.c23
-rw-r--r--fs/ocfs2/file.c49
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/ioctl.c129
-rw-r--r--fs/ocfs2/move_extents.c4
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/quota.h5
-rw-r--r--fs/ocfs2/quota_global.c4
-rw-r--r--fs/ocfs2/quota_local.c33
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c31
-rw-r--r--fs/omfs/inode.c12
-rw-r--r--fs/omfs/omfs_fs.h1
-rw-r--r--fs/open.c23
-rw-r--r--fs/overlayfs/Kconfig10
-rw-r--r--fs/overlayfs/Makefile7
-rw-r--r--fs/overlayfs/copy_up.c414
-rw-r--r--fs/overlayfs/dir.c921
-rw-r--r--fs/overlayfs/inode.c425
-rw-r--r--fs/overlayfs/overlayfs.h191
-rw-r--r--fs/overlayfs/readdir.c593
-rw-r--r--fs/overlayfs/super.c796
-rw-r--r--fs/pnode.c1
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c284
-rw-r--r--fs/proc/fd.c4
-rw-r--r--fs/proc/generic.c32
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h20
-rw-r--r--fs/proc/kcore.c6
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/proc_net.c4
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/proc_tty.c4
-rw-r--r--fs/proc/root.c7
-rw-r--r--fs/proc/task_mmu.c385
-rw-r--r--fs/proc/task_nommu.c88
-rw-r--r--fs/proc/thread_self.c85
-rw-r--r--fs/proc/vmcore.c82
-rw-r--r--fs/proc_namespace.c8
-rw-r--r--fs/pstore/inode.c4
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/qnx6/Makefile1
-rw-r--r--fs/qnx6/dir.c26
-rw-r--r--fs/qnx6/inode.c99
-rw-r--r--fs/qnx6/namei.c6
-rw-r--r--fs/qnx6/qnx6.h12
-rw-r--r--fs/qnx6/super_mmi.c22
-rw-r--r--fs/quota/dquot.c184
-rw-r--r--fs/quota/kqid.c2
-rw-r--r--fs/quota/netlink.c3
-rw-r--r--fs/quota/quota.c6
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/dir.c2
-rw-r--r--fs/reiserfs/do_balan.c113
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/ibalance.c2
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/ioctl.c2
-rw-r--r--fs/reiserfs/item_ops.c4
-rw-r--r--fs/reiserfs/journal.c30
-rw-r--r--fs/reiserfs/lbalance.c7
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/procfs.c2
-rw-r--r--fs/reiserfs/reiserfs.h14
-rw-r--r--fs/reiserfs/stree.c2
-rw-r--r--fs/reiserfs/super.c31
-rw-r--r--fs/reiserfs/xattr.c22
-rw-r--r--fs/reiserfs/xattr.h1
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/romfs/super.c23
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/file_direct.c2
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/super.c25
-rw-r--r--fs/sync.c2
-rw-r--r--fs/timerfd.c3
-rw-r--r--fs/ubifs/commit.c10
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/io.c2
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/log.c31
-rw-r--r--fs/ubifs/lpt.c5
-rw-r--r--fs/ubifs/lpt_commit.c7
-rw-r--r--fs/ubifs/master.c7
-rw-r--r--fs/ubifs/orphan.c1
-rw-r--r--fs/ubifs/recovery.c5
-rw-r--r--fs/ubifs/sb.c4
-rw-r--r--fs/ubifs/scan.c14
-rw-r--r--fs/ubifs/super.c19
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/tnc_commit.c1
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/udf/file.c31
-rw-r--r--fs/udf/ialloc.c28
-rw-r--r--fs/udf/inode.c165
-rw-r--r--fs/udf/lowlevel.c2
-rw-r--r--fs/udf/namei.c156
-rw-r--r--fs/udf/super.c73
-rw-r--r--fs/udf/symlink.c2
-rw-r--r--fs/udf/udfdecl.h16
-rw-r--r--fs/udf/udftime.c2
-rw-r--r--fs/udf/unicode.c9
-rw-r--r--fs/ufs/Makefile1
-rw-r--r--fs/ufs/balloc.c3
-rw-r--r--fs/ufs/ialloc.c6
-rw-r--r--fs/ufs/inode.c39
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/ufs/super.c304
-rw-r--r--fs/ufs/ufs.h10
-rw-r--r--fs/xattr.c116
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile71
-rw-r--r--fs/xfs/kmem.c1
-rw-r--r--fs/xfs/libxfs/xfs_ag.h (renamed from fs/xfs/xfs_ag.h)0
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c (renamed from fs/xfs/xfs_alloc.c)24
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h (renamed from fs/xfs/xfs_alloc.h)0
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c (renamed from fs/xfs/xfs_alloc_btree.c)6
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h (renamed from fs/xfs/xfs_alloc_btree.h)0
-rw-r--r--fs/xfs/libxfs/xfs_attr.c (renamed from fs/xfs/xfs_attr.c)92
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c (renamed from fs/xfs/xfs_attr_leaf.c)78
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h (renamed from fs/xfs/xfs_attr_leaf.h)0
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c (renamed from fs/xfs/xfs_attr_remote.c)22
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h (renamed from fs/xfs/xfs_attr_remote.h)0
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h (renamed from fs/xfs/xfs_attr_sf.h)0
-rw-r--r--fs/xfs/libxfs/xfs_bit.h (renamed from fs/xfs/xfs_bit.h)0
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c (renamed from fs/xfs/xfs_bmap.c)427
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h (renamed from fs/xfs/xfs_bmap.h)7
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c (renamed from fs/xfs/xfs_bmap_btree.c)99
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h (renamed from fs/xfs/xfs_bmap_btree.h)0
-rw-r--r--fs/xfs/libxfs/xfs_btree.c (renamed from fs/xfs/xfs_btree.c)46
-rw-r--r--fs/xfs/libxfs/xfs_btree.h (renamed from fs/xfs/xfs_btree.h)2
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h (renamed from fs/xfs/xfs_cksum.h)0
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c (renamed from fs/xfs/xfs_da_btree.c)115
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h (renamed from fs/xfs/xfs_da_btree.h)0
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c (renamed from fs/xfs/xfs_da_format.c)1
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h (renamed from fs/xfs/xfs_da_format.h)0
-rw-r--r--fs/xfs/libxfs/xfs_dinode.h (renamed from fs/xfs/xfs_dinode.h)0
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c (renamed from fs/xfs/xfs_dir2.c)89
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h (renamed from fs/xfs/xfs_dir2.h)2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c (renamed from fs/xfs/xfs_dir2_block.c)18
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c (renamed from fs/xfs/xfs_dir2_data.c)10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c (renamed from fs/xfs/xfs_dir2_leaf.c)24
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c (renamed from fs/xfs/xfs_dir2_node.c)40
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h (renamed from fs/xfs/xfs_dir2_priv.h)0
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c (renamed from fs/xfs/xfs_dir2_sf.c)75
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c (renamed from fs/xfs/xfs_dquot_buf.c)6
-rw-r--r--fs/xfs/libxfs/xfs_format.h (renamed from fs/xfs/xfs_format.h)14
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c (renamed from fs/xfs/xfs_ialloc.c)41
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h (renamed from fs/xfs/xfs_ialloc.h)0
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c (renamed from fs/xfs/xfs_ialloc_btree.c)6
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h (renamed from fs/xfs/xfs_ialloc_btree.h)0
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c (renamed from fs/xfs/xfs_inode_buf.c)10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h (renamed from fs/xfs/xfs_inode_buf.h)0
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c (renamed from fs/xfs/xfs_inode_fork.c)36
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h (renamed from fs/xfs/xfs_inode_fork.h)0
-rw-r--r--fs/xfs/libxfs/xfs_inum.h (renamed from fs/xfs/xfs_inum.h)4
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h (renamed from fs/xfs/xfs_log_format.h)4
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h (renamed from fs/xfs/xfs_log_recover.h)0
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c (renamed from fs/xfs/xfs_log_rlimit.c)0
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h (renamed from fs/xfs/xfs_quota_defs.h)2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c (renamed from fs/xfs/xfs_rtbitmap.c)49
-rw-r--r--fs/xfs/libxfs/xfs_sb.c (renamed from fs/xfs/xfs_sb.c)63
-rw-r--r--fs/xfs/libxfs/xfs_sb.h (renamed from fs/xfs/xfs_sb.h)8
-rw-r--r--fs/xfs/libxfs/xfs_shared.h (renamed from fs/xfs/xfs_shared.h)0
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c (renamed from fs/xfs/xfs_symlink_remote.c)6
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c (renamed from fs/xfs/xfs_trans_resv.c)0
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h (renamed from fs/xfs/xfs_trans_resv.h)0
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h (renamed from fs/xfs/xfs_trans_space.h)0
-rw-r--r--fs/xfs/time.h36
-rw-r--r--fs/xfs/xfs_acl.c8
-rw-r--r--fs/xfs/xfs_aops.c102
-rw-r--r--fs/xfs/xfs_attr_inactive.c22
-rw-r--r--fs/xfs/xfs_attr_list.c38
-rw-r--r--fs/xfs/xfs_bmap_util.c342
-rw-r--r--fs/xfs/xfs_buf.c387
-rw-r--r--fs/xfs/xfs_buf.h17
-rw-r--r--fs/xfs/xfs_buf_item.c14
-rw-r--r--fs/xfs/xfs_dir2_readdir.c4
-rw-r--r--fs/xfs/xfs_discard.c18
-rw-r--r--fs/xfs/xfs_dquot.c41
-rw-r--r--fs/xfs/xfs_dquot.h15
-rw-r--r--fs/xfs/xfs_error.c25
-rw-r--r--fs/xfs/xfs_error.h13
-rw-r--r--fs/xfs/xfs_export.c10
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c272
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fs.h7
-rw-r--r--fs/xfs/xfs_fsops.c51
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_icache.c147
-rw-r--r--fs/xfs/xfs_icache.h13
-rw-r--r--fs/xfs/xfs_inode.c102
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_ioctl.c294
-rw-r--r--fs/xfs/xfs_ioctl32.c113
-rw-r--r--fs/xfs/xfs_ioctl32.h3
-rw-r--r--fs/xfs/xfs_iomap.c58
-rw-r--r--fs/xfs/xfs_iops.c102
-rw-r--r--fs/xfs/xfs_itable.c704
-rw-r--r--fs/xfs/xfs_itable.h7
-rw-r--r--fs/xfs/xfs_linux.h33
-rw-r--r--fs/xfs/xfs_log.c128
-rw-r--r--fs/xfs/xfs_log_cil.c55
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c959
-rw-r--r--fs/xfs/xfs_mount.c141
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_mru_cache.c17
-rw-r--r--fs/xfs/xfs_qm.c230
-rw-r--r--fs/xfs/xfs_qm.h1
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c46
-rw-r--r--fs/xfs/xfs_quotaops.c20
-rw-r--r--fs/xfs/xfs_rtalloc.c105
-rw-r--r--fs/xfs/xfs_rtalloc.h6
-rw-r--r--fs/xfs/xfs_super.c167
-rw-r--r--fs/xfs/xfs_super.h15
-rw-r--r--fs/xfs/xfs_symlink.c38
-rw-r--r--fs/xfs/xfs_sysctl.h5
-rw-r--r--fs/xfs/xfs_sysfs.c239
-rw-r--r--fs/xfs/xfs_sysfs.h60
-rw-r--r--fs/xfs/xfs_trace.h3
-rw-r--r--fs/xfs/xfs_trans.c10
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_trans_buf.c53
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_types.h29
-rw-r--r--fs/xfs/xfs_vnode.h46
-rw-r--r--fs/xfs/xfs_xattr.c6
674 files changed, 30854 insertions, 19022 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index d51ec9fafcc8..47db55aee7f2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -65,8 +65,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
{
struct p9_fid *fid, *ret;
- p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
- dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid),
+ p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p) uid %d any %d\n",
+ dentry, dentry, from_kuid(&init_user_ns, uid),
any);
ret = NULL;
/* we'll recheck under lock if there's anything to look in */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index cc1cfae726b3..eb14e055ea83 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -266,8 +266,8 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
* Now that we do caching with cache mode enabled, We need
* to support direct IO
*/
- p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
- iocb->ki_filp->f_path.dentry->d_name.name,
+ p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%pD) off/no(%lld/%lu) EINVAL\n",
+ iocb->ki_filp,
(long long)pos, iter->nr_segs);
return -EINVAL;
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index b03dd23feda8..a345b2d659cc 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -49,8 +49,8 @@
*/
static int v9fs_cached_dentry_delete(const struct dentry *dentry)
{
- p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
- dentry->d_name.name, dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
+ dentry, dentry);
/* Don't cache negative dentries */
if (!dentry->d_inode)
@@ -67,8 +67,8 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
static void v9fs_dentry_release(struct dentry *dentry)
{
struct hlist_node *p, *n;
- p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
- dentry->d_name.name, dentry);
+ p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
+ dentry, dentry);
hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
dentry->d_fsdata = NULL;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0b3bfa303dda..4f1151088ebe 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -116,7 +116,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
int reclen = 0;
struct p9_rdir *rdir;
- p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
fid = file->private_data;
buflen = fid->clnt->msize - P9_IOHDRSZ;
@@ -172,7 +172,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
struct p9_rdir *rdir;
struct p9_dirent curdirent;
- p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
fid = file->private_data;
buflen = fid->clnt->msize - P9_READDIRHDRSZ;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 520c11c2dcca..5594505e6e73 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -301,8 +301,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
struct inode *inode = file_inode(filp);
int ret = -ENOLCK;
- p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
- filp, cmd, fl, filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
+ filp, cmd, fl, filp);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -337,8 +337,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
struct inode *inode = file_inode(filp);
int ret = -ENOLCK;
- p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
- filp, cmd, fl, filp->f_path.dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
+ filp, cmd, fl, filp);
/* No mandatory locks */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7fa4f7a7653d..296482fc77a9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -648,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
struct p9_fid *dfid, *ofid, *fid;
struct inode *inode;
- p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
err = 0;
ofid = NULL;
@@ -755,7 +755,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct p9_fid *fid;
struct v9fs_session_info *v9ses;
- p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
err = 0;
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -791,8 +791,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode;
char *name;
- p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n",
- dir, dentry->d_name.name, dentry, flags);
+ p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%pd) %p flags: %x\n",
+ dir, dentry, dentry, flags);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -1239,7 +1239,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
struct p9_fid *fid;
struct p9_wstat *st;
- p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, " %pd\n", dentry);
retval = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
fid = v9fs_fid_lookup(dentry);
@@ -1262,8 +1262,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
retval = min(strlen(st->extension)+1, (size_t)buflen);
memcpy(buffer, st->extension, retval);
- p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n",
- dentry->d_name.name, st->extension, buflen, buffer);
+ p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n",
+ dentry, st->extension, buflen, buffer);
done:
p9stat_free(st);
@@ -1283,7 +1283,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
int len = 0;
char *link = __getname();
- p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
if (!link)
link = ERR_PTR(-ENOMEM);
@@ -1314,8 +1314,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
{
char *s = nd_get_link(nd);
- p9_debug(P9_DEBUG_VFS, " %s %s\n",
- dentry->d_name.name, IS_ERR(s) ? "<error>" : s);
+ p9_debug(P9_DEBUG_VFS, " %pd %s\n",
+ dentry, IS_ERR(s) ? "<error>" : s);
if (!IS_ERR(s))
__putname(s);
}
@@ -1364,8 +1364,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
static int
v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
{
- p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
- dir->i_ino, dentry->d_name.name, symname);
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n",
+ dir->i_ino, dentry, symname);
return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
}
@@ -1386,8 +1386,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
char *name;
struct p9_fid *oldfid;
- p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
- dir->i_ino, dentry->d_name.name, old_dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n",
+ dir->i_ino, dentry, old_dentry);
oldfid = v9fs_fid_clone(old_dentry);
if (IS_ERR(oldfid))
@@ -1428,8 +1428,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
char *name;
u32 perm;
- p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
- dir->i_ino, dentry->d_name.name, mode,
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
if (!new_valid_dev(rdev))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1fa85aae24df..02b64f4e576a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -393,7 +393,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
- p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
err = 0;
v9ses = v9fs_inode2v9ses(dir);
@@ -767,8 +767,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
struct p9_fid *dfid, *oldfid;
struct v9fs_session_info *v9ses;
- p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
- dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n",
+ dir->i_ino, old_dentry, dentry);
v9ses = v9fs_inode2v9ses(dir);
dir_dentry = dentry->d_parent;
@@ -917,7 +917,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
char *link = __getname();
char *target;
- p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+ p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
if (!link) {
link = ERR_PTR(-ENOMEM);
diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..664991afe0c0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -67,6 +67,7 @@ source "fs/quota/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
+source "fs/overlayfs/Kconfig"
menu "Caches"
@@ -233,9 +234,13 @@ if NETWORK_FILESYSTEMS
source "fs/nfs/Kconfig"
source "fs/nfsd/Kconfig"
+config GRACE_PERIOD
+ tristate
+
config LOCKD
tristate
depends on FILE_LOCKING
+ select GRACE_PERIOD
config LOCKD_V4
bool
@@ -249,7 +254,7 @@ config NFS_ACL_SUPPORT
config NFS_COMMON
bool
- depends on NFSD || NFS_FS
+ depends on NFSD || NFS_FS || LOCKD
default y
source "net/sunrpc/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 4030cbfbc9af..34a1b9dea6dd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o
+ stack.o fs_struct.o statfs.o fs_pin.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/
obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index c770337c4b45..24575d9d882d 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -153,6 +153,7 @@ extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigne
extern unsigned int adfs_map_free(struct super_block *sb);
/* Misc */
+__printf(3, 4)
void __adfs_error(struct super_block *sb, const char *function,
const char *fmt, ...);
#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt)
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 0d138c0de293..51c279a29845 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -138,7 +138,7 @@ adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_inf
goto out;
if (ADFS_I(inode)->parent_id != dir.parent_id) {
- adfs_error(sb, "parent directory changed under me! (%lx but got %lx)\n",
+ adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n",
ADFS_I(inode)->parent_id, dir.parent_id);
ret = -EIO;
goto free_out;
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index d9e3bee4e653..f2ba88ab4aed 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -55,10 +55,10 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
}
size >>= sb->s_blocksize_bits;
- if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) {
+ if (size > ARRAY_SIZE(dir->bh)) {
/* this directory is too big for fixed bh set, must allocate */
struct buffer_head **bh_fplus =
- kzalloc(size * sizeof(struct buffer_head *),
+ kcalloc(size, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!bh_fplus) {
adfs_error(sb, "not enough memory for"
@@ -79,9 +79,8 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
dir->bh_fplus[blk] = sb_bread(sb, block);
if (!dir->bh_fplus[blk]) {
- adfs_error(sb, "dir object %X failed read for"
- " offset %d, mapped block %X",
- id, blk, block);
+ adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX",
+ id, blk, block);
goto out;
}
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 406b29836b19..abc853968fed 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -10,8 +10,6 @@
#include "affs.h"
-extern struct timezone sys_tz;
-
static char ErrorBuffer[256];
/*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a7fe57d2cd9a..1ed590aafecf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -584,11 +584,14 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
bh->b_state &= ~(1UL << BH_New);
mark_buffer_dirty_inode(bh, inode);
if (prev_bh) {
- u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
- if (tmp)
- affs_warning(sb, "extent_file_ofs", "next block already set for %d (%d)", bidx, tmp);
+ u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+ if (tmp_next)
+ affs_warning(sb, "extent_file_ofs",
+ "next block already set for %d (%d)",
+ bidx, tmp_next);
AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
- affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+ affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
mark_buffer_dirty_inode(prev_bh, inode);
affs_brelse(prev_bh);
}
@@ -727,11 +730,14 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
AFFS_DATA_HEAD(bh)->next = 0;
bh->b_state &= ~(1UL << BH_New);
if (prev_bh) {
- u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
- if (tmp)
- affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp);
+ u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+ if (tmp_next)
+ affs_warning(sb, "commit_write_ofs",
+ "next block already set for %d (%d)",
+ bidx, tmp_next);
AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
- affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+ affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
mark_buffer_dirty_inode(prev_bh, inode);
}
}
@@ -758,11 +764,14 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
AFFS_DATA_HEAD(bh)->next = 0;
bh->b_state &= ~(1UL << BH_New);
if (prev_bh) {
- u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
- if (tmp)
- affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp);
+ u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
+
+ if (tmp_next)
+ affs_warning(sb, "commit_write_ofs",
+ "next block already set for %d (%d)",
+ bidx, tmp_next);
AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
- affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp);
+ affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
mark_buffer_dirty_inode(prev_bh, inode);
}
} else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp)
@@ -842,12 +851,12 @@ affs_truncate(struct inode *inode)
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
- loff_t size = inode->i_size;
+ loff_t isize = inode->i_size;
int res;
- res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
+ res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata);
if (!res)
- res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
+ res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
else
inode->i_size = AFFS_I(inode)->mmu_private;
mark_inode_dirty(inode);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index bec2d1a0c91c..e217c511459b 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -14,13 +14,11 @@
#include "affs.h"
extern const struct inode_operations affs_symlink_inode_operations;
-extern struct timezone sys_tz;
struct inode *affs_iget(struct super_block *sb, unsigned long ino)
{
struct affs_sb_info *sbi = AFFS_SB(sb);
struct buffer_head *bh;
- struct affs_head *head;
struct affs_tail *tail;
struct inode *inode;
u32 block;
@@ -49,7 +47,6 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
goto bad_inode;
}
- head = AFFS_HEAD(bh);
tail = AFFS_TAIL(sb, bh);
prot = be32_to_cpu(tail->protect);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 51f1a95bff73..f754ab68a840 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -20,8 +20,6 @@
#include <linux/writeback.h>
#include "affs.h"
-extern struct timezone sys_tz;
-
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int affs_remount (struct super_block *sb, int *flags, char *data);
@@ -308,7 +306,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
u32 chksum;
int num_bm;
int i, j;
- s32 key;
kuid_t uid;
kgid_t gid;
int reserved;
@@ -367,7 +364,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
i = j = blocksize;
size = size / (blocksize / 512);
}
- for (blocksize = i, key = 0; blocksize <= j; blocksize <<= 1, size >>= 1) {
+ for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
sbi->s_root_block = root_block;
if (root_block < 0)
sbi->s_root_block = (reserved + size - 1) / 2;
@@ -399,7 +396,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) {
sbi->s_hashsize = blocksize / 4 - 56;
sbi->s_root_block += num_bm;
- key = 1;
goto got_root;
}
affs_brelse(root_bh);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 529300327f45..a1645b88fe8a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -669,7 +669,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
out_valid:
dentry->d_fsdata = dir_version;
-out_skip:
dput(parent);
key_put(key);
_leave(" = 1 [valid]");
@@ -682,10 +681,6 @@ not_found:
spin_unlock(&dentry->d_lock);
out_bad:
- /* don't unhash if we have submounts */
- if (check_submounts_and_drop(dentry) != 0)
- goto out_skip;
-
_debug("dropping dentry %s/%s",
parent->d_name.name, dentry->d_name.name);
dput(parent);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index b6df2e83809f..52976785a32c 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -130,7 +130,6 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
/* second+ BUSY - sleep a little bit */
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(1);
- __set_current_state(TASK_RUNNING);
}
continue;
}
diff --git a/fs/aio.c b/fs/aio.c
index bd7ec2cc2674..84a751005f5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
struct {
unsigned tail;
+ unsigned completed_events;
spinlock_t completion_lock;
} ____cacheline_aligned_in_smp;
@@ -192,7 +193,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
}
file->f_flags = O_RDWR;
- file->private_data = ctx;
return file;
}
@@ -202,7 +202,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
static const struct dentry_operations ops = {
.d_dname = simple_dname,
};
- return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1);
+ return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
}
/* aio_setup
@@ -556,8 +556,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
struct aio_ring *ring;
spin_lock(&mm->ioctx_lock);
- rcu_read_lock();
- table = rcu_dereference(mm->ioctx_table);
+ table = rcu_dereference_raw(mm->ioctx_table);
while (1) {
if (table)
@@ -565,7 +564,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
if (!table->table[i]) {
ctx->id = i;
table->table[i] = ctx;
- rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
/* While kioctx setup is in progress,
@@ -579,8 +577,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
}
new_nr = (table ? table->nr : 1) * 4;
-
- rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
@@ -591,8 +587,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
table->nr = new_nr;
spin_lock(&mm->ioctx_lock);
- rcu_read_lock();
- old = rcu_dereference(mm->ioctx_table);
+ old = rcu_dereference_raw(mm->ioctx_table);
if (!old) {
rcu_assign_pointer(mm->ioctx_table, table);
@@ -666,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
INIT_LIST_HEAD(&ctx->active_reqs);
- if (percpu_ref_init(&ctx->users, free_ioctx_users))
+ if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
goto err;
- if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+ if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
goto err;
ctx->cpu = alloc_percpu(struct kioctx_cpu);
@@ -739,12 +734,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
spin_lock(&mm->ioctx_lock);
- rcu_read_lock();
- table = rcu_dereference(mm->ioctx_table);
-
+ table = rcu_dereference_raw(mm->ioctx_table);
WARN_ON(ctx != table->table[ctx->id]);
table->table[ctx->id] = NULL;
- rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
/* percpu_ref_kill() will do the necessary call_rcu() */
@@ -793,40 +785,35 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
*/
void exit_aio(struct mm_struct *mm)
{
- struct kioctx_table *table;
- struct kioctx *ctx;
- unsigned i = 0;
-
- while (1) {
- rcu_read_lock();
- table = rcu_dereference(mm->ioctx_table);
-
- do {
- if (!table || i >= table->nr) {
- rcu_read_unlock();
- rcu_assign_pointer(mm->ioctx_table, NULL);
- if (table)
- kfree(table);
- return;
- }
+ struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
+ int i;
- ctx = table->table[i++];
- } while (!ctx);
+ if (!table)
+ return;
- rcu_read_unlock();
+ for (i = 0; i < table->nr; ++i) {
+ struct kioctx *ctx = table->table[i];
+ struct completion requests_done =
+ COMPLETION_INITIALIZER_ONSTACK(requests_done);
+ if (!ctx)
+ continue;
/*
- * We don't need to bother with munmap() here -
- * exit_mmap(mm) is coming and it'll unmap everything.
- * Since aio_free_ring() uses non-zero ->mmap_size
- * as indicator that it needs to unmap the area,
- * just set it to 0; aio_free_ring() is the only
- * place that uses ->mmap_size, so it's safe.
+ * We don't need to bother with munmap() here - exit_mmap(mm)
+ * is coming and it'll unmap everything. And we simply can't,
+ * this is not necessarily our ->mm.
+ * Since kill_ioctx() uses non-zero ->mmap_size as indicator
+ * that it needs to unmap the area, just set it to 0.
*/
ctx->mmap_size = 0;
+ kill_ioctx(mm, ctx, &requests_done);
- kill_ioctx(mm, ctx, NULL);
+ /* Wait until all IO for the context are done. */
+ wait_for_completion(&requests_done);
}
+
+ RCU_INIT_POINTER(mm->ioctx_table, NULL);
+ kfree(table);
}
static void put_reqs_available(struct kioctx *ctx, unsigned nr)
@@ -834,10 +821,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
struct kioctx_cpu *kcpu;
unsigned long flags;
- preempt_disable();
- kcpu = this_cpu_ptr(ctx->cpu);
-
local_irq_save(flags);
+ kcpu = this_cpu_ptr(ctx->cpu);
kcpu->reqs_available += nr;
while (kcpu->reqs_available >= ctx->req_batch * 2) {
@@ -846,7 +831,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
}
local_irq_restore(flags);
- preempt_enable();
}
static bool get_reqs_available(struct kioctx *ctx)
@@ -855,10 +839,8 @@ static bool get_reqs_available(struct kioctx *ctx)
bool ret = false;
unsigned long flags;
- preempt_disable();
- kcpu = this_cpu_ptr(ctx->cpu);
-
local_irq_save(flags);
+ kcpu = this_cpu_ptr(ctx->cpu);
if (!kcpu->reqs_available) {
int old, avail = atomic_read(&ctx->reqs_available);
@@ -878,10 +860,71 @@ static bool get_reqs_available(struct kioctx *ctx)
kcpu->reqs_available--;
out:
local_irq_restore(flags);
- preempt_enable();
return ret;
}
+/* refill_reqs_available
+ * Updates the reqs_available reference counts used for tracking the
+ * number of free slots in the completion ring. This can be called
+ * from aio_complete() (to optimistically update reqs_available) or
+ * from aio_get_req() (the we're out of events case). It must be
+ * called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+ unsigned tail)
+{
+ unsigned events_in_ring, completed;
+
+ /* Clamp head since userland can write to it. */
+ head %= ctx->nr_events;
+ if (head <= tail)
+ events_in_ring = tail - head;
+ else
+ events_in_ring = ctx->nr_events - (head - tail);
+
+ completed = ctx->completed_events;
+ if (events_in_ring < completed)
+ completed -= events_in_ring;
+ else
+ completed = 0;
+
+ if (!completed)
+ return;
+
+ ctx->completed_events -= completed;
+ put_reqs_available(ctx, completed);
+}
+
+/* user_refill_reqs_available
+ * Called to refill reqs_available when aio_get_req() encounters an
+ * out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+ spin_lock_irq(&ctx->completion_lock);
+ if (ctx->completed_events) {
+ struct aio_ring *ring;
+ unsigned head;
+
+ /* Access of ring->head may race with aio_read_events_ring()
+ * here, but that's okay since whether we read the old version
+ * or the new version, and either will be valid. The important
+ * part is that head cannot pass tail since we prevent
+ * aio_complete() from updating tail by holding
+ * ctx->completion_lock. Even if head is invalid, the check
+ * against ctx->completed_events below will make sure we do the
+ * safe/right thing.
+ */
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
+ kunmap_atomic(ring);
+
+ refill_reqs_available(ctx, head, ctx->tail);
+ }
+
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
/* aio_get_req
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
@@ -890,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
{
struct kiocb *req;
- if (!get_reqs_available(ctx))
- return NULL;
+ if (!get_reqs_available(ctx)) {
+ user_refill_reqs_available(ctx);
+ if (!get_reqs_available(ctx))
+ return NULL;
+ }
req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
if (unlikely(!req))
@@ -950,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
+ unsigned tail, pos, head;
unsigned long flags;
- unsigned tail, pos;
/*
* Special case handling for sync iocbs:
@@ -1012,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
ctx->tail = tail;
ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
ring->tail = tail;
kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);
+ ctx->completed_events++;
+ if (ctx->completed_events > 1)
+ refill_reqs_available(ctx, head, tail);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1030,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
/* everything turned out well, dispose of the aiocb. */
kiocb_free(iocb);
- put_reqs_available(ctx, 1);
/*
* We have to order our ring_info tail store above and test
@@ -1047,7 +1096,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
}
EXPORT_SYMBOL(aio_complete);
-/* aio_read_events
+/* aio_read_events_ring
* Pull an event off of the ioctx's event ring. Returns the number of
* events fetched
*/
@@ -1067,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
tail = ring->tail;
kunmap_atomic(ring);
+ /*
+ * Ensure that once we've read the current tail pointer, that
+ * we also see the events that were stored up to the tail.
+ */
+ smp_rmb();
+
pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
if (head == tail)
@@ -1270,12 +1325,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
if (compat)
ret = compat_rw_copy_check_uvector(rw,
(struct compat_iovec __user *)buf,
- *nr_segs, 1, *iovec, iovec);
+ *nr_segs, UIO_FASTIOV, *iovec, iovec);
else
#endif
ret = rw_copy_check_uvector(rw,
(struct iovec __user *)buf,
- *nr_segs, 1, *iovec, iovec);
+ *nr_segs, UIO_FASTIOV, *iovec, iovec);
if (ret < 0)
return ret;
@@ -1299,9 +1354,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
}
/*
- * aio_setup_iocb:
- * Performs the initial checks and aio retry method
- * setup for the kiocb at the time of io submission.
+ * aio_run_iocb:
+ * Performs the initial checks and io submission.
*/
static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
char __user *buf, bool compat)
@@ -1313,7 +1367,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
fmode_t mode;
aio_rw_op *rw_op;
rw_iter_op *iter_op;
- struct iovec inline_vec, *iovec = &inline_vec;
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
switch (opcode) {
@@ -1348,7 +1402,7 @@ rw_common:
if (!ret)
ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
if (ret < 0) {
- if (iovec != &inline_vec)
+ if (iovec != inline_vecs)
kfree(iovec);
return ret;
}
@@ -1395,7 +1449,7 @@ rw_common:
return -EINVAL;
}
- if (iovec != &inline_vec)
+ if (iovec != inline_vecs)
kfree(iovec);
if (ret != -EIOCBQUEUED) {
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index acf32054edd8..8e98cf954bab 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -79,6 +79,10 @@ struct autofs_info {
};
#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
+ * for expiry, so RCU_walk is
+ * not permitted
+ */
#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
struct autofs_wait_queue {
@@ -143,26 +147,12 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
-/* Does a dentry have some pending activity? */
-static inline int autofs4_ispending(struct dentry *dentry)
-{
- struct autofs_info *inf = autofs4_dentry_ino(dentry);
-
- if (inf->flags & AUTOFS_INF_PENDING)
- return 1;
-
- if (inf->flags & AUTOFS_INF_EXPIRING)
- return 1;
-
- return 0;
-}
-
struct inode *autofs4_get_inode(struct super_block *, umode_t);
void autofs4_free_ino(struct autofs_info *);
/* Expiration */
int is_autofs4_dentry(struct dentry *);
-int autofs4_expire_wait(struct dentry *dentry);
+int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
struct autofs_sb_info *,
struct autofs_packet_expire __user *);
@@ -191,55 +181,6 @@ extern const struct file_operations autofs4_root_operations;
extern const struct dentry_operations autofs4_dentry_operations;
/* VFS automount flags management functions */
-
-static inline void __managed_dentry_set_automount(struct dentry *dentry)
-{
- dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-}
-
-static inline void managed_dentry_set_automount(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- __managed_dentry_set_automount(dentry);
- spin_unlock(&dentry->d_lock);
-}
-
-static inline void __managed_dentry_clear_automount(struct dentry *dentry)
-{
- dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
-}
-
-static inline void managed_dentry_clear_automount(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- __managed_dentry_clear_automount(dentry);
- spin_unlock(&dentry->d_lock);
-}
-
-static inline void __managed_dentry_set_transit(struct dentry *dentry)
-{
- dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
-}
-
-static inline void managed_dentry_set_transit(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- __managed_dentry_set_transit(dentry);
- spin_unlock(&dentry->d_lock);
-}
-
-static inline void __managed_dentry_clear_transit(struct dentry *dentry)
-{
- dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
-}
-
-static inline void managed_dentry_clear_transit(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- __managed_dentry_clear_transit(dentry);
- spin_unlock(&dentry->d_lock);
-}
-
static inline void __managed_dentry_set_managed(struct dentry *dentry)
{
dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 5b570b6efa28..aaf96cb25452 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -450,7 +450,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
ino = autofs4_dentry_ino(path.dentry);
if (ino) {
err = 0;
- autofs4_expire_wait(path.dentry);
+ autofs4_expire_wait(path.dentry, 0);
spin_lock(&sbi->fs_lock);
param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 394e90b02c5e..683a5b9ce22a 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -30,12 +30,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
/* Too young to die */
if (!timeout || time_after(ino->last_used + timeout, now))
return 0;
-
- /* update last_used here :-
- - obviously makes sense if it is in use now
- - less obviously, prevents rapid-fire expire
- attempts if expire fails the first time */
- ino->last_used = now;
}
return 1;
}
@@ -255,12 +249,6 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
struct autofs_info *ino = autofs4_dentry_ino(p);
unsigned int ino_count = atomic_read(&ino->count);
- /*
- * Clean stale dentries below that have not been
- * invalidated after a mount fail during lookup
- */
- d_invalidate(p);
-
/* allow for dget above and top is already dgot */
if (p == top)
ino_count += 2;
@@ -333,11 +321,19 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
if (ino->flags & AUTOFS_INF_PENDING)
goto out;
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
- struct autofs_info *ino = autofs4_dentry_ino(root);
- ino->flags |= AUTOFS_INF_EXPIRING;
- init_completion(&ino->expire_complete);
+ ino->flags |= AUTOFS_INF_NO_RCU;
spin_unlock(&sbi->fs_lock);
- return root;
+ synchronize_rcu();
+ spin_lock(&sbi->fs_lock);
+ if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+ ino->flags |= AUTOFS_INF_EXPIRING;
+ smp_mb();
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
+ init_completion(&ino->expire_complete);
+ spin_unlock(&sbi->fs_lock);
+ return root;
+ }
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
}
out:
spin_unlock(&sbi->fs_lock);
@@ -346,6 +342,89 @@ out:
return NULL;
}
+/* Check if 'dentry' should expire, or return a nearby
+ * dentry that is suitable.
+ * If returned dentry is different from arg dentry,
+ * then a dget() reference was taken, else not.
+ */
+static struct dentry *should_expire(struct dentry *dentry,
+ struct vfsmount *mnt,
+ unsigned long timeout,
+ int how)
+{
+ int do_now = how & AUTOFS_EXP_IMMEDIATE;
+ int exp_leaves = how & AUTOFS_EXP_LEAVES;
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ unsigned int ino_count;
+
+ /* No point expiring a pending mount */
+ if (ino->flags & AUTOFS_INF_PENDING)
+ return NULL;
+
+ /*
+ * Case 1: (i) indirect mount or top level pseudo direct mount
+ * (autofs-4.1).
+ * (ii) indirect mount with offset mount, check the "/"
+ * offset (autofs-5.0+).
+ */
+ if (d_mountpoint(dentry)) {
+ DPRINTK("checking mountpoint %p %.*s",
+ dentry, (int)dentry->d_name.len, dentry->d_name.name);
+
+ /* Can we umount this guy */
+ if (autofs4_mount_busy(mnt, dentry))
+ return NULL;
+
+ /* Can we expire this guy */
+ if (autofs4_can_expire(dentry, timeout, do_now))
+ return dentry;
+ return NULL;
+ }
+
+ if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+ DPRINTK("checking symlink %p %.*s",
+ dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ /*
+ * A symlink can't be "busy" in the usual sense so
+ * just check last used for expire timeout.
+ */
+ if (autofs4_can_expire(dentry, timeout, do_now))
+ return dentry;
+ return NULL;
+ }
+
+ if (simple_empty(dentry))
+ return NULL;
+
+ /* Case 2: tree mount, expire iff entire tree is not busy */
+ if (!exp_leaves) {
+ /* Path walk currently on this dentry? */
+ ino_count = atomic_read(&ino->count) + 1;
+ if (d_count(dentry) > ino_count)
+ return NULL;
+
+ if (!autofs4_tree_busy(mnt, dentry, timeout, do_now))
+ return dentry;
+ /*
+ * Case 3: pseudo direct mount, expire individual leaves
+ * (autofs-4.1).
+ */
+ } else {
+ /* Path walk currently on this dentry? */
+ struct dentry *expired;
+ ino_count = atomic_read(&ino->count) + 1;
+ if (d_count(dentry) > ino_count)
+ return NULL;
+
+ expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
+ if (expired) {
+ if (expired == dentry)
+ dput(dentry);
+ return expired;
+ }
+ }
+ return NULL;
+}
/*
* Find an eligible tree to time-out
* A tree is eligible if :-
@@ -360,11 +439,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
unsigned long timeout;
struct dentry *root = sb->s_root;
struct dentry *dentry;
- struct dentry *expired = NULL;
- int do_now = how & AUTOFS_EXP_IMMEDIATE;
- int exp_leaves = how & AUTOFS_EXP_LEAVES;
+ struct dentry *expired;
struct autofs_info *ino;
- unsigned int ino_count;
if (!root)
return NULL;
@@ -376,77 +452,28 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
while ((dentry = get_next_positive_subdir(dentry, root))) {
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
- /* No point expiring a pending mount */
- if (ino->flags & AUTOFS_INF_PENDING)
- goto next;
-
- /*
- * Case 1: (i) indirect mount or top level pseudo direct mount
- * (autofs-4.1).
- * (ii) indirect mount with offset mount, check the "/"
- * offset (autofs-5.0+).
- */
- if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
-
- /* Can we umount this guy */
- if (autofs4_mount_busy(mnt, dentry))
- goto next;
-
- /* Can we expire this guy */
- if (autofs4_can_expire(dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- goto next;
- }
-
- if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
- DPRINTK("checking symlink %p %.*s",
- dentry, (int)dentry->d_name.len, dentry->d_name.name);
- /*
- * A symlink can't be "busy" in the usual sense so
- * just check last used for expire timeout.
- */
- if (autofs4_can_expire(dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- goto next;
+ if (ino->flags & AUTOFS_INF_NO_RCU)
+ expired = NULL;
+ else
+ expired = should_expire(dentry, mnt, timeout, how);
+ if (!expired) {
+ spin_unlock(&sbi->fs_lock);
+ continue;
}
-
- if (simple_empty(dentry))
- goto next;
-
- /* Case 2: tree mount, expire iff entire tree is not busy */
- if (!exp_leaves) {
- /* Path walk currently on this dentry? */
- ino_count = atomic_read(&ino->count) + 1;
- if (d_count(dentry) > ino_count)
- goto next;
-
- if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
- expired = dentry;
- goto found;
- }
- /*
- * Case 3: pseudo direct mount, expire individual leaves
- * (autofs-4.1).
- */
- } else {
- /* Path walk currently on this dentry? */
- ino_count = atomic_read(&ino->count) + 1;
- if (d_count(dentry) > ino_count)
- goto next;
-
- expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
- if (expired) {
+ ino = autofs4_dentry_ino(expired);
+ ino->flags |= AUTOFS_INF_NO_RCU;
+ spin_unlock(&sbi->fs_lock);
+ synchronize_rcu();
+ spin_lock(&sbi->fs_lock);
+ if (should_expire(expired, mnt, timeout, how)) {
+ if (expired != dentry)
dput(dentry);
- goto found;
- }
+ goto found;
}
-next:
+
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
+ if (expired != dentry)
+ dput(expired);
spin_unlock(&sbi->fs_lock);
}
return NULL;
@@ -454,8 +481,9 @@ next:
found:
DPRINTK("returning %p %.*s",
expired, (int)expired->d_name.len, expired->d_name.name);
- ino = autofs4_dentry_ino(expired);
ino->flags |= AUTOFS_INF_EXPIRING;
+ smp_mb();
+ ino->flags &= ~AUTOFS_INF_NO_RCU;
init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
spin_lock(&sbi->lookup_lock);
@@ -468,13 +496,18 @@ found:
return expired;
}
-int autofs4_expire_wait(struct dentry *dentry)
+int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
/* Block on any pending expire */
+ if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)))
+ return 0;
+ if (rcu_walk)
+ return -ECHILD;
+
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
@@ -526,6 +559,8 @@ int autofs4_expire_run(struct super_block *sb,
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
+ /* avoid rapid-fire expire attempts if expiry fails */
+ ino->last_used = now;
ino->flags &= ~AUTOFS_INF_EXPIRING;
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
@@ -552,6 +587,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
+ /* avoid rapid-fire expire attempts if expiry fails */
+ ino->last_used = now;
ino->flags &= ~AUTOFS_INF_EXPIRING;
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cc87c1abac97..d76d083f2f06 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -166,8 +166,10 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
const unsigned char *str = name->name;
struct list_head *p, *head;
- spin_lock(&sbi->lookup_lock);
head = &sbi->active_list;
+ if (list_empty(head))
+ return NULL;
+ spin_lock(&sbi->lookup_lock);
list_for_each(p, head) {
struct autofs_info *ino;
struct dentry *active;
@@ -208,7 +210,8 @@ next:
return NULL;
}
-static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
+static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
+ bool rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct dentry *parent = dentry->d_parent;
@@ -218,13 +221,20 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
const unsigned char *str = name->name;
struct list_head *p, *head;
- spin_lock(&sbi->lookup_lock);
head = &sbi->expiring_list;
+ if (list_empty(head))
+ return NULL;
+ spin_lock(&sbi->lookup_lock);
list_for_each(p, head) {
struct autofs_info *ino;
struct dentry *expiring;
struct qstr *qstr;
+ if (rcu_walk) {
+ spin_unlock(&sbi->lookup_lock);
+ return ERR_PTR(-ECHILD);
+ }
+
ino = list_entry(p, struct autofs_info, expiring);
expiring = ino->dentry;
@@ -260,13 +270,15 @@ next:
return NULL;
}
-static int autofs4_mount_wait(struct dentry *dentry)
+static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status = 0;
if (ino->flags & AUTOFS_INF_PENDING) {
+ if (rcu_walk)
+ return -ECHILD;
DPRINTK("waiting for mount name=%.*s",
dentry->d_name.len, dentry->d_name.name);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
@@ -276,20 +288,22 @@ static int autofs4_mount_wait(struct dentry *dentry)
return status;
}
-static int do_expire_wait(struct dentry *dentry)
+static int do_expire_wait(struct dentry *dentry, bool rcu_walk)
{
struct dentry *expiring;
- expiring = autofs4_lookup_expiring(dentry);
+ expiring = autofs4_lookup_expiring(dentry, rcu_walk);
+ if (IS_ERR(expiring))
+ return PTR_ERR(expiring);
if (!expiring)
- return autofs4_expire_wait(dentry);
+ return autofs4_expire_wait(dentry, rcu_walk);
else {
/*
* If we are racing with expire the request might not
* be quite complete, but the directory has been removed
* so it must have been successful, just wait for it.
*/
- autofs4_expire_wait(expiring);
+ autofs4_expire_wait(expiring, 0);
autofs4_del_expiring(expiring);
dput(expiring);
}
@@ -341,7 +355,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* and the directory was removed, so just go ahead and try
* the mount.
*/
- status = do_expire_wait(dentry);
+ status = do_expire_wait(dentry, 0);
if (status && status != -EAGAIN)
return NULL;
@@ -349,7 +363,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_PENDING) {
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, 0);
if (status)
return ERR_PTR(status);
goto done;
@@ -373,7 +387,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* this because the leaves of the directory tree under the
* mount never trigger mounts themselves (they have an autofs
* trigger mount mounted on them). But v4 pseudo direct mounts
- * do need the leaves to to trigger mounts. In this case we
+ * do need the leaves to trigger mounts. In this case we
* have no choice but to use the list_empty() check and
* require user space behave.
*/
@@ -390,7 +404,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, 0);
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_PENDING;
if (status) {
@@ -419,28 +433,46 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
- if (rcu_walk)
- return 0;
if (!d_mountpoint(dentry))
return -EISDIR;
return 0;
}
- /* We need to sleep, so we need pathwalk to be in ref-mode */
- if (rcu_walk)
- return -ECHILD;
-
/* Wait for pending expires */
- do_expire_wait(dentry);
+ if (do_expire_wait(dentry, rcu_walk) == -ECHILD)
+ return -ECHILD;
/*
* This dentry may be under construction so wait on mount
* completion.
*/
- status = autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry, rcu_walk);
if (status)
return status;
+ if (rcu_walk) {
+ /* We don't need fs_lock in rcu_walk mode,
+ * just testing 'AUTOFS_INFO_NO_RCU' is enough.
+ * simple_empty() takes a spinlock, so leave it
+ * to last.
+ * We only return -EISDIR when certain this isn't
+ * a mount-trap.
+ */
+ struct inode *inode;
+ if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
+ return 0;
+ if (d_mountpoint(dentry))
+ return 0;
+ inode = ACCESS_ONCE(dentry->d_inode);
+ if (inode && S_ISLNK(inode->i_mode))
+ return -EISDIR;
+ if (list_empty(&dentry->d_subdirs))
+ return 0;
+ if (!simple_empty(dentry))
+ return -EISDIR;
+ return 0;
+ }
+
spin_lock(&sbi->fs_lock);
/*
* If the dentry has been selected for expire while we slept
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 7c93953030fb..afd2b4408adf 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
return -EIO;
}
-static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
return -EIO;
}
@@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops =
.mkdir = bad_inode_mkdir,
.rmdir = bad_inode_rmdir,
.mknod = bad_inode_mknod,
- .rename = bad_inode_rename,
+ .rename2 = bad_inode_rename2,
.readlink = bad_inode_readlink,
/* follow_link must be no-op, otherwise unmounting this inode
won't work */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 9c7faa8a9288..0826e91dacda 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -78,11 +78,11 @@
/*
* In memory structure of each btree node
*/
-typedef struct {
+struct befs_btree_node {
befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */
struct buffer_head *bh;
befs_btree_nodehead *od_node; /* on disk node */
-} befs_btree_node;
+};
/* local constants */
static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
@@ -90,27 +90,30 @@ static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
/* local functions */
static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * bt_super,
- befs_btree_node * this_node,
+ struct befs_btree_node *this_node,
befs_off_t * node_off);
static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * sup);
static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
- befs_btree_node * node, befs_off_t node_off);
+ struct befs_btree_node *node,
+ befs_off_t node_off);
-static int befs_leafnode(befs_btree_node * node);
+static int befs_leafnode(struct befs_btree_node *node);
-static fs16 *befs_bt_keylen_index(befs_btree_node * node);
+static fs16 *befs_bt_keylen_index(struct befs_btree_node *node);
-static fs64 *befs_bt_valarray(befs_btree_node * node);
+static fs64 *befs_bt_valarray(struct befs_btree_node *node);
-static char *befs_bt_keydata(befs_btree_node * node);
+static char *befs_bt_keydata(struct befs_btree_node *node);
-static int befs_find_key(struct super_block *sb, befs_btree_node * node,
+static int befs_find_key(struct super_block *sb,
+ struct befs_btree_node *node,
const char *findkey, befs_off_t * value);
-static char *befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
+static char *befs_bt_get_key(struct super_block *sb,
+ struct befs_btree_node *node,
int index, u16 * keylen);
static int befs_compare_strings(const void *key1, int keylen1,
@@ -191,7 +194,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
static int
befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
- befs_btree_node * node, befs_off_t node_off)
+ struct befs_btree_node *node, befs_off_t node_off)
{
uint off = 0;
@@ -247,7 +250,7 @@ int
befs_btree_find(struct super_block *sb, befs_data_stream * ds,
const char *key, befs_off_t * value)
{
- befs_btree_node *this_node = NULL;
+ struct befs_btree_node *this_node = NULL;
befs_btree_super bt_super;
befs_off_t node_off;
int res;
@@ -260,11 +263,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
goto error;
}
- this_node = kmalloc(sizeof (befs_btree_node),
+ this_node = kmalloc(sizeof(struct befs_btree_node),
GFP_NOFS);
if (!this_node) {
befs_error(sb, "befs_btree_find() failed to allocate %zu "
- "bytes of memory", sizeof (befs_btree_node));
+ "bytes of memory", sizeof(struct befs_btree_node));
goto error;
}
@@ -333,7 +336,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
* Use binary search instead of a linear.
*/
static int
-befs_find_key(struct super_block *sb, befs_btree_node * node,
+befs_find_key(struct super_block *sb, struct befs_btree_node *node,
const char *findkey, befs_off_t * value)
{
int first, last, mid;
@@ -417,7 +420,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize,
befs_off_t * value)
{
- befs_btree_node *this_node;
+ struct befs_btree_node *this_node;
befs_btree_super bt_super;
befs_off_t node_off = 0;
int cur_key;
@@ -436,9 +439,10 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
goto error;
}
- if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
+ this_node = kmalloc(sizeof(struct befs_btree_node), GFP_NOFS);
+ if (this_node == NULL) {
befs_error(sb, "befs_btree_read() failed to allocate %zu "
- "bytes of memory", sizeof (befs_btree_node));
+ "bytes of memory", sizeof(struct befs_btree_node));
goto error;
}
@@ -545,7 +549,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
*/
static int
befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
- befs_btree_super * bt_super, befs_btree_node * this_node,
+ befs_btree_super *bt_super,
+ struct befs_btree_node *this_node,
befs_off_t * node_off)
{
@@ -600,7 +605,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
* Return 1 if leaf, 0 if interior
*/
static int
-befs_leafnode(befs_btree_node * node)
+befs_leafnode(struct befs_btree_node *node)
{
/* all interior nodes (and only interior nodes) have an overflow node */
if (node->head.overflow == befs_bt_inval)
@@ -623,7 +628,7 @@ befs_leafnode(befs_btree_node * node)
* Except that rounding up to 8 works, and rounding up to 4 doesn't.
*/
static fs16 *
-befs_bt_keylen_index(befs_btree_node * node)
+befs_bt_keylen_index(struct befs_btree_node *node)
{
const int keylen_align = 8;
unsigned long int off =
@@ -644,7 +649,7 @@ befs_bt_keylen_index(befs_btree_node * node)
* of the node pointed to by the node header
*/
static fs64 *
-befs_bt_valarray(befs_btree_node * node)
+befs_bt_valarray(struct befs_btree_node *node)
{
void *keylen_index_start = (void *) befs_bt_keylen_index(node);
size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
@@ -660,7 +665,7 @@ befs_bt_valarray(befs_btree_node * node)
* of the node pointed to by the node header
*/
static char *
-befs_bt_keydata(befs_btree_node * node)
+befs_bt_keydata(struct befs_btree_node *node)
{
return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead));
}
@@ -676,7 +681,7 @@ befs_bt_keydata(befs_btree_node * node)
* Returns NULL on failure (bad input) and sets *@keylen = 0
*/
static char *
-befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
+befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node,
int index, u16 * keylen)
{
int prev_key_end;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 0d6c07cc1149..4cf61ec6b7a8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -832,16 +832,14 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
(befs_super_block *) ((void *) bh->b_data + x86_sb_off);
}
- if (befs_load_sb(sb, disk_sb) != BEFS_OK)
+ if ((befs_load_sb(sb, disk_sb) != BEFS_OK) ||
+ (befs_check_sb(sb) != BEFS_OK))
goto unacquire_bh;
befs_dump_super_block(sb, disk_sb);
brelse(bh);
- if (befs_check_sb(sb) != BEFS_OK)
- goto unacquire_priv_sbp;
-
if( befs_sb->num_blocks > ~((sector_t)0) ) {
befs_error(sb, "blocks count: %llu "
"is larger than the host can use",
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index f7f87e233dd9..f40006db36df 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -46,6 +46,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
/* inode.c */
extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino);
+extern void bfs_dump_imap(const char *, struct super_block *);
/* file.c */
extern const struct inode_operations bfs_file_inops;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index a399e6d9dc74..08063ae0a17c 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -75,8 +75,6 @@ const struct file_operations bfs_dir_operations = {
.llseek = generic_file_llseek,
};
-extern void dump_imap(const char *, struct super_block *);
-
static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
@@ -110,7 +108,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
BFS_I(inode)->i_eblock = 0;
insert_inode_hash(inode);
mark_inode_dirty(inode);
- dump_imap("create", s);
+ bfs_dump_imap("create", s);
err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
inode->i_ino);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 7041ac35ace8..90bc079d9982 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,8 +30,6 @@ MODULE_LICENSE("GPL");
#define dprintf(x...)
#endif
-void dump_imap(const char *prefix, struct super_block *s);
-
struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
{
struct bfs_inode *di;
@@ -194,7 +192,7 @@ static void bfs_evict_inode(struct inode *inode)
info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
info->si_freei++;
clear_bit(ino, info->si_imap);
- dump_imap("delete_inode", s);
+ bfs_dump_imap("delete_inode", s);
}
/*
@@ -297,7 +295,7 @@ static const struct super_operations bfs_sops = {
.statfs = bfs_statfs,
};
-void dump_imap(const char *prefix, struct super_block *s)
+void bfs_dump_imap(const char *prefix, struct super_block *s)
{
#ifdef DEBUG
int i;
@@ -443,7 +441,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
}
brelse(bh);
brelse(sbh);
- dump_imap("read_super", s);
+ bfs_dump_imap("read_super", s);
return 0;
out3:
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ca0ba15a7306..929dec08c348 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -256,11 +256,8 @@ static int load_aout_binary(struct linux_binprm * bprm)
(current->mm->start_brk = N_BSSADDR(ex));
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0) {
- /* Someone check-me: is this error path enough? */
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
return retval;
- }
install_exec_creds(bprm);
@@ -278,17 +275,13 @@ static int load_aout_binary(struct linux_binprm * bprm)
map_size = ex.a_text+ex.a_data;
#endif
error = vm_brk(text_addr & PAGE_MASK, map_size);
- if (error != (text_addr & PAGE_MASK)) {
- send_sig(SIGKILL, current, 0);
+ if (error != (text_addr & PAGE_MASK))
return error;
- }
error = read_code(bprm->file, text_addr, pos,
ex.a_text+ex.a_data);
- if ((signed long)error < 0) {
- send_sig(SIGKILL, current, 0);
+ if ((signed long)error < 0)
return error;
- }
} else {
if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
(N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
@@ -315,28 +308,22 @@ static int load_aout_binary(struct linux_binprm * bprm)
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
fd_offset);
- if (error != N_TXTADDR(ex)) {
- send_sig(SIGKILL, current, 0);
+ if (error != N_TXTADDR(ex))
return error;
- }
error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
fd_offset + ex.a_text);
- if (error != N_DATADDR(ex)) {
- send_sig(SIGKILL, current, 0);
+ if (error != N_DATADDR(ex))
return error;
- }
}
beyond_if:
set_binfmt(&aout_format);
retval = set_brk(current->mm->start_brk, current->mm->brk);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
return retval;
- }
current->mm->start_stack =
(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3892c1a23241..d8fc0605b9d2 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,10 +738,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
change some of these later */
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
goto out_free_dentry;
- }
current->mm->start_stack = bprm->p;
@@ -763,10 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
and clear the area. */
retval = set_brk(elf_bss + load_bias,
elf_brk + load_bias);
- if (retval) {
- send_sig(SIGKILL, current, 0);
+ if (retval)
goto out_free_dentry;
- }
nbyte = ELF_PAGEOFFSET(elf_bss);
if (nbyte) {
nbyte = ELF_MIN_ALIGN - nbyte;
@@ -820,7 +816,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, 0);
if (BAD_ADDR(error)) {
- send_sig(SIGKILL, current, 0);
retval = IS_ERR((void *)error) ?
PTR_ERR((void*)error) : -EINVAL;
goto out_free_dentry;
@@ -851,7 +846,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_ppnt->p_memsz > TASK_SIZE ||
TASK_SIZE - elf_ppnt->p_memsz < k) {
/* set_brk can never work. Avoid overflows. */
- send_sig(SIGKILL, current, 0);
retval = -EINVAL;
goto out_free_dentry;
}
@@ -883,12 +877,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
* up getting placed where the bss needs to go.
*/
retval = set_brk(elf_bss, elf_brk);
- if (retval) {
- send_sig(SIGKILL, current, 0);
+ if (retval)
goto out_free_dentry;
- }
if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
- send_sig(SIGSEGV, current, 0);
retval = -EFAULT; /* Nobody gets to see this, but.. */
goto out_free_dentry;
}
@@ -909,7 +900,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_entry += loc->interp_elf_ex.e_entry;
}
if (BAD_ADDR(elf_entry)) {
- force_sig(SIGSEGV, current);
retval = IS_ERR((void *)elf_entry) ?
(int)elf_entry : -EINVAL;
goto out_free_dentry;
@@ -922,7 +912,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
} else {
elf_entry = loc->elf_ex.e_entry;
if (BAD_ADDR(elf_entry)) {
- force_sig(SIGSEGV, current);
retval = -EINVAL;
goto out_free_dentry;
}
@@ -934,19 +923,15 @@ static int load_elf_binary(struct linux_binprm *bprm)
#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
goto out;
- }
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
install_exec_creds(bprm);
retval = create_elf_tables(bprm, &loc->elf_ex,
load_addr, interp_load_addr);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
goto out;
- }
/* N.B. passed_fileno might not be initialized? */
current->mm->end_code = end_code;
current->mm->start_code = start_code;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fe2a643ee005..d3634bfb7fe1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -317,8 +317,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
goto error;
/* there's now no turning back... the old userspace image is dead,
- * defunct, deceased, etc. after this point we have to exit via
- * error_kill */
+ * defunct, deceased, etc.
+ */
set_personality(PER_LINUX_FDPIC);
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -343,24 +343,22 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
retval = setup_arg_pages(bprm, current->mm->start_stack,
executable_stack);
- if (retval < 0) {
- send_sig(SIGKILL, current, 0);
- goto error_kill;
- }
+ if (retval < 0)
+ goto error;
#endif
/* load the executable and interpreter into memory */
retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm,
"executable");
if (retval < 0)
- goto error_kill;
+ goto error;
if (interpreter_name) {
retval = elf_fdpic_map_file(&interp_params, interpreter,
current->mm, "interpreter");
if (retval < 0) {
printk(KERN_ERR "Unable to load interpreter\n");
- goto error_kill;
+ goto error;
}
allow_write_access(interpreter);
@@ -397,7 +395,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
if (IS_ERR_VALUE(current->mm->start_brk)) {
retval = current->mm->start_brk;
current->mm->start_brk = 0;
- goto error_kill;
+ goto error;
}
current->mm->brk = current->mm->start_brk;
@@ -410,7 +408,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
install_exec_creds(bprm);
if (create_elf_fdpic_tables(bprm, current->mm,
&exec_params, &interp_params) < 0)
- goto error_kill;
+ goto error;
kdebug("- start_code %lx", current->mm->start_code);
kdebug("- end_code %lx", current->mm->end_code);
@@ -449,12 +447,6 @@ error:
kfree(interp_params.phdrs);
kfree(interp_params.loadmap);
return retval;
-
- /* unrecoverable error - kill the process */
-error_kill:
- send_sig(SIGSEGV, current, 0);
- goto error;
-
}
/*****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index b60500300dd7..fd8beb9657a2 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -62,7 +62,22 @@ static struct file_system_type bm_fs_type;
static struct vfsmount *bm_mnt;
static int entry_count;
-/*
+/*
+ * Max length of the register string. Determined by:
+ * - 7 delimiters
+ * - name: ~50 bytes
+ * - type: 1 byte
+ * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE)
+ * - magic: 128 bytes (512 in escaped form)
+ * - mask: 128 bytes (512 in escaped form)
+ * - interp: ~50 bytes
+ * - flags: 5 bytes
+ * Round that up a bit, and then back off to hold the internal data
+ * (like struct Node).
+ */
+#define MAX_REGISTER_LENGTH 1920
+
+/*
* Check if we support the binfmt
* if we do, return the node, else NULL
* locking is done in load_misc_binary
@@ -279,7 +294,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
/* some sanity checks */
err = -EINVAL;
- if ((count < 11) || (count > 256))
+ if ((count < 11) || (count > MAX_REGISTER_LENGTH))
goto out;
err = -ENOMEM;
@@ -396,12 +411,12 @@ static int parse_command(const char __user *buffer, size_t count)
{
char s[4];
- if (!count)
- return 0;
if (count > 3)
return -EINVAL;
if (copy_from_user(s, buffer, count))
return -EFAULT;
+ if (!count)
+ return 0;
if (s[count-1] == '\n')
count--;
if (count == 1 && s[0] == '0')
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d7274619bf9..1d9c9f3754f8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -50,32 +50,22 @@ inline struct block_device *I_BDEV(struct inode *inode)
EXPORT_SYMBOL(I_BDEV);
/*
- * Move the inode from its current bdi to a new bdi. If the inode is dirty we
- * need to move it onto the dirty list of @dst so that the inode is always on
- * the right list.
+ * Move the inode from its current bdi to a new bdi. Make sure the inode
+ * is clean before moving so that it doesn't linger on the old bdi.
*/
static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
- struct backing_dev_info *old = inode->i_data.backing_dev_info;
- bool wakeup_bdi = false;
-
- if (unlikely(dst == old)) /* deadlock avoidance */
- return;
- bdi_lock_two(&old->wb, &dst->wb);
- spin_lock(&inode->i_lock);
- inode->i_data.backing_dev_info = dst;
- if (inode->i_state & I_DIRTY) {
- if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
- wakeup_bdi = true;
- list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+ while (true) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_DIRTY)) {
+ inode->i_data.backing_dev_info = dst;
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+ spin_unlock(&inode->i_lock);
+ WARN_ON_ONCE(write_inode_now(inode, true));
}
- spin_unlock(&inode->i_lock);
- spin_unlock(&old->wb.list_lock);
- spin_unlock(&dst->wb.list_lock);
-
- if (wakeup_bdi)
- bdi_wakeup_thread_delayed(dst);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -304,6 +294,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
return block_read_full_page(page, blkdev_get_block);
}
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1173,8 +1169,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
bdi = blk_get_backing_dev_info(bdev);
- if (bdi == NULL)
- bdi = &default_backing_dev_info;
bdev_inode_switch_bdi(bdev->bd_inode, bdi);
}
@@ -1591,7 +1585,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
EXPORT_SYMBOL_GPL(blkdev_write_iter);
-static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = file->f_mapping->host;
@@ -1605,6 +1599,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
iov_iter_truncate(to, size);
return generic_file_read_iter(iocb, to);
}
+EXPORT_SYMBOL_GPL(blkdev_read_iter);
/*
* Try to release a page associated with block device when the system
@@ -1622,6 +1617,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
+ .readpages = blkdev_readpages,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5a201d81049c..4dabeb893b7c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -22,7 +22,6 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
-#include <linux/workqueue.h>
#include "async-thread.h"
#include "ctree.h"
@@ -55,13 +54,45 @@ struct btrfs_workqueue {
struct __btrfs_workqueue *high;
};
-static inline struct __btrfs_workqueue
-*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+static void normal_work_helper(struct btrfs_work *work);
+
+#define BTRFS_WORK_HELPER(name) \
+void btrfs_##name(struct work_struct *arg) \
+{ \
+ struct btrfs_work *work = container_of(arg, struct btrfs_work, \
+ normal_work); \
+ normal_work_helper(work); \
+}
+
+BTRFS_WORK_HELPER(worker_helper);
+BTRFS_WORK_HELPER(delalloc_helper);
+BTRFS_WORK_HELPER(flush_delalloc_helper);
+BTRFS_WORK_HELPER(cache_helper);
+BTRFS_WORK_HELPER(submit_helper);
+BTRFS_WORK_HELPER(fixup_helper);
+BTRFS_WORK_HELPER(endio_helper);
+BTRFS_WORK_HELPER(endio_meta_helper);
+BTRFS_WORK_HELPER(endio_meta_write_helper);
+BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(endio_repair_helper);
+BTRFS_WORK_HELPER(rmw_helper);
+BTRFS_WORK_HELPER(endio_write_helper);
+BTRFS_WORK_HELPER(freespace_write_helper);
+BTRFS_WORK_HELPER(delayed_meta_helper);
+BTRFS_WORK_HELPER(readahead_helper);
+BTRFS_WORK_HELPER(qgroup_rescan_helper);
+BTRFS_WORK_HELPER(extent_refs_helper);
+BTRFS_WORK_HELPER(scrub_helper);
+BTRFS_WORK_HELPER(scrubwrc_helper);
+BTRFS_WORK_HELPER(scrubnc_helper);
+
+static struct __btrfs_workqueue *
+__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- if (unlikely(!ret))
+ if (!ret)
return NULL;
ret->max_active = max_active;
@@ -85,7 +116,7 @@ static inline struct __btrfs_workqueue
ret->normal_wq = alloc_workqueue("%s-%s", flags,
ret->max_active, "btrfs",
name);
- if (unlikely(!ret->normal_wq)) {
+ if (!ret->normal_wq) {
kfree(ret);
return NULL;
}
@@ -107,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
{
struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- if (unlikely(!ret))
+ if (!ret)
return NULL;
ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
max_active, thresh);
- if (unlikely(!ret->normal)) {
+ if (!ret->normal) {
kfree(ret);
return NULL;
}
@@ -120,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
if (flags & WQ_HIGHPRI) {
ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
thresh);
- if (unlikely(!ret->high)) {
+ if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
kfree(ret);
return NULL;
@@ -232,13 +263,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
spin_unlock_irqrestore(lock, flags);
}
-static void normal_work_helper(struct work_struct *arg)
+static void normal_work_helper(struct btrfs_work *work)
{
- struct btrfs_work *work;
struct __btrfs_workqueue *wq;
int need_order = 0;
- work = container_of(arg, struct btrfs_work, normal_work);
/*
* We should not touch things inside work in the following cases:
* 1) after work->func() if it has no ordered_free
@@ -262,7 +291,7 @@ static void normal_work_helper(struct work_struct *arg)
trace_btrfs_all_work_done(work);
}
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
btrfs_func_t func,
btrfs_func_t ordered_func,
btrfs_func_t ordered_free)
@@ -270,7 +299,7 @@ void btrfs_init_work(struct btrfs_work *work,
work->func = func;
work->ordered_func = ordered_func;
work->ordered_free = ordered_free;
- INIT_WORK(&work->normal_work, normal_work_helper);
+ INIT_WORK(&work->normal_work, uniq_func);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9c6b66d15fb0..e386c29ef1f6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -19,12 +19,14 @@
#ifndef __BTRFS_ASYNC_THREAD_
#define __BTRFS_ASYNC_THREAD_
+#include <linux/workqueue.h>
struct btrfs_workqueue;
/* Internal use only */
struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_work_func_t)(struct work_struct *arg);
struct btrfs_work {
btrfs_func_t func;
@@ -38,11 +40,36 @@ struct btrfs_work {
unsigned long flags;
};
+#define BTRFS_WORK_HELPER_PROTO(name) \
+void btrfs_##name(struct work_struct *arg)
+
+BTRFS_WORK_HELPER_PROTO(worker_helper);
+BTRFS_WORK_HELPER_PROTO(delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(cache_helper);
+BTRFS_WORK_HELPER_PROTO(submit_helper);
+BTRFS_WORK_HELPER_PROTO(fixup_helper);
+BTRFS_WORK_HELPER_PROTO(endio_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
+BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
+BTRFS_WORK_HELPER_PROTO(rmw_helper);
+BTRFS_WORK_HELPER_PROTO(endio_write_helper);
+BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
+BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
+BTRFS_WORK_HELPER_PROTO(readahead_helper);
+BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
+BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
+BTRFS_WORK_HELPER_PROTO(scrub_helper);
+BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
int flags,
int max_active,
int thresh);
-void btrfs_init_work(struct btrfs_work *work,
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
btrfs_func_t func,
btrfs_func_t ordered_func,
btrfs_func_t ordered_free);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e25564bfcb46..2d3e32ebfd15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -25,6 +25,9 @@
#include "delayed-ref.h"
#include "locking.h"
+/* Just an arbitrary number so we can be sure this happened */
+#define BACKREF_FOUND_SHARED 6
+
struct extent_inode_elem {
u64 inum;
u64 offset;
@@ -276,9 +279,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
if (ret > 0)
goto next;
- ret = ulist_add_merge(parents, eb->start,
- (uintptr_t)eie,
- (u64 *)&old, GFP_NOFS);
+ ret = ulist_add_merge_ptr(parents, eb->start,
+ eie, (void **)&old, GFP_NOFS);
if (ret < 0)
break;
if (!ret && extent_item_pos) {
@@ -378,7 +380,8 @@ out:
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct list_head *head,
- const u64 *extent_item_pos, u64 total_refs)
+ const u64 *extent_item_pos, u64 total_refs,
+ u64 root_objectid)
{
int err;
int ret = 0;
@@ -403,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
if (ref->count == 0)
continue;
+ if (root_objectid && ref->root_id != root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
parents, extent_item_pos,
total_refs);
@@ -483,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
continue;
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
- fs_info->tree_root->leafsize, 0);
+ 0);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -562,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode)
* smaller or equal that seq to the list
*/
static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
- struct list_head *prefs, u64 *total_refs)
+ struct list_head *prefs, u64 *total_refs,
+ u64 inum)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct rb_node *n = &head->node.rb_node;
@@ -626,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
key.objectid = ref->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = ref->offset;
+
+ /*
+ * Found a inum that doesn't match our known inum, we
+ * know it's shared.
+ */
+ if (inum && ref->objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
@@ -660,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
static int __add_inline_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
int *info_level, struct list_head *prefs,
- u64 *total_refs)
+ u64 *total_refs, u64 inum)
{
int ret = 0;
int slot;
@@ -745,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
dref);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (inum && key.objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
root = btrfs_extent_data_ref_root(leaf, dref);
ret = __add_prelim_ref(prefs, root, &key, 0, 0,
bytenr, count, GFP_NOFS);
@@ -766,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
*/
static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- int info_level, struct list_head *prefs)
+ int info_level, struct list_head *prefs, u64 inum)
{
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
@@ -828,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
dref);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (inum && key.objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
root = btrfs_extent_data_ref_root(leaf, dref);
ret = __add_prelim_ref(prefs, root, &key, 0, 0,
bytenr, count, GFP_NOFS);
@@ -855,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist *refs,
- struct ulist *roots, const u64 *extent_item_pos)
+ struct ulist *roots, const u64 *extent_item_pos,
+ u64 root_objectid, u64 inum)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -930,7 +961,8 @@ again:
}
spin_unlock(&delayed_refs->lock);
ret = __add_delayed_refs(head, time_seq,
- &prefs_delayed, &total_refs);
+ &prefs_delayed, &total_refs,
+ inum);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@@ -952,11 +984,11 @@ again:
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
&info_level, &prefs,
- &total_refs);
+ &total_refs, inum);
if (ret)
goto out;
ret = __add_keyed_refs(fs_info, path, bytenr,
- info_level, &prefs);
+ info_level, &prefs, inum);
if (ret)
goto out;
}
@@ -972,7 +1004,8 @@ again:
__merge_refs(&prefs, 1);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
- extent_item_pos, total_refs);
+ extent_item_pos, total_refs,
+ root_objectid);
if (ret)
goto out;
@@ -982,6 +1015,11 @@ again:
ref = list_first_entry(&prefs, struct __prelim_ref, list);
WARN_ON(ref->count < 0);
if (roots && ref->count && ref->root_id && ref->parent == 0) {
+ if (root_objectid && ref->root_id != root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
+
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
if (ret < 0)
@@ -990,27 +1028,28 @@ again:
if (ref->count && ref->parent) {
if (extent_item_pos && !ref->inode_list &&
ref->level == 0) {
- u32 bsz;
struct extent_buffer *eb;
- bsz = btrfs_level_size(fs_info->extent_root,
- ref->level);
+
eb = read_tree_block(fs_info->extent_root,
- ref->parent, bsz, 0);
+ ref->parent, 0);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
}
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie);
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
ref->inode_list = eie;
}
- ret = ulist_add_merge(refs, ref->parent,
- (uintptr_t)ref->inode_list,
- (u64 *)&eie, GFP_NOFS);
+ ret = ulist_add_merge_ptr(refs, ref->parent,
+ ref->inode_list,
+ (void **)&eie, GFP_NOFS);
if (ret < 0)
goto out;
if (!ret && extent_item_pos) {
@@ -1085,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, *leafs, NULL, extent_item_pos);
+ time_seq, *leafs, NULL, extent_item_pos, 0, 0);
if (ret < 0 && ret != -ENOENT) {
free_leaf_list(*leafs);
return ret;
@@ -1128,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
ULIST_ITER_INIT(&uiter);
while (1) {
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, tmp, *roots, NULL);
+ time_seq, tmp, *roots, NULL, 0, 0);
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
@@ -1159,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return ret;
}
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 root_objectid,
+ u64 inum, u64 bytenr)
+{
+ struct ulist *tmp = NULL;
+ struct ulist *roots = NULL;
+ struct ulist_iterator uiter;
+ struct ulist_node *node;
+ struct seq_list elem = {};
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ roots = ulist_alloc(GFP_NOFS);
+ if (!tmp || !roots) {
+ ulist_free(tmp);
+ ulist_free(roots);
+ return -ENOMEM;
+ }
+
+ if (trans)
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ else
+ down_read(&fs_info->commit_root_sem);
+ ULIST_ITER_INIT(&uiter);
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
+ roots, NULL, root_objectid, inum);
+ if (ret == BACKREF_FOUND_SHARED) {
+ ret = 1;
+ break;
+ }
+ if (ret < 0 && ret != -ENOENT)
+ break;
+ node = ulist_next(tmp, &uiter);
+ if (!node)
+ break;
+ bytenr = node->val;
+ cond_resched();
+ }
+ if (trans)
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ else
+ up_read(&fs_info->commit_root_sem);
+ ulist_free(tmp);
+ ulist_free(roots);
+ return ret;
+}
+
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
@@ -1191,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
unsigned long ptr;
key.objectid = inode_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = start_off;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1231,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
ret = -ENOENT;
if (found_key.objectid != inode_objectid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+ if (found_key.type != BTRFS_INODE_EXTREF_KEY)
break;
ret = 0;
@@ -1364,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type == BTRFS_METADATA_ITEM_KEY)
- size = fs_info->extent_root->leafsize;
+ size = fs_info->extent_root->nodesize;
else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
size = found_key->offset;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 86fc20fec282..2a1ac6bfc724 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 root_objectid,
+ u64 inum, u64 bytenr);
int __init btrfs_prelim_ref_init(void);
void btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4794923c410c..4aadadcfab20 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,17 @@
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
#define BTRFS_INODE_HAS_PROPS 11
+/*
+ * The following 3 bits are meant only for the btree inode.
+ * When any of them is set, it means an error happened while writing an
+ * extent buffer belonging to:
+ * 1) a non-log btree
+ * 2) a log btree and first log sub-transaction
+ * 3) a log btree and second log sub-transaction
+ */
+#define BTRFS_INODE_BTREE_ERR 12
+#define BTRFS_INODE_BTREE_LOG1_ERR 13
+#define BTRFS_INODE_BTREE_LOG2_ERR 14
/* in memory btrfs inode */
struct btrfs_inode {
@@ -84,12 +95,6 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
- /*
- * list for tracking inodes that must be sent to disk before a
- * rename or truncate commit
- */
- struct list_head ordered_operations;
-
/* node for the red-black tree that links inodes in subvolume root */
struct rb_node rb_node;
@@ -127,6 +132,12 @@ struct btrfs_inode {
u64 delalloc_bytes;
/*
+ * total number of bytes pending defrag, used by stat to check whether
+ * it needs COW.
+ */
+ u64 defrag_bytes;
+
+ /*
* the size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
* because not all the blocks are written yet.
@@ -240,13 +251,25 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
BTRFS_I(inode)->last_sub_trans <=
BTRFS_I(inode)->last_log_commit &&
BTRFS_I(inode)->last_sub_trans <=
- BTRFS_I(inode)->root->last_log_commit)
- return 1;
+ BTRFS_I(inode)->root->last_log_commit) {
+ /*
+ * After a ranged fsync we might have left some extent maps
+ * (that fall outside the fsync's range). So return false
+ * here if the list isn't empty, to make sure btrfs_log_inode()
+ * will be called and process those extent maps.
+ */
+ smp_mb();
+ if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+ return 1;
+ }
return 0;
}
+#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
+
struct btrfs_dio_private {
struct inode *inode;
+ unsigned long flags;
u64 logical_offset;
u64 disk_bytenr;
u64 bytes;
@@ -263,7 +286,12 @@ struct btrfs_dio_private {
/* dio_bio came from fs/direct-io.c */
struct bio *dio_bio;
- u8 csum[0];
+
+ /*
+ * The original bio may be splited to several sub-bios, this is
+ * done during endio of sub-bios
+ */
+ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
};
/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce92ae30250f..cb7f3fe9c9f6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror(
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
return -1;
bh = __bread(superblock_bdev, dev_bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
@@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror(
btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
- btrfs_super_leafsize(super_tmp) != state->metablock_size ||
btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
brelse(bh);
return 0;
@@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data(
while (len > 0) {
cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
- BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT);
+ BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
kaddr = block_ctx->datav[i];
memcpy(dst, kaddr + offset_in_page, cur);
@@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (root->nodesize != root->leafsize) {
- printk(KERN_INFO
- "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
- root->nodesize, root->leafsize);
- return -1;
- }
if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
root->nodesize, PAGE_CACHE_SIZE);
return -1;
}
- if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
- printk(KERN_INFO
- "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
- root->leafsize, PAGE_CACHE_SIZE);
- return -1;
- }
if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1daea0b47187..d3220d31d3cb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
return sizeof(struct compressed_bio) +
- ((disk_size + root->sectorsize - 1) / root->sectorsize) *
- csum_size;
+ (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
}
static struct bio *compressed_bio_alloc(struct block_device *bdev,
@@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
* freed before we're done setting it up
*/
atomic_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
@@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
}
bio_get(bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
@@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
- nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
GFP_NOFS);
if (!cb->compressed_pages)
@@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
PAGE_CACHE_SIZE) {
bio_get(comp_bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
/*
@@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio, sums);
BUG_ON(ret); /* -ENOMEM */
}
- sums += (comp_bio->bi_iter.bi_size +
- root->sectorsize - 1) / root->sectorsize;
+ sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
+ root->sectorsize);
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
@@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
bio_get(comp_bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index aeab453b8e24..19bc6162fb8e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
- new_root_objectid, &disk_key, level,
- buf->start, 0);
+ cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
+ &disk_key, level, buf->start, 0);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -280,9 +279,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 0);
if (ret)
return ret;
@@ -1035,14 +1034,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if ((owner == root->root_key.objectid ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
- ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+ ret = btrfs_inc_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+ ret = btrfs_dec_ref(trans, root, buf, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
BUG_ON(ret); /* -ENOMEM */
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -1050,9 +1049,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 0);
BUG_ON(ret); /* -ENOMEM */
}
if (new_flags != 0) {
@@ -1069,11 +1068,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+ ret = btrfs_dec_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
}
clean_tree_block(trans, root, buf);
@@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
} else
parent_start = 0;
- cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
- root->root_key.objectid, &disk_key,
- level, search_start, empty_size);
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
- u32 blocksize;
eb_root = btrfs_read_lock_root_node(root);
tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- blocksize = btrfs_level_size(root, old_root->level);
- old = read_tree_block(root, logical, blocksize, 0);
+ old = read_tree_block(root, logical, 0);
if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
free_extent_buffer(old);
btrfs_warn(root->fs_info,
@@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
/* ensure we can see the force_cow */
smp_rmb();
@@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid != root->fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
- blocksize = btrfs_level_size(root, parent_level - 1);
+ blocksize = root->nodesize;
end_slot = parent_nritems;
if (parent_nritems == 1)
@@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = btrfs_find_tree_block(root, blocknr, blocksize);
+ cur = btrfs_find_tree_block(root, blocknr);
if (cur)
uptodate = btrfs_buffer_uptodate(cur, gen, 0);
else
uptodate = 0;
if (!cur || !uptodate) {
if (!cur) {
- cur = read_tree_block(root, blocknr,
- blocksize, gen);
+ cur = read_tree_block(root, blocknr, gen);
if (!cur || !extent_buffer_uptodate(cur)) {
free_extent_buffer(cur);
return -EIO;
@@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
BUG_ON(level == 0);
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
- btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(parent, slot));
if (eb && !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
@@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root,
node = path->nodes[level];
search = btrfs_node_blockptr(node, slot);
- blocksize = btrfs_level_size(root, level - 1);
- eb = btrfs_find_tree_block(root, search, blocksize);
+ blocksize = root->nodesize;
+ eb = btrfs_find_tree_block(root, search);
if (eb) {
free_extent_buffer(eb);
return;
@@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root,
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
gen = btrfs_node_ptr_generation(node, nr);
- readahead_tree_block(root, search, blocksize, gen);
+ readahead_tree_block(root, search, blocksize);
nread += blocksize;
}
nscan++;
@@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- blocksize = btrfs_level_size(root, level);
+ blocksize = root->nodesize;
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = btrfs_find_tree_block(root, block1, blocksize);
+ eb = btrfs_find_tree_block(root, block1);
/*
* if we get -eagain from btrfs_buffer_uptodate, we
* don't want to return eagain here. That will loop
@@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot + 1 < nritems) {
block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = btrfs_find_tree_block(root, block2, blocksize);
+ eb = btrfs_find_tree_block(root, block2);
if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
block2 = 0;
free_extent_buffer(eb);
}
if (block1)
- readahead_tree_block(root, block1, blocksize, 0);
+ readahead_tree_block(root, block1, blocksize);
if (block2)
- readahead_tree_block(root, block2, blocksize, 0);
+ readahead_tree_block(root, block2, blocksize);
}
@@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans,
{
u64 blocknr;
u64 gen;
- u32 blocksize;
struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
int ret;
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
- blocksize = btrfs_level_size(root, level - 1);
- tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+ tmp = btrfs_find_tree_block(root, blocknr);
if (tmp) {
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_release_path(p);
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, blocksize, 0);
+ tmp = read_tree_block(root, blocknr, 0);
if (tmp) {
/*
* If the read above didn't mark this buffer up to date,
@@ -2792,8 +2784,6 @@ again:
if (!should_cow_block(trans, root, b))
goto cow_done;
- btrfs_set_path_blocking(p);
-
/*
* must have write locks on this node and the
* parent
@@ -2807,6 +2797,7 @@ again:
goto again;
}
+ btrfs_set_path_blocking(p);
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
p->slots[level + 1], &b);
@@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
- root->root_key.objectid, &lower_key,
- level, root->node->start, 0);
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
- root->root_key.objectid,
- &disk_key, level, c->start, 0);
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -4282,13 +4271,12 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
- right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- root->root_key.objectid,
- &disk_key, 0, l->start, 0);
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0);
if (IS_ERR(right))
return PTR_ERR(right);
- root_add_used(root, root->leafsize);
+ root_add_used(root, root->nodesize);
memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(right, right->start);
@@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
ptr = btrfs_item_ptr_offset(leaf, slot);
memmove_extent_buffer(leaf, ptr,
(unsigned long)fi,
- offsetof(struct btrfs_file_extent_item,
- disk_bytenr));
+ BTRFS_FILE_EXTENT_INLINE_DATA_START);
}
}
@@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
int slot;
struct btrfs_map_token token;
+ if (path->slots[0] == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ fixup_low_keys(root, path, &disk_key, 1);
+ }
+ btrfs_unlock_up_safe(path, 1);
+
btrfs_init_map_token(&token);
leaf = path->nodes[0];
@@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
btrfs_set_header_nritems(leaf, nritems + nr);
-
- if (slot == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(root, path, &disk_key, 1);
- }
- btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u32 nritems;
int level;
int ret = 1;
+ int keep_locks = path->keep_locks;
- WARN_ON(!path->keep_locks);
+ path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
@@ -5210,7 +5198,6 @@ find_next_key:
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
- unlock_up(path, level, 1, 0, NULL);
goto out;
}
btrfs_set_path_blocking(path);
@@ -5225,9 +5212,12 @@ find_next_key:
btrfs_clear_path_blocking(path, NULL, 0);
}
out:
- if (ret == 0)
+ path->keep_locks = keep_locks;
+ if (ret == 0) {
+ btrfs_unlock_up_safe(path, path->lowest_level + 1);
+ btrfs_set_path_blocking(path);
memcpy(min_key, &found_key, sizeof(found_key));
- btrfs_set_path_blocking(path);
+ }
return ret;
}
@@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+ tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
if (!tmp_buf) {
ret = -ENOMEM;
goto out;
@@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
advance_right = ADVANCE;
} else {
- enum btrfs_compare_tree_result cmp;
+ enum btrfs_compare_tree_result result;
WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
ret = tree_compare_item(left_root, left_path,
right_path, tmp_buf);
if (ret)
- cmp = BTRFS_COMPARE_TREE_CHANGED;
+ result = BTRFS_COMPARE_TREE_CHANGED;
else
- cmp = BTRFS_COMPARE_TREE_SAME;
+ result = BTRFS_COMPARE_TREE_SAME;
ret = changed_cb(left_root, right_root,
left_path, right_path,
- &left_key, cmp, ctx);
+ &left_key, result, ctx);
if (ret < 0)
goto out;
advance_left = ADVANCE;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be91397f4e92..fe69edda11fb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
#include <linux/pagemap.h>
#include <linux/btrfs.h>
#include <linux/workqueue.h>
+#include <linux/security.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -62,13 +63,6 @@ struct btrfs_ordered_sum;
#define BTRFS_COMPAT_EXTENT_TREE_V0
-/*
- * files bigger than this get some pre-flushing when they are added
- * to the ordered operations list. That way we limit the total
- * work done by the commit
- */
-#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
-
/* holds pointers to all of the tree roots */
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -391,10 +385,12 @@ struct btrfs_header {
sizeof(struct btrfs_header)) / \
sizeof(struct btrfs_key_ptr))
#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) - \
- sizeof(struct btrfs_file_extent_item))
+ BTRFS_FILE_EXTENT_INLINE_DATA_START)
#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) -\
sizeof(struct btrfs_dir_item))
@@ -474,7 +470,7 @@ struct btrfs_super_block {
__le64 num_devices;
__le32 sectorsize;
__le32 nodesize;
- __le32 leafsize;
+ __le32 __unused_leafsize;
__le32 stripesize;
__le32 sys_chunk_array_size;
__le64 chunk_root_generation;
@@ -903,6 +899,8 @@ struct btrfs_file_extent_item {
/*
* disk space consumed by the extent, checksum blocks are included
* in these numbers
+ *
+ * At this offset in the structure, the inline extent data start.
*/
__le64 disk_bytenr;
__le64 disk_num_bytes;
@@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache {
*/
struct list_head cluster_list;
- /* For delayed block group creation */
- struct list_head new_bg_list;
+ /* For delayed block group creation or deletion of empty block groups */
+ struct list_head bg_list;
};
/* delayed seq elem */
@@ -1545,6 +1543,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
struct btrfs_workqueue *endio_raid56_workers;
+ struct btrfs_workqueue *endio_repair_workers;
struct btrfs_workqueue *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
@@ -1574,6 +1573,7 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
+ int open;
u64 total_pinned;
@@ -1723,6 +1723,12 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+
+ spinlock_t unused_bgs_lock;
+ struct list_head unused_bgs;
+
+ /* For btrfs to record security options */
+ struct security_mnt_opts security_opts;
};
struct btrfs_subvolume_writers {
@@ -1776,12 +1782,12 @@ struct btrfs_root {
/* free ino cache stuff */
struct btrfs_free_space_ctl *free_ino_ctl;
- enum btrfs_caching_type cached;
- spinlock_t cache_lock;
- wait_queue_head_t cache_wait;
+ enum btrfs_caching_type ino_cache_state;
+ spinlock_t ino_cache_lock;
+ wait_queue_head_t ino_cache_wait;
struct btrfs_free_space_ctl *free_ino_pinned;
- u64 cache_progress;
- struct inode *cache_inode;
+ u64 ino_cache_progress;
+ struct inode *ino_cache_inode;
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
@@ -1806,18 +1812,14 @@ struct btrfs_root {
/* node allocations are done in nodesize units */
u32 nodesize;
- /* leaf allocations are done in leafsize units */
- u32 leafsize;
-
u32 stripesize;
u32 type;
u64 highest_objectid;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
u64 alloc_bytenr;
-#endif
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
@@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_DEFAULT_MAX_INLINE (8192)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
sectorsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
nodesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
- leafsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
stripesize, 32);
BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
@@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
static inline unsigned long
btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
{
- unsigned long offset = (unsigned long)e;
- offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
- return offset;
+ return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
{
- return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+ return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
@@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
struct btrfs_item *e)
{
- unsigned long offset;
- offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
- return btrfs_item_size(eb, e) - offset;
+ return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
/* this returns the number of file bytes represented by the inline item.
@@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
-static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
-{
- if (level == 0)
- return root->leafsize;
- return root->nodesize;
-}
-
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(btrfs_leaf_data(leaf) + \
@@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2 * num_items;
}
@@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
- num_items;
+ return root->nodesize * BTRFS_MAX_LEVEL * num_items;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3287,7 +3276,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_async_run_delayed_refs(struct btrfs_root *root,
unsigned long count, int wait);
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
@@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
u64 bytenr);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int get_block_group_index(struct btrfs_block_group_cache *cache);
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize,
- u64 parent, u64 root_objectid,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
@@ -3326,9 +3315,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota);
+ struct extent_buffer *buf, int full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota);
+ struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
@@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->uuid_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
+ security_free_mnt_opts(&fs_info->security_opts);
kfree(fs_info);
}
@@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct btrfs_dio_private *dip, struct bio *bio,
- u64 logical_offset);
+ struct bio *bio, u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
- u64 rfer, u64 excl);
#endif
+static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 1;
+#endif
+ return 0;
+}
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index da775bfdebc9..054577bddaf2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
int ret;
key.objectid = node->inode_id;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
@@ -1099,7 +1099,7 @@ err_out:
search:
btrfs_release_path(path);
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
- NULL, NULL);
+ btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
+ btrfs_async_run_delayed_root, NULL, NULL);
async_work->nr = nr;
btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
@@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
}
delayed_item->key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+ delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
delayed_item->key.offset = index;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
@@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
return PTR_ERR(node);
item_key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+ item_key.type = BTRFS_DIR_INDEX_KEY;
item_key.offset = index;
ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index eea26e1b2fda..6f662b34ba0e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found:
dev_replace->srcdev->total_bytes;
dev_replace->tgtdev->disk_total_bytes =
dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->commit_total_bytes =
+ dev_replace->srcdev->commit_total_bytes;
dev_replace->tgtdev->bytes_used =
dev_replace->srcdev->bytes_used;
+ dev_replace->tgtdev->commit_bytes_used =
+ dev_replace->srcdev->commit_bytes_used;
}
dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
@@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- mutex_lock(&fs_info->volume_mutex);
- ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
- &tgt_device);
- if (ret) {
- btrfs_err(fs_info, "target device %s is invalid!",
- args->start.tgtdev_name);
- mutex_unlock(&fs_info->volume_mutex);
- return -EINVAL;
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
}
+ /* the disk copy procedure reuses the scrub code */
+ mutex_lock(&fs_info->volume_mutex);
ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
args->start.srcdev_name,
&src_device);
- mutex_unlock(&fs_info->volume_mutex);
if (ret) {
- ret = -EINVAL;
- goto leave_no_lock;
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
}
- if (tgt_device->total_bytes < src_device->total_bytes) {
- btrfs_err(fs_info, "target device is smaller than source device!");
- ret = -EINVAL;
- goto leave_no_lock;
- }
+ ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ src_device, &tgt_device);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (ret)
+ return ret;
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
@@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
src_device->devid,
rcu_str_deref(tgt_device->name));
- tgt_device->total_bytes = src_device->total_bytes;
- tgt_device->disk_total_bytes = src_device->disk_total_bytes;
- tgt_device->bytes_used = src_device->bytes_used;
-
/*
* from now on, the writes to the srcdev are all duplicated to
* go to the tgtdev as well (refer to btrfs_map_block()).
@@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
/* the disk copy procedure reuses the scrub code */
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
- src_device->total_bytes,
+ btrfs_device_get_total_bytes(src_device),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(root->fs_info, ret);
@@ -426,9 +430,7 @@ leave:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
btrfs_dev_replace_unlock(dev_replace);
-leave_no_lock:
- if (tgt_device)
- btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
}
@@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
ret = btrfs_commit_transaction(trans, root);
WARN_ON(ret);
+ mutex_lock(&uuid_mutex);
/* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&root->fs_info->chunk_mutex);
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&root->fs_info->chunk_mutex);
btrfs_dev_replace_lock(dev_replace);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
+ "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
tgt_device->is_tgtdev_for_dev_replace = 0;
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
- tgt_device->bytes_used = src_device->bytes_used;
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
- tgt_device->total_bytes = src_device->total_bytes;
- tgt_device->disk_total_bytes = src_device->disk_total_bytes;
- tgt_device->bytes_used = src_device->bytes_used;
+ btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
+ btrfs_device_set_disk_total_bytes(tgt_device,
+ src_device->disk_total_bytes);
+ btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
+ ASSERT(list_empty(&src_device->resized_list));
+ tgt_device->commit_total_bytes = src_device->commit_total_bytes;
+ tgt_device->commit_bytes_used = src_device->bytes_used;
if (fs_info->sb->s_bdev == src_device->bdev)
fs_info->sb->s_bdev = tgt_device->bdev;
if (fs_info->fs_devices->latest_bdev == src_device->bdev)
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ fs_info->fs_devices->rw_devices++;
/* replace the sysfs entry */
btrfs_kobj_rm_device(fs_info, src_device);
btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_dev_replace_unlock(dev_replace);
+
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
@@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* superblock is scratched out so that it is no longer marked to
* belong to this filesystem.
*/
- btrfs_dev_replace_unlock(dev_replace);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *srcdev;
btrfs_dev_replace_lock(dev_replace);
/* even if !dev_replace_is_valid, the values are good enough for
@@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ srcdev = dev_replace->srcdev;
args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
- div64_u64(dev_replace->srcdev->total_bytes, 1000));
+ div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
btrfs_dev_replace_unlock(dev_replace);
@@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
- dev_replace->srcdev->total_bytes,
+ btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a0691df5dcea..fc8df866e919 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
data_size = sizeof(*dir_item) + name_len + data_len;
@@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
u32 data_size;
key.objectid = btrfs_ino(dir);
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
path = btrfs_alloc_path();
@@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
@@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
return -ENOMEM;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.type = BTRFS_DIR_INDEX_KEY;
key.offset = objectid;
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
@@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
int cow = mod != 0;
key.objectid = dir;
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
if (ret < 0)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 08e65e9cf2aa..1bf9f897065d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -39,7 +39,6 @@
#include "btrfs_inode.h"
#include "volumes.h"
#include "print-tree.h"
-#include "async-thread.h"
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
@@ -60,8 +59,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
- struct btrfs_root *root);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
@@ -75,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root);
static void btrfs_error_commit_super(struct btrfs_root *root);
/*
- * end_io_wq structs are used to do processing in task context when an IO is
- * complete. This is used during reads to verify checksums, and it is used
+ * btrfs_end_io_wq structs are used to do processing in task context when an IO
+ * is complete. This is used during reads to verify checksums, and it is used
* by writes to insert metadata for new file extents after IO is complete.
*/
-struct end_io_wq {
+struct btrfs_end_io_wq {
struct bio *bio;
bio_end_io_t *end_io;
void *private;
struct btrfs_fs_info *info;
int error;
- int metadata;
+ enum btrfs_wq_endio_type metadata;
struct list_head list;
struct btrfs_work work;
};
+static struct kmem_cache *btrfs_end_io_wq_cache;
+
+int __init btrfs_end_io_wq_init(void)
+{
+ btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
+ sizeof(struct btrfs_end_io_wq),
+ 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_end_io_wq_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_end_io_wq_exit(void)
+{
+ if (btrfs_end_io_wq_cache)
+ kmem_cache_destroy(btrfs_end_io_wq_cache);
+}
+
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
@@ -330,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
- bool need_lock = (current->journal_info ==
- (void *)BTRFS_SEND_TRANS_STUB);
+ bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -351,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk_ratelimited("parent transid verify failed on %llu wanted %llu "
- "found %llu\n",
- eb->start, parent_transid, btrfs_header_generation(eb));
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ eb->fs_info->sb->s_id, eb->start,
+ parent_transid, btrfs_header_generation(eb));
ret = 1;
/*
@@ -610,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
goto err;
eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
ret = -EIO;
goto err;
}
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
"%llu %llu\n",
- found_start, eb->start);
+ eb->fs_info->sb->s_id, found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
- eb->start);
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+ eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
goto err;
}
@@ -683,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
eb = (struct extent_buffer *)page->private;
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -693,52 +709,55 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
static void end_workqueue_bio(struct bio *bio, int err)
{
- struct end_io_wq *end_io_wq = bio->bi_private;
+ struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
fs_info = end_io_wq->info;
end_io_wq->error = err;
- btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
if (bio->bi_rw & REQ_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- btrfs_queue_work(fs_info->endio_meta_write_workers,
- &end_io_wq->work);
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- btrfs_queue_work(fs_info->endio_freespace_worker,
- &end_io_wq->work);
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_work(fs_info->endio_raid56_workers,
- &end_io_wq->work);
- else
- btrfs_queue_work(fs_info->endio_write_workers,
- &end_io_wq->work);
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
+ wq = fs_info->endio_meta_write_workers;
+ func = btrfs_endio_meta_write_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
+ wq = fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ wq = fs_info->endio_raid56_workers;
+ func = btrfs_endio_raid56_helper;
+ } else {
+ wq = fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
} else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_work(fs_info->endio_raid56_workers,
- &end_io_wq->work);
- else if (end_io_wq->metadata)
- btrfs_queue_work(fs_info->endio_meta_workers,
- &end_io_wq->work);
- else
- btrfs_queue_work(fs_info->endio_workers,
- &end_io_wq->work);
+ if (unlikely(end_io_wq->metadata ==
+ BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ wq = fs_info->endio_repair_workers;
+ func = btrfs_endio_repair_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ wq = fs_info->endio_raid56_workers;
+ func = btrfs_endio_raid56_helper;
+ } else if (end_io_wq->metadata) {
+ wq = fs_info->endio_meta_workers;
+ func = btrfs_endio_meta_helper;
+ } else {
+ wq = fs_info->endio_workers;
+ func = btrfs_endio_helper;
+ }
}
+
+ btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+ btrfs_queue_work(wq, &end_io_wq->work);
}
-/*
- * For the metadata arg you want
- *
- * 0 - if data
- * 1 - if normal metadta
- * 2 - if writing to the free space cache area
- * 3 - raid parity work
- */
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- int metadata)
+ enum btrfs_wq_endio_type metadata)
{
- struct end_io_wq *end_io_wq;
- end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+ struct btrfs_end_io_wq *end_io_wq;
+
+ end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
if (!end_io_wq)
return -ENOMEM;
@@ -830,7 +849,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->submit_bio_start = submit_bio_start;
async->submit_bio_done = submit_bio_done;
- btrfs_init_work(&async->work, run_one_async_start,
+ btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
run_one_async_done, run_one_async_free);
async->bio_flags = bio_flags;
@@ -922,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
* can happen in the async kernel threads
*/
ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- bio, 1);
+ bio, BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
@@ -1054,20 +1073,17 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
- u64 parent_transid)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
{
struct extent_buffer *buf = NULL;
struct inode *btree_inode = root->fs_info->btree_inode;
- int ret = 0;
buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!buf)
- return 0;
+ return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
- return ret;
}
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1103,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
}
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize)
+ u64 bytenr)
{
return find_extent_buffer(root->fs_info, bytenr);
}
@@ -1111,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return alloc_test_extent_buffer(root->fs_info, bytenr,
blocksize);
-#endif
return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
}
@@ -1133,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid)
+ u64 parent_transid)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
if (!buf)
return NULL;
@@ -1180,7 +1194,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
if (!writers)
return ERR_PTR(-ENOMEM);
- ret = percpu_counter_init(&writers->counter, 0);
+ ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
if (ret < 0) {
kfree(writers);
return ERR_PTR(ret);
@@ -1197,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
kfree(writers);
}
-static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- u32 stripesize, struct btrfs_root *root,
- struct btrfs_fs_info *fs_info,
+static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
+ struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
root->node = NULL;
root->commit_root = NULL;
root->sectorsize = sectorsize;
root->nodesize = nodesize;
- root->leafsize = leafsize;
root->stripesize = stripesize;
root->state = 0;
root->orphan_cleanup_state = 0;
@@ -1292,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
root = btrfs_alloc_root(NULL);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+ __setup_root(4096, 4096, 4096, root, NULL, 1);
set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
root->alloc_bytenr = 0;
@@ -1315,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1393,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1410,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
* updated (along with back refs to the log tree).
*/
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- BTRFS_TREE_LOG_OBJECTID, NULL,
- 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
+ NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
@@ -1462,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1482,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
u64 generation;
- u32 blocksize;
int ret;
path = btrfs_alloc_path();
@@ -1495,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
goto alloc_fail;
}
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, key->objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, key->objectid);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
@@ -1508,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
}
generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
+ generation);
if (!root->node) {
ret = -ENOMEM;
goto find_fail;
@@ -1570,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
root->subv_writers = writers;
btrfs_init_free_ino_ctl(root);
- spin_lock_init(&root->cache_lock);
- init_waitqueue_head(&root->cache_wait);
+ spin_lock_init(&root->ino_cache_lock);
+ init_waitqueue_head(&root->ino_cache_wait);
ret = get_anon_bdev(&root->anon_dev);
if (ret)
@@ -1696,7 +1702,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
if (!device->bdev)
continue;
bdi = blk_get_backing_dev_info(device->bdev);
- if (bdi && bdi_congested(bdi, bdi_bits)) {
+ if (bdi_congested(bdi, bdi_bits)) {
ret = 1;
break;
}
@@ -1705,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
return ret;
}
-/*
- * If this fails, caller must call bdi_destroy() to get rid of the
- * bdi again.
- */
static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
@@ -1731,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
static void end_workqueue_fn(struct btrfs_work *work)
{
struct bio *bio;
- struct end_io_wq *end_io_wq;
+ struct btrfs_end_io_wq *end_io_wq;
int error;
- end_io_wq = container_of(work, struct end_io_wq, work);
+ end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
- kfree(end_io_wq);
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
bio_endio_nodec(bio, error);
}
@@ -1769,6 +1771,7 @@ static int cleaner_kthread(void *arg)
}
btrfs_run_delayed_iputs(root);
+ btrfs_delete_unused_bgs(root->fs_info);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -2060,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ btrfs_destroy_workqueue(fs_info->endio_repair_workers);
btrfs_destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2140,8 +2144,6 @@ int open_ctree(struct super_block *sb,
{
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
- u32 blocksize;
u32 stripesize;
u64 generation;
u64 features;
@@ -2185,7 +2187,7 @@ int open_ctree(struct super_block *sb,
goto fail_srcu;
}
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_bdi;
@@ -2193,13 +2195,13 @@ int open_ctree(struct super_block *sb,
fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
(1 + ilog2(nr_cpu_ids));
- ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_dirty_metadata_bytes;
}
- ret = percpu_counter_init(&fs_info->bio_counter, 0);
+ ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
@@ -2230,6 +2232,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
@@ -2239,6 +2242,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_LIST_HEAD(&fs_info->unused_bgs);
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
@@ -2257,7 +2261,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
- fs_info->max_inline = 8192 * 1024;
+ fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->free_chunk_space = 0;
@@ -2386,7 +2390,7 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- __setup_root(4096, 4096, 4096, 4096, tree_root,
+ __setup_root(4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
invalidate_bdev(fs_devices->latest_bdev);
@@ -2466,19 +2470,22 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) !=
+ /*
+ * Leafsize and nodesize were always equal, this is only a sanity check.
+ */
+ if (le32_to_cpu(disk_super->__unused_leafsize) !=
btrfs_super_nodesize(disk_super)) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksizes don't match. node %d leaf %d\n",
btrfs_super_nodesize(disk_super),
- btrfs_super_leafsize(disk_super));
+ le32_to_cpu(disk_super->__unused_leafsize));
err = -EINVAL;
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksize (%d) was too large\n",
- btrfs_super_leafsize(disk_super));
+ btrfs_super_nodesize(disk_super));
err = -EINVAL;
goto fail_alloc;
}
@@ -2495,17 +2502,16 @@ int open_ctree(struct super_block *sb,
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
*/
- if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
nodesize = btrfs_super_nodesize(disk_super);
- leafsize = btrfs_super_leafsize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = btrfs_super_stripesize(disk_super);
- fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
/*
@@ -2513,7 +2519,7 @@ int open_ctree(struct super_block *sb,
* extent buffers for the same range. It leads to corruptions
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != leafsize)) {
+ (sectorsize != nodesize)) {
printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
@@ -2576,6 +2582,8 @@ int open_ctree(struct super_block *sb,
btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->endio_repair_workers =
+ btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
fs_info->rmw_workers =
btrfs_alloc_workqueue("rmw", flags, max_active, 2);
fs_info->endio_write_workers =
@@ -2597,11 +2605,12 @@ int open_ctree(struct super_block *sb,
fs_info->submit_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
+ fs_info->endio_repair_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->fixup_workers && fs_info->extent_workers &&
+ fs_info->extent_workers &&
fs_info->qgroup_rescan_workers)) {
err = -ENOMEM;
goto fail_sb_buffer;
@@ -2612,7 +2621,6 @@ int open_ctree(struct super_block *sb,
4 * 1024 * 1024 / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
- tree_root->leafsize = leafsize;
tree_root->sectorsize = sectorsize;
tree_root->stripesize = stripesize;
@@ -2639,16 +2647,14 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_chunk_root_level(disk_super));
generation = btrfs_super_chunk_root_generation(disk_super);
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
- chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+ __setup_root(nodesize, sectorsize, stripesize, chunk_root,
+ fs_info, BTRFS_CHUNK_TREE_OBJECTID);
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
- blocksize, generation);
+ generation);
if (!chunk_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
@@ -2681,13 +2687,11 @@ int open_ctree(struct super_block *sb,
}
retry_root_backup:
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
- blocksize, generation);
+ generation);
if (!tree_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
@@ -2856,9 +2860,6 @@ retry_root_backup:
err = -EIO;
goto fail_qgroup;
}
- blocksize =
- btrfs_level_size(tree_root,
- btrfs_super_log_root_level(disk_super));
log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
@@ -2866,11 +2867,10 @@ retry_root_backup:
goto fail_qgroup;
}
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ __setup_root(nodesize, sectorsize, stripesize,
log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
log_tree_root->node = read_tree_block(tree_root, bytenr,
- blocksize,
generation + 1);
if (!log_tree_root->node ||
!extent_buffer_uptodate(log_tree_root->node)) {
@@ -2977,6 +2977,8 @@ retry_root_backup:
fs_info->update_uuid_tree_gen = 1;
}
+ fs_info->open = 1;
+
return 0;
fail_qgroup:
@@ -3136,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device,
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->commit_total_bytes)
break;
if (wait) {
@@ -3452,8 +3455,10 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
btrfs_set_stack_device_generation(dev_item, 0);
btrfs_set_stack_device_type(dev_item, dev->type);
btrfs_set_stack_device_id(dev_item, dev->devid);
- btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
- btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+ btrfs_set_stack_device_total_bytes(dev_item,
+ dev->commit_total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item,
+ dev->commit_bytes_used);
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@ -3528,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
static void free_fs_root(struct btrfs_root *root)
{
- iput(root->cache_inode);
+ iput(root->ino_cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
@@ -3619,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root)
return btrfs_commit_transaction(trans, root);
}
-int close_ctree(struct btrfs_root *root)
+void close_ctree(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3685,6 +3690,7 @@ int close_ctree(struct btrfs_root *root)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
+ fs_info->open = 0;
free_root_pointers(fs_info, 1);
iput(fs_info->btree_inode);
@@ -3707,8 +3713,6 @@ int close_ctree(struct btrfs_root *root)
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
-
- return 0;
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3810,10 +3814,74 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ int ret = 0;
+
+ if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
+ btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
+ btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
+ btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+
/*
- * Placeholder for checks
+ * The common minimum, we don't know if we can trust the nodesize/sectorsize
+ * items yet, they'll be verified later. Issue just a warning.
*/
- return 0;
+ if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->root);
+ if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->chunk_root);
+ if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ btrfs_super_log_root(sb));
+
+ if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+ printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
+ fs_info->fsid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ * done later
+ */
+ if (btrfs_super_num_devices(sb) > (1UL << 31))
+ printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
+ btrfs_super_num_devices(sb));
+
+ if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
+ printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
+ btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+ ret = -EINVAL;
+ }
+
+ /*
+ * The generation is a global counter, we'll trust it more than the others
+ * but it's still possible that it's the one that's wrong.
+ */
+ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
+ btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
+ if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+ && btrfs_super_cache_generation(sb) != (u64)-1)
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
+ btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
+
+ return ret;
}
static void btrfs_error_commit_super(struct btrfs_root *root)
@@ -3829,34 +3897,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
btrfs_cleanup_transaction(root);
}
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
- struct btrfs_root *root)
-{
- struct btrfs_inode *btrfs_inode;
- struct list_head splice;
-
- INIT_LIST_HEAD(&splice);
-
- mutex_lock(&root->fs_info->ordered_operations_mutex);
- spin_lock(&root->fs_info->ordered_root_lock);
-
- list_splice_init(&t->ordered_operations, &splice);
- while (!list_empty(&splice)) {
- btrfs_inode = list_entry(splice.next, struct btrfs_inode,
- ordered_operations);
-
- list_del_init(&btrfs_inode->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
-
- btrfs_invalidate_inodes(btrfs_inode->root);
-
- spin_lock(&root->fs_info->ordered_root_lock);
- }
-
- spin_unlock(&root->fs_info->ordered_root_lock);
- mutex_unlock(&root->fs_info->ordered_operations_mutex);
-}
-
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
@@ -4033,9 +4073,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
- eb = btrfs_find_tree_block(root, start,
- root->leafsize);
- start += root->leafsize;
+ eb = btrfs_find_tree_block(root, start);
+ start += root->nodesize;
if (!eb)
continue;
wait_on_extent_buffer_writeback(eb);
@@ -4093,8 +4132,6 @@ again:
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
- btrfs_destroy_ordered_operations(cur_trans, root);
-
btrfs_destroy_delayed_refs(cur_trans, root);
cur_trans->state = TRANS_STATE_COMMIT_START;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 23ce3ceba0a9..414651821fb3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,11 +25,12 @@
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
-enum {
+enum btrfs_wq_endio_type {
BTRFS_WQ_ENDIO_DATA = 0,
BTRFS_WQ_ENDIO_METADATA = 1,
BTRFS_WQ_ENDIO_FREE_SPACE = 2,
BTRFS_WQ_ENDIO_RAID56 = 3,
+ BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
};
static inline u64 btrfs_sb_offset(int mirror)
@@ -44,9 +45,8 @@ struct btrfs_device;
struct btrfs_fs_devices;
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid);
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
- u64 parent_transid);
+ u64 parent_transid);
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
-int close_ctree(struct btrfs_root *root);
+void close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize);
+ u64 bytenr);
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
int btrfs_init_fs_root(struct btrfs_root *root);
@@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- int metadata);
+ enum btrfs_wq_endio_type metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
unsigned long bio_flags, u64 bio_offset,
@@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info);
+int __init btrfs_end_io_wq_init(void);
+void btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 41422a3de8ed..37d164540c3a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
return ERR_PTR(-ESTALE);
key.objectid = root_objectid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
}
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(sb, &key, root, NULL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813537f362f9..47c1ba141082 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -491,7 +491,7 @@ next:
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
- fs_info->tree_root->leafsize;
+ fs_info->tree_root->nodesize;
else
last = key.objectid + key.offset;
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
atomic_set(&caching_ctl->count, 1);
- btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+ btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+ caching_thread, NULL, NULL);
spin_lock(&cache->lock);
/*
@@ -709,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-/* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
{
int ret;
struct btrfs_key key;
@@ -725,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
0, 0);
- if (ret > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == start &&
- key.type == BTRFS_METADATA_ITEM_KEY)
- ret = 0;
- }
btrfs_free_path(path);
return ret;
}
@@ -764,7 +759,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
* different
*/
if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
- offset = root->leafsize;
+ offset = root->nodesize;
metadata = 0;
}
@@ -785,7 +780,6 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
ret = btrfs_search_slot(trans, root->fs_info->extent_root,
&key, path, 0, 0);
if (ret < 0)
@@ -798,16 +792,9 @@ again:
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == root->leafsize)
+ key.offset == root->nodesize)
ret = 0;
}
- if (ret) {
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = root->leafsize;
- btrfs_release_path(path);
- goto again;
- }
}
if (ret == 0) {
@@ -2650,7 +2637,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
num_heads = heads_to_leaves(root, num_heads);
if (num_heads > 1)
- num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes += (num_heads - 1) * root->nodesize;
num_bytes <<= 1;
global_rsv = &root->fs_info->global_block_rsv;
@@ -2749,8 +2736,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
async->sync = 0;
init_completion(&async->wait);
- btrfs_init_work(&async->work, delayed_ref_async_start,
- NULL, NULL);
+ btrfs_init_work(&async->work, btrfs_extent_refs_helper,
+ delayed_ref_async_start, NULL, NULL);
btrfs_queue_work(root->fs_info->extent_workers, &async->work);
@@ -3057,7 +3044,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc, int no_quota)
+ int full_backref, int inc)
{
u64 bytenr;
u64 num_bytes;
@@ -3072,10 +3059,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
u64, u64, u64, u64, u64, u64, int);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
@@ -3096,7 +3083,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
for (i = 0; i < nritems; i++) {
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(buf, i,
struct btrfs_file_extent_item);
@@ -3111,15 +3098,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, no_quota);
+ key.offset, 1);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = btrfs_level_size(root, level - 1);
+ num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
- no_quota);
+ 1);
if (ret)
goto fail;
}
@@ -3130,15 +3117,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota)
+ struct extent_buffer *buf, int full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int no_quota)
+ struct extent_buffer *buf, int full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3493,7 +3480,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
- ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
if (ret) {
kfree(found);
return ret;
@@ -3586,13 +3573,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
*/
static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
- /*
- * we add in the count of missing devices because we want
- * to make sure that any RAID levels on a degraded FS
- * continue to be honored.
- */
- u64 num_devices = root->fs_info->fs_devices->rw_devices +
- root->fs_info->fs_devices->missing_devices;
+ u64 num_devices = root->fs_info->fs_devices->rw_devices;
u64 target;
u64 tmp;
@@ -4348,11 +4329,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
}
static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info)
+ struct btrfs_fs_info *fs_info,
+ int flush_state)
{
u64 used;
spin_lock(&space_info->lock);
+ /*
+ * We run out of space and have not got any free space via flush_space,
+ * so don't bother doing async reclaim.
+ */
+ if (flush_state > COMMIT_TRANS && space_info->full) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ }
+
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
@@ -4385,11 +4376,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_space(fs_info->fs_root, space_info, to_reclaim,
to_reclaim, flush_state);
flush_state++;
- if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+ flush_state))
return;
} while (flush_state <= COMMIT_TRANS);
- if (btrfs_need_do_async_reclaim(space_info, fs_info))
+ if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
queue_work(system_unbound_wq, work);
}
@@ -4507,7 +4499,13 @@ again:
space_info->flush = 1;
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
- if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!root->fs_info->log_root_recovering &&
+ need_do_async_reclaim(space_info, root->fs_info, used) &&
!work_busy(&root->fs_info->async_reclaim_work))
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
@@ -4844,7 +4842,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
if (num_bytes * 3 > meta_used)
num_bytes = div64_u64(meta_used, 3);
- return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+ return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4993,7 +4991,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * root->leafsize;
+ num_bytes = 3 * root->nodesize;
ret = btrfs_qgroup_reserve(root, num_bytes);
if (ret)
return ret;
@@ -5181,7 +5179,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
@@ -5190,7 +5188,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (unlikely(ret)) {
if (root->fs_info->quota_enabled)
btrfs_qgroup_free(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
goto out_fail;
}
@@ -5306,7 +5304,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode), to_free, 0);
if (root->fs_info->quota_enabled) {
btrfs_qgroup_free(root, num_bytes +
- dropped * root->leafsize);
+ dropped * root->nodesize);
}
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5427,6 +5425,20 @@ static int update_block_group(struct btrfs_root *root,
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
+
+ /*
+ * No longer have used bytes in this block group, queue
+ * it for deletion.
+ */
+ if (old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
btrfs_set_block_group_used(&cache->item, old_val);
cache->pinned += num_bytes;
cache->space_info->bytes_pinned += num_bytes;
@@ -6238,10 +6250,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
/*
@@ -6268,14 +6279,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
-static u64 stripe_align(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache,
- u64 val, u64 num_bytes)
-{
- u64 ret = ALIGN(val, root->stripesize);
- return ret;
-}
-
/*
* when we wait for progress in the block group caching, its because
* our allocation attempt failed at least once. So, we must sleep
@@ -6469,7 +6472,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool have_caching_bg = false;
WARN_ON(num_bytes < root->sectorsize);
- btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+ ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -6756,8 +6759,7 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, block_group,
- offset, num_bytes);
+ search_start = ALIGN(offset, root->stripesize);
/* move on to the next group */
if (search_start + num_bytes >
@@ -7082,7 +7084,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
return -ENOMEM;
}
@@ -7091,7 +7093,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
ins, size);
if (ret) {
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->leafsize);
+ root->nodesize);
btrfs_free_path(path);
return ret;
}
@@ -7106,7 +7108,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
- num_bytes = root->leafsize;
+ num_bytes = root->nodesize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7136,14 +7138,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
- ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+ ret = update_block_group(root, ins->objectid, root->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
return ret;
}
@@ -7218,17 +7220,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
* EXENT bit to differentiate dirty pages.
*/
- if (root->log_transid % 2 == 0)
+ if (buf->log_index == 0)
set_extent_dirty(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
else
set_extent_new(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
} else {
+ buf->log_index = -1;
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
@@ -7305,8 +7309,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
*
* returns the tree buffer or NULL.
*/
-struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u32 blocksize,
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size)
@@ -7316,18 +7320,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf;
u64 flags = 0;
int ret;
+ u32 blocksize = root->nodesize;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+ if (btrfs_test_is_dummy_root(root)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
blocksize, level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
}
-#endif
+
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
@@ -7422,7 +7426,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
eb = path->nodes[wc->level];
nritems = btrfs_header_nritems(eb);
- blocksize = btrfs_level_size(root, wc->level - 1);
+ blocksize = root->nodesize;
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
if (nread >= wc->reada_count)
@@ -7469,15 +7473,224 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- ret = readahead_tree_block(root, bytenr, blocksize,
- generation);
- if (ret)
- break;
+ readahead_tree_block(root, bytenr, blocksize);
nread++;
}
wc->reada_slot = slot;
}
+static int account_leaf_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+ int nr = btrfs_header_nritems(eb);
+ int i, extent_type, ret;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ u64 bytenr, num_bytes;
+
+ for (i = 0; i < nr; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ /* filter out non qgroup-accountable extents */
+ extent_type = btrfs_file_extent_type(eb, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+
+ bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (!bytenr)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+ root->objectid,
+ bytenr, num_bytes,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_root *root,
+ struct btrfs_path *path, int root_level)
+{
+ int level = 0;
+ int nr, slot;
+ struct extent_buffer *eb;
+
+ if (root_level == 0)
+ return 1;
+
+ while (level <= root_level) {
+ eb = path->nodes[level];
+ nr = btrfs_header_nritems(eb);
+ path->slots[level]++;
+ slot = path->slots[level];
+ if (slot >= nr || level == 0) {
+ /*
+ * Don't free the root - we will detect this
+ * condition after our loop and return a
+ * positive value for caller to stop walking the tree.
+ */
+ if (level != root_level) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+
+ free_extent_buffer(eb);
+ path->nodes[level] = NULL;
+ path->slots[level] = 0;
+ }
+ } else {
+ /*
+ * We have a valid slot to walk back down
+ * from. Stop here so caller can process these
+ * new nodes.
+ */
+ break;
+ }
+
+ level++;
+ }
+
+ eb = path->nodes[root_level];
+ if (path->slots[root_level] >= btrfs_header_nritems(eb))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * root_eb is the subtree root and is locked before this function is called.
+ */
+static int account_shared_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *root_eb,
+ u64 root_gen,
+ int root_level)
+{
+ int ret = 0;
+ int level;
+ struct extent_buffer *eb = root_eb;
+ struct btrfs_path *path = NULL;
+
+ BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+ BUG_ON(root_eb == NULL);
+
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
+ if (!extent_buffer_uptodate(root_eb)) {
+ ret = btrfs_read_buffer(root_eb, root_gen);
+ if (ret)
+ goto out;
+ }
+
+ if (root_level == 0) {
+ ret = account_leaf_items(trans, root, root_eb);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Walk down the tree. Missing extent blocks are filled in as
+ * we go. Metadata is accounted every time we read a new
+ * extent block.
+ *
+ * When we reach a leaf, we account for file extent items in it,
+ * walk back up the tree (adjusting slot pointers as we go)
+ * and restart the search process.
+ */
+ extent_buffer_get(root_eb); /* For path */
+ path->nodes[root_level] = root_eb;
+ path->slots[root_level] = 0;
+ path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+ level = root_level;
+ while (level >= 0) {
+ if (path->nodes[level] == NULL) {
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /* We need to get child blockptr/gen from
+ * parent before we can read it. */
+ eb = path->nodes[level + 1];
+ parent_slot = path->slots[level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+
+ eb = read_tree_block(root, child_bytenr, child_gen);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ path->nodes[level] = eb;
+ path->slots[level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+ root->objectid,
+ child_bytenr,
+ root->nodesize,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE,
+ 0);
+ if (ret)
+ goto out;
+
+ }
+
+ if (level == 0) {
+ ret = account_leaf_items(trans, root, path->nodes[level]);
+ if (ret)
+ goto out;
+
+ /* Nonzero return here means we completed our search */
+ ret = adjust_slots_upwards(root, path, root_level);
+ if (ret)
+ break;
+
+ /* Restart search with new slots */
+ goto walk_down;
+ }
+
+ level--;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
/*
* helper to process tree block while walking down the tree.
*
@@ -7532,9 +7745,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
- ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+ ret = btrfs_inc_ref(trans, root, eb, 1);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
eb->len, flag,
@@ -7581,6 +7794,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
int level = wc->level;
int reada = 0;
int ret = 0;
+ bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
@@ -7596,9 +7810,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- blocksize = btrfs_level_size(root, level - 1);
+ blocksize = root->nodesize;
- next = btrfs_find_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_tree_block(root, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!next)
@@ -7626,6 +7840,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level - 1] > 1) {
+ need_account = true;
if (level == 1 &&
(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
goto skip;
@@ -7659,7 +7874,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(root, bytenr, blocksize, generation);
+ next = read_tree_block(root, bytenr, generation);
if (!next || !extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
@@ -7689,6 +7904,16 @@ skip:
parent = 0;
}
+ if (need_account) {
+ ret = account_shared_subtree(trans, root, next,
+ generation, level - 1);
+ if (ret) {
+ printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ "%d accounting shared subtree. Quota "
+ "is out of sync, rescan required.\n",
+ root->fs_info->sb->s_id, ret);
+ }
+ }
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
root->root_key.objectid, level - 1, 0, 0);
BUG_ON(ret); /* -ENOMEM */
@@ -7769,12 +7994,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = btrfs_dec_ref(trans, root, eb, 1,
- wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 1);
else
- ret = btrfs_dec_ref(trans, root, eb, 0,
- wc->for_reloc);
+ ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
+ ret = account_leaf_items(trans, root, eb);
+ if (ret) {
+ printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ "%d accounting leaf items. Quota "
+ "is out of sync, rescan required.\n",
+ root->fs_info->sb->s_id, ret);
+ }
}
/* make block locked assertion in clean_tree_block happy */
if (!path->locks[level] &&
@@ -7900,6 +8130,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int level;
bool root_dropped = false;
+ btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
+
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
@@ -8025,6 +8257,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
+ /*
+ * Qgroup update accounting is run from
+ * delayed ref handling. This usually works
+ * out because delayed refs are normally the
+ * only way qgroup updates are added. However,
+ * we may have added updates during our tree
+ * walk so run qgroups here to make sure we
+ * don't lose any updates.
+ */
+ ret = btrfs_delayed_qgroup_accounting(trans,
+ root->fs_info);
+ if (ret)
+ printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+ "running qgroup updates "
+ "during snapshot delete. "
+ "Quota is out of sync, "
+ "rescan required.\n", ret);
+
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8078,6 +8328,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
+ ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
+ if (ret)
+ printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+ "running qgroup updates "
+ "during snapshot delete. "
+ "Quota is out of sync, "
+ "rescan required.\n", ret);
+
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
@@ -8181,13 +8439,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
if (stripped)
return extended_to_chunk(stripped);
- /*
- * we add in the count of missing devices because we want
- * to make sure that any RAID levels on a degraded FS
- * continue to be honored.
- */
- num_devices = root->fs_info->fs_devices->rw_devices +
- root->fs_info->fs_devices->missing_devices;
+ num_devices = root->fs_info->fs_devices->rw_devices;
stripped = BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
@@ -8605,6 +8857,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
up_write(&info->commit_root_sem);
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8739,7 +9001,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
return cache;
@@ -8761,7 +9023,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
root = info->extent_root;
key.objectid = 0;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -8880,8 +9142,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
set_avail_alloc_bits(root->fs_info, cache->flags);
- if (btrfs_chunk_readonly(root, cache->key.objectid))
+ if (btrfs_chunk_readonly(root, cache->key.objectid)) {
set_block_group_ro(cache, 1);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ /* Should always be true but just in case. */
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
}
list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -8922,10 +9194,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret = 0;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
- new_bg_list) {
- list_del_init(&block_group->new_bg_list);
-
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ list_del_init(&block_group->bg_list);
if (ret)
continue;
@@ -9011,7 +9281,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
- list_add_tail(&cache->new_bg_list, &trans->new_bgs);
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
@@ -9165,8 +9435,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
- btrfs_clear_space_info_full(root->fs_info);
-
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9182,6 +9450,101 @@ out:
return ret;
}
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!fs_info->open)
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ space_info = block_group->space_info;
+ list_del_init(&block_group->bg_list);
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ btrfs_block_group_used(&block_group->item) ||
+ block_group->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = set_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_set_block_group_rw(root, block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->key.objectid;
+ end = start + block_group->key.offset - 1;
+ clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ block_group->pinned = 0;
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, root,
+ block_group->key.objectid);
+ btrfs_end_transaction(trans, root);
+next:
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -9313,7 +9676,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
int btrfs_start_nocow_write(struct btrfs_root *root)
{
- if (unlikely(atomic_read(&root->will_be_snapshoted)))
+ if (atomic_read(&root->will_be_snapshoted))
return 0;
percpu_counter_inc(&root->subv_writers->counter);
@@ -9321,7 +9684,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
* Make sure counter is updated before we check for snapshot creation.
*/
smp_mb();
- if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+ if (atomic_read(&root->will_be_snapshoted)) {
btrfs_end_nocow_write(root);
return 0;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e11aab9f391..bf3f424e0013 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
static struct bio_set *btrfs_bioset;
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(buffers);
static LIST_HEAD(states);
@@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
- printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
- "state %lu in tree %p refs %d\n",
- state->start, state->end, state->state, state->tree,
+ pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
atomic_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
@@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
return state;
state->state = 0;
state->private = 0;
- state->tree = NULL;
+ RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
atomic_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
@@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state)
if (!state)
return;
if (atomic_dec_and_test(&state->refs)) {
- WARN_ON(state->tree);
+ WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
@@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree,
other->state == state->state) {
merge_cb(tree, state, other);
state->start = other->start;
- other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
@@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree,
other->state == state->state) {
merge_cb(tree, state, other);
state->end = other->end;
- other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
@@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree,
found->start, found->end, start, end);
return -EEXIST;
}
- state->tree = tree;
merge_state(tree, state);
return 0;
}
@@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
free_extent_state(prealloc);
return -EEXIST;
}
- prealloc->tree = tree;
return 0;
}
@@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
wake_up(&state->wq);
if (state->state == 0) {
next = next_state(state);
- if (state->tree) {
+ if (extent_state_in_tree(state)) {
rb_erase(&state->rb_node, &tree->state);
- state->tree = NULL;
+ RB_CLEAR_NODE(&state->rb_node);
free_extent_state(state);
} else {
WARN_ON(1);
@@ -606,8 +609,8 @@ again:
cached_state = NULL;
}
- if (cached && cached->tree && cached->start <= start &&
- cached->end > start) {
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
if (clear)
atomic_dec(&cached->refs);
state = cached;
@@ -843,7 +846,7 @@ again:
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
- state->tree) {
+ extent_state_in_tree(state)) {
node = &state->rb_node;
goto hit_next;
}
@@ -1069,7 +1072,7 @@ again:
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
- state->tree) {
+ extent_state_in_tree(state)) {
node = &state->rb_node;
goto hit_next;
}
@@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
- if (state->end == start - 1 && state->tree) {
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
n = rb_next(&state->rb_node);
while (n) {
state = rb_entry(n, struct extent_state,
@@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bitset = 0;
spin_lock(&tree->lock);
- if (cached && cached->tree && cached->start <= start &&
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
cached->end > start)
node = &cached->rb_node;
else
@@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
SetPageUptodate(page);
}
-/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data. This
- * io_failure_record is used to record state as we go through all the
- * mirrors. If another mirror has good data, the page is set up to date
- * and things continue. If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
- struct page *page;
- u64 start;
- u64 len;
- u64 logical;
- unsigned long bio_flags;
- int this_mirror;
- int failed_mirror;
- int in_validation;
-};
-
-static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
- int did_repair)
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
{
int ret;
int err = 0;
@@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
- u64 length, u64 logical, struct page *page,
- int mirror_num)
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+ struct page *page, unsigned int pg_offset, int mirror_num)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
@@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
return -EIO;
}
bio->bi_bdev = dev->bdev;
- bio_add_page(bio, page, length, start - page_offset(page));
+ bio_add_page(bio, page, length, pg_offset);
if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
/* try to remap that extent elsewhere? */
@@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
}
printk_ratelimited_in_rcu(KERN_INFO
- "BTRFS: read error corrected: ino %lu off %llu "
- "(dev %s sector %llu)\n", page->mapping->host->i_ino,
- start, rcu_str_deref(dev->name), sector);
-
+ "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ btrfs_ino(inode), start,
+ rcu_str_deref(dev->name), sector);
bio_put(bio);
return 0;
}
@@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
return -EROFS;
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
- ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
- start, p, mirror_num);
+ struct page *p = eb->pages[i];
+
+ ret = repair_io_failure(root->fs_info->btree_inode, start,
+ PAGE_CACHE_SIZE, start, p,
+ start - page_offset(p), mirror_num);
if (ret)
break;
start += PAGE_CACHE_SIZE;
@@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-static int clean_io_failure(u64 start, struct page *page)
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+ unsigned int pg_offset)
{
u64 private;
u64 private_failure;
struct io_failure_record *failrec;
- struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct extent_state *state;
int num_copies;
- int did_repair = 0;
int ret;
private = 0;
@@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page)
/* there was no real error, just free the record */
pr_debug("clean_io_failure: freeing dummy error at %llu\n",
failrec->start);
- did_repair = 1;
goto out;
}
if (fs_info->sb->s_flags & MS_RDONLY)
@@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page)
num_copies = btrfs_num_copies(fs_info, failrec->logical,
failrec->len);
if (num_copies > 1) {
- ret = repair_io_failure(fs_info, start, failrec->len,
- failrec->logical, page,
- failrec->failed_mirror);
- did_repair = !ret;
+ repair_io_failure(inode, start, failrec->len,
+ failrec->logical, page,
+ pg_offset, failrec->failed_mirror);
}
- ret = 0;
}
out:
- if (!ret)
- ret = free_io_failure(inode, failrec, did_repair);
+ free_io_failure(inode, failrec);
- return ret;
+ return 0;
}
/*
- * this is a generic handler for readpage errors (default
- * readpage_io_failed_hook). if other copies exist, read those and write back
- * good data to the failed position. does not investigate in remapping the
- * failed extent elsewhere, hoping the device will be smart enough to do this as
- * needed
+ * Can be called when
+ * - hold extent lock
+ * - under ordered extent
+ * - the inode is freeing
*/
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
+{
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct io_failure_record *failrec;
+ struct extent_state *state, *next;
-static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int failed_mirror)
+ if (RB_EMPTY_ROOT(&failure_tree->state))
+ return;
+
+ spin_lock(&failure_tree->lock);
+ state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
+ while (state) {
+ if (state->start > end)
+ break;
+
+ ASSERT(state->end <= end);
+
+ next = next_state(state);
+
+ failrec = (struct io_failure_record *)state->private;
+ free_extent_state(state);
+ kfree(failrec);
+
+ state = next;
+ }
+ spin_unlock(&failure_tree->lock);
+}
+
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret)
{
- struct io_failure_record *failrec = NULL;
+ struct io_failure_record *failrec;
u64 private;
struct extent_map *em;
- struct inode *inode = page->mapping->host;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct bio *bio;
- struct btrfs_io_bio *btrfs_failed_bio;
- struct btrfs_io_bio *btrfs_bio;
- int num_copies;
int ret;
- int read_mode;
u64 logical;
- BUG_ON(failed_bio->bi_rw & REQ_WRITE);
-
ret = get_state_private(failure_tree, start, &private);
if (ret) {
failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
if (!failrec)
return -ENOMEM;
+
failrec->start = start;
failrec->len = end - start + 1;
failrec->this_mirror = 0;
@@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
em = NULL;
}
read_unlock(&em_tree->lock);
-
if (!em) {
kfree(failrec);
return -EIO;
}
+
logical = start - em->start;
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
@@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
extent_set_compress_type(&failrec->bio_flags,
em->compress_type);
}
- pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
- "len=%llu\n", logical, start, failrec->len);
+
+ pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
+ logical, start, failrec->len);
+
failrec->logical = logical;
free_extent_map(em);
@@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
}
} else {
failrec = (struct io_failure_record *)(unsigned long)private;
- pr_debug("bio_readpage_error: (found) logical=%llu, "
- "start=%llu, len=%llu, validation=%d\n",
+ pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
failrec->logical, failrec->start, failrec->len,
failrec->in_validation);
/*
@@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
* clean_io_failure() clean all those errors at once.
*/
}
+
+ *failrec_ret = failrec;
+
+ return 0;
+}
+
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int failed_mirror)
+{
+ int num_copies;
+
num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
failrec->logical, failrec->len);
if (num_copies == 1) {
@@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
* all the retry and error correction code that follows. no
* matter what the error is, it is very likely to persist.
*/
- pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
num_copies, failrec->this_mirror, failed_mirror);
- free_io_failure(inode, failrec, 0);
- return -EIO;
+ return 0;
}
/*
@@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
BUG_ON(failrec->in_validation);
failrec->in_validation = 1;
failrec->this_mirror = failed_mirror;
- read_mode = READ_SYNC | REQ_FAILFAST_DEV;
} else {
/*
* we're ready to fulfill a) and b) alongside. get a good copy
@@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
failrec->this_mirror++;
if (failrec->this_mirror == failed_mirror)
failrec->this_mirror++;
- read_mode = READ_SYNC;
}
if (failrec->this_mirror > num_copies) {
- pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
num_copies, failrec->this_mirror, failed_mirror);
- free_io_failure(inode, failrec, 0);
- return -EIO;
+ return 0;
}
+ return 1;
+}
+
+
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ struct page *page, int pg_offset, int icsum,
+ bio_end_io_t *endio_func, void *data)
+{
+ struct bio *bio;
+ struct btrfs_io_bio *btrfs_failed_bio;
+ struct btrfs_io_bio *btrfs_bio;
+
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio) {
- free_io_failure(inode, failrec, 0);
- return -EIO;
- }
- bio->bi_end_io = failed_bio->bi_end_io;
+ if (!bio)
+ return NULL;
+
+ bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
bio->bi_iter.bi_size = 0;
+ bio->bi_private = data;
btrfs_failed_bio = btrfs_io_bio(failed_bio);
if (btrfs_failed_bio->csum) {
@@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
btrfs_bio = btrfs_io_bio(bio);
btrfs_bio->csum = btrfs_bio->csum_inline;
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- phy_offset *= csum_size;
- memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
+ icsum *= csum_size;
+ memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
csum_size);
}
- bio_add_page(bio, page, failrec->len, start - page_offset(page));
+ bio_add_page(bio, page, failrec->len, pg_offset);
+
+ return bio;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int failed_mirror)
+{
+ struct io_failure_record *failrec;
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct bio *bio;
+ int read_mode;
+ int ret;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ if (ret)
+ return ret;
+
+ ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
+ if (!ret) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ if (failed_bio->bi_vcnt > 1)
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ else
+ read_mode = READ_SYNC;
+
+ phy_offset >>= inode->i_sb->s_blocksize_bits;
+ bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+ start - page_offset(page),
+ (int)phy_offset, failed_bio->bi_end_io,
+ NULL);
+ if (!bio) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
- pr_debug("bio_readpage_error: submitting new read[%#x] to "
- "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
- failrec->this_mirror, num_copies, failrec->in_validation);
+ pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
+ read_mode, failrec->this_mirror, failrec->in_validation);
ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
failrec->this_mirror,
failrec->bio_flags, 0);
+ if (ret) {
+ free_io_failure(inode, failrec);
+ bio_put(bio);
+ }
+
return ret;
}
@@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
- "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
+ "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
@@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (ret)
uptodate = 0;
else
- clean_io_failure(start, page);
+ clean_io_failure(inode, start, page, 0);
}
if (likely(uptodate))
@@ -2532,6 +2602,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
uptodate = 0;
+ offset += len;
continue;
}
}
@@ -2539,12 +2610,12 @@ readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
- unsigned offset;
+ unsigned off;
/* Zero out the end if this page straddles i_size */
- offset = i_size & (PAGE_CACHE_SIZE-1);
- if (page->index == end_index && offset)
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ off = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index == end_index && off)
+ zero_user_segment(page, off, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -2617,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
{
- return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
-}
+ struct btrfs_io_bio *btrfs_bio;
+ struct bio *new;
+ new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+ if (new) {
+ btrfs_bio = btrfs_io_bio(new);
+ btrfs_bio->csum = NULL;
+ btrfs_bio->csum_allocated = NULL;
+ btrfs_bio->end_io = NULL;
+ }
+ return new;
+}
/* this also allocates from the btrfs_bioset */
struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -3500,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
if (!trylock_page(p)) {
if (!flush) {
@@ -3521,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
}
+static void set_btree_ioerr(struct page *page)
+{
+ struct extent_buffer *eb = (struct extent_buffer *)page->private;
+ struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
+
+ SetPageError(page);
+ if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+ return;
+
+ /*
+ * If writeback for a btree extent that doesn't belong to a log tree
+ * failed, increment the counter transaction->eb_write_errors.
+ * We do this because while the transaction is running and before it's
+ * committing (when we call filemap_fdata[write|wait]_range against
+ * the btree inode), we might have
+ * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+ * returns an error or an error happens during writeback, when we're
+ * committing the transaction we wouldn't know about it, since the pages
+ * can be no longer dirty nor marked anymore for writeback (if a
+ * subsequent modification to the extent buffer didn't happen before the
+ * transaction commit), which makes filemap_fdata[write|wait]_range not
+ * able to find the pages tagged with SetPageError at transaction
+ * commit time. So if this happens we must abort the transaction,
+ * otherwise we commit a super block with btree roots that point to
+ * btree nodes/leafs whose content on disk is invalid - either garbage
+ * or the content of some node/leaf from a past generation that got
+ * cowed or deleted and is no longer valid.
+ *
+ * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
+ * not be enough - we need to distinguish between log tree extents vs
+ * non-log tree extents, and the next filemap_fdatawait_range() call
+ * will catch and clear such errors in the mapping - and that call might
+ * be from a log sync and not from a transaction commit. Also, checking
+ * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
+ * not done and would not be reliable - the eb might have been released
+ * from memory and reading it back again means that flag would not be
+ * set (since it's a runtime flag, not persisted on disk).
+ *
+ * Using the flags below in the btree inode also makes us achieve the
+ * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
+ * writeback for all dirty pages and before filemap_fdatawait_range()
+ * is called, the writeback for all dirty pages had already finished
+ * with errors - because we were not using AS_EIO/AS_ENOSPC,
+ * filemap_fdatawait_range() would return success, as it could not know
+ * that writeback errors happened (the pages were no longer tagged for
+ * writeback).
+ */
+ switch (eb->log_index) {
+ case -1:
+ set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
+ break;
+ case 0:
+ set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+ break;
+ case 1:
+ set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ break;
+ default:
+ BUG(); /* unexpected, logic error */
+ }
+}
+
static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
{
struct bio_vec *bvec;
@@ -3534,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
- SetPageError(page);
+ set_btree_ioerr(page);
}
end_page_writeback(page);
@@ -3564,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
int ret = 0;
- clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
atomic_set(&eb->io_pages, num_pages);
if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
bio_flags = EXTENT_BIO_TREE_LOG;
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
clear_page_dirty_for_io(p);
set_page_writeback(p);
@@ -3581,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
0, epd->bio_flags, bio_flags);
epd->bio_flags = bio_flags;
if (ret) {
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
- SetPageError(p);
+ set_btree_ioerr(p);
+ end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
end_extent_buffer_writeback(eb);
ret = -EIO;
@@ -3595,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
if (unlikely(ret)) {
for (; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
+ clear_page_dirty_for_io(p);
unlock_page(p);
}
}
@@ -4165,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
return NULL;
}
-static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
-{
- unsigned long cnt = *((unsigned long *)ctx);
-
- cnt++;
- *((unsigned long *)ctx) = cnt;
-
- /* Now we're sure that the extent is shared. */
- if (cnt > 1)
- return 1;
- return 0;
-}
-
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent)
{
@@ -4194,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
@@ -4207,15 +4337,15 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
- start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
- len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
+ start = round_down(start, BTRFS_I(inode)->root->sectorsize);
+ len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
/*
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
- path, btrfs_ino(inode), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
btrfs_free_path(path);
return ret;
@@ -4223,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
WARN_ON(!ret);
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
/* No extents, but there might be delalloc bits */
if (found_key.objectid != btrfs_ino(inode) ||
@@ -4308,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
} else if (em->block_start == EXTENT_MAP_DELALLOC) {
flags |= (FIEMAP_EXTENT_DELALLOC |
FIEMAP_EXTENT_UNKNOWN);
- } else {
- unsigned long ref_cnt = 0;
+ } else if (fieinfo->fi_extents_max) {
+ u64 bytenr = em->block_start -
+ (em->start - em->orig_start);
disko = em->block_start + offset_in_extent;
/*
* As btrfs supports shared space, this information
* can be exported to userspace tools via
- * flag FIEMAP_EXTENT_SHARED.
+ * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
+ * then we're just getting a count and we can skip the
+ * lookup stuff.
*/
- ret = iterate_inodes_from_logical(
- em->block_start,
- BTRFS_I(inode)->root->fs_info,
- path, count_ext_ref, &ref_cnt);
- if (ret < 0 && ret != -ENOENT)
+ ret = btrfs_check_shared(NULL, root->fs_info,
+ root->objectid,
+ btrfs_ino(inode), bytenr);
+ if (ret < 0)
goto out_free;
-
- if (ref_cnt > 1)
+ if (ret)
flags |= FIEMAP_EXTENT_SHARED;
+ ret = 0;
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
@@ -4380,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb)
/*
* Helper for releasing extent buffer page.
*/
-static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
- unsigned long start_idx)
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
{
unsigned long index;
- unsigned long num_pages;
struct page *page;
int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
BUG_ON(extent_buffer_under_io(eb));
- num_pages = num_extent_pages(eb->start, eb->len);
- index = start_idx + num_pages;
- if (start_idx >= index)
+ index = num_extent_pages(eb->start, eb->len);
+ if (index == 0)
return;
do {
index--;
- page = extent_buffer_page(eb, index);
+ page = eb->pages[index];
if (page && mapped) {
spin_lock(&page->mapping->private_lock);
/*
@@ -4428,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
/* One for when we alloced the page */
page_cache_release(page);
}
- } while (index != start_idx);
+ } while (index != 0);
}
/*
@@ -4436,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
*/
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
- btrfs_release_extent_buffer_page(eb, 0);
+ btrfs_release_extent_buffer_page(eb);
__free_extent_buffer(eb);
}
@@ -4579,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- struct page *p = extent_buffer_page(eb, i);
+ struct page *p = eb->pages[i];
+
if (p != accessed)
mark_page_accessed(p);
}
@@ -4748,7 +4878,7 @@ again:
*/
SetPageChecked(eb->pages[0]);
for (i = 1; i < num_pages; i++) {
- p = extent_buffer_page(eb, i);
+ p = eb->pages[i];
ClearPageChecked(p);
unlock_page(p);
}
@@ -4793,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
}
/* Should be safe to release our pages at this point */
- btrfs_release_extent_buffer_page(eb, 0);
+ btrfs_release_extent_buffer_page(eb);
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
return 1;
}
@@ -4859,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (!PageDirty(page))
continue;
@@ -4895,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
for (i = 0; i < num_pages; i++)
- set_page_dirty(extent_buffer_page(eb, i));
+ set_page_dirty(eb->pages[i]);
return was_dirty;
}
@@ -4908,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (page)
ClearPageUptodate(page);
}
@@ -4924,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
SetPageUptodate(page);
}
return 0;
@@ -4964,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (wait == WAIT_NONE) {
if (!trylock_page(page))
goto unlock_exit;
@@ -4983,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
goto unlock_exit;
}
- clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
if (!PageUptodate(page)) {
ClearPageError(page);
err = __extent_read_full_page(tree, page,
@@ -5012,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
return ret;
for (i = start_i; i < num_pages; i++) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
wait_on_page_locked(page);
if (!PageUptodate(page))
ret = -EIO;
@@ -5023,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
unlock_exit:
i = start_i;
while (locked_pages > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
i++;
unlock_page(page);
locked_pages--;
@@ -5049,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
kaddr = page_address(page);
@@ -5081,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
kaddr = page_address(page);
@@ -5130,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
return -EINVAL;
}
- p = extent_buffer_page(eb, i);
+ p = eb->pages[i];
kaddr = page_address(p);
*map = kaddr + offset;
*map_len = PAGE_CACHE_SIZE - offset;
@@ -5156,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -5190,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5220,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(eb, i);
+ page = eb->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5251,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
(PAGE_CACHE_SIZE - 1);
while (len > 0) {
- page = extent_buffer_page(dst, i);
+ page = dst->pages[i];
WARN_ON(!PageUptodate(page));
cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
@@ -5329,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
cur = min_t(unsigned long, cur,
(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
- copy_pages(extent_buffer_page(dst, dst_i),
- extent_buffer_page(dst, src_i),
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page, src_off_in_page, cur);
src_offset += cur;
@@ -5376,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
- copy_pages(extent_buffer_page(dst, dst_i),
- extent_buffer_page(dst, src_i),
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page - cur + 1,
src_off_in_page - cur + 1, cur);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ccc264e7bde1..6d4b938be986 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -11,8 +11,6 @@
#define EXTENT_NEW (1 << 4)
#define EXTENT_DELALLOC (1 << 5)
#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_DEFRAG_DONE (1 << 7)
-#define EXTENT_BUFFER_FILLED (1 << 8)
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
@@ -34,16 +32,16 @@
/* these are bit numbers for test/set bit */
#define EXTENT_BUFFER_UPTODATE 0
-#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
#define EXTENT_BUFFER_CORRUPT 3
#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
#define EXTENT_BUFFER_TREE_REF 5
#define EXTENT_BUFFER_STALE 6
#define EXTENT_BUFFER_WRITEBACK 7
-#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
#define EXTENT_BUFFER_DUMMY 9
#define EXTENT_BUFFER_IN_TREE 10
+#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
/* these are flags for extent_clear_unlock_delalloc */
#define PAGE_UNLOCK (1 << 0)
@@ -57,7 +55,6 @@
* map has page->private set to one.
*/
#define EXTENT_PAGE_PRIVATE 1
-#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
struct extent_state;
struct btrfs_root;
@@ -108,7 +105,6 @@ struct extent_state {
struct rb_node rb_node;
/* ADD NEW ELEMENTS AFTER THIS */
- struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
@@ -126,8 +122,6 @@ struct extent_state {
struct extent_buffer {
u64 start;
unsigned long len;
- unsigned long map_start;
- unsigned long map_len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
spinlock_t refs_lock;
@@ -144,7 +138,9 @@ struct extent_buffer {
atomic_t blocking_readers;
atomic_t spinning_readers;
atomic_t spinning_writers;
- int lock_nested;
+ short lock_nested;
+ /* >= 0 if eb belongs to a log tree, -1 otherwise */
+ short log_index;
/* protects write locks */
rwlock_t lock;
@@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len)
(start >> PAGE_CACHE_SHIFT);
}
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
- unsigned long i)
-{
- return eb->pages[i];
-}
-
static inline void extent_buffer_get(struct extent_buffer *eb)
{
atomic_inc(&eb->refs);
@@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
struct btrfs_fs_info;
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
- u64 length, u64 logical, struct page *page,
- int mirror_num);
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+ struct page *page, unsigned int pg_offset,
+ int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+ unsigned int pg_offset);
int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the page is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ struct page *page;
+ u64 start;
+ u64 len;
+ u64 logical;
+ unsigned long bio_flags;
+ int this_mirror;
+ int failed_mirror;
+ int in_validation;
+};
+
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret);
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int fail_mirror);
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ struct page *page, int pg_offset, int icsum,
+ bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
noinline u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
+#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
#endif
-#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f46cfe45d686..84a2d1868271 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
file_key.objectid = objectid;
file_key.offset = pos;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
@@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
if (ret < 0)
goto fail;
@@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
goto fail;
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
goto fail;
csum_offset = (bytenr - found_key.offset) >>
@@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
file_key.objectid = objectid;
file_key.offset = offset;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
return ret;
}
@@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
}
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct btrfs_dio_private *dip, struct bio *bio,
- u64 offset)
+ struct bio *bio, u64 offset)
{
- int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- int ret;
-
- len >>= inode->i_sb->s_blocksize_bits;
- len *= csum_size;
-
- ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
- (u32 *)(dip->csum + len), 1);
- return ret;
+ return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
u64 csum_end;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- ASSERT(start == ALIGN(start, root->sectorsize) &&
- (end + 1) == ALIGN(end + 1, root->sectorsize));
+ ASSERT(IS_ALIGNED(start, root->sectorsize) &&
+ IS_ALIGNED(end + 1, root->sectorsize));
path = btrfs_alloc_path();
if (!path)
@@ -423,7 +413,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
ret = 0;
fail:
while (ret < 0 && !list_empty(&tmplist)) {
- sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+ sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
list_del(&sums->list);
kfree(sums);
}
@@ -720,7 +710,7 @@ again:
bytenr = sums->bytenr + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
- btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
@@ -756,7 +746,7 @@ again:
found_next = 1;
if (ret != 0)
goto insert;
- slot = 0;
+ slot = path->slots[0];
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
@@ -790,7 +780,7 @@ again:
csum_offset = (bytenr - found_key.offset) >>
root->fs_info->sb->s_blocksize_bits;
- if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
goto insert;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1f2b99cb55ea..a18ceabd99a8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
/* get the inode */
key.objectid = defrag->root;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
}
key.objectid = defrag->ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
if (IS_ERR(inode)) {
@@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
if (unlikely(copied == 0))
break;
- if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+ if (copied < PAGE_CACHE_SIZE - offset) {
offset += copied;
} else {
pg++;
@@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
bool force_page_uptodate = false;
bool need_unlock;
- nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
- (sizeof(struct page *)));
+ nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
- size_t num_pages = (write_bytes + offset +
- PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_CACHE_SIZE);
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* our prealloc extent may be smaller than
* write_bytes, so scale down.
*/
- num_pages = (write_bytes + offset +
- PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
ret = 0;
} else {
@@ -1590,9 +1588,8 @@ again:
dirty_pages = 0;
} else {
force_page_uptodate = false;
- dirty_pages = (copied + offset +
- PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ dirty_pages = DIV_ROUND_UP(copied + offset,
+ PAGE_CACHE_SIZE);
}
/*
@@ -1653,7 +1650,7 @@ again:
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
btrfs_btree_balance_dirty(root);
pos += copied;
@@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (sync)
atomic_inc(&BTRFS_I(inode)->sync_writers);
- if (unlikely(file->f_flags & O_DIRECT)) {
+ if (file->f_flags & O_DIRECT) {
num_written = __btrfs_direct_write(iocb, from, pos);
} else {
num_written = __btrfs_buffered_write(file, from, pos);
@@ -1838,6 +1835,8 @@ out:
int btrfs_release_file(struct inode *inode, struct file *filp)
{
+ if (filp->private_data)
+ btrfs_ioctl_trans_end(filp);
/*
* ordered_data_close is set by settattr when we are about to truncate
* a file from a non-zero size to a zero size. This tries to
@@ -1845,29 +1844,25 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
* application were using truncate to replace a file in place.
*/
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
- &BTRFS_I(inode)->runtime_flags)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We need to block on a committing transaction to keep us from
- * throwing a ordered operation on to the list and causing
- * something like sync to deadlock trying to flush out this
- * inode.
- */
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
- btrfs_end_transaction(trans, root);
- if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ &BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
- }
- if (filp->private_data)
- btrfs_ioctl_trans_end(filp);
return 0;
}
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+
+ return ret;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -1897,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* multi-task, and make the performance up. See
* btrfs_wait_ordered_range for an explanation of the ASYNC check.
*/
- atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- atomic_dec(&BTRFS_I(inode)->sync_writers);
+ ret = start_ordered_ops(inode, start, end);
if (ret)
return ret;
mutex_lock(&inode->i_mutex);
-
- /*
- * We flush the dirty pages again to avoid some dirty pages in the
- * range being left.
- */
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * We might have have had more pages made dirty after calling
+ * start_ordered_ops and before acquiring the inode's i_mutex.
+ */
if (full_sync) {
+ /*
+ * For a full sync, we need to make sure any ordered operations
+ * start and finish before we start logging the inode, so that
+ * all extents are persisted and the respective file extent
+ * items are in the fs/subvol btree.
+ */
ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
+ } else {
+ /*
+ * Start any new ordered operations before starting to log the
+ * inode. We will wait for them to finish in btrfs_sync_log().
+ *
+ * Right before acquiring the inode's mutex, we might have new
+ * writes dirtying pages, which won't immediately start the
+ * respective ordered operations - that is done through the
+ * fill_delalloc callbacks invoked from the writepage and
+ * writepages address space operations. So make sure we start
+ * all ordered operations before starting to log our inode. Not
+ * doing this means that while logging the inode, writeback
+ * could start and invoke writepage/writepages, which would call
+ * the fill_delalloc callbacks (cow_file_range,
+ * submit_compressed_extents). These callbacks add first an
+ * extent map to the modified list of extents and then create
+ * the respective ordered operation, which means in
+ * tree-log.c:btrfs_log_inode() we might capture all existing
+ * ordered operations (with btrfs_get_logged_extents()) before
+ * the fill_delalloc callback adds its ordered operation, and by
+ * the time we visit the modified list of extent maps (with
+ * btrfs_log_changed_extents()), we see and process the extent
+ * map they created. We then use the extent map to construct a
+ * file extent item for logging without waiting for the
+ * respective ordered operation to finish - this file extent
+ * item points to a disk location that might not have yet been
+ * written to, containing random data - so after a crash a log
+ * replay will make our inode have file extent items that point
+ * to disk locations containing invalid data, as we returned
+ * success to userspace without waiting for the respective
+ * ordered operation to finish, because it wasn't captured by
+ * btrfs_get_logged_extents().
+ */
+ ret = start_ordered_ops(inode, start, end);
+ }
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
}
atomic_inc(&root->log_batch);
@@ -1982,7 +2011,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_init_log_ctx(&ctx);
- ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+ ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2000,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
mutex_unlock(&inode->i_mutex);
+ /*
+ * If any of the ordered extents had an error, just return it to user
+ * space, so that the application knows some writes didn't succeed and
+ * can take proper action (retry for e.g.). Blindly committing the
+ * transaction in this case, would fool userspace that everything was
+ * successful. And we also want to make sure our log doesn't contain
+ * file extent items pointing to extents that weren't fully written to -
+ * just like in the non fast fsync path, where we check for the ordered
+ * operation's error flag before writing to the log tree and return -EIO
+ * if any of them had this flag set (btrfs_wait_ordered_range) -
+ * therefore we need to check for errors in the ordered operations,
+ * which are indicated by ctx.io_err.
+ */
+ if (ctx.io_err) {
+ btrfs_end_transaction(trans, root);
+ ret = ctx.io_err;
+ goto out;
+ }
+
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
@@ -2112,10 +2160,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
goto out;
}
- if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+ if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
u64 num_bytes;
- path->slots[0]++;
key.offset = offset;
btrfs_set_item_key_safe(root, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -2240,7 +2287,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+ lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
lockend = round_down(offset + len,
BTRFS_I(inode)->root->sectorsize) - 1;
same_page = ((offset >> PAGE_CACHE_SHIFT) ==
@@ -2301,7 +2348,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
tail_start + tail_len, 0, 1);
if (ret)
goto out_only_mutex;
- }
+ }
}
}
@@ -2638,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- u64 lockstart = *offset;
- u64 lockend = i_size_read(inode);
- u64 start = *offset;
- u64 len = i_size_read(inode);
+ u64 lockstart;
+ u64 lockend;
+ u64 start;
+ u64 len;
int ret = 0;
- lockend = max_t(u64, root->sectorsize, lockend);
+ if (inode->i_size == 0)
+ return -ENXIO;
+
+ /*
+ * *offset can be negative, in this case we start finding DATA/HOLE from
+ * the very start of the file.
+ */
+ start = max_t(loff_t, 0, *offset);
+
+ lockstart = round_down(start, root->sectorsize);
+ lockend = round_up(i_size_read(inode), root->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
-
lockend--;
len = lockend - lockstart + 1;
- len = max_t(u64, len, root->sectorsize);
- if (inode->i_size == 0)
- return -ENXIO;
-
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
&cached_state);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2b0a627cb5f9..33848196550e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
int num_pages;
int check_crcs = 0;
- num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
check_crcs = 1;
@@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
return merged;
}
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ unsigned long i;
+ unsigned long j;
+ const u64 end = info->offset + info->bytes;
+ const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+ u64 bytes;
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, end);
+ j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+ if (j == i)
+ return false;
+ bytes = (j - i) * ctl->unit;
+ info->bytes += bytes;
+
+ if (update_stat)
+ bitmap_clear_bits(ctl, bitmap, end, bytes);
+ else
+ __bitmap_clear_bits(ctl, bitmap, end, bytes);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ u64 bitmap_offset;
+ unsigned long i;
+ unsigned long j;
+ unsigned long prev_j;
+ u64 bytes;
+
+ bitmap_offset = offset_to_bitmap(ctl, info->offset);
+ /* If we're on a boundary, try the previous logical bitmap. */
+ if (bitmap_offset == info->offset) {
+ if (info->offset == 0)
+ return false;
+ bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
+ }
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+ j = 0;
+ prev_j = (unsigned long)-1;
+ for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
+ if (j > i)
+ break;
+ prev_j = j;
+ }
+ if (prev_j == i)
+ return false;
+
+ if (prev_j == (unsigned long)-1)
+ bytes = (i + 1) * ctl->unit;
+ else
+ bytes = (i - prev_j) * ctl->unit;
+
+ info->offset -= bytes;
+ info->bytes += bytes;
+
+ if (update_stat)
+ bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+ else
+ __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+/*
+ * We prefer always to allocate from extent entries, both for clustered and
+ * non-clustered allocation requests. So when attempting to add a new extent
+ * entry, try to see if there's adjacent free space in bitmap entries, and if
+ * there is, migrate that space from the bitmaps to the extent.
+ * Like this we get better chances of satisfying space allocation requests
+ * because we attempt to satisfy them based on a single cache entry, and never
+ * on 2 or more entries - even if the entries represent a contiguous free space
+ * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
+ * ends).
+ */
+static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ /*
+ * Only work with disconnected entries, as we can change their offset,
+ * and must be extent entries.
+ */
+ ASSERT(!info->bitmap);
+ ASSERT(RB_EMPTY_NODE(&info->offset_index));
+
+ if (ctl->total_bitmaps > 0) {
+ bool stole_end;
+ bool stole_front = false;
+
+ stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
+ if (ctl->total_bitmaps > 0)
+ stole_front = steal_from_bitmap_to_front(ctl, info,
+ update_stat);
+
+ if (stole_end || stole_front)
+ try_merge_free_space(ctl, info, update_stat);
+ }
+}
+
int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
u64 offset, u64 bytes)
{
@@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
info->offset = offset;
info->bytes = bytes;
+ RB_CLEAR_NODE(&info->offset_index);
spin_lock(&ctl->tree_lock);
@@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
goto out;
}
link:
+ /*
+ * Only steal free space from adjacent bitmaps if we're sure we're not
+ * going to add the new free space to existing bitmap entries - because
+ * that would mean unnecessary work that would be reverted. Therefore
+ * attempt to steal space from bitmaps if we're adding an extent entry.
+ */
+ steal_from_bitmap(ctl, info, true);
+
ret = link_free_space(ctl, info);
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
@@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space(
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
+ RB_CLEAR_NODE(&entry->offset_index);
bitmap = (entry->bitmap != NULL);
- if (!bitmap)
+ if (!bitmap) {
try_merge_free_space(ctl, entry, false);
+ steal_from_bitmap(ctl, entry, false);
+ }
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
}
@@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
{
struct inode *inode = NULL;
- spin_lock(&root->cache_lock);
- if (root->cache_inode)
- inode = igrab(root->cache_inode);
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_inode)
+ inode = igrab(root->ino_cache_inode);
+ spin_unlock(&root->ino_cache_lock);
if (inode)
return inode;
@@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
if (IS_ERR(inode))
return inode;
- spin_lock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
if (!btrfs_fs_closing(root->fs_info))
- root->cache_inode = igrab(inode);
- spin_unlock(&root->cache_lock);
+ root->ino_cache_inode = igrab(inode);
+ spin_unlock(&root->ino_cache_lock);
return inode;
}
@@ -3176,6 +3309,7 @@ again:
map = NULL;
add_new_bitmap(ctl, info, offset);
bitmap_info = info;
+ info = NULL;
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
@@ -3186,6 +3320,8 @@ again:
if (bytes)
goto again;
+ if (info)
+ kmem_cache_free(btrfs_free_space_cachep, info);
if (map)
kfree(map);
return 0;
@@ -3260,6 +3396,7 @@ have_info:
goto have_info;
}
+ ret = 0;
goto out;
}
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 85889aa82c62..aae520b2aee5 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -20,10 +20,8 @@ static struct crypto_shash *tfm;
int __init btrfs_hash_init(void)
{
tfm = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
- return 0;
+ return PTR_ERR_OR_ZERO(tfm);
}
void btrfs_hash_exit(void)
@@ -33,18 +31,16 @@ void btrfs_hash_exit(void)
u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
{
- struct {
- struct shash_desc shash;
- char ctx[crypto_shash_descsize(tfm)];
- } desc;
+ SHASH_DESC_ON_STACK(shash, tfm);
+ u32 *ctx = (u32 *)shash_desc_ctx(shash);
int err;
- desc.shash.tfm = tfm;
- desc.shash.flags = 0;
- *(u32 *)desc.ctx = crc;
+ shash->tfm = tfm;
+ shash->flags = 0;
+ *ctx = crc;
- err = crypto_shash_update(&desc.shash, address, length);
+ err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *(u32 *)desc.ctx;
+ return *ctx;
}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 2be38df703c9..8ffa4783cbf4 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
u32 item_size;
key.objectid = inode_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
path = btrfs_alloc_path();
@@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.offset = ref_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+ key.type = BTRFS_INODE_REF_KEY;
path = btrfs_alloc_path();
if (!path)
@@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.offset = ref_objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+ key.type = BTRFS_INODE_REF_KEY;
path = btrfs_alloc_path();
if (!path)
@@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_key found_key;
ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
- if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+ if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
location->offset == (u64)-1 && path->slots[0] != 0) {
slot = path->slots[0] - 1;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid == location->objectid &&
- btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+ found_key.type == location->type) {
path->slots[0]--;
return 0;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 888fbe19079f..83d646bd2e4b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -87,7 +87,7 @@ again:
*/
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
- root->cache_progress = last;
+ root->ino_cache_progress = last;
up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
@@ -106,7 +106,7 @@ again:
if (last != (u64)-1 && last + 1 != key.objectid) {
__btrfs_add_free_space(ctl, last + 1,
key.objectid - last - 1);
- wake_up(&root->cache_wait);
+ wake_up(&root->ino_cache_wait);
}
last = key.objectid;
@@ -119,14 +119,14 @@ next:
root->highest_objectid - last - 1);
}
- spin_lock(&root->cache_lock);
- root->cached = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->ino_cache_lock);
- root->cache_progress = (u64)-1;
+ root->ino_cache_progress = (u64)-1;
btrfs_unpin_free_ino(root);
out:
- wake_up(&root->cache_wait);
+ wake_up(&root->ino_cache_wait);
up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
@@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root)
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
- spin_lock(&root->cache_lock);
- if (root->cached != BTRFS_CACHE_NO) {
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state != BTRFS_CACHE_NO) {
+ spin_unlock(&root->ino_cache_lock);
return;
}
- root->cached = BTRFS_CACHE_STARTED;
- spin_unlock(&root->cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_STARTED;
+ spin_unlock(&root->ino_cache_lock);
ret = load_free_ino_cache(root->fs_info, root);
if (ret == 1) {
- spin_lock(&root->cache_lock);
- root->cached = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->ino_cache_lock);
return;
}
@@ -196,11 +196,11 @@ again:
start_caching(root);
- wait_event(root->cache_wait,
- root->cached == BTRFS_CACHE_FINISHED ||
+ wait_event(root->ino_cache_wait,
+ root->ino_cache_state == BTRFS_CACHE_FINISHED ||
root->free_ino_ctl->free_space > 0);
- if (root->cached == BTRFS_CACHE_FINISHED &&
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
root->free_ino_ctl->free_space == 0)
return -ENOSPC;
else
@@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
again:
- if (root->cached == BTRFS_CACHE_FINISHED) {
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
__btrfs_add_free_space(pinned, objectid, 1);
} else {
down_write(&root->fs_info->commit_root_sem);
- spin_lock(&root->cache_lock);
- if (root->cached == BTRFS_CACHE_FINISHED) {
- spin_unlock(&root->cache_lock);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
+ spin_unlock(&root->ino_cache_lock);
up_write(&root->fs_info->commit_root_sem);
goto again;
}
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
start_caching(root);
@@ -235,10 +235,10 @@ again:
}
/*
- * When a transaction is committed, we'll move those inode numbers which
- * are smaller than root->cache_progress from pinned tree to free_ino tree,
- * and others will just be dropped, because the commit root we were
- * searching has changed.
+ * When a transaction is committed, we'll move those inode numbers which are
+ * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
+ * others will just be dropped, because the commit root we were searching has
+ * changed.
*
* Must be called with root->fs_info->commit_root_sem held
*/
@@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
info = rb_entry(n, struct btrfs_free_space, offset_index);
BUG_ON(info->bitmap); /* Logic error */
- if (info->offset > root->cache_progress)
+ if (info->offset > root->ino_cache_progress)
goto free;
- else if (info->offset + info->bytes > root->cache_progress)
- count = root->cache_progress - info->offset + 1;
+ else if (info->offset + info->bytes > root->ino_cache_progress)
+ count = root->ino_cache_progress - info->offset + 1;
else
count = info->bytes;
@@ -462,13 +462,13 @@ again:
}
}
- spin_lock(&root->cache_lock);
- if (root->cached != BTRFS_CACHE_FINISHED) {
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
ret = -1;
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
goto out_put;
}
- spin_unlock(&root->cache_lock);
+ spin_unlock(&root->ino_cache_lock);
spin_lock(&ctl->tree_lock);
prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3668048e16f8..d23362f4464e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
key.objectid = btrfs_ino(inode);
key.offset = start;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
data_len = compressed_size;
if (start > 0 ||
- actual_end >= PAGE_CACHE_SIZE ||
- data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+ actual_end > PAGE_CACHE_SIZE ||
+ data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
(!compressed_size &&
(actual_end & (root->sectorsize - 1)) == 0) ||
end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
return 0;
}
+static inline int inode_need_compress(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /* force compress */
+ if (btrfs_test_opt(root, FORCE_COMPRESS))
+ return 1;
+ /* bad compression ratios */
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ return 0;
+ if (btrfs_test_opt(root, COMPRESS) ||
+ BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+ BTRFS_I(inode)->force_compress)
+ return 1;
+ return 0;
+}
+
/*
* we create compressed extents in two phases. The first
* phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
- (btrfs_test_opt(root, COMPRESS) ||
- (BTRFS_I(inode)->force_compress) ||
- (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
+ if (inode_need_compress(inode)) {
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
if (!pages) {
@@ -709,6 +723,18 @@ retry:
unlock_extent(io_tree, async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
+
+ /*
+ * we need to redirty the pages if we decide to
+ * fallback to uncompressed IO, otherwise we
+ * will not submit these pages down to lower
+ * layers.
+ */
+ extent_range_redirty_for_io(inode,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1);
+
goto retry;
}
goto out_free;
@@ -766,8 +792,12 @@ retry:
ins.offset,
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
- if (ret)
+ if (ret) {
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
goto out_free_reserve;
+ }
/*
* clear dirty, set writeback and unlock the pages.
@@ -959,14 +989,14 @@ static noinline int cow_file_range(struct inode *inode,
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size, 0);
if (ret)
- goto out_reserve;
+ goto out_drop_extent_cache;
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
if (ret)
- goto out_reserve;
+ goto out_drop_extent_cache;
}
if (disk_num_bytes < cur_alloc_size)
@@ -994,6 +1024,8 @@ static noinline int cow_file_range(struct inode *inode,
out:
return ret;
+out_drop_extent_cache:
+ btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
@@ -1076,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->locked_page = locked_page;
async_cow->start = start;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ !btrfs_test_opt(root, FORCE_COMPRESS))
cur_end = end;
else
cur_end = min(end, start + 512 * 1024 - 1);
@@ -1084,8 +1117,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
- btrfs_init_work(&async_cow->work, async_cow_start,
- async_cow_submit, async_cow_free);
+ btrfs_init_work(&async_cow->work,
+ btrfs_delalloc_helper,
+ async_cow_start, async_cow_submit,
+ async_cow_free);
nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
@@ -1425,6 +1460,26 @@ error:
return ret;
}
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+ return 0;
+
+ /*
+ * @defrag_bytes is a hint value, no spinlock held here,
+ * if is not zero, it means the file is defragging.
+ * Force cow if given extent needs to be defragged.
+ */
+ if (BTRFS_I(inode)->defrag_bytes &&
+ test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+ EXTENT_DEFRAG, 0, NULL))
+ return 1;
+
+ return 0;
+}
+
/*
* extent_io.c call back to do delayed allocation processing
*/
@@ -1433,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
unsigned long *nr_written)
{
int ret;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ int force_cow = need_force_cow(inode, start, end);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
+ } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- } else if (!btrfs_test_opt(root, COMPRESS) &&
- !(BTRFS_I(inode)->force_compress) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
+ } else if (!inode_need_compress(inode)) {
ret = cow_file_range(inode, locked_page, start, end,
page_started, nr_written, 1);
} else {
@@ -1535,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
struct extent_state *state, unsigned long *bits)
{
+ if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
@@ -1557,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
root->fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
+ if (*bits & EXTENT_DEFRAG)
+ BTRFS_I(inode)->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags))
btrfs_add_delalloc_inodes(root, inode);
@@ -1571,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
struct extent_state *state,
unsigned long *bits)
{
+ u64 len = state->end + 1 - state->start;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+ BTRFS_I(inode)->defrag_bytes -= len;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
@@ -1578,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
*/
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 len = state->end + 1 - state->start;
bool do_list = !btrfs_is_free_space_inode(inode);
if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -1869,7 +1932,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
SetPageChecked(page);
page_cache_get(page);
- btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+ btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+ btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
return -EBUSY;
@@ -2639,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1);
+
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
logical_len = ordered_extent->truncated_len;
@@ -2810,7 +2878,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2819,15 +2888,53 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
end - start + 1, uptodate))
return 0;
- btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
+ if (btrfs_is_free_space_inode(inode)) {
+ wq = root->fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else {
+ wq = root->fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
- if (btrfs_is_free_space_inode(inode))
- workers = root->fs_info->endio_freespace_worker;
- else
- workers = root->fs_info->endio_write_workers;
- btrfs_queue_work(workers, &ordered_extent->work);
+ btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
+ NULL);
+ btrfs_queue_work(wq, &ordered_extent->work);
+
+ return 0;
+}
+
+static int __readpage_endio_check(struct inode *inode,
+ struct btrfs_io_bio *io_bio,
+ int icsum, struct page *page,
+ int pgoff, u64 start, size_t len)
+{
+ char *kaddr;
+ u32 csum_expected;
+ u32 csum = ~(u32)0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ csum_expected = *(((u32 *)io_bio->csum) + icsum);
+
+ kaddr = kmap_atomic(page);
+ csum = btrfs_csum_data(kaddr + pgoff, csum, len);
+ btrfs_csum_final(csum, (char *)&csum);
+ if (csum != csum_expected)
+ goto zeroit;
+ kunmap_atomic(kaddr);
return 0;
+zeroit:
+ if (__ratelimit(&_rs))
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu off %llu csum %u expected csum %u",
+ btrfs_ino(inode), start, csum, csum_expected);
+ memset(kaddr + pgoff, 1, len);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ if (csum_expected == 0)
+ return 0;
+ return -EIO;
}
/*
@@ -2842,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- char *kaddr;
struct btrfs_root *root = BTRFS_I(inode)->root;
- u32 csum_expected;
- u32 csum = ~(u32)0;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
if (PageChecked(page)) {
ClearPageChecked(page);
- goto good;
+ return 0;
}
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
- goto good;
+ return 0;
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2865,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
}
phy_offset >>= inode->i_sb->s_blocksize_bits;
- csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
-
- kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
- btrfs_csum_final(csum, (char *)&csum);
- if (csum != csum_expected)
- goto zeroit;
-
- kunmap_atomic(kaddr);
-good:
- return 0;
-
-zeroit:
- if (__ratelimit(&_rs))
- btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
- btrfs_ino(page->mapping->host), start, csum, csum_expected);
- memset(kaddr + offset, 1, end - start + 1);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- if (csum_expected == 0)
- return 0;
- return -EIO;
+ return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+ start, (size_t)(end - start + 1));
}
struct delayed_iput {
@@ -3133,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
path->reada = -1;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
@@ -3160,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* make sure the item matches what we want */
if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
break;
- if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+ if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
/* release the path since we're done with it */
@@ -3636,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !root->fs_info->log_root_recovering) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4059,7 +4142,7 @@ search_again:
fi = NULL;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
if (found_key.objectid != ino)
break;
@@ -4222,7 +4305,8 @@ out:
btrfs_abort_transaction(trans, root, ret);
}
error:
- if (last_size != (u64)-1)
+ if (last_size != (u64)-1 &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
btrfs_ordered_update_i_size(inode, last_size, NULL);
btrfs_free_path(path);
return err;
@@ -4662,6 +4746,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
remove_extent_mapping(map_tree, em);
free_extent_map(em);
+ if (need_resched()) {
+ write_unlock(&map_tree->lock);
+ cond_resched();
+ write_lock(&map_tree->lock);
+ }
}
write_unlock(&map_tree->lock);
@@ -4684,6 +4773,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
&cached_state, GFP_NOFS);
free_extent_state(state);
+ cond_resched();
spin_lock(&io_tree->lock);
}
spin_unlock(&io_tree->lock);
@@ -4714,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode)
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ btrfs_free_io_failure_record(inode, 0, (u64)-1);
+
if (root->fs_info->log_root_recovering) {
BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags));
@@ -5262,7 +5354,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
btrfs_get_delayed_items(inode, &ins_list, &del_list);
}
- btrfs_set_key_type(&key, key_type);
+ key.type = key_type;
key.offset = ctx->pos;
key.objectid = btrfs_ino(inode);
@@ -5287,7 +5379,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (found_key.objectid != key.objectid)
break;
- if (btrfs_key_type(&found_key) != key_type)
+ if (found_key.type != key_type)
break;
if (found_key.offset < ctx->pos)
goto next;
@@ -5499,7 +5591,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
int ret;
key.objectid = btrfs_ino(inode);
- btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.type = BTRFS_DIR_INDEX_KEY;
key.offset = (u64)-1;
path = btrfs_alloc_path();
@@ -5531,7 +5623,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != btrfs_ino(inode) ||
- btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
BTRFS_I(inode)->index_cnt = 2;
goto out;
}
@@ -5565,6 +5657,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
return ret;
}
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+ struct btrfs_iget_args args;
+ args.location = &BTRFS_I(inode)->location;
+ args.root = BTRFS_I(inode)->root;
+
+ return insert_inode_locked4(inode,
+ btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+ btrfs_find_actor, &args);
+}
+
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir,
@@ -5594,6 +5697,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
}
/*
+ * O_TMPFILE, set link count to 0, so that after this point,
+ * we fill in an inode item with the correct link count.
+ */
+ if (!name)
+ set_nlink(inode, 0);
+
+ /*
* we have to initialize this early, so we can reclaim the inode
* number if we fail afterwards in this function.
*/
@@ -5631,7 +5741,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
key[0].objectid = objectid;
- btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+ key[0].type = BTRFS_INODE_ITEM_KEY;
key[0].offset = 0;
sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5644,16 +5754,25 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* add more hard links than can fit in the ref item.
*/
key[1].objectid = objectid;
- btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ key[1].type = BTRFS_INODE_REF_KEY;
key[1].offset = ref_objectid;
sizes[1] = name_len + sizeof(*ref);
}
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ location->type = BTRFS_INODE_ITEM_KEY;
+
+ ret = btrfs_insert_inode_locked(inode);
+ if (ret < 0)
+ goto fail;
+
path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
if (ret != 0)
- goto fail;
+ goto fail_unlock;
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
@@ -5676,11 +5795,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-
btrfs_inherit_iflags(inode, dir);
if (S_ISREG(mode)) {
@@ -5691,7 +5805,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
BTRFS_INODE_NODATASUM;
}
- btrfs_insert_inode_hash(inode);
inode_tree_add(inode);
trace_btrfs_inode_new(inode);
@@ -5706,6 +5819,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_ino(inode), root->root_key.objectid, ret);
return inode;
+
+fail_unlock:
+ unlock_new_inode(inode);
fail:
if (dir && name)
BTRFS_I(dir)->index_cnt--;
@@ -5739,7 +5855,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
} else {
key.objectid = ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
}
@@ -5840,28 +5956,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err) {
- drop_inode = 1;
- goto out_unlock;
- }
-
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
* if the filesystem supports xattrs by looking at the
* ops vector.
*/
-
inode->i_op = &btrfs_special_inode_operations;
- err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+ init_special_inode(inode, inode->i_mode, rdev);
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
- drop_inode = 1;
- else {
- init_special_inode(inode, inode->i_mode, rdev);
+ goto out_unlock_inode;
+
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+ if (err) {
+ goto out_unlock_inode;
+ } else {
btrfs_update_inode(trans, root, inode);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
}
+
out_unlock:
btrfs_end_transaction(trans, root);
btrfs_balance_delayed_items(root);
@@ -5871,6 +5987,12 @@ out_unlock:
iput(inode);
}
return err;
+
+out_unlock_inode:
+ drop_inode = 1;
+ unlock_new_inode(inode);
+ goto out_unlock;
+
}
static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5905,15 +6027,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
drop_inode_on_err = 1;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_update_inode(trans, root, inode);
- if (err)
- goto out_unlock;
-
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
@@ -5922,14 +6035,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
*/
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ if (err)
+ goto out_unlock_inode;
+
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ goto out_unlock_inode;
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
- goto out_unlock;
+ goto out_unlock_inode;
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
out_unlock:
@@ -5941,6 +6063,11 @@ out_unlock:
btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
+
+out_unlock_inode:
+ unlock_new_inode(inode);
+ goto out_unlock;
+
}
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6048,25 +6175,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
drop_on_err = 1;
+ /* these must be set before we unlock the inode */
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
- goto out_fail;
-
- inode->i_op = &btrfs_dir_inode_operations;
- inode->i_fop = &btrfs_dir_file_operations;
+ goto out_fail_inode;
btrfs_i_size_write(inode, 0);
err = btrfs_update_inode(trans, root, inode);
if (err)
- goto out_fail;
+ goto out_fail_inode;
err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
dentry->d_name.len, 0, index);
if (err)
- goto out_fail;
+ goto out_fail_inode;
d_instantiate(dentry, inode);
+ /*
+ * mkdir is special. We're unlocking after we call d_instantiate
+ * to avoid a race with nfsd calling d_instantiate.
+ */
+ unlock_new_inode(inode);
drop_on_err = 0;
out_fail:
@@ -6076,23 +6208,66 @@ out_fail:
btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
+
+out_fail_inode:
+ unlock_new_inode(inode);
+ goto out_fail;
+}
+
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+ struct rb_node *next;
+
+ next = rb_next(&em->rb_node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+ struct rb_node *prev;
+
+ prev = rb_prev(&em->rb_node);
+ if (!prev)
+ return NULL;
+ return container_of(prev, struct extent_map, rb_node);
}
/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
* and an extent that you want to insert, deal with overlap and insert
- * the new extent into the tree.
+ * the best fitted new extent into the tree.
*/
static int merge_extent_mapping(struct extent_map_tree *em_tree,
struct extent_map *existing,
struct extent_map *em,
- u64 map_start, u64 map_len)
+ u64 map_start)
{
+ struct extent_map *prev;
+ struct extent_map *next;
+ u64 start;
+ u64 end;
u64 start_diff;
BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
- start_diff = map_start - em->start;
- em->start = map_start;
- em->len = map_len;
+
+ if (existing->start > map_start) {
+ next = existing;
+ prev = prev_extent_map(next);
+ } else {
+ prev = existing;
+ next = next_extent_map(prev);
+ }
+
+ start = prev ? extent_map_end(prev) : em->start;
+ start = max_t(u64, start, em->start);
+ end = next ? next->start : extent_map_end(em);
+ end = min_t(u64, end, extent_map_end(em));
+ start_diff = start - em->start;
+ em->start = start;
+ em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
em->block_start += start_diff;
@@ -6220,7 +6395,7 @@ again:
struct btrfs_file_extent_item);
/* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = btrfs_key_type(&found_key);
+ found_type = found_key.type;
if (found_key.objectid != objectid ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/*
@@ -6263,6 +6438,8 @@ next:
goto not_found;
if (start + len <= found_key.offset)
goto not_found;
+ if (start > found_key.offset)
+ goto next;
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
@@ -6367,26 +6544,21 @@ insert:
ret = 0;
- existing = lookup_extent_mapping(em_tree, start, len);
- if (existing && (existing->start > start ||
- existing->start + existing->len <= start)) {
+ existing = search_extent_mapping(em_tree, start, len);
+ /*
+ * existing will always be non-NULL, since there must be
+ * extent causing the -EEXIST.
+ */
+ if (start >= extent_map_end(existing) ||
+ start <= existing->start) {
+ /*
+ * The existing extent map is the one nearest to
+ * the [start, start + len) range which overlaps
+ */
+ err = merge_extent_mapping(em_tree, existing,
+ em, start);
free_extent_map(existing);
- existing = NULL;
- }
- if (!existing) {
- existing = lookup_extent_mapping(em_tree, em->start,
- em->len);
- if (existing) {
- err = merge_extent_mapping(em_tree, existing,
- em, start,
- root->sectorsize);
- free_extent_map(existing);
- if (err) {
- free_extent_map(em);
- em = NULL;
- }
- } else {
- err = -EIO;
+ if (err) {
free_extent_map(em);
em = NULL;
}
@@ -6998,8 +7170,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
block_start, len,
orig_block_len,
ram_bytes, type);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
goto unlock_err;
+ }
}
ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7074,45 +7248,277 @@ unlock_err:
return ret;
}
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int rw, int mirror_num)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct bio_vec *bvec;
- struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct bio *dio_bio;
- u32 *csums = (u32 *)dip->csum;
+ int ret;
+
+ BUG_ON(rw & REQ_WRITE);
+
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DIO_REPAIR);
+ if (ret)
+ goto err;
+
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+ bio_put(bio);
+ return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+ struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ int failed_mirror)
+{
+ int num_copies;
+
+ num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+ failrec->logical, failrec->len);
+ if (num_copies == 1) {
+ /*
+ * we only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. no
+ * matter what the error is, it is very likely to persist.
+ */
+ pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
+ failrec->this_mirror++;
+
+ if (failrec->this_mirror > num_copies) {
+ pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ int failed_mirror, bio_end_io_t *repair_endio,
+ void *repair_arg)
+{
+ struct io_failure_record *failrec;
+ struct bio *bio;
+ int isector;
+ int read_mode;
+ int ret;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ if (ret)
+ return ret;
+
+ ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+ failed_mirror);
+ if (!ret) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ if (failed_bio->bi_vcnt > 1)
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ else
+ read_mode = READ_SYNC;
+
+ isector = start - btrfs_io_bio(failed_bio)->logical;
+ isector >>= inode->i_sb->s_blocksize_bits;
+ bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+ 0, isector, repair_endio, repair_arg);
+ if (!bio) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ btrfs_debug(BTRFS_I(inode)->root->fs_info,
+ "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+ read_mode, failrec->this_mirror, failrec->in_validation);
+
+ ret = submit_dio_repair_bio(inode, bio, read_mode,
+ failrec->this_mirror);
+ if (ret) {
+ free_io_failure(inode, failrec);
+ bio_put(bio);
+ }
+
+ return ret;
+}
+
+struct btrfs_retry_complete {
+ struct completion done;
+ struct inode *inode;
u64 start;
+ int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+ struct btrfs_retry_complete *done = bio->bi_private;
+ struct bio_vec *bvec;
+ int i;
+
+ if (err)
+ goto end;
+
+ done->uptodate = 1;
+ bio_for_each_segment_all(bvec, bio, i)
+ clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+ complete(&done->done);
+ bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+ struct btrfs_io_bio *io_bio)
+{
+ struct bio_vec *bvec;
+ struct btrfs_retry_complete done;
+ u64 start;
+ int i;
+ int ret;
+
+ start = io_bio->logical;
+ done.inode = inode;
+
+ bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+ done.uptodate = 0;
+ done.start = start;
+ init_completion(&done.done);
+
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+ start + bvec->bv_len - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio_nocsum, &done);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&done.done);
+
+ if (!done.uptodate) {
+ /* We might have another mirror, so try again */
+ goto try_again;
+ }
+
+ start += bvec->bv_len;
+ }
+
+ return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+ struct btrfs_retry_complete *done = bio->bi_private;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct bio_vec *bvec;
+ int uptodate;
+ int ret;
int i;
- start = dip->logical_offset;
+ if (err)
+ goto end;
+
+ uptodate = 1;
bio_for_each_segment_all(bvec, bio, i) {
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- struct page *page = bvec->bv_page;
- char *kaddr;
- u32 csum = ~(u32)0;
- unsigned long flags;
-
- local_irq_save(flags);
- kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr + bvec->bv_offset,
- csum, bvec->bv_len);
- btrfs_csum_final(csum, (char *)&csum);
- kunmap_atomic(kaddr);
- local_irq_restore(flags);
-
- flush_dcache_page(bvec->bv_page);
- if (csum != csums[i]) {
- btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
- btrfs_ino(inode), start, csum,
- csums[i]);
- err = -EIO;
- }
+ ret = __readpage_endio_check(done->inode, io_bio, i,
+ bvec->bv_page, 0,
+ done->start, bvec->bv_len);
+ if (!ret)
+ clean_io_failure(done->inode, done->start,
+ bvec->bv_page, 0);
+ else
+ uptodate = 0;
+ }
+
+ done->uptodate = uptodate;
+end:
+ complete(&done->done);
+ bio_put(bio);
+}
+
+static int __btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, int err)
+{
+ struct bio_vec *bvec;
+ struct btrfs_retry_complete done;
+ u64 start;
+ u64 offset = 0;
+ int i;
+ int ret;
+
+ err = 0;
+ start = io_bio->logical;
+ done.inode = inode;
+
+ bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+ ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+ 0, start, bvec->bv_len);
+ if (likely(!ret))
+ goto next;
+try_again:
+ done.uptodate = 0;
+ done.start = start;
+ init_completion(&done.done);
+
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+ start + bvec->bv_len - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio, &done);
+ if (ret) {
+ err = ret;
+ goto next;
}
+ wait_for_completion(&done.done);
+
+ if (!done.uptodate) {
+ /* We might have another mirror, so try again */
+ goto try_again;
+ }
+next:
+ offset += bvec->bv_len;
start += bvec->bv_len;
}
+ return err;
+}
+
+static int btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, int err)
+{
+ bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ if (skip_csum) {
+ if (unlikely(err))
+ return __btrfs_correct_data_nocsum(inode, io_bio);
+ else
+ return 0;
+ } else {
+ return __btrfs_subio_endio_read(inode, io_bio, err);
+ }
+}
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct bio *dio_bio;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+
+ if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+ err = btrfs_subio_endio_read(inode, io_bio, err);
+
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
dio_bio = dip->dio_bio;
@@ -7123,6 +7529,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
if (err)
clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
dio_end_io(dio_bio, err);
+
+ if (io_bio->end_io)
+ io_bio->end_io(io_bio, err);
bio_put(bio);
}
@@ -7146,7 +7555,8 @@ again:
if (!ret)
goto out_test;
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+ finish_ordered_fn, NULL, NULL);
btrfs_queue_work(root->fs_info->endio_write_workers,
&ordered->work);
out_test:
@@ -7187,12 +7597,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ if (err)
+ btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+ "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+ btrfs_ino(dip->inode), bio->bi_rw,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, err);
+
+ if (dip->subio_endio)
+ err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+
if (err) {
- btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
- "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
- btrfs_ino(dip->inode), bio->bi_rw,
- (unsigned long long)bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, err);
dip->errors = 1;
/*
@@ -7223,6 +7638,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
}
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_dio_private *dip,
+ struct bio *bio,
+ u64 file_offset)
+{
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+ int ret;
+
+ /*
+ * We load all the csum data we need when we submit
+ * the first bio to reduce the csum tree search and
+ * contention.
+ */
+ if (dip->logical_offset == file_offset) {
+ ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+ file_offset);
+ if (ret)
+ return ret;
+ }
+
+ if (bio == dip->orig_bio)
+ return 0;
+
+ file_offset -= dip->logical_offset;
+ file_offset >>= inode->i_sb->s_blocksize_bits;
+ io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+
+ return 0;
+}
+
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int rw, u64 file_offset, int skip_sum,
int async_submit)
@@ -7238,7 +7685,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
bio_get(bio);
if (!write) {
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
if (ret)
goto err;
}
@@ -7261,13 +7709,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
if (ret)
goto err;
- } else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
- file_offset);
+ } else {
+ ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+ file_offset);
if (ret)
goto err;
}
-
map:
ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
err:
@@ -7288,19 +7735,18 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
u64 submit_len = 0;
u64 map_length;
int nr_pages = 0;
- int ret = 0;
+ int ret;
int async_submit = 0;
map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
&map_length, NULL, 0);
- if (ret) {
- bio_put(orig_bio);
+ if (ret)
return -EIO;
- }
if (map_length >= orig_bio->bi_iter.bi_size) {
bio = orig_bio;
+ dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
goto submit;
}
@@ -7314,14 +7760,16 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
if (!bio)
return -ENOMEM;
+
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
atomic_inc(&dip->pending_bios);
while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
- if (unlikely(map_length < submit_len + bvec->bv_len ||
+ if (map_length < submit_len + bvec->bv_len ||
bio_add_page(bio, bvec->bv_page, bvec->bv_len,
- bvec->bv_offset) < bvec->bv_len)) {
+ bvec->bv_offset) < bvec->bv_len) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -7350,6 +7798,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
goto out_err;
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw,
@@ -7393,11 +7842,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_private *dip;
struct bio *io_bio;
+ struct btrfs_io_bio *btrfs_bio;
int skip_sum;
- int sum_len;
int write = rw & REQ_WRITE;
int ret = 0;
- u16 csum_size;
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -7407,16 +7855,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
goto free_ordered;
}
- if (!skip_sum && !write) {
- csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- sum_len = dio_bio->bi_iter.bi_size >>
- inode->i_sb->s_blocksize_bits;
- sum_len *= csum_size;
- } else {
- sum_len = 0;
- }
-
- dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
+ dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
goto free_io_bio;
@@ -7428,20 +7867,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
io_bio->bi_private = dip;
- dip->errors = 0;
dip->orig_bio = io_bio;
dip->dio_bio = dio_bio;
atomic_set(&dip->pending_bios, 0);
+ btrfs_bio = btrfs_io_bio(io_bio);
+ btrfs_bio->logical = file_offset;
- if (write)
+ if (write) {
io_bio->bi_end_io = btrfs_endio_direct_write;
- else
+ } else {
io_bio->bi_end_io = btrfs_endio_direct_read;
+ dip->subio_endio = btrfs_subio_endio_read;
+ }
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
+ if (btrfs_bio->end_io)
+ btrfs_bio->end_io(btrfs_bio, ret);
free_io_bio:
bio_put(io_bio);
@@ -7522,7 +7966,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
count = iov_iter_count(iter);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset, count);
+ filemap_fdatawrite_range(inode->i_mapping, offset,
+ offset + count - 1);
if (rw & WRITE) {
/*
@@ -7537,8 +7982,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
goto out;
- } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags))) {
+ } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags)) {
inode_dio_done(inode);
flags = DIO_LOCKING | DIO_SKIP_HOLES;
wakeup = false;
@@ -7939,27 +8384,6 @@ static int btrfs_truncate(struct inode *inode)
BUG_ON(ret);
/*
- * setattr is responsible for setting the ordered_data_close flag,
- * but that is only tested during the last file release. That
- * could happen well after the next commit, leaving a great big
- * window where new writes may get lost if someone chooses to write
- * to this file after truncating to zero
- *
- * The inode doesn't have any dirty data here, and so if we commit
- * this is a noop. If someone immediately starts writing to the inode
- * it is very likely we'll catch some of their writes in this
- * transaction, and the commit will find this file on the ordered
- * data list with good things to send down.
- *
- * This is a best effort solution, there is still a window where
- * using truncate to replace the contents of the file will
- * end up with a zero length file after a crash.
- */
- if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
- &BTRFS_I(inode)->runtime_flags))
- btrfs_add_ordered_operation(trans, root, inode);
-
- /*
* So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to
* first truncate that entire inode. So set this flag so we write out
@@ -8050,6 +8474,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
set_nlink(inode, 1);
btrfs_i_size_write(inode, 0);
+ unlock_new_inode(inode);
err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
if (err)
@@ -8078,6 +8503,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
ei->csum_bytes = 0;
@@ -8106,7 +8532,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
- INIT_LIST_HEAD(&ei->ordered_operations);
RB_CLEAR_NODE(&ei->rb_node);
return inode;
@@ -8137,6 +8562,7 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(BTRFS_I(inode)->reserved_extents);
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
WARN_ON(BTRFS_I(inode)->csum_bytes);
+ WARN_ON(BTRFS_I(inode)->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8146,17 +8572,6 @@ void btrfs_destroy_inode(struct inode *inode)
if (!root)
goto free;
- /*
- * Make sure we're properly removed from the ordered operation
- * lists.
- */
- smp_mb();
- if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
- spin_lock(&root->fs_info->ordered_root_lock);
- list_del_init(&BTRFS_I(inode)->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
- }
-
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags)) {
btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8338,12 +8753,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = 0;
/*
- * we're using rename to replace one file with another.
- * and the replacement file is large. Start IO on it now so
- * we don't add too much work to the end of the transaction
+ * we're using rename to replace one file with another. Start IO on it
+ * now so we don't add too much work to the end of the transaction
*/
- if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
- old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
filemap_flush(old_inode->i_mapping);
/* close the racy window with snapshot create/destroy ioctl */
@@ -8391,12 +8804,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
btrfs_pin_log_trans(root);
}
- /*
- * make sure the inode gets flushed if it is replacing
- * something.
- */
- if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
- btrfs_add_ordered_operation(trans, root, old_inode);
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
@@ -8476,6 +8883,16 @@ out_notrans:
return ret;
}
+static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
+ return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
struct btrfs_delalloc_work *delalloc_work;
@@ -8514,7 +8931,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
work->inode = inode;
work->wait = wait;
work->delay_iput = delay_iput;
- btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+ WARN_ON_ONCE(!inode);
+ btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+ btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -8559,7 +8978,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
spin_unlock(&root->delalloc_lock);
work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
- if (unlikely(!work)) {
+ if (!work) {
if (delay_iput)
btrfs_add_delayed_iput(inode);
else
@@ -8718,12 +9137,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
goto out_unlock;
}
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err) {
- drop_inode = 1;
- goto out_unlock;
- }
-
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
@@ -8732,34 +9145,32 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
*/
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ if (err)
+ goto out_unlock_inode;
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err)
- drop_inode = 1;
- else {
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
- }
- if (drop_inode)
- goto out_unlock;
+ goto out_unlock_inode;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
- drop_inode = 1;
- goto out_unlock;
+ goto out_unlock_inode;
}
key.objectid = btrfs_ino(inode);
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(name_len);
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (err) {
- drop_inode = 1;
btrfs_free_path(path);
- goto out_unlock;
+ goto out_unlock_inode;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8783,12 +9194,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
- if (err)
+ if (err) {
drop_inode = 1;
+ goto out_unlock_inode;
+ }
+
+ unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
out_unlock:
- if (!err)
- d_instantiate(dentry, inode);
btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -8796,6 +9210,11 @@ out_unlock:
}
btrfs_btree_balance_dirty(root);
return err;
+
+out_unlock_inode:
+ drop_inode = 1;
+ unlock_new_inode(inode);
+ goto out_unlock;
}
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8979,14 +9398,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out;
}
- ret = btrfs_init_inode_security(trans, inode, dir, NULL);
- if (ret)
- goto out;
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- goto out;
-
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
@@ -8994,10 +9405,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ if (ret)
+ goto out_inode;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto out_inode;
ret = btrfs_orphan_add(trans, inode);
if (ret)
- goto out;
+ goto out_inode;
+ /*
+ * We set number of links to 0 in btrfs_new_inode(), and here we set
+ * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+ * through:
+ *
+ * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+ */
+ set_nlink(inode, 1);
+ unlock_new_inode(inode);
d_tmpfile(dentry, inode);
mark_inode_dirty(inode);
@@ -9007,8 +9434,12 @@ out:
iput(inode);
btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
-
return ret;
+
+out_inode:
+ unlock_new_inode(inode);
+ goto out;
+
}
static const struct inode_operations btrfs_dir_inode_operations = {
@@ -9019,7 +9450,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.link = btrfs_link,
.mkdir = btrfs_mkdir,
.rmdir = btrfs_rmdir,
- .rename = btrfs_rename,
+ .rename2 = btrfs_rename2,
.symlink = btrfs_symlink,
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47aceb494d1d..4399f0c3a4ce 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
goto out_drop;
} else {
+ ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+ if (ret && ret != -ENODATA)
+ goto out_drop;
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
@@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_flags(&root_item, 0);
@@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir,
key.objectid = objectid;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
&root_item);
if (ret)
@@ -763,23 +765,6 @@ out:
return ret;
}
-/* copy of check_sticky in fs/namei.c()
-* It's inline, so penalty for filesystems that don't use sticky bit is
-* minimal.
-*/
-static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
-{
- kuid_t fsuid = current_fsuid();
-
- if (!(dir->i_mode & S_ISVTX))
- return 0;
- if (uid_eq(inode->i_uid, fsuid))
- return 0;
- if (uid_eq(dir->i_uid, fsuid))
- return 0;
- return !capable(CAP_FOWNER);
-}
-
/* copy of may_delete in fs/namei.c()
* Check whether we can remove a link victim from directory dir, check
* whether the type of victim is right.
@@ -815,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (btrfs_check_sticky(dir, victim->d_inode)||
- IS_APPEND(victim->d_inode)||
+ if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
return -EPERM;
if (isdir) {
@@ -915,7 +899,7 @@ out_unlock:
* file you want to defrag, we return 0 to let you know to skip this
* part of the file
*/
-static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em = NULL;
@@ -950,7 +934,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
*/
static int find_new_extents(struct btrfs_root *root,
struct inode *inode, u64 newer_than,
- u64 *off, int thresh)
+ u64 *off, u32 thresh)
{
struct btrfs_path *path;
struct btrfs_key min_key;
@@ -969,12 +953,9 @@ static int find_new_extents(struct btrfs_root *root,
min_key.offset = *off;
while (1) {
- path->keep_locks = 1;
ret = btrfs_search_forward(root, &min_key, path, newer_than);
if (ret != 0)
goto none;
- path->keep_locks = 0;
- btrfs_unlock_up_safe(path, 1);
process_slot:
if (min_key.objectid != ino)
goto none;
@@ -1052,15 +1033,17 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return false;
next = defrag_lookup_extent(inode, em->start + em->len);
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
- (em->block_start + em->block_len == next->block_start))
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ ret = false;
+ else if ((em->block_start + em->block_len == next->block_start) &&
+ (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
ret = false;
free_extent_map(next);
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, int thresh,
+static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
u64 *last_len, u64 *skip, u64 *defrag_end,
int compress)
{
@@ -1088,7 +1071,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
}
next_mergeable = defrag_check_next_extent(inode, em);
-
/*
* we hit a real extent, if it is big or the next extent is not a
* real extent, don't bother defragging it
@@ -1291,7 +1273,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int ret;
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
- int extent_thresh = range->extent_thresh;
+ u32 extent_thresh = range->extent_thresh;
unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
unsigned long cluster = max_cluster;
u64 new_align = ~((u64)128 * 1024 - 1);
@@ -1367,8 +1349,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
inode->i_mapping->writeback_index = i;
while (i <= last_index && defrag_count < max_to_defrag &&
- (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT)) {
+ (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
/*
* make sure we stop running if someone unmounts
* the FS
@@ -1391,7 +1372,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* the should_defrag function tells us how much to skip
* bump our counter by the suggested amount
*/
- next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
i = max(i + 1, next);
continue;
}
@@ -1586,7 +1567,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- old_size = device->total_bytes;
+ old_size = btrfs_device_get_total_bytes(device);
if (mod < 0) {
if (new_size > old_size) {
@@ -1735,7 +1716,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
BTRFS_SUBVOL_QGROUP_INHERIT)) {
ret = -EOPNOTSUPP;
- goto out;
+ goto free_args;
}
if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1745,27 +1726,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
if (vol_args->size > PAGE_CACHE_SIZE) {
ret = -EINVAL;
- goto out;
+ goto free_args;
}
inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
if (IS_ERR(inherit)) {
ret = PTR_ERR(inherit);
- goto out;
+ goto free_args;
}
}
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol, ptr,
readonly, inherit);
+ if (ret)
+ goto free_inherit;
- if (ret == 0 && ptr &&
- copy_to_user(arg +
- offsetof(struct btrfs_ioctl_vol_args_v2,
- transid), ptr, sizeof(*ptr)))
+ if (ptr && copy_to_user(arg +
+ offsetof(struct btrfs_ioctl_vol_args_v2,
+ transid),
+ ptr, sizeof(*ptr)))
ret = -EFAULT;
-out:
- kfree(vol_args);
+
+free_inherit:
kfree(inherit);
+free_args:
+ kfree(vol_args);
return ret;
}
@@ -2117,8 +2102,6 @@ static noinline int search_ioctl(struct inode *inode,
key.type = sk->min_type;
key.offset = sk->min_offset;
- path->keep_locks = 1;
-
while (1) {
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
@@ -2451,9 +2434,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- err = d_invalidate(dentry);
- if (err)
- goto out_unlock;
+ d_invalidate(dentry);
down_write(&root->fs_info->subvol_sem);
@@ -2538,7 +2519,6 @@ out_release:
btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
out_up_write:
up_write(&root->fs_info->subvol_sem);
-out_unlock:
if (err) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
@@ -2554,9 +2534,9 @@ out_unlock:
ASSERT(dest->send_in_progress == 0);
/* the last ref */
- if (dest->cache_inode) {
- iput(dest->cache_inode);
- dest->cache_inode = NULL;
+ if (dest->ino_cache_inode) {
+ iput(dest->ino_cache_inode);
+ dest->ino_cache_inode = NULL;
}
}
out_dput:
@@ -2662,6 +2642,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(root, vol_args->name);
+ if (!ret)
+ btrfs_info(root->fs_info, "disk added %s",vol_args->name);
+
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
@@ -2685,7 +2668,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto out;
+ goto err_drop;
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2701,8 +2684,12 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
mutex_unlock(&root->fs_info->volume_mutex);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ if (!ret)
+ btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+
out:
kfree(vol_args);
+err_drop:
mnt_drop_write_file(file);
return ret;
}
@@ -2764,8 +2751,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
}
di_args->devid = dev->devid;
- di_args->bytes_used = dev->bytes_used;
- di_args->total_bytes = dev->total_bytes;
+ di_args->bytes_used = btrfs_device_get_bytes_used(dev);
+ di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
if (dev->name) {
struct rcu_string *name;
@@ -3191,7 +3178,7 @@ static void clone_update_extent_map(struct inode *inode,
em->start + em->len - 1, 0);
}
- if (unlikely(ret))
+ if (ret)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
}
@@ -3226,7 +3213,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = vmalloc(btrfs_level_size(root, 0));
+ buf = vmalloc(root->nodesize);
if (!buf)
return ret;
@@ -3279,11 +3266,11 @@ process_slot:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+ if (key.type > BTRFS_EXTENT_DATA_KEY ||
key.objectid != btrfs_ino(src))
break;
- if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
struct btrfs_file_extent_item *extent;
int type;
u32 size;
@@ -3527,7 +3514,8 @@ process_slot:
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- last_dest_end = new_key.offset + datal;
+ last_dest_end = ALIGN(new_key.offset + datal,
+ root->sectorsize);
ret = clone_finish_inode_update(trans, inode,
last_dest_end,
destoff, olen);
@@ -5309,6 +5297,12 @@ long btrfs_ioctl(struct file *file, unsigned int
if (ret)
return ret;
ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ /*
+ * The transaction thread may want to do more work,
+ * namely it pokes the cleaner ktread that will start
+ * processing uncleaned subvols.
+ */
+ wake_up_process(root->fs_info->transaction_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index dfad8514f0da..78285f30909e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
char *data_in;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
unsigned long buf_start;
unsigned long buf_offset = 0;
unsigned long bytes;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7187b14faa6c..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
trace_btrfs_ordered_extent_remove(inode, entry);
- /*
- * we have no more ordered extents for this inode and
- * no dirty pages. We can safely remove it from the
- * list of ordered extents
- */
- if (RB_EMPTY_ROOT(&tree->tree) &&
- !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
- spin_lock(&root->fs_info->ordered_root_lock);
- list_del_init(&BTRFS_I(inode)->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
- }
-
if (!root->nr_ordered_extents) {
spin_lock(&root->fs_info->ordered_root_lock);
BUG_ON(list_empty(&root->ordered_root));
@@ -627,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
+ btrfs_flush_delalloc_helper,
btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
@@ -687,81 +676,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
}
/*
- * this is used during transaction commit to write all the inodes
- * added to the ordered operation list. These files must be fully on
- * disk before the transaction commits.
- *
- * we have two modes here, one is to just start the IO via filemap_flush
- * and the other is to wait for all the io. When we wait, we have an
- * extra check to make sure the ordered operation list really is empty
- * before we return
- */
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int wait)
-{
- struct btrfs_inode *btrfs_inode;
- struct inode *inode;
- struct btrfs_transaction *cur_trans = trans->transaction;
- struct list_head splice;
- struct list_head works;
- struct btrfs_delalloc_work *work, *next;
- int ret = 0;
-
- INIT_LIST_HEAD(&splice);
- INIT_LIST_HEAD(&works);
-
- mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
- spin_lock(&root->fs_info->ordered_root_lock);
- list_splice_init(&cur_trans->ordered_operations, &splice);
- while (!list_empty(&splice)) {
- btrfs_inode = list_entry(splice.next, struct btrfs_inode,
- ordered_operations);
- inode = &btrfs_inode->vfs_inode;
-
- list_del_init(&btrfs_inode->ordered_operations);
-
- /*
- * the inode may be getting freed (in sys_unlink path).
- */
- inode = igrab(inode);
- if (!inode)
- continue;
-
- if (!wait)
- list_add_tail(&BTRFS_I(inode)->ordered_operations,
- &cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
-
- work = btrfs_alloc_delalloc_work(inode, wait, 1);
- if (!work) {
- spin_lock(&root->fs_info->ordered_root_lock);
- if (list_empty(&BTRFS_I(inode)->ordered_operations))
- list_add_tail(&btrfs_inode->ordered_operations,
- &splice);
- list_splice_tail(&splice,
- &cur_trans->ordered_operations);
- spin_unlock(&root->fs_info->ordered_root_lock);
- ret = -ENOMEM;
- goto out;
- }
- list_add_tail(&work->list, &works);
- btrfs_queue_work(root->fs_info->flush_workers,
- &work->work);
-
- cond_resched();
- spin_lock(&root->fs_info->ordered_root_lock);
- }
- spin_unlock(&root->fs_info->ordered_root_lock);
-out:
- list_for_each_entry_safe(work, next, &works, list) {
- list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
- }
- mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
- return ret;
-}
-
-/*
* Used to start IO or wait for a given ordered extent to finish.
*
* If wait is one, this effectively waits on page writeback for all the pages
@@ -1120,42 +1034,6 @@ out:
return index;
}
-
-/*
- * add a given inode to the list of inodes that must be fully on
- * disk before a transaction commit finishes.
- *
- * This basically gives us the ext3 style data=ordered mode, and it is mostly
- * used to make sure renamed files are fully on disk.
- *
- * It is a noop if the inode is already fully on disk.
- *
- * If trans is not null, we'll do a friendly check for a transaction that
- * is already flushing things and force the IO down ourselves.
- */
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
-{
- struct btrfs_transaction *cur_trans = trans->transaction;
- u64 last_mod;
-
- last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
-
- /*
- * if this file hasn't been changed since the last transaction
- * commit, we can safely return without doing anything
- */
- if (last_mod <= root->fs_info->last_trans_committed)
- return;
-
- spin_lock(&root->fs_info->ordered_root_lock);
- if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
- list_add_tail(&BTRFS_I(inode)->ordered_operations,
- &cur_trans->ordered_operations);
- }
- spin_unlock(&root->fs_info->ordered_root_lock);
-}
-
int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 246897058efb..d81a274d621e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len);
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int wait);
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
void btrfs_get_logged_extents(struct inode *inode,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 65793edb38ca..47767d5b8f0b 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = offset;
path = btrfs_alloc_path();
@@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = offset;
path = btrfs_alloc_path();
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9626b4ad3b9a..647ab12fdf5d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
for (i = 0 ; i < nr ; i++) {
item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
- type = btrfs_key_type(&key);
+ type = key.type;
printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
"itemsize %d\n",
i, key.objectid, type, key.offset,
@@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
for (i = 0; i < nr; i++) {
struct extent_buffer *next = read_tree_block(root,
btrfs_node_blockptr(c, i),
- btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(c, i));
if (btrfs_is_leaf(next) &&
level != 1)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98cb6b2630f9..48b60dbf807f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
+ if (btrfs_test_is_dummy_root(quota_root))
return 0;
-#endif
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroupid;
+ /*
+ * Avoid a transaction abort by catching -EEXIST here. In that
+ * case, we proceed by re-initializing the existing structure
+ * on disk.
+ */
+
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_info));
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
@@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_limit));
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
@@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
int ret;
int slot;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return 0;
-#endif
+
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroup->qgroupid;
@@ -1201,6 +1205,50 @@ out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
+
+static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
+ struct btrfs_qgroup_operation *oper2)
+{
+ /*
+ * Ignore seq and type here, we're looking for any operation
+ * at all related to this extent on that root.
+ */
+ if (oper1->bytenr < oper2->bytenr)
+ return -1;
+ if (oper1->bytenr > oper2->bytenr)
+ return 1;
+ if (oper1->ref_root < oper2->ref_root)
+ return -1;
+ if (oper1->ref_root > oper2->ref_root)
+ return 1;
+ return 0;
+}
+
+static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup_operation *cur;
+ int cmp;
+
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = fs_info->qgroup_op_tree.rb_node;
+ while (n) {
+ cur = rb_entry(n, struct btrfs_qgroup_operation, n);
+ cmp = comp_oper_exist(cur, oper);
+ if (cmp < 0) {
+ n = n->rb_right;
+ } else if (cmp) {
+ n = n->rb_left;
+ } else {
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return -EEXIST;
+ }
+ }
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return 0;
+}
+
static int comp_oper(struct btrfs_qgroup_operation *oper1,
struct btrfs_qgroup_operation *oper2)
{
@@ -1290,6 +1338,25 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
INIT_LIST_HEAD(&oper->elem.list);
oper->elem.seq = 0;
+
+ trace_btrfs_qgroup_record_ref(oper);
+
+ if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
+ /*
+ * If any operation for this bytenr/ref_root combo
+ * exists, then we know it's not exclusively owned and
+ * shouldn't be queued up.
+ *
+ * This also catches the case where we have a cloned
+ * extent that gets queued up multiple times during
+ * drop snapshot.
+ */
+ if (qgroup_oper_exists(fs_info, oper)) {
+ kfree(oper);
+ return 0;
+ }
+ }
+
ret = insert_qgroup_oper(fs_info, oper);
if (ret) {
/* Shouldn't happen so have an assert for developers */
@@ -1884,6 +1951,111 @@ out:
}
/*
+ * Process a reference to a shared subtree. This type of operation is
+ * queued during snapshot removal when we encounter extents which are
+ * shared between more than one root.
+ */
+static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *roots = NULL;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup_list *glist;
+ struct ulist *parents;
+ int ret = 0;
+ int err;
+ struct btrfs_qgroup *qg;
+ u64 root_obj = 0;
+ struct seq_list elem = {};
+
+ parents = ulist_alloc(GFP_NOFS);
+ if (!parents)
+ return -ENOMEM;
+
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+ elem.seq, &roots);
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ if (ret < 0)
+ goto out;
+
+ if (roots->nnodes != 1)
+ goto out;
+
+ ULIST_ITER_INIT(&uiter);
+ unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
+ /*
+ * If we find our ref root then that means all refs
+ * this extent has to the root have not yet been
+ * deleted. In that case, we do nothing and let the
+ * last ref for this bytenr drive our update.
+ *
+ * This can happen for example if an extent is
+ * referenced multiple times in a snapshot (clone,
+ * etc). If we are in the middle of snapshot removal,
+ * queued updates for such an extent will find the
+ * root if we have not yet finished removing the
+ * snapshot.
+ */
+ if (unode->val == oper->ref_root)
+ goto out;
+
+ root_obj = unode->val;
+ BUG_ON(!root_obj);
+
+ spin_lock(&fs_info->qgroup_lock);
+ qg = find_qgroup_rb(fs_info, root_obj);
+ if (!qg)
+ goto out_unlock;
+
+ qg->excl += oper->num_bytes;
+ qg->excl_cmpr += oper->num_bytes;
+ qgroup_dirty(fs_info, qg);
+
+ /*
+ * Adjust counts for parent groups. First we find all
+ * parents, then in the 2nd loop we do the adjustment
+ * while adding parents of the parents to our ulist.
+ */
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ err = ulist_add(parents, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (err < 0) {
+ ret = err;
+ goto out_unlock;
+ }
+ }
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(parents, &uiter))) {
+ qg = u64_to_ptr(unode->aux);
+ qg->excl += oper->num_bytes;
+ qg->excl_cmpr += oper->num_bytes;
+ qgroup_dirty(fs_info, qg);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ err = ulist_add(parents, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (err < 0) {
+ ret = err;
+ goto out_unlock;
+ }
+ }
+ }
+
+out_unlock:
+ spin_unlock(&fs_info->qgroup_lock);
+
+out:
+ ulist_free(roots);
+ ulist_free(parents);
+ return ret;
+}
+
+/*
* btrfs_qgroup_account_ref is called for every ref that is added to or deleted
* from the fs. First, all roots referencing the extent are searched, and
* then the space is accounted accordingly to the different roots. The
@@ -1911,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
ASSERT(is_fstree(oper->ref_root));
+ trace_btrfs_qgroup_account(oper);
+
switch (oper->type) {
case BTRFS_QGROUP_OPER_ADD_EXCL:
case BTRFS_QGROUP_OPER_SUB_EXCL:
@@ -1920,6 +2094,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
case BTRFS_QGROUP_OPER_SUB_SHARED:
ret = qgroup_shared_accounting(trans, fs_info, oper);
break;
+ case BTRFS_QGROUP_OPER_SUB_SUBTREE:
+ ret = qgroup_subtree_accounting(trans, fs_info, oper);
+ break;
default:
ASSERT(0);
}
@@ -2068,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
if (srcid) {
struct btrfs_root *srcroot;
struct btrfs_key srckey;
- int srcroot_level;
srckey.objectid = srcid;
srckey.type = BTRFS_ROOT_ITEM_KEY;
@@ -2080,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
}
rcu_read_lock();
- srcroot_level = btrfs_header_level(srcroot->node);
- level_size = btrfs_level_size(srcroot, srcroot_level);
+ level_size = srcroot->nodesize;
rcu_read_unlock();
}
@@ -2397,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
found.type != BTRFS_METADATA_ITEM_KEY)
continue;
if (found.type == BTRFS_METADATA_ITEM_KEY)
- num_bytes = fs_info->extent_root->leafsize;
+ num_bytes = fs_info->extent_root->nodesize;
else
num_bytes = found.offset;
@@ -2551,6 +2726,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
+ btrfs_qgroup_rescan_helper,
btrfs_qgroup_rescan_worker, NULL, NULL);
if (ret) {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5952ff1fbd7a..18cc68ca3090 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type {
BTRFS_QGROUP_OPER_ADD_SHARED,
BTRFS_QGROUP_OPER_SUB_EXCL,
BTRFS_QGROUP_OPER_SUB_SHARED,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE,
};
struct btrfs_qgroup_operation {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4a88f073fdd7..6a41631cb959 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
{
unsigned long nr = stripe_len * nr_stripes;
- return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
}
/*
@@ -1416,7 +1416,8 @@ cleanup:
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
{
- btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ rmw_work, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers,
&rbio->work);
@@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
static void async_read_rebuild(struct btrfs_raid_bio *rbio)
{
- btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ read_rebuild_work, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers,
&rbio->work);
@@ -1440,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
int ret;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
@@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+ btrfs_init_work(&plug->work, btrfs_rmw_helper,
+ unplug_work, NULL, NULL);
btrfs_queue_work(plug->info->rmw_workers,
&plug->work);
return;
@@ -1722,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
int pagenr, stripe;
void **pointers;
int faila = -1, failb = -1;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
struct page *page;
int err;
int i;
@@ -1937,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
int ret;
- int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 09230cf3a244..b63ae20618fb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
if (!re)
return NULL;
- blocksize = btrfs_level_size(root, level);
+ blocksize = root->nodesize;
re->logical = logical;
re->blocksize = blocksize;
re->top = *top;
@@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
/* FIXME we cannot handle this properly right now */
BUG();
}
- btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
+ btrfs_init_work(&rmw->work, btrfs_readahead_helper,
+ reada_start_machine_worker, NULL, NULL);
rmw->fs_info = fs_info;
btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65245a07275b..74257d6436ad 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
err = ret;
goto out;
}
- BUG_ON(!ret || !path1->slots[0]);
+ ASSERT(ret);
+ ASSERT(path1->slots[0]);
path1->slots[0]--;
@@ -746,10 +747,10 @@ again:
* the backref was added previously when processing
* backref of type BTRFS_TREE_BLOCK_REF_KEY
*/
- BUG_ON(!list_is_singular(&cur->upper));
+ ASSERT(list_is_singular(&cur->upper));
edge = list_entry(cur->upper.next, struct backref_edge,
list[LOWER]);
- BUG_ON(!list_empty(&edge->list[UPPER]));
+ ASSERT(list_empty(&edge->list[UPPER]));
exist = edge->node[UPPER];
/*
* add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
cur->cowonly = 1;
}
#else
- BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
#endif
if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
* backref of this type.
*/
root = find_reloc_root(rc, cur->bytenr);
- BUG_ON(!root);
+ ASSERT(root);
cur->root = root;
break;
}
@@ -868,7 +869,7 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
if (btrfs_root_level(&root->root_item) == cur->level) {
/* tree root */
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
cur->bytenr);
if (should_ignore_root(root))
list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
need_check = true;
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
lower->bytenr);
if (should_ignore_root(root))
list_add(&lower->list, &useless);
@@ -977,12 +978,15 @@ again:
need_check = false;
list_add_tail(&edge->list[UPPER],
&list);
- } else
+ } else {
+ if (upper->checked)
+ need_check = true;
INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
@@ -1026,7 +1030,7 @@ next:
* everything goes well, connect backref nodes and insert backref nodes
* into the cache.
*/
- BUG_ON(!node->checked);
+ ASSERT(node->checked);
cowonly = node->cowonly;
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1062,8 +1066,21 @@ next:
continue;
}
- BUG_ON(!upper->checked);
- BUG_ON(cowonly != upper->cowonly);
+ if (!upper->checked) {
+ /*
+ * Still want to blow up for developers since this is a
+ * logic bug.
+ */
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+ if (cowonly != upper->cowonly) {
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
@@ -1086,7 +1103,7 @@ next:
while (!list_empty(&useless)) {
upper = list_entry(useless.next, struct backref_node, list);
list_del_init(&upper->list);
- BUG_ON(!list_empty(&upper->upper));
+ ASSERT(list_empty(&upper->upper));
if (upper == node)
node = NULL;
if (upper->lowest) {
@@ -1119,29 +1136,45 @@ out:
if (err) {
while (!list_empty(&useless)) {
lower = list_entry(useless.next,
- struct backref_node, upper);
- list_del_init(&lower->upper);
+ struct backref_node, list);
+ list_del_init(&lower->list);
}
- upper = node;
- INIT_LIST_HEAD(&list);
- while (upper) {
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- list_splice_tail(&upper->upper, &list);
- free_backref_node(cache, upper);
- }
-
- if (list_empty(&list))
- break;
-
- edge = list_entry(list.next, struct backref_edge,
- list[LOWER]);
+ while (!list_empty(&list)) {
+ edge = list_first_entry(&list, struct backref_edge,
+ list[UPPER]);
+ list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
upper = edge->node[UPPER];
free_backref_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes
+ * and isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &useless);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to proces */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &useless);
+ }
+
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, list);
+ list_del_init(&lower->list);
+ free_backref_node(cache, lower);
}
return ERR_PTR(err);
}
- BUG_ON(node && node->detached);
+ ASSERT(!node || !node->detached);
return node;
}
@@ -1787,7 +1820,7 @@ again:
btrfs_node_key_to_cpu(parent, next_key, slot + 1);
old_bytenr = btrfs_node_blockptr(parent, slot);
- blocksize = btrfs_level_size(dest, level - 1);
+ blocksize = dest->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
if (level <= max_level) {
@@ -1813,8 +1846,7 @@ again:
break;
}
- eb = read_tree_block(dest, old_bytenr, blocksize,
- old_ptr_gen);
+ eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
ret = (!eb) ? -ENOMEM : -EIO;
free_extent_buffer(eb);
@@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
u64 bytenr;
u64 ptr_gen = 0;
u64 last_snapshot;
- u32 blocksize;
u32 nritems;
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
@@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
}
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- blocksize = btrfs_level_size(root, i - 1);
- eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+ eb = read_tree_block(root, bytenr, ptr_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list)
}
static noinline_for_stack
-int merge_reloc_roots(struct reloc_control *rc)
+void merge_reloc_roots(struct reloc_control *rc)
{
struct btrfs_root *root;
struct btrfs_root *reloc_root;
@@ -2397,7 +2427,6 @@ out:
}
BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
- return ret;
}
static void free_block_list(struct rb_root *blocks)
@@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
if (next->processed && (reserve || next != node))
break;
- num_bytes += btrfs_level_size(rc->extent_root,
- next->level);
+ num_bytes += rc->extent_root->nodesize;
if (list_empty(&next->upper))
break;
@@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
- blocksize = btrfs_level_size(root, node->level);
+ blocksize = root->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
- eb = read_tree_block(root, bytenr, blocksize, generation);
+ eb = read_tree_block(root, bytenr, generation);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
err = -EIO;
@@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc,
u32 blocksize;
if (node->level == 0 ||
in_block_group(node->bytenr, rc->block_group)) {
- blocksize = btrfs_level_size(rc->extent_root, node->level);
+ blocksize = rc->extent_root->nodesize;
mark_block_processed(rc, node->bytenr, blocksize);
}
node->processed = 1;
@@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid, block->key.offset);
+ block->key.offset);
if (!eb || !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
@@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc,
return 0;
}
-static int reada_tree_block(struct reloc_control *rc,
- struct tree_block *block)
-{
- BUG_ON(block->key_ready);
- if (block->key.type == BTRFS_METADATA_ITEM_KEY)
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid,
- rc->extent_root->leafsize);
- else
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid, block->key.offset);
- return 0;
-}
-
/*
* helper function to relocate a tree block
*/
@@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
if (!block->key_ready)
- reada_tree_block(rc, block);
+ readahead_tree_block(rc->extent_root, block->bytenr,
+ block->key.objectid);
rb_node = rb_next(rb_node);
}
@@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc,
return -ENOMEM;
block->bytenr = extent_key->objectid;
- block->key.objectid = rc->extent_root->leafsize;
+ block->key.objectid = rc->extent_root->nodesize;
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
@@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_extent_inline_ref *iref;
unsigned long ptr;
unsigned long end;
- u32 blocksize = btrfs_level_size(rc->extent_root, 0);
+ u32 blocksize = rc->extent_root->nodesize;
int ret = 0;
int err = 0;
@@ -3783,7 +3798,7 @@ next:
}
if (key.type == BTRFS_METADATA_ITEM_KEY &&
- key.objectid + rc->extent_root->leafsize <=
+ key.objectid + rc->extent_root->nodesize <=
rc->search_start) {
path->slots[0]++;
goto next;
@@ -3801,7 +3816,7 @@ next:
rc->search_start = key.objectid + key.offset;
else
rc->search_start = key.objectid +
- rc->extent_root->leafsize;
+ rc->extent_root->nodesize;
memcpy(extent_key, &key, sizeof(key));
return 0;
}
@@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
out:
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b6d198f5181e..efa083113827 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -137,7 +137,6 @@ struct scrub_ctx {
int pages_per_rd_bio;
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
int is_dev_replace;
struct scrub_wr_ctx wr_ctx;
@@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx {
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
- char *scratch_buf;
- char *msg_buf;
const char *errstr;
sector_t sector;
u64 logical;
struct btrfs_device *dev;
- int msg_bufsize;
- int scratch_bufsize;
};
-
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -428,8 +422,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
- btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
- NULL, NULL);
+ btrfs_init_work(&sbio->work, btrfs_scrub_helper,
+ scrub_bio_end_io_worker, NULL, NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
}
sctx->first_free = 0;
sctx->nodesize = dev->dev_root->nodesize;
- sctx->leafsize = dev->dev_root->leafsize;
sctx->sectorsize = dev->dev_root->sectorsize;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
@@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u64 ref_root;
u32 item_size;
u8 ref_level;
- const int bufsize = 4096;
int ret;
WARN_ON(sblock->page_count < 1);
@@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
fs_info = sblock->sctx->dev_root->fs_info;
path = btrfs_alloc_path();
+ if (!path)
+ return;
- swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
- swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
swarn.sector = (sblock->pagev[0]->physical) >> 9;
swarn.logical = sblock->pagev[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
- swarn.msg_bufsize = bufsize;
- swarn.scratch_bufsize = bufsize;
-
- if (!path || !swarn.scratch_buf || !swarn.msg_buf)
- goto out;
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
@@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
out:
btrfs_free_path(path);
- kfree(swarn.scratch_buf);
- kfree(swarn.msg_buf);
}
static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
@@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
ret = -EIO;
goto out;
}
- fs_info = BTRFS_I(inode)->root->fs_info;
- ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
+ ret = repair_io_failure(inode, offset, PAGE_SIZE,
fixup->logical, page,
+ offset - page_offset(page),
fixup->mirror_num);
unlock_page(page);
corrected = !ret;
@@ -999,8 +984,8 @@ nodatasum_case:
fixup_nodatasum->root = fs_info->extent_root;
fixup_nodatasum->mirror_num = failed_mirror_index + 1;
scrub_pending_trans_workers_inc(sctx);
- btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
- NULL, NULL);
+ btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
+ scrub_fixup_nodatasum, NULL, NULL);
btrfs_queue_work(fs_info->scrub_workers,
&fixup_nodatasum->work);
goto out;
@@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
return;
}
+static inline int scrub_check_fsid(u8 fsid[],
+ struct scrub_page *spage)
+{
+ struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+ int ret;
+
+ ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
+ return !ret;
+}
+
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int is_metadata, int have_csum,
@@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
h = (struct btrfs_header *)mapped_buffer;
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
- memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+ !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE)) {
sblock->header_error = 1;
@@ -1616,7 +1611,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
sbio->err = err;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
+ scrub_wr_bio_end_io_worker, NULL, NULL);
btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
@@ -1750,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
++fail;
- if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+ if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
++fail;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
++fail;
- WARN_ON(sctx->nodesize != sctx->leafsize);
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1790,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_root *root = sctx->dev_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
@@ -1816,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
if (sblock->pagev[0]->generation != btrfs_super_generation(s))
++fail_gen;
- if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+ if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
++fail_cor;
len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
@@ -2195,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- WARN_ON(sctx->nodesize != sctx->leafsize);
blocksize = sctx->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
@@ -2486,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
btrfs_item_key_to_cpu(l, &key, slot);
if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = root->leafsize;
+ bytes = root->nodesize;
else
bytes = key.offset;
@@ -2713,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (found_key.objectid != scrub_dev->devid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
break;
if (found_key.offset >= end)
@@ -2827,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return -EIO;
- gen = root->fs_info->last_trans_committed;
+ /* Seed devices of a new filesystem has their own generation. */
+ if (scrub_dev->fs_devices != root->fs_info->fs_devices)
+ gen = scrub_dev->generation;
+ else
+ gen = root->fs_info->last_trans_committed;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >
+ scrub_dev->commit_total_bytes)
break;
ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
@@ -2904,21 +2901,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+ struct rcu_string *name;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
- /*
- * check some assumptions
- */
- if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
- btrfs_err(fs_info,
- "scrub: size assumption nodesize == leafsize (%d == %d) fails",
- fs_info->chunk_root->nodesize,
- fs_info->chunk_root->leafsize);
- return -EINVAL;
- }
-
if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
/*
* in this case scrub is unable to calculate the checksum
@@ -2965,6 +2952,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -ENODEV;
}
+ if (!is_dev_replace && !readonly && !dev->writeable) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
+ btrfs_err(fs_info, "scrub: device %s is not writable",
+ name->str);
+ rcu_read_unlock();
+ return -EROFS;
+ }
+
mutex_lock(&fs_info->scrub_lock);
if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
mutex_unlock(&fs_info->scrub_lock);
@@ -3203,7 +3200,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
nocow_ctx->len = len;
nocow_ctx->mirror_num = mirror_num;
nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
- btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
+ btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
+ copy_nocow_pages_worker, NULL, NULL);
INIT_LIST_HEAD(&nocow_ctx->inodes);
btrfs_queue_work(fs_info->scrub_nocow_workers,
&nocow_ctx->work);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6528aa662181..874828dd0a86 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
set_fs(KERNEL_DS);
while (pos < len) {
- ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
+ ret = vfs_write(filp, (__force const char __user *)buf + pos,
+ len - pos, off);
/* TODO handle that correctly */
/*if (ret == -ERESTARTSYS) {
continue;
@@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
int num;
u8 type;
- if (found_key->type == BTRFS_XATTR_ITEM_KEY)
- buf_len = BTRFS_MAX_XATTR_SIZE(root);
- else
- buf_len = PATH_MAX;
-
+ /*
+ * Start with a small buffer (1 page). If later we end up needing more
+ * space, which can happen for xattrs on a fs with a leaf size greater
+ * then the page size, attempt to increase the buffer. Typically xattr
+ * values are small.
+ */
+ buf_len = PATH_MAX;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
- if (name_len + data_len > buf_len) {
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
ret = -E2BIG;
goto out;
}
@@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
/*
* Path too long
*/
- if (name_len + data_len > buf_len) {
+ if (name_len + data_len > PATH_MAX) {
ret = -ENAMETOOLONG;
goto out;
}
}
+ if (name_len + data_len > buf_len) {
+ buf_len = name_len + data_len;
+ if (is_vmalloc_addr(buf)) {
+ vfree(buf);
+ buf = NULL;
+ } else {
+ char *tmp = krealloc(buf, buf_len,
+ GFP_NOFS | __GFP_NOWARN);
+
+ if (!tmp)
+ kfree(buf);
+ buf = tmp;
+ }
+ if (!buf) {
+ buf = vmalloc(buf_len);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
name_len + data_len);
@@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- kfree(buf);
+ kvfree(buf);
return ret;
}
@@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
if (ret < 0 && ret != -ENOENT) {
goto out;
} else if (ret == -ENOENT) {
- ret = 1;
+ ret = 0;
break;
}
@@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
NULL);
sort_clone_roots = 1;
- current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
+ current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current->journal_info = NULL;
if (ret < 0)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8e16bca69c56..54bd91ece35b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
#include "backref.h"
#include "tests/btrfs-tests.h"
+#include "qgroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
static void btrfs_put_super(struct super_block *sb)
{
- (void)close_ctree(btrfs_sb(sb)->tree_root);
- /* FIXME: need to fix VFS to return error? */
- /* AV: return it _where_? ->put_super() can be triggered by any number
- * of async events, up to and including delivery of SIGKILL to the
- * last process that kept it busy. Or segfault in the aforementioned
- * process... Whom would you report that to?
- */
+ close_ctree(btrfs_sb(sb)->tree_root);
}
enum {
@@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
int ret = 0;
char *compress_type;
bool compress_force = false;
- bool compress = false;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (cache_gen)
@@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
- compress = true;
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
@@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_and_info(root, FORCE_COMPRESS,
"force %s compression",
compress_type);
- } else if (compress) {
+ } else {
if (!btrfs_test_opt(root, COMPRESS))
btrfs_info(root->fs_info,
"btrfs: use %s compression",
compress_type);
+ /*
+ * If we remount from compress-force=xxx to
+ * compress=xxx, we need clear FORCE_COMPRESS
+ * flag, otherwise, there is no way for users
+ * to disable forcible compression separately.
+ */
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
break;
case Opt_ssd:
@@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
- struct dentry *dentry;
u64 dir_id;
int new = 0;
@@ -922,13 +921,7 @@ setup_root:
return dget(sb->s_root);
}
- dentry = d_obtain_alias(inode);
- if (!IS_ERR(dentry)) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~DCACHE_DISCONNECTED;
- spin_unlock(&dentry->d_lock);
- }
- return dentry;
+ return d_obtain_root(inode);
}
static int btrfs_fill_super(struct super_block *sb,
@@ -1021,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",nodatacow");
if (btrfs_test_opt(root, NOBARRIER))
seq_puts(seq, ",nobarrier");
- if (info->max_inline != 8192 * 1024)
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
if (info->alloc_start != 0)
seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
@@ -1222,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
return root;
}
+static int parse_security_options(char *orig_opts,
+ struct security_mnt_opts *sec_opts)
+{
+ char *secdata = NULL;
+ int ret = 0;
+
+ secdata = alloc_secdata();
+ if (!secdata)
+ return -ENOMEM;
+ ret = security_sb_copy_data(orig_opts, secdata);
+ if (ret) {
+ free_secdata(secdata);
+ return ret;
+ }
+ ret = security_sb_parse_opts_str(secdata, sec_opts);
+ free_secdata(secdata);
+ return ret;
+}
+
+static int setup_security_options(struct btrfs_fs_info *fs_info,
+ struct super_block *sb,
+ struct security_mnt_opts *sec_opts)
+{
+ int ret = 0;
+
+ /*
+ * Call security_sb_set_mnt_opts() to check whether new sec_opts
+ * is valid.
+ */
+ ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_SECURITY
+ if (!fs_info->security_opts.num_mnt_opts) {
+ /* first time security setup, copy sec_opts to fs_info */
+ memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
+ } else {
+ /*
+ * Since SELinux(the only one supports security_mnt_opts) does
+ * NOT support changing context during remount/mount same sb,
+ * This must be the same or part of the same security options,
+ * just free it.
+ */
+ security_free_mnt_opts(sec_opts);
+ }
+#endif
+ return ret;
+}
+
/*
* Find a superblock for the given device / mount point.
*
@@ -1236,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
+ struct security_mnt_opts new_sec_opts;
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
@@ -1258,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
return root;
}
+ security_init_mnt_opts(&new_sec_opts);
+ if (data) {
+ error = parse_security_options(data, &new_sec_opts);
+ if (error)
+ return ERR_PTR(error);
+ }
+
error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
if (error)
- return ERR_PTR(error);
+ goto error_sec_opts;
/*
* Setup a dummy root and fs_info for test/set super. This is because
@@ -1269,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
* then open_ctree will properly initialize everything later.
*/
fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
- if (!fs_info)
- return ERR_PTR(-ENOMEM);
+ if (!fs_info) {
+ error = -ENOMEM;
+ goto error_sec_opts;
+ }
fs_info->fs_devices = fs_devices;
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ security_init_mnt_opts(&fs_info->security_opts);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
goto error_fs_info;
@@ -1313,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
- if (IS_ERR(root))
+ if (IS_ERR(root)) {
deactivate_locked_super(s);
+ error = PTR_ERR(root);
+ goto error_sec_opts;
+ }
+
+ fs_info = btrfs_sb(s);
+ error = setup_security_options(fs_info, s, &new_sec_opts);
+ if (error) {
+ dput(root);
+ deactivate_locked_super(s);
+ goto error_sec_opts;
+ }
return root;
@@ -1322,6 +1387,8 @@ error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
free_fs_info(fs_info);
+error_sec_opts:
+ security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
}
@@ -1403,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
sync_filesystem(sb);
btrfs_remount_prepare(fs_info);
+ if (data) {
+ struct security_mnt_opts new_sec_opts;
+
+ security_init_mnt_opts(&new_sec_opts);
+ ret = parse_security_options(data, &new_sec_opts);
+ if (ret)
+ goto restore;
+ ret = setup_security_options(fs_info, sb,
+ &new_sec_opts);
+ if (ret) {
+ security_free_mnt_opts(&new_sec_opts);
+ goto restore;
+ }
+ }
+
ret = btrfs_parse_options(root, data);
if (ret) {
ret = -EINVAL;
@@ -1672,6 +1754,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
return 0;
}
+/*
+ * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
+ *
+ * If there's a redundant raid level at DATA block groups, use the respective
+ * multiplier to scale the sizes.
+ *
+ * Unused device space usage is based on simulating the chunk allocator
+ * algorithm that respects the device sizes, order of allocations and the
+ * 'alloc_start' value, this is a close approximation of the actual use but
+ * there are other factors that may change the result (like a new metadata
+ * chunk).
+ *
+ * FIXME: not accurate for mixed block groups, total and free/used are ok,
+ * available appears slightly larger.
+ */
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
@@ -1682,36 +1779,66 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 total_free_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fsid;
+ unsigned factor = 1;
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
- /* holding chunk_muext to avoid allocating new chunks */
+ /*
+ * holding chunk_muext to avoid allocating new chunks, holding
+ * device_list_mutex to avoid the device being removed
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int i;
+
total_free_data += found->disk_total - found->disk_used;
total_free_data -=
btrfs_account_ro_block_groups_free_space(found);
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (!list_empty(&found->block_groups[i])) {
+ switch (i) {
+ case BTRFS_RAID_DUP:
+ case BTRFS_RAID_RAID1:
+ case BTRFS_RAID_RAID10:
+ factor = 2;
+ }
+ }
+ }
}
total_used += found->disk_used;
}
+
rcu_read_unlock();
- buf->f_namelen = BTRFS_NAME_LEN;
- buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
- buf->f_bfree = buf->f_blocks - (total_used >> bits);
- buf->f_bsize = dentry->d_sb->s_blocksize;
- buf->f_type = BTRFS_SUPER_MAGIC;
+ buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
+ buf->f_blocks >>= bits;
+ buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
+
+ /* Account global block reserve as used, it's in logical size already */
+ spin_lock(&block_rsv->lock);
+ buf->f_bfree -= block_rsv->size >> bits;
+ spin_unlock(&block_rsv->lock);
+
buf->f_bavail = total_free_data;
ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
if (ret) {
mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return ret;
}
- buf->f_bavail += total_free_data;
+ buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ buf->f_type = BTRFS_SUPER_MAGIC;
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+ buf->f_namelen = BTRFS_NAME_LEN;
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
@@ -1737,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = {
.name = "btrfs",
.mount = btrfs_mount,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
MODULE_ALIAS_FS("btrfs");
@@ -1961,11 +2088,15 @@ static int __init init_btrfs_fs(void)
err = btrfs_prelim_ref_init();
if (err)
+ goto free_delayed_ref;
+
+ err = btrfs_end_io_wq_init();
+ if (err)
goto free_prelim_ref;
err = btrfs_interface_init();
if (err)
- goto free_delayed_ref;
+ goto free_end_io_wq;
btrfs_init_lockdep();
@@ -1983,6 +2114,8 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
+free_end_io_wq:
+ btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
@@ -2018,6 +2151,7 @@ static void __exit exit_btrfs_fs(void)
extent_map_exit();
extent_io_exit();
btrfs_interface_exit();
+ btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78699364f537..b2e7bb4393f6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
}
-BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
+BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
static ssize_t global_rsv_reserved_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
}
-BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
+BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
@@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
struct btrfs_space_info *sinfo = to_space_info(kobj); \
return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
} \
-BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
+BTRFS_ATTR(field, btrfs_space_info_show_##field)
static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
struct kobj_attribute *a,
@@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved);
SPACE_INFO_ATTR(bytes_may_use);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
+BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(flags),
@@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
+ char *label = fs_info->super_copy->label;
+ return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
}
static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = fs_info->fs_root;
int ret;
+ size_t p_len;
- if (len >= BTRFS_LABEL_SIZE)
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ /*
+ * p_len is the len until the first occurrence of either
+ * '\n' or '\0'
+ */
+ p_len = strcspn(buf, "\n");
+
+ if (p_len >= BTRFS_LABEL_SIZE)
return -EINVAL;
trans = btrfs_start_transaction(root, 0);
@@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
return PTR_ERR(trans);
spin_lock(&root->fs_info->super_lock);
- strcpy(fs_info->super_copy->label, buf);
+ memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
+ memcpy(fs_info->super_copy->label, buf, p_len);
spin_unlock(&root->fs_info->super_lock);
ret = btrfs_commit_transaction(trans, root);
@@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
return ret;
}
-BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
-
-static ssize_t btrfs_no_store(struct kobject *kobj,
- struct kobj_attribute *a,
- const char *buf, size_t len)
-{
- return -EPERM;
-}
+BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
static ssize_t btrfs_nodesize_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
}
-BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
+BTRFS_ATTR(nodesize, btrfs_nodesize_show);
static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
-BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
+BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
-BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
+BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
static struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
@@ -614,7 +619,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
if (!fs_info->device_dir_kobj)
return -EINVAL;
- if (one_device) {
+ if (one_device && one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index ac46df37504c..f7dd298b3cf6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -20,16 +20,20 @@ enum btrfs_feature_set {
.store = _store, \
}
-#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \
-static struct kobj_attribute btrfs_attr_##_name = \
- __INIT_KOBJ_ATTR(_name, _mode, _show, _store)
-#define BTRFS_ATTR(_name, _mode, _show) \
- BTRFS_ATTR_RW(_name, _mode, _show, NULL)
+#define BTRFS_ATTR_RW(_name, _show, _store) \
+ static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define BTRFS_ATTR(_name, _show) \
+ static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
#define BTRFS_RAID_ATTR(_name, _show) \
-static struct kobj_attribute btrfs_raid_attr_##_name = \
+ static struct kobj_attribute btrfs_raid_attr_##_name = \
__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8d9ddf84c69..2299bfde39ee 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
cache->key.offset = 1024 * 1024 * 1024;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
cache->sectorsize = 4096;
+ cache->full_stripe_len = 4096;
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
@@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
return 0;
}
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return ctl->free_extents > 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int
+check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+ const int num_extents,
+ const int num_bitmaps)
+{
+ if (cache->free_space_ctl->free_extents != num_extents) {
+ test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ cache->free_space_ctl->free_extents, num_extents);
+ return -EINVAL;
+ }
+ if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
+ test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ cache->free_space_ctl->total_bitmaps, num_bitmaps);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int check_cache_empty(struct btrfs_block_group_cache *cache)
+{
+ u64 offset;
+ u64 max_extent_size;
+
+ /*
+ * Now lets confirm that there's absolutely no free space left to
+ * allocate.
+ */
+ if (cache->free_space_ctl->free_space != 0) {
+ test_msg("Cache free space is not 0\n");
+ return -EINVAL;
+ }
+
+ /* And any allocation request, no matter how small, should fail now. */
+ offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
+ &max_extent_size);
+ if (offset != 0) {
+ test_msg("Space allocation did not fail, returned offset: %llu",
+ offset);
+ return -EINVAL;
+ }
+
+ /* And no extent nor bitmap entries in the cache anymore. */
+ return check_num_extents_and_bitmaps(cache, 0, 0);
+}
+
+/*
+ * Before we were able to steal free space from a bitmap entry to an extent
+ * entry, we could end up with 2 entries representing a contiguous free space.
+ * One would be an extent entry and the other a bitmap entry. Since in order
+ * to allocate space to a caller we use only 1 entry, we couldn't return that
+ * whole range to the caller if it was requested. This forced the caller to
+ * either assume ENOSPC or perform several smaller space allocations, which
+ * wasn't optimal as they could be spread all over the block group while under
+ * concurrency (extra overhead and fragmentation).
+ *
+ * This stealing approach is benefical, since we always prefer to allocate from
+ * extent entries, both for clustered and non-clustered allocation requests.
+ */
+static int
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
+{
+ int ret;
+ u64 offset;
+ u64 max_extent_size;
+
+ bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
+ struct btrfs_free_space *);
+
+ test_msg("Running space stealing from bitmap to extent\n");
+
+ /*
+ * For this test, we want to ensure we end up with an extent entry
+ * immediately adjacent to a bitmap entry, where the bitmap starts
+ * at an offset where the extent entry ends. We keep adding and
+ * removing free space to reach into this state, but to get there
+ * we need to reach a point where marking new free space doesn't
+ * result in adding new extent entries or merging the new space
+ * with existing extent entries - the space ends up being marked
+ * in an existing bitmap that covers the new free space range.
+ *
+ * To get there, we need to reach the threshold defined set at
+ * cache->free_space_ctl->extents_thresh, which currently is
+ * 256 extents on a x86_64 system at least, and a few other
+ * conditions (check free_space_cache.c). Instead of making the
+ * test much longer and complicated, use a "use_bitmap" operation
+ * that forces use of bitmaps as soon as we have at least 1
+ * extent entry.
+ */
+ use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
+ cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+
+ /*
+ * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
+ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 128 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
+ 128 * 1024 * 1024 - 512 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the first 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb - 256Kb, 128Mb - 128Kb[
+ * [128Mb + 512Kb, 128Mb + 768Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ 128 * 1024 * 1024 + 768 * 1024,
+ 128 * 1024 * 1024 - 768 * 1024);
+ if (ret) {
+ test_msg("Failed to free part of bitmap space %d\n", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 128 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
+ 256 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
+ 128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Bitmap region not removed from space cache\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
+ 256 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
+ * by the bitmap too, isn't marked as free either.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024,
+ 256 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+ test_msg("Bitmap region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the right of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
+ 4096);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
+ 128 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
+ 128 * 1024)) {
+ test_msg("Extent region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 4Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb - 256Kb, 128Mb[
+ * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 1 * 1024 * 1024)) {
+ test_msg("Expected region not marked as free\n");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+ test_msg("Cache free space is not 1Mb + 4Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 1 * 1024 * 1024, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+ test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 4096) {
+ test_msg("Cache free space is not 4Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 4096, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+ test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ /*
+ * Now test a similar scenario, but where our extent entry is located
+ * to the right of the bitmap entry, so that we can check that stealing
+ * space from a bitmap to the front of an extent entry works.
+ */
+
+ /*
+ * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
+ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
+ 128 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
+ ret = test_add_free_space_entry(cache, 0,
+ 128 * 1024 * 1024 - 512 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the last 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb + 128b, 128Mb + 256Kb[
+ * [128Mb - 768Kb, 128Mb - 512Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ 0,
+ 128 * 1024 * 1024 - 768 * 1024);
+ if (ret) {
+ test_msg("Failed to free part of bitmap space %d\n", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
+ 128 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+ 256 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 0,
+ 128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Bitmap region not removed from space cache\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024)) {
+ test_msg("Bitmap region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the left of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+ test_msg("Extent region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 8Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb, 128Mb + 256Kb[
+ * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+ 1 * 1024 * 1024)) {
+ test_msg("Expected region not marked as free\n");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+ test_msg("Cache free space is not 1Mb + 8Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 1 * 1024 * 1024, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 8192) {
+ test_msg("Cache free space is not 8Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 8192, 0,
+ &max_extent_size);
+ if (offset != (32 * 1024 * 1024)) {
+ test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ return 0;
+}
+
int btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
@@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void)
ret = test_bitmaps_and_extents(cache);
if (ret)
goto out;
+
+ ret = test_steal_space_from_bitmap_to_extent(cache);
out:
__btrfs_remove_free_space_cache(cache->free_space_ctl);
kfree(cache->free_space_ctl);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5f379affdf23..dcaae3616728 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -218,7 +218,6 @@ loop:
spin_lock_init(&cur_trans->delayed_refs.lock);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
- INIT_LIST_HEAD(&cur_trans->ordered_operations);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -387,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
int ret;
/* Send isn't supposed to start transactions. */
- ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+ ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
@@ -409,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
if (num_items > 0 && root != root->fs_info->chunk_root) {
if (root->fs_info->quota_enabled &&
is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->leafsize;
+ qgroup_reserved = num_items * root->nodesize;
ret = btrfs_qgroup_reserve(root, qgroup_reserved);
if (ret)
return ERR_PTR(ret);
@@ -419,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
/*
* Do the reservation for the relocation root creation
*/
- if (unlikely(need_reserve_reloc_root(root))) {
+ if (need_reserve_reloc_root(root)) {
num_bytes += root->nodesize;
reloc_reserved = true;
}
@@ -610,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
if (transid <= root->fs_info->last_trans_committed)
goto out;
- ret = -EINVAL;
/* find specified transaction */
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -626,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
}
}
spin_unlock(&root->fs_info->trans_lock);
- /* The specified transaction doesn't exist */
- if (!cur_trans)
+
+ /*
+ * The specified transaction doesn't exist, or we
+ * raced with btrfs_commit_transaction
+ */
+ if (!cur_trans) {
+ if (transid > root->fs_info->last_trans_committed)
+ ret = -EINVAL;
goto out;
+ }
} else {
/* find newest transaction that is committing | committed */
spin_lock(&root->fs_info->trans_lock);
@@ -852,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
+ struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+ bool errors = false;
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
@@ -865,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
+
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if ((mark & EXTENT_DIRTY) &&
+ test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+
+ if ((mark & EXTENT_NEW) &&
+ test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+ } else {
+ if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+ }
+
+ if (errors && !werr)
+ werr = -EIO;
+
return werr;
}
@@ -1612,27 +1639,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
-static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- int ret;
-
- ret = btrfs_run_delayed_items(trans, root);
- if (ret)
- return ret;
-
- /*
- * rename don't use btrfs_join_transaction, so, once we
- * set the transaction to blocked above, we aren't going
- * to get any new ordered operations. We can safely run
- * it here and no for sure that nothing new will be added
- * to the list
- */
- ret = btrfs_run_ordered_operations(trans, root, 1);
-
- return ret;
-}
-
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
@@ -1651,15 +1657,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
+ struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
int ret;
- ret = btrfs_run_ordered_operations(trans, root, 0);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- btrfs_end_transaction(trans, root);
- return ret;
- }
-
/* Stop the commit early if ->aborted is set */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
@@ -1740,7 +1740,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret)
goto cleanup_transaction;
- ret = btrfs_flush_all_pending_stuffs(trans, root);
+ ret = btrfs_run_delayed_items(trans, root);
if (ret)
goto cleanup_transaction;
@@ -1748,7 +1748,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
- ret = btrfs_flush_all_pending_stuffs(trans, root);
+ ret = btrfs_run_delayed_items(trans, root);
if (ret)
goto cleanup_transaction;
@@ -1897,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
sizeof(*root->fs_info->super_copy));
+ btrfs_update_commit_device_size(root->fs_info);
+ btrfs_update_commit_device_bytes_used(root, cur_trans);
+
+ clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+ clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+
spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
@@ -2010,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
ret = btrfs_drop_snapshot(root, NULL, 0, 0);
else
ret = btrfs_drop_snapshot(root, NULL, 1, 0);
- /*
- * If we encounter a transaction abort during snapshot cleaning, we
- * don't want to crash here
- */
+
return (ret < 0) ? 0 : 1;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7dd558ed0716..d8f40e1a5d2d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -55,7 +55,6 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
- struct list_head ordered_operations;
struct list_head pending_chunks;
struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs;
@@ -80,7 +79,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
__TRANS_ATTACH)
-#define BTRFS_SEND_TRANS_STUB 1
+#define BTRFS_SEND_TRANS_STUB ((void *)1)
struct btrfs_trans_handle {
u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 50af2b96df6c..286213cec861 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,11 @@
#define LOG_WALK_REPLAY_ALL 3
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only);
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -669,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* is this extent already allocated in the extent
* allocation tree? If so, just add a reference
*/
- ret = btrfs_lookup_extent(root, ins.objectid,
+ ret = btrfs_lookup_data_extent(root, ins.objectid,
ins.offset);
if (ret == 0) {
ret = btrfs_inc_extent_ref(trans, root,
@@ -1496,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
return -EIO;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
- btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1635,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
found_key.type == log_key.type &&
found_key.offset == log_key.offset &&
btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+ update_size = false;
goto out;
}
@@ -2155,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
+ blocksize = root->nodesize;
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
@@ -2981,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
min_key.type = key_type;
min_key.offset = min_offset;
- path->keep_locks = 1;
-
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
/*
@@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct list_head ordered_sums;
int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
bool has_extents = false;
- bool need_find_last_extent = (*last_extent == 0);
+ bool need_find_last_extent = true;
bool done = false;
INIT_LIST_HEAD(&ordered_sums);
@@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
*/
if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
has_extents = true;
- if (need_find_last_extent &&
- first_key.objectid == (u64)-1)
+ if (first_key.objectid == (u64)-1)
first_key = ins_keys[i];
} else {
need_find_last_extent = false;
@@ -3363,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* or deletes of this inode don't have to relog the inode
* again
*/
- if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
!skip_csum) {
int found_type;
extent = btrfs_item_ptr(src, start_slot + i,
@@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
if (!has_extents)
return ret;
+ if (need_find_last_extent && *last_extent == first_key.offset) {
+ /*
+ * We don't have any leafs between our current one and the one
+ * we processed before that can have file extent items for our
+ * inode (and have a generation number smaller than our current
+ * transaction id).
+ */
+ need_find_last_extent = false;
+ }
+
/*
* Because we use btrfs_search_forward we could skip leaves that were
* not modified and then assume *last_extent is valid when it really
@@ -3537,7 +3548,7 @@ fill_holes:
0, 0);
if (ret)
break;
- *last_extent = offset + len;
+ *last_extent = extent_end;
}
/*
* Need to let the callers know we dropped the path so they should
@@ -3562,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
-static int log_one_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path,
- struct list_head *logged_list)
+static int wait_ordered_extents(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_root *root,
+ const struct extent_map *em,
+ const struct list_head *logged_list,
+ bool *ordered_io_error)
{
- struct btrfs_root *log = root->log_root;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
struct btrfs_ordered_extent *ordered;
- struct list_head ordered_sums;
- struct btrfs_map_token token;
- struct btrfs_key key;
+ struct btrfs_root *log = root->log_root;
u64 mod_start = em->mod_start;
u64 mod_len = em->mod_len;
+ const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
u64 csum_offset;
u64 csum_len;
- u64 extent_offset = em->start - em->orig_start;
- u64 block_len;
- int ret;
- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- int extent_inserted = 0;
-
- INIT_LIST_HEAD(&ordered_sums);
- btrfs_init_map_token(&token);
-
- ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
- em->start + em->len, NULL, 0, 1,
- sizeof(*fi), &extent_inserted);
- if (ret)
- return ret;
-
- if (!extent_inserted) {
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = em->start;
-
- ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*fi));
- if (ret)
- return ret;
- }
- leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
- &token);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
- skip_csum = true;
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_PREALLOC,
- &token);
- } else {
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_REG,
- &token);
- if (em->block_start == EXTENT_MAP_HOLE)
- skip_csum = true;
- }
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start,
- &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start -
- extent_offset, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
- &token);
- }
+ LIST_HEAD(ordered_sums);
+ int ret = 0;
- btrfs_set_token_file_extent_offset(leaf, fi,
- em->start - em->orig_start,
- &token);
- btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
- btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
- btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
- &token);
- btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
- btrfs_mark_buffer_dirty(leaf);
+ *ordered_io_error = false;
- btrfs_release_path(path);
- if (ret) {
- return ret;
- }
-
- if (skip_csum)
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ em->block_start == EXTENT_MAP_HOLE)
return 0;
/*
- * First check and see if our csums are on our outstanding ordered
- * extents.
+ * Wait far any ordered extent that covers our extent map. If it
+ * finishes without an error, first check and see if our csums are on
+ * our outstanding ordered extents.
*/
list_for_each_entry(ordered, logged_list, log_list) {
struct btrfs_ordered_sum *sum;
@@ -3674,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
mod_start + mod_len <= ordered->file_offset)
continue;
+ if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ const u64 start = ordered->file_offset;
+ const u64 end = ordered->file_offset + ordered->len - 1;
+
+ WARN_ON(ordered->inode != inode);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ }
+
+ wait_event(ordered->wait,
+ (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
+ test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
+
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+ *ordered_io_error = true;
+ break;
+ }
/*
* We are going to copy all the csums on this ordered extent, so
* go ahead and adjust mod_start and mod_len in case this
@@ -3705,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
}
}
+ if (skip_csum)
+ continue;
+
/*
* To keep us from looping for the above case of an ordered
* extent that falls inside of the logged extent.
@@ -3722,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
if (ret)
- goto unlocked;
+ break;
}
-
}
-unlocked:
- if (!mod_len || ret)
+ if (*ordered_io_error || !mod_len || ret || skip_csum)
return ret;
if (em->compress_type) {
csum_offset = 0;
- csum_len = block_len;
+ csum_len = max(em->block_len, em->orig_block_len);
} else {
csum_offset = mod_start - em->start;
csum_len = mod_len;
@@ -3760,11 +3716,106 @@ unlocked:
return ret;
}
+static int log_one_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct btrfs_root *root,
+ const struct extent_map *em,
+ struct btrfs_path *path,
+ const struct list_head *logged_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct btrfs_map_token token;
+ struct btrfs_key key;
+ u64 extent_offset = em->start - em->orig_start;
+ u64 block_len;
+ int ret;
+ int extent_inserted = 0;
+ bool ordered_io_err = false;
+
+ ret = wait_ordered_extents(trans, inode, root, em, logged_list,
+ &ordered_io_err);
+ if (ret)
+ return ret;
+
+ if (ordered_io_err) {
+ ctx->io_err = -EIO;
+ return 0;
+ }
+
+ btrfs_init_map_token(&token);
+
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+ em->start + em->len, NULL, 0, 1,
+ sizeof(*fi), &extent_inserted);
+ if (ret)
+ return ret;
+
+ if (!extent_inserted) {
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = em->start;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(*fi));
+ if (ret)
+ return ret;
+ }
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+ &token);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_PREALLOC,
+ &token);
+ else
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG,
+ &token);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start,
+ &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start -
+ extent_offset, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+ &token);
+ }
+
+ btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
+ btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+ btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
+ btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+ &token);
+ btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
struct btrfs_path *path,
- struct list_head *logged_list)
+ struct list_head *logged_list,
+ struct btrfs_log_ctx *ctx)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -3822,7 +3873,8 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, logged_list);
+ ret = log_one_extent(trans, inode, root, em, path, logged_list,
+ ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -3849,8 +3901,11 @@ process:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only)
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -3867,6 +3922,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
path = btrfs_alloc_path();
if (!path)
@@ -3950,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
err = ret;
goto out_unlock;
}
- path->keep_locks = 1;
while (1) {
ins_nr = 0;
@@ -4035,19 +4090,41 @@ log_extents:
btrfs_release_path(dst_path);
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list);
+ &logged_list, ctx);
if (ret) {
err = ret;
goto out_unlock;
}
} else if (inode_only == LOG_INODE_ALL) {
- struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em, *n;
- write_lock(&tree->lock);
- list_for_each_entry_safe(em, n, &tree->modified_extents, list)
- list_del_init(&em->list);
- write_unlock(&tree->lock);
+ write_lock(&em_tree->lock);
+ /*
+ * We can't just remove every em if we're called for a ranged
+ * fsync - that is, one that doesn't cover the whole possible
+ * file range (0 to LLONG_MAX). This is because we can have
+ * em's that fall outside the range we're logging and therefore
+ * their ordered operations haven't completed yet
+ * (btrfs_finish_ordered_io() not invoked yet). This means we
+ * didn't get their respective file extent item in the fs/subvol
+ * tree yet, and need to let the next fast fsync (one which
+ * consults the list of modified extent maps) find the em so
+ * that it logs a matching file extent item and waits for the
+ * respective ordered operation to complete (if it's still
+ * running).
+ *
+ * Removing every em outside the range we're logging would make
+ * the next fast fsync not log their matching file extent items,
+ * therefore making us lose data after a log replay.
+ */
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+ list) {
+ const u64 mod_end = em->mod_start + em->mod_len - 1;
+
+ if (em->mod_start >= start && mod_end <= end)
+ list_del_init(&em->list);
+ }
+ write_unlock(&em_tree->lock);
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4057,6 +4134,7 @@ log_extents:
goto out_unlock;
}
}
+
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
@@ -4153,7 +4231,10 @@ out:
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only,
+ struct dentry *parent,
+ const loff_t start,
+ const loff_t end,
+ int exists_only,
struct btrfs_log_ctx *ctx)
{
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4199,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
if (ret)
goto end_trans;
@@ -4227,7 +4308,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
- ret = btrfs_log_inode(trans, root, inode, inode_only);
+ ret = btrfs_log_inode(trans, root, inode, inode_only,
+ 0, LLONG_MAX, ctx);
if (ret)
goto end_trans;
}
@@ -4261,13 +4343,15 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry,
+ const loff_t start,
+ const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
- 0, ctx);
+ start, end, 0, ctx);
dput(parent);
return ret;
@@ -4317,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
key.offset = (u64)-1;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.type = BTRFS_ROOT_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
@@ -4504,6 +4588,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
root->fs_info->last_trans_committed))
return 0;
- return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
+ return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+ LLONG_MAX, 1, NULL);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..154990c26dcb 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -28,6 +28,7 @@
struct btrfs_log_ctx {
int log_ret;
int log_transid;
+ int io_err;
struct list_head list;
};
@@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
{
ctx->log_ret = 0;
ctx->log_transid = 0;
+ ctx->io_err = 0;
INIT_LIST_HEAD(&ctx->list);
}
@@ -59,6 +61,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry,
+ const loff_t start,
+ const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 7f78cbf5cf41..4c29db604bbe 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask);
+
+/* just like ulist_add_merge() but take a pointer for the aux data */
+static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
+ void **old_aux, gfp_t gfp_mask)
+{
+#if BITS_PER_LONG == 32
+ u64 old64 = (uintptr_t)*old_aux;
+ int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
+ *old_aux = (void *)((uintptr_t)old64);
+ return ret;
+#else
+ return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
+#endif
+}
+
struct ulist_node *ulist_next(struct ulist *ulist,
struct ulist_iterator *uiter);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index f6a4c03ee7d8..778282944530 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
key.offset = 0;
again_search_slot:
- path->keep_locks = 1;
ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
if (ret > 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6cb82f62cb7c..d47289c715c8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static DEFINE_MUTEX(uuid_mutex);
+DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
static void lock_chunks(struct btrfs_root *root)
@@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
mutex_init(&fs_devs->device_list_mutex);
INIT_LIST_HEAD(&fs_devs->devices);
+ INIT_LIST_HEAD(&fs_devs->resized_devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->list);
@@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void)
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->resized_list);
spin_lock_init(&dev->io_lock);
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
+ atomic_set(&dev->dev_stats_ccnt, 0);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
@@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path,
return PTR_ERR(fs_devices);
list_add(&fs_devices->list, &fs_uuids);
- fs_devices->latest_devid = devid;
- fs_devices->latest_trans = found_transid;
device = NULL;
} else {
device = __find_device(&fs_devices->devices, devid,
disk_super->dev_item.uuid);
}
+
if (!device) {
if (fs_devices->opened)
return -EBUSY;
@@ -508,6 +510,43 @@ static noinline int device_list_add(const char *path,
ret = 1;
device->fs_devices = fs_devices;
} else if (!device->name || strcmp(device->name->str, path)) {
+ /*
+ * When FS is already mounted.
+ * 1. If you are here and if the device->name is NULL that
+ * means this device was missing at time of FS mount.
+ * 2. If you are here and if the device->name is different
+ * from 'path' that means either
+ * a. The same device disappeared and reappeared with
+ * different name. or
+ * b. The missing-disk-which-was-replaced, has
+ * reappeared now.
+ *
+ * We must allow 1 and 2a above. But 2b would be a spurious
+ * and unintentional.
+ *
+ * Further in case of 1 and 2a above, the disk at 'path'
+ * would have missed some transaction when it was away and
+ * in case of 2a the stale bdev has to be updated as well.
+ * 2b must not be allowed at all time.
+ */
+
+ /*
+ * For now, we do allow update to btrfs_fs_device through the
+ * btrfs dev scan cli after FS has been mounted. We're still
+ * tracking a problem where systems fail mount by subvolume id
+ * when we reject replacement on a mounted FS.
+ */
+ if (!fs_devices->opened && found_transid < device->generation) {
+ /*
+ * That is if the FS is _not_ mounted and if you
+ * are here, that means there is more than one
+ * disk with same uuid and devid.We keep the one
+ * with larger generation number or the last-in if
+ * generation are equal.
+ */
+ return -EEXIST;
+ }
+
name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
return -ENOMEM;
@@ -519,10 +558,15 @@ static noinline int device_list_add(const char *path,
}
}
- if (found_transid > fs_devices->latest_trans) {
- fs_devices->latest_devid = devid;
- fs_devices->latest_trans = found_transid;
- }
+ /*
+ * Unmount does not free the btrfs_device struct but would zero
+ * generation along with most of the other members. So just update
+ * it back. We need it to pick the disk with largest generation
+ * (as above).
+ */
+ if (!fs_devices->opened)
+ device->generation = found_transid;
+
*fs_devices_ret = fs_devices;
return ret;
@@ -538,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
if (IS_ERR(fs_devices))
return fs_devices;
- fs_devices->latest_devid = orig->latest_devid;
- fs_devices->latest_trans = orig->latest_trans;
+ mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
/* We have held the volume lock, it is safe to get the devices. */
@@ -568,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
+ mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
+ mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(-ENOMEM);
}
@@ -578,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
-
- struct block_device *latest_bdev = NULL;
- u64 latest_devid = 0;
- u64 latest_transid = 0;
+ struct btrfs_device *latest_dev = NULL;
mutex_lock(&uuid_mutex);
again:
@@ -589,11 +631,9 @@ again:
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->in_fs_metadata) {
if (!device->is_tgtdev_for_dev_replace &&
- (!latest_transid ||
- device->generation > latest_transid)) {
- latest_devid = device->devid;
- latest_transid = device->generation;
- latest_bdev = device->bdev;
+ (!latest_dev ||
+ device->generation > latest_dev->generation)) {
+ latest_dev = device;
}
continue;
}
@@ -635,9 +675,7 @@ again:
goto again;
}
- fs_devices->latest_bdev = latest_bdev;
- fs_devices->latest_devid = latest_devid;
- fs_devices->latest_trans = latest_transid;
+ fs_devices->latest_bdev = latest_dev->bdev;
mutex_unlock(&uuid_mutex);
}
@@ -686,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
fs_devices->rw_devices--;
}
- if (device->can_discard)
- fs_devices->num_can_discard--;
if (device->missing)
fs_devices->missing_devices--;
@@ -752,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
struct block_device *bdev;
struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
- struct block_device *latest_bdev = NULL;
+ struct btrfs_device *latest_dev = NULL;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
- u64 latest_devid = 0;
- u64 latest_transid = 0;
u64 devid;
int seeding = 1;
int ret = 0;
@@ -784,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
goto error_brelse;
device->generation = btrfs_super_generation(disk_super);
- if (!latest_transid || device->generation > latest_transid) {
- latest_devid = devid;
- latest_transid = device->generation;
- latest_bdev = bdev;
- }
+ if (!latest_dev ||
+ device->generation > latest_dev->generation)
+ latest_dev = device;
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
device->writeable = 0;
@@ -798,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
}
q = bdev_get_queue(bdev);
- if (blk_queue_discard(q)) {
+ if (blk_queue_discard(q))
device->can_discard = 1;
- fs_devices->num_can_discard++;
- }
device->bdev = bdev;
device->in_fs_metadata = 0;
@@ -831,9 +861,7 @@ error_brelse:
}
fs_devices->seeding = seeding;
fs_devices->opened = 1;
- fs_devices->latest_bdev = latest_bdev;
- fs_devices->latest_devid = latest_devid;
- fs_devices->latest_trans = latest_transid;
+ fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
out:
return ret;
@@ -1007,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
if (key.objectid > device->devid)
break;
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1159,7 +1187,7 @@ again:
if (key.objectid > device->devid)
break;
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
if (key.offset > search_start) {
@@ -1238,7 +1266,7 @@ out:
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
- u64 start)
+ u64 start, u64 *dev_extent_len)
{
int ret;
struct btrfs_path *path;
@@ -1280,13 +1308,8 @@ again:
goto out;
}
- if (device->bytes_used > 0) {
- u64 len = btrfs_dev_extent_length(leaf, extent);
- device->bytes_used -= len;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += len;
- spin_unlock(&root->fs_info->free_chunk_lock);
- }
+ *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_error(root->fs_info, ret,
@@ -1436,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
- btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
btrfs_set_device_bandwidth(leaf, dev_item, 0);
@@ -1493,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
- lock_chunks(root);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -1509,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
goto out;
out:
btrfs_free_path(path);
- unlock_chunks(root);
btrfs_commit_transaction(trans, root);
return ret;
}
@@ -1625,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->writeable) {
lock_chunks(root);
list_del_init(&device->dev_alloc_list);
+ device->fs_devices->rw_devices--;
unlock_chunks(root);
- root->fs_info->fs_devices->rw_devices--;
clear_super = true;
}
@@ -1645,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (ret)
goto error_undo;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space = device->total_bytes -
- device->bytes_used;
- spin_unlock(&root->fs_info->free_chunk_lock);
-
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root->fs_info, device);
@@ -1671,7 +1689,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
device->fs_devices->total_devices--;
if (device->missing)
- root->fs_info->fs_devices->missing_devices--;
+ device->fs_devices->missing_devices--;
next_device = list_entry(root->fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
@@ -1703,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
fs_devices = fs_devices->seed;
}
cur_devices->seed = NULL;
- lock_chunks(root);
__btrfs_close_devices(cur_devices);
- unlock_chunks(root);
free_fs_devices(cur_devices);
}
@@ -1778,8 +1794,8 @@ error_undo:
lock_chunks(root);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
+ device->fs_devices->rw_devices++;
unlock_chunks(root);
- root->fs_info->fs_devices->rw_devices++;
}
goto error_brelse;
}
@@ -1787,25 +1803,57 @@ error_undo:
void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev)
{
+ struct btrfs_fs_devices *fs_devices;
+
WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+ /*
+ * in case of fs with no seed, srcdev->fs_devices will point
+ * to fs_devices of fs_info. However when the dev being replaced is
+ * a seed dev it will point to the seed's local fs_devices. In short
+ * srcdev will have its correct fs_devices in both the cases.
+ */
+ fs_devices = srcdev->fs_devices;
+
list_del_rcu(&srcdev->dev_list);
list_del_rcu(&srcdev->dev_alloc_list);
- fs_info->fs_devices->num_devices--;
- if (srcdev->missing) {
- fs_info->fs_devices->missing_devices--;
- fs_info->fs_devices->rw_devices++;
- }
- if (srcdev->can_discard)
- fs_info->fs_devices->num_can_discard--;
- if (srcdev->bdev) {
- fs_info->fs_devices->open_devices--;
+ fs_devices->num_devices--;
+ if (srcdev->missing)
+ fs_devices->missing_devices--;
- /* zero out the old super */
+ if (srcdev->writeable) {
+ fs_devices->rw_devices--;
+ /* zero out the old super if it is writable */
btrfs_scratch_superblock(srcdev);
}
+ if (srcdev->bdev)
+ fs_devices->open_devices--;
+
call_rcu(&srcdev->rcu, free_device);
+
+ /*
+ * unless fs_devices is seed fs, num_devices shouldn't go
+ * zero
+ */
+ BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
+
+ /* if this is no devs we rather delete the fs_devices */
+ if (!fs_devices->num_devices) {
+ struct btrfs_fs_devices *tmp_fs_devices;
+
+ tmp_fs_devices = fs_info->fs_devices;
+ while (tmp_fs_devices) {
+ if (tmp_fs_devices->seed == fs_devices) {
+ tmp_fs_devices->seed = fs_devices->seed;
+ break;
+ }
+ tmp_fs_devices = tmp_fs_devices->seed;
+ }
+ fs_devices->seed = NULL;
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ }
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
@@ -1813,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_device *next_device;
+ mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
if (tgtdev->bdev) {
@@ -1820,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
- if (tgtdev->can_discard)
- fs_info->fs_devices->num_can_discard++;
next_device = list_entry(fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
@@ -1834,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
call_rcu(&tgtdev->rcu, free_device);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
}
static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -1932,15 +1980,18 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
+ list_for_each_entry(device, &seed_devices->devices, dev_list)
+ device->fs_devices = seed_devices;
+ lock_chunks(root);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- list_for_each_entry(device, &seed_devices->devices, dev_list) {
- device->fs_devices = seed_devices;
- }
+ unlock_chunks(root);
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
+ fs_devices->missing_devices = 0;
+ fs_devices->rotating = 0;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
@@ -2039,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
struct list_head *devices;
struct super_block *sb = root->fs_info->sb;
struct rcu_string *name;
- u64 total_bytes;
+ u64 tmp;
int seeding_dev = 0;
int ret = 0;
@@ -2095,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- lock_chunks(root);
-
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
@@ -2107,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->sector_size = root->sectorsize;
device->total_bytes = i_size_read(bdev->bd_inode);
device->disk_total_bytes = device->total_bytes;
+ device->commit_total_bytes = device->total_bytes;
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
@@ -2124,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->fs_devices = root->fs_info->fs_devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ lock_chunks(root);
list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
@@ -2131,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fs_devices->open_devices++;
root->fs_info->fs_devices->rw_devices++;
root->fs_info->fs_devices->total_devices++;
- if (device->can_discard)
- root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -2142,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
- total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
btrfs_set_super_total_bytes(root->fs_info->super_copy,
- total_bytes + device->total_bytes);
+ tmp + device->total_bytes);
- total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+ tmp = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
- total_bytes + 1);
+ tmp + 1);
/* add sysfs device entry */
btrfs_kobj_add_device(root->fs_info, device);
+ /*
+ * we've got more storage, clear any full flags on the space
+ * infos
+ */
+ btrfs_clear_space_info_full(root->fs_info);
+
+ unlock_chunks(root);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
- char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+ lock_chunks(root);
ret = init_first_rw_device(trans, root, device);
+ unlock_chunks(root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
+ }
+
+ ret = btrfs_add_device(trans, root, device);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error_trans;
+ }
+
+ if (seeding_dev) {
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+
ret = btrfs_finish_sprout(trans, root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -2175,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
root->fs_info->fsid);
if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
goto error_trans;
- } else {
- ret = btrfs_add_device(trans, root, device);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto error_trans;
- }
}
- /*
- * we've got more storage, clear any full flags on the space
- * infos
- */
- btrfs_clear_space_info_full(root->fs_info);
-
- unlock_chunks(root);
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
ret = btrfs_commit_transaction(trans, root);
@@ -2221,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
return ret;
error_trans:
- unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
btrfs_kobj_rm_device(root->fs_info, device);
@@ -2236,6 +2290,7 @@ error:
}
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct request_queue *q;
@@ -2248,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding)
+ if (fs_info->fs_devices->seeding) {
+ btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
+ }
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
- if (IS_ERR(bdev))
+ if (IS_ERR(bdev)) {
+ btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev);
+ }
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
+ btrfs_err(fs_info, "target device is in the filesystem!");
ret = -EEXIST;
goto error;
}
}
+
+ if (i_size_read(bdev->bd_inode) <
+ btrfs_device_get_total_bytes(srcdev)) {
+ btrfs_err(fs_info, "target device is smaller than source device!");
+ ret = -EINVAL;
+ goto error;
+ }
+
+
device = btrfs_alloc_device(NULL, &devid, NULL);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
@@ -2289,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
- device->total_bytes = i_size_read(bdev->bd_inode);
- device->disk_total_bytes = device->total_bytes;
+ device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+ device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+ device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+ ASSERT(list_empty(&srcdev->resized_list));
+ device->commit_total_bytes = srcdev->commit_total_bytes;
+ device->commit_bytes_used = device->bytes_used;
device->dev_root = fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
@@ -2302,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
- if (device->can_discard)
- fs_info->fs_devices->num_can_discard++;
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
*device_out = device;
@@ -2362,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
- btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
btrfs_mark_buffer_dirty(leaf);
out:
@@ -2371,40 +2444,44 @@ out:
return ret;
}
-static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
device->dev_root->fs_info->super_copy;
- u64 old_total = btrfs_super_total_bytes(super_copy);
- u64 diff = new_size - device->total_bytes;
+ struct btrfs_fs_devices *fs_devices;
+ u64 old_total;
+ u64 diff;
if (!device->writeable)
return -EACCES;
+
+ lock_chunks(device->dev_root);
+ old_total = btrfs_super_total_bytes(super_copy);
+ diff = new_size - device->total_bytes;
+
if (new_size <= device->total_bytes ||
- device->is_tgtdev_for_dev_replace)
+ device->is_tgtdev_for_dev_replace) {
+ unlock_chunks(device->dev_root);
return -EINVAL;
+ }
+
+ fs_devices = device->dev_root->fs_info->fs_devices;
btrfs_set_super_total_bytes(super_copy, old_total + diff);
device->fs_devices->total_rw_bytes += diff;
- device->total_bytes = new_size;
- device->disk_total_bytes = new_size;
+ btrfs_device_set_total_bytes(device, new_size);
+ btrfs_device_set_disk_total_bytes(device, new_size);
btrfs_clear_space_info_full(device->dev_root->fs_info);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &fs_devices->resized_devices);
+ unlock_chunks(device->dev_root);
return btrfs_update_device(trans, device);
}
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 new_size)
-{
- int ret;
- lock_chunks(device->dev_root);
- ret = __btrfs_grow_device(trans, device, new_size);
- unlock_chunks(device->dev_root);
- return ret;
-}
-
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 chunk_tree, u64 chunk_objectid,
@@ -2456,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
u32 cur;
struct btrfs_key key;
+ lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
@@ -2485,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
cur += len;
}
}
+ unlock_chunks(root);
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_tree, u64 chunk_objectid,
- u64 chunk_offset)
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset)
{
struct extent_map_tree *em_tree;
- struct btrfs_root *extent_root;
- struct btrfs_trans_handle *trans;
struct extent_map *em;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
struct map_lookup *map;
- int ret;
- int i;
+ u64 dev_extent_len = 0;
+ u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ u64 chunk_tree = root->fs_info->chunk_root->objectid;
+ int i, ret = 0;
+ /* Just in case */
root = root->fs_info->chunk_root;
- extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
- ret = btrfs_can_relocate(extent_root, chunk_offset);
- if (ret)
- return -ENOSPC;
-
- /* step one, relocate all the extents inside this chunk */
- ret = btrfs_relocate_block_group(extent_root, chunk_offset);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret);
- return ret;
- }
-
- lock_chunks(root);
-
- /*
- * step two, delete the device extents and the
- * chunk tree entries
- */
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
- BUG_ON(!em || em->start > chunk_offset ||
- em->start + em->len < chunk_offset);
+ if (!em || em->start > chunk_offset ||
+ em->start + em->len < chunk_offset) {
+ /*
+ * This is a logic error, but we don't want to just rely on the
+ * user having built with ASSERT enabled, so if ASSERT doens't
+ * do anything we still error out.
+ */
+ ASSERT(0);
+ if (em)
+ free_extent_map(em);
+ return -EINVAL;
+ }
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
- map->stripes[i].physical);
- BUG_ON(ret);
+ struct btrfs_device *device = map->stripes[i].dev;
+ ret = btrfs_free_dev_extent(trans, device,
+ map->stripes[i].physical,
+ &dev_extent_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ if (device->bytes_used > 0) {
+ lock_chunks(root);
+ btrfs_device_set_bytes_used(device,
+ device->bytes_used - dev_extent_len);
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += dev_extent_len;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ btrfs_clear_space_info_full(root->fs_info);
+ unlock_chunks(root);
+ }
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
}
ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
chunk_offset);
-
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
@@ -2565,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
/* once for the tree */
free_extent_map(em);
+out:
/* once for us */
free_extent_map(em);
+ return ret;
+}
- unlock_chunks(root);
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ struct btrfs_root *extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ root = root->fs_info->chunk_root;
+ extent_root = root->fs_info->extent_root;
+
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
+ /* step one, relocate all the extents inside this chunk */
+ ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_std_error(root->fs_info, ret);
+ return ret;
+ }
+
+ /*
+ * step two, delete the device extents and the
+ * chunk tree entries
+ */
+ ret = btrfs_remove_chunk(trans, root, chunk_offset);
btrfs_end_transaction(trans, root);
- return 0;
+ return ret;
}
static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
@@ -2623,8 +2751,8 @@ again:
found_key.offset);
if (ret == -ENOSPC)
failed++;
- else if (ret)
- BUG();
+ else
+ BUG_ON(ret);
}
if (found_key.offset == 0)
@@ -3031,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
- old_size = device->total_bytes;
+ old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
if (!device->writeable ||
- device->total_bytes - device->bytes_used > size_to_free ||
+ btrfs_device_get_total_bytes(device) -
+ btrfs_device_get_bytes_used(device) > size_to_free ||
device->is_tgtdev_for_dev_replace)
continue;
@@ -3590,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data)
max_key.type = BTRFS_ROOT_ITEM_KEY;
max_key.offset = (u64)-1;
- path->keep_locks = 1;
-
while (1) {
ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
@@ -3843,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
- u64 old_size = device->total_bytes;
- u64 diff = device->total_bytes - new_size;
+ u64 old_size = btrfs_device_get_total_bytes(device);
+ u64 diff = old_size - new_size;
if (device->is_tgtdev_for_dev_replace)
return -EINVAL;
@@ -3857,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
lock_chunks(root);
- device->total_bytes = new_size;
+ btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -3923,7 +4050,7 @@ again:
ret = -ENOSPC;
lock_chunks(root);
- device->total_bytes = old_size;
+ btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -3941,18 +4068,17 @@ again:
}
lock_chunks(root);
+ btrfs_device_set_disk_total_bytes(device, new_size);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &root->fs_info->fs_devices->resized_devices);
- device->disk_total_bytes = new_size;
- /* Now btrfs_update_device() will change the on-disk size. */
- ret = btrfs_update_device(trans, device);
- if (ret) {
- unlock_chunks(root);
- btrfs_end_transaction(trans, root);
- goto done;
- }
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy, old_total - diff);
unlock_chunks(root);
+
+ /* Now btrfs_update_device() will change the on-disk size. */
+ ret = btrfs_update_device(trans, device);
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
@@ -3968,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
u32 array_size;
u8 *ptr;
+ lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
- > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ unlock_chunks(root);
return -EFBIG;
+ }
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
@@ -3980,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+ unlock_chunks(root);
+
return 0;
}
@@ -4349,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (ret)
goto error_del_extent;
+ for (i = 0; i < map->num_stripes; i++) {
+ num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+ btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+ }
+
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+ extent_root->fs_info->free_chunk_space -= (stripe_size *
+ map->num_stripes);
+ spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
free_extent_map(em);
check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -4420,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
- device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
if (ret)
goto out;
@@ -4433,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
- spin_lock(&extent_root->fs_info->free_chunk_lock);
- extent_root->fs_info->free_chunk_space -= (stripe_size *
- map->num_stripes);
- spin_unlock(&extent_root->fs_info->free_chunk_lock);
-
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
@@ -4517,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
alloc_profile);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
+ return ret;
+}
+
+static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+{
+ int max_errors;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ max_errors = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+ max_errors = 2;
+ } else {
+ max_errors = 0;
}
- ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- if (ret)
- btrfs_abort_transaction(trans, root, ret);
-out:
- return ret;
+ return max_errors;
}
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -4535,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
struct map_lookup *map;
struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
int readonly = 0;
+ int miss_ndevs = 0;
int i;
read_lock(&map_tree->map_tree.lock);
@@ -4543,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
if (!em)
return 1;
- if (btrfs_test_opt(root, DEGRADED)) {
- free_extent_map(em);
- return 0;
- }
-
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev->missing) {
+ miss_ndevs++;
+ continue;
+ }
+
if (!map->stripes[i].dev->writeable) {
readonly = 1;
- break;
+ goto end;
}
}
+
+ /*
+ * If the number of missing devices is larger than max errors,
+ * we can not write the data into that chunk successfully, so
+ * set it readonly.
+ */
+ if (miss_ndevs > btrfs_chunk_max_errors(map))
+ readonly = 1;
+end:
free_extent_map(em);
return readonly;
}
@@ -4955,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = min_t(u64, map->num_stripes,
stripe_nr_end - stripe_nr_orig);
stripe_index = do_div(stripe_nr, map->num_stripes);
+ if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
+ mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
num_stripes = map->num_stripes;
@@ -5058,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* We distribute the parity blocks across stripes */
tmp = stripe_nr + stripe_index;
stripe_index = do_div(tmp, map->num_stripes);
+ if (!(rw & (REQ_WRITE | REQ_DISCARD |
+ REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
+ mirror_num = 1;
}
} else {
/*
@@ -5165,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
- if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_DUP)) {
- max_errors = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
- max_errors = 2;
- }
- }
+ if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+ max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
dev_replace->tgtdev != NULL) {
@@ -5557,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
name = rcu_dereference(dev->name);
pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
"(%s id %llu), size=%u\n", rw,
- (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
- name->str, dev->devid, bio->bi_size);
+ (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
+ name->str, dev->devid, bio->bi_iter.bi_size);
rcu_read_unlock();
}
#endif
@@ -5736,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
}
static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
if (IS_ERR(device))
@@ -5800,7 +5951,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
- btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
+ btrfs_init_work(&dev->work, btrfs_submit_helper,
+ pending_bios_fn, NULL, NULL);
return dev;
}
@@ -5875,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
- add_missing_dev(root, devid, uuid);
+ add_missing_dev(root, root->fs_info->fs_devices,
+ devid, uuid);
if (!map->stripes[i].dev) {
free_extent_map(em);
return -EIO;
@@ -5902,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf,
device->devid = btrfs_device_id(leaf, dev_item);
device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
device->total_bytes = device->disk_total_bytes;
+ device->commit_total_bytes = device->disk_total_bytes;
device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+ device->commit_bytes_used = device->bytes_used;
device->type = btrfs_device_type(leaf, dev_item);
device->io_align = btrfs_device_io_align(leaf, dev_item);
device->io_width = btrfs_device_io_width(leaf, dev_item);
@@ -5914,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
}
-static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
+ u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
int ret;
@@ -5923,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
fs_devices = root->fs_info->fs_devices->seed;
while (fs_devices) {
- if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
- ret = 0;
- goto out;
- }
+ if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
+ return fs_devices;
+
fs_devices = fs_devices->seed;
}
fs_devices = find_fsid(fsid);
if (!fs_devices) {
- ret = -ENOENT;
- goto out;
+ if (!btrfs_test_opt(root, DEGRADED))
+ return ERR_PTR(-ENOENT);
+
+ fs_devices = alloc_fs_devices(fsid);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ fs_devices->seeding = 1;
+ fs_devices->opened = 1;
+ return fs_devices;
}
fs_devices = clone_fs_devices(fs_devices);
- if (IS_ERR(fs_devices)) {
- ret = PTR_ERR(fs_devices);
- goto out;
- }
+ if (IS_ERR(fs_devices))
+ return fs_devices;
ret = __btrfs_open_devices(fs_devices, FMODE_READ,
root->fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
+ fs_devices = ERR_PTR(ret);
goto out;
}
if (!fs_devices->seeding) {
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
- ret = -EINVAL;
+ fs_devices = ERR_PTR(-EINVAL);
goto out;
}
fs_devices->seed = root->fs_info->fs_devices->seed;
root->fs_info->fs_devices->seed = fs_devices;
out:
- return ret;
+ return fs_devices;
}
static int read_one_dev(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
int ret;
@@ -5979,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root,
BTRFS_UUID_SIZE);
if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
- ret = open_seed_devices(root, fs_uuid);
- if (ret && !btrfs_test_opt(root, DEGRADED))
- return ret;
+ fs_devices = open_seed_devices(root, fs_uuid);
+ if (IS_ERR(fs_devices))
+ return PTR_ERR(fs_devices);
}
device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
- if (!device || !device->bdev) {
+ if (!device) {
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
- if (!device) {
- btrfs_warn(root->fs_info, "devid %llu missing", devid);
- device = add_missing_dev(root, devid, dev_uuid);
- if (!device)
- return -ENOMEM;
- } else if (!device->missing) {
+ btrfs_warn(root->fs_info, "devid %llu missing", devid);
+ device = add_missing_dev(root, fs_devices, devid, dev_uuid);
+ if (!device)
+ return -ENOMEM;
+ } else {
+ if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+ return -EIO;
+
+ if(!device->bdev && !device->missing) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
* device->bdev is NULL, and so we have to set
* device->missing to one here
*/
- root->fs_info->fs_devices->missing_devices++;
+ device->fs_devices->missing_devices++;
device->missing = 1;
}
+
+ /* Move the device to its own fs_devices */
+ if (device->fs_devices != fs_devices) {
+ ASSERT(device->missing);
+
+ list_move(&device->dev_list, &fs_devices->devices);
+ device->fs_devices->num_devices--;
+ fs_devices->num_devices++;
+
+ device->fs_devices->missing_devices--;
+ fs_devices->missing_devices++;
+
+ device->fs_devices = fs_devices;
+ }
}
if (device->fs_devices != root->fs_info->fs_devices) {
@@ -6319,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ int stats_cnt;
int ret = 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (!device->dev_stats_valid || !device->dev_stats_dirty)
+ if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
continue;
+ stats_cnt = atomic_read(&device->dev_stats_ccnt);
ret = update_dev_stat_item(trans, dev_root, device);
if (!ret)
- device->dev_stats_dirty = 0;
+ atomic_sub(stats_cnt, &device->dev_stats_ccnt);
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -6427,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
return 0;
}
+
+/*
+ * Update the size of all devices, which is used for writing out the
+ * super blocks.
+ */
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *curr, *next;
+
+ if (list_empty(&fs_devices->resized_devices))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ lock_chunks(fs_info->dev_root);
+ list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
+ resized_list) {
+ list_del_init(&curr->resized_list);
+ curr->commit_total_bytes = curr->disk_total_bytes;
+ }
+ unlock_chunks(fs_info->dev_root);
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+/* Must be invoked during the transaction commit */
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *dev;
+ int i;
+
+ if (list_empty(&transaction->pending_chunks))
+ return;
+
+ /* In order to kick the device replace finish process */
+ lock_chunks(root);
+ list_for_each_entry(em, &transaction->pending_chunks, list) {
+ map = (struct map_lookup *)em->bdev;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ dev = map->stripes[i].dev;
+ dev->commit_bytes_used = dev->bytes_used;
+ }
+ }
+ unlock_chunks(root);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2aaa00c47816..08980fa23039 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
#include <linux/btrfs.h>
#include "async-thread.h"
+extern struct mutex uuid_mutex;
+
#define BTRFS_STRIPE_LEN (64 * 1024)
struct buffer_head;
@@ -32,41 +34,59 @@ struct btrfs_pending_bios {
struct bio *tail;
};
+/*
+ * Use sequence counter to get consistent device stat data on
+ * 32-bit processors.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#include <linux/seqlock.h>
+#define __BTRFS_NEED_DEVICE_DATA_ORDERED
+#define btrfs_device_data_ordered_init(device) \
+ seqcount_init(&device->data_seqcount)
+#else
+#define btrfs_device_data_ordered_init(device) do { } while (0)
+#endif
+
struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
struct btrfs_fs_devices *fs_devices;
+
struct btrfs_root *dev_root;
+ struct rcu_string *name;
+
+ u64 generation;
+
+ spinlock_t io_lock ____cacheline_aligned;
+ int running_pending;
/* regular prio bios */
struct btrfs_pending_bios pending_bios;
/* WRITE_SYNC bios */
struct btrfs_pending_bios pending_sync_bios;
- u64 generation;
- int running_pending;
+ struct block_device *bdev;
+
+ /* the mode sent to blkdev_get */
+ fmode_t mode;
+
int writeable;
int in_fs_metadata;
int missing;
int can_discard;
int is_tgtdev_for_dev_replace;
- spinlock_t io_lock;
- /* the mode sent to blkdev_get */
- fmode_t mode;
-
- struct block_device *bdev;
-
-
- struct rcu_string *name;
+#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
+ seqcount_t data_seqcount;
+#endif
/* the internal btrfs device id */
u64 devid;
- /* size of the device */
+ /* size of the device in memory */
u64 total_bytes;
- /* size of the disk */
+ /* size of the device on disk */
u64 disk_total_bytes;
/* bytes used */
@@ -83,10 +103,26 @@ struct btrfs_device {
/* minimal io size for this device */
u32 sector_size;
-
/* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE];
+ /*
+ * size of the device on the current transaction
+ *
+ * This variant is update when committing the transaction,
+ * and protected by device_list_mutex
+ */
+ u64 commit_total_bytes;
+
+ /* bytes used on the current transaction */
+ u64 commit_bytes_used;
+ /*
+ * used to manage the device which is resized
+ *
+ * It is protected by chunk_lock.
+ */
+ struct list_head resized_list;
+
/* for sending down flush barriers */
int nobarriers;
struct bio *flush_bio;
@@ -107,26 +143,90 @@ struct btrfs_device {
struct radix_tree_root reada_zones;
struct radix_tree_root reada_extents;
-
/* disk I/O failure stats. For detailed description refer to
* enum btrfs_dev_stat_values in ioctl.h */
int dev_stats_valid;
- int dev_stats_dirty; /* counters need to be written to disk */
+
+ /* Counter to record the change of device stats */
+ atomic_t dev_stats_ccnt;
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
};
+/*
+ * If we read those variants at the context of their own lock, we needn't
+ * use the following helpers, reading them directly is safe.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ unsigned int seq; \
+ \
+ do { \
+ seq = read_seqcount_begin(&dev->data_seqcount); \
+ size = dev->name; \
+ } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ write_seqcount_begin(&dev->data_seqcount); \
+ dev->name = size; \
+ write_seqcount_end(&dev->data_seqcount); \
+ preempt_enable(); \
+}
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ \
+ preempt_disable(); \
+ size = dev->name; \
+ preempt_enable(); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ dev->name = size; \
+ preempt_enable(); \
+}
+#else
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ return dev->name; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ dev->name = size; \
+}
+#endif
+
+BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
- /* the device with this id has the most recent copy of the super */
- u64 latest_devid;
- u64 latest_trans;
u64 num_devices;
u64 open_devices;
u64 rw_devices;
u64 missing_devices;
u64 total_rw_bytes;
- u64 num_can_discard;
u64 total_devices;
struct block_device *latest_bdev;
@@ -139,6 +239,7 @@ struct btrfs_fs_devices {
struct mutex device_list_mutex;
struct list_head devices;
+ struct list_head resized_devices;
/* devices not currently being allocated */
struct list_head alloc_list;
struct list_head list;
@@ -167,8 +268,9 @@ struct btrfs_fs_devices {
*/
typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
struct btrfs_io_bio {
- unsigned long mirror_num;
- unsigned long stripe_index;
+ unsigned int mirror_num;
+ unsigned int stripe_index;
+ u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
u8 *csum_allocated;
@@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
struct btrfs_device **device_out);
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
@@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
u64 chunk_offset, u64 chunk_size);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset);
+
+static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
+{
+ return atomic_read(&dev->dev_stats_ccnt);
+}
+
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
atomic_inc(dev->dev_stat_values + index);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
}
static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
@@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
int ret;
ret = atomic_xchg(dev->dev_stat_values + index, 0);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
return ret;
}
@@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
int index, unsigned long val)
{
atomic_set(dev->dev_stat_values + index, val);
- dev->dev_stats_dirty = 1;
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
}
static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
@@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
{
btrfs_dev_stat_set(dev, index, 0);
}
+
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ad8328d797ea..dcf20131fbe4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
* first xattr that we find and walk forward
*/
key.objectid = btrfs_ino(inode);
- btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
path = btrfs_alloc_path();
@@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
- if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+ if (found_key.type != BTRFS_XATTR_ITEM_KEY)
break;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b67d8fc81277..759fa4e2de8f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -33,8 +33,7 @@
#include "compression.h"
struct workspace {
- z_stream inf_strm;
- z_stream def_strm;
+ z_stream strm;
char *buf;
struct list_head list;
};
@@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- vfree(workspace->def_strm.workspace);
- vfree(workspace->inf_strm.workspace);
+ vfree(workspace->strm.workspace);
kfree(workspace->buf);
kfree(workspace);
}
@@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws)
static struct list_head *zlib_alloc_workspace(void)
{
struct workspace *workspace;
+ int workspacesize;
workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
- MAX_WBITS, MAX_MEM_LEVEL));
- workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+ workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+ workspace->strm.workspace = vmalloc(workspacesize);
workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
- if (!workspace->def_strm.workspace ||
- !workspace->inf_strm.workspace || !workspace->buf)
+ if (!workspace->strm.workspace || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
@@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws,
*total_out = 0;
*total_in = 0;
- if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+ if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
printk(KERN_WARNING "BTRFS: deflateInit failed\n");
ret = -EIO;
goto out;
}
- workspace->def_strm.total_in = 0;
- workspace->def_strm.total_out = 0;
+ workspace->strm.total_in = 0;
+ workspace->strm.total_out = 0;
in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
data_in = kmap(in_page);
@@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws,
pages[0] = out_page;
nr_pages = 1;
- workspace->def_strm.next_in = data_in;
- workspace->def_strm.next_out = cpage_out;
- workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+ workspace->strm.next_in = data_in;
+ workspace->strm.next_out = cpage_out;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
- while (workspace->def_strm.total_in < len) {
- ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+ while (workspace->strm.total_in < len) {
+ ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (ret != Z_OK) {
printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
- zlib_deflateEnd(&workspace->def_strm);
+ zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
}
/* we're making it bigger, give up */
- if (workspace->def_strm.total_in > 8192 &&
- workspace->def_strm.total_in <
- workspace->def_strm.total_out) {
+ if (workspace->strm.total_in > 8192 &&
+ workspace->strm.total_in <
+ workspace->strm.total_out) {
ret = -E2BIG;
goto out;
}
@@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws,
* before the total_in so we will pull in a new page for
* the stream end if required
*/
- if (workspace->def_strm.avail_out == 0) {
+ if (workspace->strm.avail_out == 0) {
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
@@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws,
cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
- workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->def_strm.next_out = cpage_out;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = cpage_out;
}
/* we're all done */
- if (workspace->def_strm.total_in >= len)
+ if (workspace->strm.total_in >= len)
break;
/* we've read in a full page, get a new one */
- if (workspace->def_strm.avail_in == 0) {
- if (workspace->def_strm.total_out > max_out)
+ if (workspace->strm.avail_in == 0) {
+ if (workspace->strm.total_out > max_out)
break;
- bytes_left = len - workspace->def_strm.total_in;
+ bytes_left = len - workspace->strm.total_in;
kunmap(in_page);
page_cache_release(in_page);
@@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws,
in_page = find_get_page(mapping,
start >> PAGE_CACHE_SHIFT);
data_in = kmap(in_page);
- workspace->def_strm.avail_in = min(bytes_left,
+ workspace->strm.avail_in = min(bytes_left,
PAGE_CACHE_SIZE);
- workspace->def_strm.next_in = data_in;
+ workspace->strm.next_in = data_in;
}
}
- workspace->def_strm.avail_in = 0;
- ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
- zlib_deflateEnd(&workspace->def_strm);
+ workspace->strm.avail_in = 0;
+ ret = zlib_deflate(&workspace->strm, Z_FINISH);
+ zlib_deflateEnd(&workspace->strm);
if (ret != Z_STREAM_END) {
ret = -EIO;
goto out;
}
- if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+ if (workspace->strm.total_out >= workspace->strm.total_in) {
ret = -E2BIG;
goto out;
}
ret = 0;
- *total_out = workspace->def_strm.total_out;
- *total_in = workspace->def_strm.total_in;
+ *total_out = workspace->strm.total_out;
+ *total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
if (out_page)
@@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
size_t total_out = 0;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
unsigned long buf_start;
unsigned long pg_offset;
data_in = kmap(pages_in[page_in_index]);
- workspace->inf_strm.next_in = data_in;
- workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
- workspace->inf_strm.total_in = 0;
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+ workspace->strm.total_in = 0;
- workspace->inf_strm.total_out = 0;
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.total_out = 0;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
pg_offset = 0;
/* If it's deflate, and it's got no preset dictionary, then
@@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
!(((data_in[0]<<8) + data_in[1]) % 31)) {
wbits = -((data_in[0] >> 4) + 8);
- workspace->inf_strm.next_in += 2;
- workspace->inf_strm.avail_in -= 2;
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
return -EIO;
}
- while (workspace->inf_strm.total_in < srclen) {
- ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ while (workspace->strm.total_in < srclen) {
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
buf_start = total_out;
- total_out = workspace->inf_strm.total_out;
+ total_out = workspace->strm.total_out;
/* we didn't make progress in this inflate call, we're done */
if (buf_start == total_out)
@@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
goto done;
}
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
- if (workspace->inf_strm.avail_in == 0) {
+ if (workspace->strm.avail_in == 0) {
unsigned long tmp;
kunmap(pages_in[page_in_index]);
page_in_index++;
@@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
break;
}
data_in = kmap(pages_in[page_in_index]);
- workspace->inf_strm.next_in = data_in;
- tmp = srclen - workspace->inf_strm.total_in;
- workspace->inf_strm.avail_in = min(tmp,
+ workspace->strm.next_in = data_in;
+ tmp = srclen - workspace->strm.total_in;
+ workspace->strm.avail_in = min(tmp,
PAGE_CACHE_SIZE);
}
}
@@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
else
ret = 0;
done:
- zlib_inflateEnd(&workspace->inf_strm);
+ zlib_inflateEnd(&workspace->strm);
if (data_in)
kunmap(pages_in[page_in_index]);
return ret;
@@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long total_out = 0;
char *kaddr;
- workspace->inf_strm.next_in = data_in;
- workspace->inf_strm.avail_in = srclen;
- workspace->inf_strm.total_in = 0;
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = srclen;
+ workspace->strm.total_in = 0;
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
- workspace->inf_strm.total_out = 0;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.total_out = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
@@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
!(((data_in[0]<<8) + data_in[1]) % 31)) {
wbits = -((data_in[0] >> 4) + 8);
- workspace->inf_strm.next_in += 2;
- workspace->inf_strm.avail_in -= 2;
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
return -EIO;
}
@@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long bytes;
unsigned long pg_offset = 0;
- ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
buf_start = total_out;
- total_out = workspace->inf_strm.total_out;
+ total_out = workspace->strm.total_out;
if (total_out == buf_start) {
ret = -EIO;
@@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
pg_offset += bytes;
bytes_left -= bytes;
next:
- workspace->inf_strm.next_out = workspace->buf;
- workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
}
if (ret != Z_STREAM_END && bytes_left != 0)
@@ -386,7 +383,7 @@ next:
else
ret = 0;
- zlib_inflateEnd(&workspace->inf_strm);
+ zlib_inflateEnd(&workspace->strm);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111bbb8b..20805db2c987 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -128,21 +128,15 @@ __clear_page_buffers(struct page *page)
page_cache_release(page);
}
-
-static int quiet_error(struct buffer_head *bh)
-{
- if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
- return 0;
- return 1;
-}
-
-
-static void buffer_io_error(struct buffer_head *bh)
+static void buffer_io_error(struct buffer_head *bh, char *msg)
{
char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+
+ if (!test_bit(BH_Quiet, &bh->b_state))
+ printk_ratelimited(KERN_ERR
+ "Buffer I/O error on dev %s, logical block %llu%s\n",
bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr);
+ (unsigned long long)bh->b_blocknr, msg);
}
/*
@@ -177,17 +171,10 @@ EXPORT_SYMBOL(end_buffer_read_sync);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
-
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (!quiet_error(bh)) {
- buffer_io_error(bh);
- printk(KERN_WARNING "lost page write due to "
- "I/O error on %s\n",
- bdevname(bh->b_bdev, b));
- }
+ buffer_io_error(bh, ", lost sync page write");
set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
@@ -304,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
clear_buffer_uptodate(bh);
- if (!quiet_error(bh))
- buffer_io_error(bh);
+ buffer_io_error(bh, ", async page read");
SetPageError(page);
}
@@ -353,7 +339,6 @@ still_busy:
*/
void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
@@ -365,12 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (!quiet_error(bh)) {
- buffer_io_error(bh);
- printk(KERN_WARNING "lost page write due to "
- "I/O error on %s\n",
- bdevname(bh->b_bdev, b));
- }
+ buffer_io_error(bh, ", lost async page write");
set_bit(AS_EIO, &page->mapping->flags);
set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
@@ -993,7 +973,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
*/
static int
grow_dev_page(struct block_device *bdev, sector_t block,
- pgoff_t index, int size, int sizebits)
+ pgoff_t index, int size, int sizebits, gfp_t gfp)
{
struct inode *inode = bdev->bd_inode;
struct page *page;
@@ -1002,8 +982,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
- gfp_mask |= __GFP_MOVABLE;
+ gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+
/*
* XXX: __getblk_slow() can not really deal with failure and
* will endlessly loop on improvised global reclaim. Prefer
@@ -1022,7 +1002,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
bh = page_buffers(page);
if (bh->b_size == size) {
end_block = init_page_buffers(page, bdev,
- index << sizebits, size);
+ (sector_t)index << sizebits,
+ size);
goto done;
}
if (!try_to_free_buffers(page))
@@ -1043,7 +1024,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
*/
spin_lock(&inode->i_mapping->private_lock);
link_dev_buffers(page, bh);
- end_block = init_page_buffers(page, bdev, index << sizebits, size);
+ end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
+ size);
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
@@ -1058,7 +1040,7 @@ failed:
* that page was dirty, the buffers are set dirty also.
*/
static int
-grow_buffers(struct block_device *bdev, sector_t block, int size)
+grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
{
pgoff_t index;
int sizebits;
@@ -1085,11 +1067,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
}
/* Create a page with the proper size buffers.. */
- return grow_dev_page(bdev, block, index, size, sizebits);
+ return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}
-static struct buffer_head *
-__getblk_slow(struct block_device *bdev, sector_t block, int size)
+struct buffer_head *
+__getblk_slow(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
@@ -1111,13 +1094,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
if (bh)
return bh;
- ret = grow_buffers(bdev, block, size);
+ ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
if (ret == 0)
free_more_memory();
}
}
+EXPORT_SYMBOL(__getblk_slow);
/*
* The relationship between dirty buffers and dirty pages:
@@ -1251,7 +1235,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
* a local interrupt disable for that.
*/
-#define BH_LRU_SIZE 8
+#define BH_LRU_SIZE 16
struct bh_lru {
struct buffer_head *bhs[BH_LRU_SIZE];
@@ -1329,8 +1313,8 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
for (i = 0; i < BH_LRU_SIZE; i++) {
struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
- if (bh && bh->b_bdev == bdev &&
- bh->b_blocknr == block && bh->b_size == size) {
+ if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
+ bh->b_size == size) {
if (i) {
while (i) {
__this_cpu_write(bh_lrus.bhs[i],
@@ -1371,24 +1355,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
EXPORT_SYMBOL(__find_get_block);
/*
- * __getblk will locate (and, if necessary, create) the buffer_head
+ * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
* which corresponds to the passed block_device, block and size. The
* returned buffer has its reference count incremented.
*
- * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
- * attempt is failing. FIXME, perhaps?
+ * __getblk_gfp() will lock up the machine if grow_dev_page's
+ * try_to_free_buffers() attempt is failing. FIXME, perhaps?
*/
struct buffer_head *
-__getblk(struct block_device *bdev, sector_t block, unsigned size)
+__getblk_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
struct buffer_head *bh = __find_get_block(bdev, block, size);
might_sleep();
if (bh == NULL)
- bh = __getblk_slow(bdev, block, size);
+ bh = __getblk_slow(bdev, block, size, gfp);
return bh;
}
-EXPORT_SYMBOL(__getblk);
+EXPORT_SYMBOL(__getblk_gfp);
/*
* Do async read-ahead on a buffer..
@@ -1404,24 +1389,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
EXPORT_SYMBOL(__breadahead);
/**
- * __bread() - reads a specified block and returns the bh
+ * __bread_gfp() - reads a specified block and returns the bh
* @bdev: the block_device to read from
* @block: number of block
* @size: size (in bytes) to read
- *
+ * @gfp: page allocation flag
+ *
* Reads a specified block, and returns buffer head that contains it.
+ * The page cache can be allocated from non-movable area
+ * not to prevent page migration if you set gfp to zero.
* It returns NULL if the block was unreadable.
*/
struct buffer_head *
-__bread(struct block_device *bdev, sector_t block, unsigned size)
+__bread_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __getblk(bdev, block, size);
+ struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
return bh;
}
-EXPORT_SYMBOL(__bread);
+EXPORT_SYMBOL(__bread_gfp);
/*
* invalidate_bh_lrus() is called rarely - but not only at unmount.
@@ -2080,6 +2069,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ loff_t old_size = inode->i_size;
int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2099,6 +2089,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
unlock_page(page);
page_cache_release(page);
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
* makes the holding time of page lock longer. Second, it forces lock
@@ -2316,6 +2308,11 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
err = 0;
balance_dirty_pages_ratelimited(mapping);
+
+ if (unlikely(fatal_signal_pending(current))) {
+ err = -EINTR;
+ goto out;
+ }
}
/* page covers the boundary, find the boundary offset */
@@ -2954,7 +2951,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
/*
* This allows us to do IO even on the odd last sectors
- * of a device, even if the bh block size is some multiple
+ * of a device, even if the block size is some multiple
* of the physical sector size.
*
* We'll just truncate the bio to the size of the device,
@@ -2964,10 +2961,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
* errors, this only handles the "we need to be able to
* do IO at the final sector" case.
*/
-static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+void guard_bio_eod(int rw, struct bio *bio)
{
sector_t maxsector;
- unsigned bytes;
+ struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ unsigned truncated_bytes;
maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
if (!maxsector)
@@ -2982,23 +2980,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
return;
maxsector -= bio->bi_iter.bi_sector;
- bytes = bio->bi_iter.bi_size;
- if (likely((bytes >> 9) <= maxsector))
+ if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
return;
- /* Uhhuh. We've got a bh that straddles the device size! */
- bytes = maxsector << 9;
+ /* Uhhuh. We've got a bio that straddles the device size! */
+ truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
/* Truncate the bio.. */
- bio->bi_iter.bi_size = bytes;
- bio->bi_io_vec[0].bv_len = bytes;
+ bio->bi_iter.bi_size -= truncated_bytes;
+ bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
if ((rw & RW_MASK) == READ) {
- void *kaddr = kmap_atomic(bh->b_page);
- memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
- kunmap_atomic(kaddr);
- flush_dcache_page(bh->b_page);
+ zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+ truncated_bytes);
}
}
@@ -3039,7 +3034,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
- guard_bh_eod(rw, bio, bh);
+ guard_bio_eod(rw, bio);
if (buffer_meta(bh))
rw |= REQ_META;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index d749731dc0ee..fbb08e97438d 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
cache->brun_percent < 100);
if (*args) {
- pr_err("'bind' command doesn't take an argument");
+ pr_err("'bind' command doesn't take an argument\n");
return -EINVAL;
}
if (!cache->rootdirname) {
- pr_err("No cache directory specified");
+ pr_err("No cache directory specified\n");
return -EINVAL;
}
/* don't permit already bound caches to be re-bound */
if (test_bit(CACHEFILES_READY, &cache->flags)) {
- pr_err("Cache already bound");
+ pr_err("Cache already bound\n");
return -EBUSY;
}
@@ -248,7 +248,7 @@ error_open_root:
kmem_cache_free(cachefiles_object_jar, fsdef);
error_root_object:
cachefiles_end_secure(cache, saved_cred);
- pr_err("Failed to register: %d", ret);
+ pr_err("Failed to register: %d\n", ret);
return ret;
}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index b078d3081d6c..ce1b115dcc28 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -315,7 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file,
static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
char *args)
{
- pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%");
+ pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
return -EINVAL;
}
@@ -475,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- pr_err("Empty directory specified");
+ pr_err("Empty directory specified\n");
return -EINVAL;
}
if (cache->rootdirname) {
- pr_err("Second cache directory specified");
+ pr_err("Second cache directory specified\n");
return -EEXIST;
}
@@ -503,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- pr_err("Empty security context specified");
+ pr_err("Empty security context specified\n");
return -EINVAL;
}
if (cache->secctx) {
- pr_err("Second security context specified");
+ pr_err("Second security context specified\n");
return -EINVAL;
}
@@ -531,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- pr_err("Empty tag specified");
+ pr_err("Empty tag specified\n");
return -EINVAL;
}
@@ -562,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
goto inval;
if (!test_bit(CACHEFILES_READY, &cache->flags)) {
- pr_err("cull applied to unready cache");
+ pr_err("cull applied to unready cache\n");
return -EIO;
}
if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
- pr_err("cull applied to dead cache");
+ pr_err("cull applied to dead cache\n");
return -EIO;
}
@@ -587,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
notdir:
path_put(&path);
- pr_err("cull command requires dirfd to be a directory");
+ pr_err("cull command requires dirfd to be a directory\n");
return -ENOTDIR;
inval:
- pr_err("cull command requires dirfd and filename");
+ pr_err("cull command requires dirfd and filename\n");
return -EINVAL;
}
@@ -614,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
return 0;
inval:
- pr_err("debug command requires mask");
+ pr_err("debug command requires mask\n");
return -EINVAL;
}
@@ -634,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
goto inval;
if (!test_bit(CACHEFILES_READY, &cache->flags)) {
- pr_err("inuse applied to unready cache");
+ pr_err("inuse applied to unready cache\n");
return -EIO;
}
if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
- pr_err("inuse applied to dead cache");
+ pr_err("inuse applied to dead cache\n");
return -EIO;
}
@@ -659,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
notdir:
path_put(&path);
- pr_err("inuse command requires dirfd to be a directory");
+ pr_err("inuse command requires dirfd to be a directory\n");
return -ENOTDIR;
inval:
- pr_err("inuse command requires dirfd and filename");
+ pr_err("inuse command requires dirfd and filename\n");
return -EINVAL;
}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 584743d456c3..1c7293c3a93a 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -268,20 +268,27 @@ static void cachefiles_drop_object(struct fscache_object *_object)
ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
#endif
- /* delete retired objects */
- if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
- _object != cache->cache.fsdef
- ) {
- _debug("- retire object OBJ%x", object->fscache.debug_id);
- cachefiles_begin_secure(cache, &saved_cred);
- cachefiles_delete_object(cache, object);
- cachefiles_end_secure(cache, saved_cred);
- }
+ /* We need to tidy the object up if we did in fact manage to open it.
+ * It's possible for us to get here before the object is fully
+ * initialised if the parent goes away or the object gets retired
+ * before we set it up.
+ */
+ if (object->dentry) {
+ /* delete retired objects */
+ if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
+ _object != cache->cache.fsdef
+ ) {
+ _debug("- retire object OBJ%x", object->fscache.debug_id);
+ cachefiles_begin_secure(cache, &saved_cred);
+ cachefiles_delete_object(cache, object);
+ cachefiles_end_secure(cache, saved_cred);
+ }
- /* close the filesystem stuff attached to the object */
- if (object->backer != object->dentry)
- dput(object->backer);
- object->backer = NULL;
+ /* close the filesystem stuff attached to the object */
+ if (object->backer != object->dentry)
+ dput(object->backer);
+ object->backer = NULL;
+ }
/* note that the object is now inactive */
if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 3d50998abf57..8c52472d2efa 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -255,7 +255,7 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
#define cachefiles_io_error(___cache, FMT, ...) \
do { \
- pr_err("I/O Error: " FMT, ##__VA_ARGS__); \
+ pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
fscache_io_error(&(___cache)->cache); \
set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
} while (0)
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 180edfb45f66..711f13d8c2de 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -84,7 +84,7 @@ error_proc:
error_object_jar:
misc_deregister(&cachefiles_dev);
error_dev:
- pr_err("failed to register: %d", ret);
+ pr_err("failed to register: %d\n", ret);
return ret;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 5bf2b41e66d3..e12f189d539b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -189,7 +189,7 @@ try_again:
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
wait_for_old_object:
- if (fscache_object_is_live(&object->fscache)) {
+ if (fscache_object_is_live(&xobject->fscache)) {
pr_err("\n");
pr_err("Error: Unexpected object collision\n");
cachefiles_printk_object(object, xobject);
@@ -543,7 +543,7 @@ lookup_again:
next, next->d_inode, next->d_inode->i_ino);
} else if (!S_ISDIR(next->d_inode->i_mode)) {
- pr_err("inode %lu is not a directory",
+ pr_err("inode %lu is not a directory\n",
next->d_inode->i_ino);
ret = -ENOBUFS;
goto error;
@@ -574,7 +574,7 @@ lookup_again:
} else if (!S_ISDIR(next->d_inode->i_mode) &&
!S_ISREG(next->d_inode->i_mode)
) {
- pr_err("inode %lu is not a file or directory",
+ pr_err("inode %lu is not a file or directory\n",
next->d_inode->i_ino);
ret = -ENOBUFS;
goto error;
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
ASSERT(subdir->d_inode);
if (!S_ISDIR(subdir->d_inode->i_mode)) {
- pr_err("%s is not a directory", dirname);
+ pr_err("%s is not a directory\n", dirname);
ret = -EIO;
goto check_error;
}
@@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
!subdir->d_inode->i_op->lookup ||
!subdir->d_inode->i_op->mkdir ||
!subdir->d_inode->i_op->create ||
- !subdir->d_inode->i_op->rename ||
+ (!subdir->d_inode->i_op->rename &&
+ !subdir->d_inode->i_op->rename2) ||
!subdir->d_inode->i_op->rmdir ||
!subdir->d_inode->i_op->unlink)
goto check_error;
@@ -795,13 +796,13 @@ check_error:
mkdir_error:
mutex_unlock(&dir->d_inode->i_mutex);
dput(subdir);
- pr_err("mkdir %s failed with error %d", dirname, ret);
+ pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
lookup_error:
mutex_unlock(&dir->d_inode->i_mutex);
ret = PTR_ERR(subdir);
- pr_err("Lookup %s failed with error %d", dirname, ret);
+ pr_err("Lookup %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
nomem_d_alloc:
@@ -891,7 +892,7 @@ lookup_error:
if (ret == -EIO) {
cachefiles_io_error(cache, "Lookup failed");
} else if (ret != -ENOMEM) {
- pr_err("Internal error: %d", ret);
+ pr_err("Internal error: %d\n", ret);
ret = -EIO;
}
@@ -950,7 +951,7 @@ error:
}
if (ret != -ENOMEM) {
- pr_err("Internal error: %d", ret);
+ pr_err("Internal error: %d\n", ret);
ret = -EIO;
}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 4b1fb5ca65b8..616db0e77b44 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
struct cachefiles_one_read *monitor;
struct cachefiles_object *object;
struct fscache_retrieval *op;
- struct pagevec pagevec;
int error, max;
op = container_of(_op, struct fscache_retrieval, op);
@@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
_enter("{ino=%lu}", object->backer->d_inode->i_ino);
- pagevec_init(&pagevec, 0);
-
max = 8;
spin_lock_irq(&object->work_lock);
@@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
- struct pagevec pagevec;
struct inode *inode;
sector_t block0, block;
unsigned shift;
@@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
op->op.flags |= FSCACHE_OP_ASYNC;
op->op.processor = cachefiles_read_copier;
- pagevec_init(&pagevec, 0);
-
/* we assume the absence or presence of the first block is a good
* enough indication for the page as a whole
* - TODO: don't use bmap() for this as it is _not_ actually good
@@ -886,7 +880,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
- mm_segment_t old_fs;
struct file *file;
struct path path;
loff_t pos, eof;
@@ -920,36 +913,27 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
if (IS_ERR(file)) {
ret = PTR_ERR(file);
} else {
- ret = -EIO;
- if (file->f_op->write) {
- pos = (loff_t) page->index << PAGE_SHIFT;
-
- /* we mustn't write more data than we have, so we have
- * to beware of a partial page at EOF */
- eof = object->fscache.store_limit_l;
- len = PAGE_SIZE;
- if (eof & ~PAGE_MASK) {
- ASSERTCMP(pos, <, eof);
- if (eof - pos < PAGE_SIZE) {
- _debug("cut short %llx to %llx",
- pos, eof);
- len = eof - pos;
- ASSERTCMP(pos + len, ==, eof);
- }
+ pos = (loff_t) page->index << PAGE_SHIFT;
+
+ /* we mustn't write more data than we have, so we have
+ * to beware of a partial page at EOF */
+ eof = object->fscache.store_limit_l;
+ len = PAGE_SIZE;
+ if (eof & ~PAGE_MASK) {
+ ASSERTCMP(pos, <, eof);
+ if (eof - pos < PAGE_SIZE) {
+ _debug("cut short %llx to %llx",
+ pos, eof);
+ len = eof - pos;
+ ASSERTCMP(pos + len, ==, eof);
}
-
- data = kmap(page);
- file_start_write(file);
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = file->f_op->write(
- file, (const void __user *) data, len, &pos);
- set_fs(old_fs);
- kunmap(page);
- file_end_write(file);
- if (ret != len)
- ret = -EIO;
}
+
+ data = kmap(page);
+ ret = __kernel_write(file, data, len, &pos);
+ kunmap(page);
+ if (ret != len)
+ ret = -EIO;
fput(file);
}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 1ad51ffbb275..acbc1f094fb1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
}
if (ret != -EEXIST) {
- pr_err("Can't set xattr on %*.*s [%lu] (err %d)",
+ pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
-ret);
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
if (ret == -ERANGE)
goto bad_type_length;
- pr_err("Can't read xattr on %*.*s [%lu] (err %d)",
+ pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
-ret);
@@ -85,14 +85,14 @@ error:
return ret;
bad_type_length:
- pr_err("Cache object %lu type xattr length incorrect",
+ pr_err("Cache object %lu type xattr length incorrect\n",
dentry->d_inode->i_ino);
ret = -EIO;
goto error;
bad_type:
xtype[2] = 0;
- pr_err("Cache object %*.*s [%lu] type %s not %s",
+ pr_err("Cache object %*.*s [%lu] type %s not %s\n",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
xtype, type);
@@ -293,7 +293,7 @@ error:
return ret;
bad_type_length:
- pr_err("Cache object %lu xattr length incorrect",
+ pr_err("Cache object %lu xattr length incorrect\n",
dentry->d_inode->i_ino);
ret = -EIO;
goto error;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 469f2e8657e8..5bd853ba44ff 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -169,26 +169,109 @@ out:
return ret;
}
-int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
+int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acls_info *info)
{
- struct posix_acl *default_acl, *acl;
- int error;
+ struct posix_acl *acl, *default_acl;
+ size_t val_size1 = 0, val_size2 = 0;
+ struct ceph_pagelist *pagelist = NULL;
+ void *tmp_buf = NULL;
+ int err;
- error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (error)
- return error;
+ err = posix_acl_create(dir, mode, &default_acl, &acl);
+ if (err)
+ return err;
+
+ if (acl) {
+ int ret = posix_acl_equiv_mode(acl, mode);
+ if (ret < 0)
+ goto out_err;
+ if (ret == 0) {
+ posix_acl_release(acl);
+ acl = NULL;
+ }
+ }
if (!default_acl && !acl)
- cache_no_acl(inode);
+ return 0;
+
+ if (acl)
+ val_size1 = posix_acl_xattr_size(acl->a_count);
+ if (default_acl)
+ val_size2 = posix_acl_xattr_size(default_acl->a_count);
+
+ err = -ENOMEM;
+ tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
+ if (!tmp_buf)
+ goto out_err;
+ pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
+ if (!pagelist)
+ goto out_err;
+ ceph_pagelist_init(pagelist);
+
+ err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
+ if (err)
+ goto out_err;
+
+ ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
- if (default_acl) {
- error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- }
if (acl) {
- if (!error)
- error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
- posix_acl_release(acl);
+ size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
+ err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
+ if (err)
+ goto out_err;
+ ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
+ len);
+ err = posix_acl_to_xattr(&init_user_ns, acl,
+ tmp_buf, val_size1);
+ if (err < 0)
+ goto out_err;
+ ceph_pagelist_encode_32(pagelist, val_size1);
+ ceph_pagelist_append(pagelist, tmp_buf, val_size1);
}
- return error;
+ if (default_acl) {
+ size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
+ err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
+ if (err)
+ goto out_err;
+ err = ceph_pagelist_encode_string(pagelist,
+ POSIX_ACL_XATTR_DEFAULT, len);
+ err = posix_acl_to_xattr(&init_user_ns, default_acl,
+ tmp_buf, val_size2);
+ if (err < 0)
+ goto out_err;
+ ceph_pagelist_encode_32(pagelist, val_size2);
+ ceph_pagelist_append(pagelist, tmp_buf, val_size2);
+ }
+
+ kfree(tmp_buf);
+
+ info->acl = acl;
+ info->default_acl = default_acl;
+ info->pagelist = pagelist;
+ return 0;
+
+out_err:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+ kfree(tmp_buf);
+ if (pagelist)
+ ceph_pagelist_release(pagelist);
+ return err;
+}
+
+void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info)
+{
+ if (!inode)
+ return;
+ ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl);
+ ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl);
+}
+
+void ceph_release_acls_info(struct ceph_acls_info *info)
+{
+ posix_acl_release(info->acl);
+ posix_acl_release(info->default_acl);
+ if (info->pagelist)
+ ceph_pagelist_release(info->pagelist);
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 90b3954d48ed..18c06bbaf136 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1076,12 +1076,6 @@ retry_locked:
/* past end of file? */
i_size = inode->i_size; /* caller holds i_mutex */
- if (i_size + len > inode->i_sb->s_maxbytes) {
- /* file is too big */
- r = -EINVAL;
- goto fail;
- }
-
if (page_off >= i_size ||
(pos_in_page == 0 && (pos+len) >= i_size &&
end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
@@ -1099,9 +1093,6 @@ retry_locked:
if (r < 0)
goto fail_nosnap;
goto retry_locked;
-
-fail:
- up_read(&mdsc->snap_rwsem);
fail_nosnap:
unlock_page(page);
return r;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1fde164b74b5..cefca661464b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2397,12 +2397,12 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
u64 max_size = le64_to_cpu(grant->max_size);
struct timespec mtime, atime, ctime;
int check_caps = 0;
- bool wake = 0;
- bool writeback = 0;
- bool queue_trunc = 0;
- bool queue_invalidate = 0;
- bool queue_revalidate = 0;
- bool deleted_inode = 0;
+ bool wake = false;
+ bool writeback = false;
+ bool queue_trunc = false;
+ bool queue_invalidate = false;
+ bool queue_revalidate = false;
+ bool deleted_inode = false;
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2437,7 +2437,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
/* there were locked pages.. invalidate later
in a separate thread. */
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- queue_invalidate = 1;
+ queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
}
@@ -2466,7 +2466,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
set_nlink(inode, le32_to_cpu(grant->nlink));
if (inode->i_nlink == 0 &&
(newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
- deleted_inode = 1;
+ deleted_inode = true;
}
if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
@@ -2487,7 +2487,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
/* Do we need to revalidate our fscache cookie. Don't bother on the
* first cache cap as we already validate at cookie creation time. */
if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
- queue_revalidate = 1;
+ queue_revalidate = true;
if (newcaps & CEPH_CAP_ANY_RD) {
/* ctime/mtime/atime? */
@@ -2516,7 +2516,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
ci->i_wanted_max_size = 0; /* reset */
ci->i_requested_max_size = 0;
}
- wake = 1;
+ wake = true;
}
}
@@ -2546,7 +2546,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
ceph_cap_string(newcaps),
ceph_cap_string(revoking));
if (revoking & used & CEPH_CAP_FILE_BUFFER)
- writeback = 1; /* initiate writeback; will delay ack */
+ writeback = true; /* initiate writeback; will delay ack */
else if (revoking == CEPH_CAP_FILE_CACHE &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
queue_invalidate)
@@ -2572,7 +2572,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
cap->implemented |= newcaps; /* add bits only, to
* avoid stepping on a
* pending revocation */
- wake = 1;
+ wake = true;
}
BUG_ON(cap->issued & ~cap->implemented);
@@ -2586,7 +2586,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
kick_flushing_inode_caps(mdsc, session, inode);
up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued)
- wake = 1;
+ wake = true;
}
if (queue_trunc) {
@@ -2638,7 +2638,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
for (i = 0; i < CEPH_CAP_BITS; i++)
if ((dirty & (1 << i)) &&
- flush_tid == ci->i_cap_flush_tid[i])
+ (u16)flush_tid == ci->i_cap_flush_tid[i])
cleaned |= 1 << i;
dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
@@ -3045,6 +3045,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
}
+ /* lookup ino */
+ inode = ceph_find_inode(sb, vino);
+ ci = ceph_inode(inode);
+ dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+ vino.snap, inode);
+
mutex_lock(&session->s_mutex);
session->s_seq++;
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -3053,11 +3059,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
if (op == CEPH_CAP_OP_IMPORT)
ceph_add_cap_releases(mdsc, session);
- /* lookup ino */
- inode = ceph_find_inode(sb, vino);
- ci = ceph_inode(inode);
- dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
- vino.snap, inode);
if (!inode) {
dout(" i don't have ino %llx\n", vino.ino);
@@ -3277,7 +3278,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
rel->ino = cpu_to_le64(ceph_ino(inode));
rel->cap_id = cpu_to_le64(cap->cap_id);
rel->seq = cpu_to_le32(cap->seq);
- rel->issue_seq = cpu_to_le32(cap->issue_seq),
+ rel->issue_seq = cpu_to_le32(cap->issue_seq);
rel->mseq = cpu_to_le32(cap->mseq);
rel->caps = cpu_to_le32(cap->implemented);
rel->wanted = cpu_to_le32(cap->mds_wanted);
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 5a743ac141ab..5d5a4c8c8496 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -158,10 +158,47 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
return 0;
}
+static int mds_sessions_show(struct seq_file *s, void *ptr)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_auth_client *ac = fsc->client->monc.auth;
+ struct ceph_options *opt = fsc->client->options;
+ int mds = -1;
+
+ mutex_lock(&mdsc->mutex);
+
+ /* The 'num' portion of an 'entity name' */
+ seq_printf(s, "global_id %llu\n", ac->global_id);
+
+ /* The -o name mount argument */
+ seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : "");
+
+ /* The list of MDS session rank+state */
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ struct ceph_mds_session *session =
+ __ceph_lookup_mds_session(mdsc, mds);
+ if (!session) {
+ continue;
+ }
+ mutex_unlock(&mdsc->mutex);
+ seq_printf(s, "mds.%d %s\n",
+ session->s_mds,
+ ceph_session_state_name(session->s_state));
+
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ return 0;
+}
+
CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
CEPH_DEFINE_SHOW_FUNC(mdsc_show)
CEPH_DEFINE_SHOW_FUNC(caps_show)
CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
/*
@@ -193,6 +230,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_bdi);
debugfs_remove(fsc->debugfs_congestion_kb);
debugfs_remove(fsc->debugfs_mdsmap);
+ debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
debugfs_remove(fsc->debugfs_mdsc);
debugfs_remove(fsc->debugfs_dentry_lru);
@@ -231,6 +269,14 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
if (!fsc->debugfs_mdsmap)
goto out;
+ fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &mds_sessions_show_fops);
+ if (!fsc->debugfs_mds_sessions)
+ goto out;
+
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
0600,
fsc->client->debugfs_dir,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c29d6ae68874..e6d63f8f98c0 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -682,17 +682,22 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
+ struct ceph_acls_info acls = {};
int err;
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_pre_init_acls(dir, &mode, &acls);
+ if (err < 0)
+ return err;
+
dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
dir, dentry, mode, rdev);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
+ err = PTR_ERR(req);
+ goto out;
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
@@ -701,15 +706,20 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
req->r_args.mknod.rdev = cpu_to_le32(rdev);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (acls.pagelist) {
+ req->r_pagelist = acls.pagelist;
+ acls.pagelist = NULL;
+ }
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
-
+out:
if (!err)
- ceph_init_acl(dentry, dentry->d_inode, dir);
+ ceph_init_inode_acls(dentry->d_inode, &acls);
else
d_drop(dentry);
+ ceph_release_acls_info(&acls);
return err;
}
@@ -733,8 +743,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
+ err = PTR_ERR(req);
+ goto out;
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
@@ -746,9 +756,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
- if (!err)
- ceph_init_acl(dentry, dentry->d_inode, dir);
- else
+out:
+ if (err)
d_drop(dentry);
return err;
}
@@ -758,6 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
+ struct ceph_acls_info acls = {};
int err = -EROFS;
int op;
@@ -772,6 +782,12 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
} else {
goto out;
}
+
+ mode |= S_IFDIR;
+ err = ceph_pre_init_acls(dir, &mode, &acls);
+ if (err < 0)
+ goto out;
+
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -784,15 +800,20 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
req->r_args.mkdir.mode = cpu_to_le32(mode);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (acls.pagelist) {
+ req->r_pagelist = acls.pagelist;
+ acls.pagelist = NULL;
+ }
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
out:
if (!err)
- ceph_init_acl(dentry, dentry->d_inode, dir);
+ ceph_init_inode_acls(dentry->d_inode, &acls);
else
d_drop(dentry);
+ ceph_release_acls_info(&acls);
return err;
}
@@ -1069,7 +1090,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
ceph_dentry_lru_touch(dentry);
} else {
ceph_dir_clear_complete(dir);
- d_drop(dentry);
}
iput(dir);
return valid;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 302085100c28..d7e0da8366e6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -235,6 +235,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct dentry *dn;
+ struct ceph_acls_info acls = {};
int err;
dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
@@ -248,22 +249,34 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (err < 0)
return err;
+ if (flags & O_CREAT) {
+ err = ceph_pre_init_acls(dir, &mode, &acls);
+ if (err < 0)
+ return err;
+ }
+
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out_acl;
+ }
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
if (flags & O_CREAT) {
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (acls.pagelist) {
+ req->r_pagelist = acls.pagelist;
+ acls.pagelist = NULL;
+ }
}
req->r_locked_dir = dir; /* caller holds dir->i_mutex */
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
if (err)
- goto out_err;
+ goto out_req;
err = ceph_handle_snapdir(req, dentry, err);
if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
@@ -278,7 +291,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
dn = NULL;
}
if (err)
- goto out_err;
+ goto out_req;
if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
/* make vfs retry on splice, ENOENT, or symlink */
dout("atomic_open finish_no_open on dn %p\n", dn);
@@ -286,15 +299,17 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
- ceph_init_acl(dentry, dentry->d_inode, dir);
+ ceph_init_inode_acls(dentry->d_inode, &acls);
*opened |= FILE_CREATED;
}
err = finish_open(file, dentry, ceph_open, opened);
}
-out_err:
+out_req:
if (!req->r_err && req->r_target_inode)
ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
+out_acl:
+ ceph_release_acls_info(&acls);
dout("atomic_open result=%d\n", err);
return err;
}
@@ -423,6 +438,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
dout("sync_read on file %p %llu~%u %s\n", file, off,
(unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+ if (!len)
+ return 0;
/*
* flush any page cache pages in this range. this
* will make concurrent normal and sync io slow,
@@ -470,8 +488,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
size_t left = ret;
while (left) {
- int copy = min_t(size_t, PAGE_SIZE, left);
- l = copy_page_to_iter(pages[k++], 0, copy, i);
+ size_t page_off = off & ~PAGE_MASK;
+ size_t copy = min_t(size_t,
+ PAGE_SIZE - page_off, left);
+ l = copy_page_to_iter(pages[k++], page_off,
+ copy, i);
off += l;
left -= l;
if (l < copy)
@@ -531,7 +552,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
* objects, rollback on failure, etc.)
*/
static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -547,7 +568,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
int check_caps = 0;
int ret;
struct timespec mtime = CURRENT_TIME;
- loff_t pos = iocb->ki_pos;
size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -646,7 +666,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
* correct atomic write, we should e.g. take write locks on all
* objects, rollback on failure, etc.)
*/
-static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -663,7 +684,6 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
int check_caps = 0;
int ret;
struct timespec mtime = CURRENT_TIME;
- loff_t pos = iocb->ki_pos;
size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -821,8 +841,7 @@ again:
ceph_put_cap_refs(ci, got);
if (checkeof && ret >= 0) {
- int statret = ceph_do_getattr(inode,
- CEPH_STAT_CAP_SIZE);
+ int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
/* hit EOF or hole? */
if (statret == 0 && iocb->ki_pos < inode->i_size &&
@@ -831,7 +850,6 @@ again:
", reading more\n", iocb->ki_pos,
inode->i_size);
- iov_iter_advance(to, ret);
read += ret;
len -= ret;
checkeof = 0;
@@ -918,9 +936,9 @@ retry_snap:
/* we might need to revert back to that point */
data = *from;
if (file->f_flags & O_DIRECT)
- written = ceph_sync_direct_write(iocb, &data);
+ written = ceph_sync_direct_write(iocb, &data, pos);
else
- written = ceph_sync_write(iocb, &data);
+ written = ceph_sync_write(iocb, &data, pos);
if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n",
@@ -990,7 +1008,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
mutex_lock(&inode->i_mutex);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
- ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+ ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
if (ret < 0) {
offset = ret;
goto out;
@@ -1177,6 +1195,9 @@ static long ceph_fallocate(struct file *file, int mode,
loff_t endoff = 0;
loff_t size;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 04c89c266cec..7b6139004401 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -766,7 +766,7 @@ static int fill_inode(struct inode *inode,
/* xattrs */
/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
- if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+ if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
if (ci->i_xattrs.blob)
ceph_buffer_put(ci->i_xattrs.blob);
@@ -1813,10 +1813,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ia_valid & ATTR_SIZE) {
dout("setattr %p size %lld -> %lld\n", inode,
inode->i_size, attr->ia_size);
- if (attr->ia_size > inode->i_sb->s_maxbytes) {
- err = -EINVAL;
- goto out;
- }
if ((issued & CEPH_CAP_FILE_EXCL) &&
attr->ia_size > inode->i_size) {
inode->i_size = attr->ia_size;
@@ -1896,8 +1892,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (mask & CEPH_SETATTR_SIZE)
__ceph_do_pending_vmtruncate(inode);
return err;
-out:
- spin_unlock(&ci->i_ceph_lock);
out_put:
ceph_mdsc_put_request(req);
return err;
@@ -1907,7 +1901,7 @@ out_put:
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
*/
-int ceph_do_getattr(struct inode *inode, int mask)
+int ceph_do_getattr(struct inode *inode, int mask, bool force)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1920,7 +1914,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
}
dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
- if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+ if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
return 0;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
@@ -1948,7 +1942,7 @@ int ceph_permission(struct inode *inode, int mask)
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
- err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+ err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
if (!err)
err = generic_permission(inode, mask);
@@ -1966,7 +1960,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct ceph_inode_info *ci = ceph_inode(inode);
int err;
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
if (!err) {
generic_fillattr(inode, stat);
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index a822a6e58290..f851d8d70158 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -19,7 +19,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
struct ceph_ioctl_layout l;
int err;
- err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
+ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (!err) {
l.stripe_unit = ceph_file_layout_su(ci->i_layout);
l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
@@ -41,7 +41,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
/* validate striping parameters */
if ((l->object_size & ~PAGE_MASK) ||
(l->stripe_unit & ~PAGE_MASK) ||
- (l->stripe_unit != 0 &&
+ ((unsigned)l->stripe_unit != 0 &&
((unsigned)l->object_size % (unsigned)l->stripe_unit)))
return -EINVAL;
@@ -74,7 +74,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
return -EFAULT;
/* validate changed params against current layout */
- err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
+ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (err)
return err;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 92a2548278fc..a92d3f5c6c12 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/utsname.h>
#include "super.h"
#include "mds_client.h"
@@ -334,7 +335,7 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
/*
* sessions
*/
-static const char *session_state_name(int s)
+const char *ceph_session_state_name(int s)
{
switch (s) {
case CEPH_MDS_SESSION_NEW: return "new";
@@ -542,6 +543,8 @@ void ceph_mdsc_release_request(struct kref *kref)
}
kfree(req->r_path1);
kfree(req->r_path2);
+ if (req->r_pagelist)
+ ceph_pagelist_release(req->r_pagelist);
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
kfree(req);
@@ -812,6 +815,74 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
h = msg->front.iov_base;
h->op = cpu_to_le32(op);
h->seq = cpu_to_le64(seq);
+
+ return msg;
+}
+
+/*
+ * session message, specialization for CEPH_SESSION_REQUEST_OPEN
+ * to include additional client metadata fields.
+ */
+static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_session_head *h;
+ int i = -1;
+ int metadata_bytes = 0;
+ int metadata_key_count = 0;
+ struct ceph_options *opt = mdsc->fsc->client->options;
+ void *p;
+
+ const char* metadata[3][2] = {
+ {"hostname", utsname()->nodename},
+ {"entity_id", opt->name ? opt->name : ""},
+ {NULL, NULL}
+ };
+
+ /* Calculate serialized length of metadata */
+ metadata_bytes = 4; /* map length */
+ for (i = 0; metadata[i][0] != NULL; ++i) {
+ metadata_bytes += 8 + strlen(metadata[i][0]) +
+ strlen(metadata[i][1]);
+ metadata_key_count++;
+ }
+
+ /* Allocate the message */
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes,
+ GFP_NOFS, false);
+ if (!msg) {
+ pr_err("create_session_msg ENOMEM creating msg\n");
+ return NULL;
+ }
+ h = msg->front.iov_base;
+ h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
+ h->seq = cpu_to_le64(seq);
+
+ /*
+ * Serialize client metadata into waiting buffer space, using
+ * the format that userspace expects for map<string, string>
+ */
+ msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */
+
+ /* The write pointer, following the session_head structure */
+ p = msg->front.iov_base + sizeof(*h);
+
+ /* Number of entries in the map */
+ ceph_encode_32(&p, metadata_key_count);
+
+ /* Two length-prefixed strings for each entry in the map */
+ for (i = 0; metadata[i][0] != NULL; ++i) {
+ size_t const key_len = strlen(metadata[i][0]);
+ size_t const val_len = strlen(metadata[i][1]);
+
+ ceph_encode_32(&p, key_len);
+ memcpy(p, metadata[i][0], key_len);
+ p += key_len;
+ ceph_encode_32(&p, val_len);
+ memcpy(p, metadata[i][1], val_len);
+ p += val_len;
+ }
+
return msg;
}
@@ -835,7 +906,7 @@ static int __open_session(struct ceph_mds_client *mdsc,
session->s_renew_requested = jiffies;
/* send connect message */
- msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+ msg = create_session_open_msg(mdsc, session->s_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1164,7 +1235,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
struct ceph_msg *msg;
dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
- session->s_mds, session_state_name(session->s_state), seq);
+ session->s_mds, ceph_session_state_name(session->s_state), seq);
msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
if (!msg)
return -ENOMEM;
@@ -1216,7 +1287,7 @@ static int request_close_session(struct ceph_mds_client *mdsc,
struct ceph_msg *msg;
dout("request_close_session mds%d state %s seq %lld\n",
- session->s_mds, session_state_name(session->s_state),
+ session->s_mds, ceph_session_state_name(session->s_state),
session->s_seq);
msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
if (!msg)
@@ -1847,13 +1918,15 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- if (req->r_data_len) {
- /* outbound data set only by ceph_sync_setxattr() */
- BUG_ON(!req->r_pages);
- ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
+ if (req->r_pagelist) {
+ struct ceph_pagelist *pagelist = req->r_pagelist;
+ atomic_inc(&pagelist->refcnt);
+ ceph_msg_data_add_pagelist(msg, pagelist);
+ msg->hdr.data_len = cpu_to_le32(pagelist->length);
+ } else {
+ msg->hdr.data_len = 0;
}
- msg->hdr.data_len = cpu_to_le32(req->r_data_len);
msg->hdr.data_off = cpu_to_le16(0);
out_free2:
@@ -1904,6 +1977,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
if (req->r_got_unsafe) {
+ void *p;
/*
* Replay. Do not regenerate message (and rebuild
* paths, etc.); just use the original message.
@@ -1924,8 +1998,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
/* remove cap/dentry releases from message */
rhead->num_releases = 0;
- msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
- msg->front.iov_len = req->r_request_release_offset;
+
+ /* time stamp */
+ p = msg->front.iov_base + req->r_request_release_offset;
+ ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
return 0;
}
@@ -2001,7 +2080,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
req->r_session = get_session(session);
dout("do_request mds%d session %p state %s\n", mds, session,
- session_state_name(session->s_state));
+ ceph_session_state_name(session->s_state));
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
session->s_state != CEPH_MDS_SESSION_HUNG) {
if (session->s_state == CEPH_MDS_SESSION_NEW ||
@@ -2061,16 +2140,18 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
static void kick_requests(struct ceph_mds_client *mdsc, int mds)
{
struct ceph_mds_request *req;
- struct rb_node *p;
+ struct rb_node *p = rb_first(&mdsc->request_tree);
dout("kick_requests mds%d\n", mds);
- for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+ while (p) {
req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
if (req->r_got_unsafe)
continue;
if (req->r_session &&
req->r_session->s_mds == mds) {
dout(" kicking tid %llu\n", req->r_tid);
+ list_del_init(&req->r_wait);
__do_request(mdsc, req);
}
}
@@ -2248,6 +2329,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
*/
if (result == -ESTALE) {
dout("got ESTALE on request %llu", req->r_tid);
+ req->r_resend_mds = -1;
if (req->r_direct_mode != USE_AUTH_MDS) {
dout("not using auth, setting for that now");
req->r_direct_mode = USE_AUTH_MDS;
@@ -2436,7 +2518,7 @@ static void handle_session(struct ceph_mds_session *session,
dout("handle_session mds%d %s %p state %s seq %llu\n",
mds, ceph_session_op_name(op), session,
- session_state_name(session->s_state), seq);
+ ceph_session_state_name(session->s_state), seq);
if (session->s_state == CEPH_MDS_SESSION_HUNG) {
session->s_state = CEPH_MDS_SESSION_OPEN;
@@ -2463,9 +2545,8 @@ static void handle_session(struct ceph_mds_session *session,
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect denied\n", session->s_mds);
remove_session_caps(session);
- wake = 1; /* for good measure */
+ wake = 2; /* for good measure */
wake_up_all(&mdsc->session_close_wq);
- kick_requests(mdsc, mds);
break;
case CEPH_SESSION_STALE:
@@ -2495,6 +2576,8 @@ static void handle_session(struct ceph_mds_session *session,
if (wake) {
mutex_lock(&mdsc->mutex);
__wake_requests(mdsc, &session->s_waiting);
+ if (wake == 2)
+ kick_requests(mdsc, mds);
mutex_unlock(&mdsc->mutex);
}
return;
@@ -2687,18 +2770,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
- ceph_con_close(&session->s_con);
- ceph_con_open(&session->s_con,
- CEPH_ENTITY_TYPE_MDS, mds,
- ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
- /* replay unsafe requests */
- replay_unsafe_requests(mdsc, session);
-
- down_read(&mdsc->snap_rwsem);
-
dout("session %p state %s\n", session,
- session_state_name(session->s_state));
+ ceph_session_state_name(session->s_state));
spin_lock(&session->s_gen_ttl_lock);
session->s_cap_gen++;
@@ -2715,6 +2788,19 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
discard_cap_releases(mdsc, session);
spin_unlock(&session->s_cap_lock);
+ /* trim unused caps to reduce MDS's cache rejoin time */
+ shrink_dcache_parent(mdsc->fsc->sb->s_root);
+
+ ceph_con_close(&session->s_con);
+ ceph_con_open(&session->s_con,
+ CEPH_ENTITY_TYPE_MDS, mds,
+ ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+ /* replay unsafe requests */
+ replay_unsafe_requests(mdsc, session);
+
+ down_read(&mdsc->snap_rwsem);
+
/* traverse this session's caps */
s_nr_caps = session->s_nr_caps;
err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
@@ -2783,7 +2869,6 @@ fail:
mutex_unlock(&session->s_mutex);
fail_nomsg:
ceph_pagelist_release(pagelist);
- kfree(pagelist);
fail_nopagelist:
pr_err("error %d preparing reconnect for mds%d\n", err, mds);
return;
@@ -2819,7 +2904,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
ceph_mds_state_name(newstate),
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
- session_state_name(s->s_state));
+ ceph_session_state_name(s->s_state));
if (i >= newmap->m_max_mds ||
memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2931,14 +3016,15 @@ static void handle_lease(struct ceph_mds_client *mdsc,
if (dname.len != get_unaligned_le32(h+1))
goto bad;
- mutex_lock(&session->s_mutex);
- session->s_seq++;
-
/* lookup inode */
inode = ceph_find_inode(sb, vino);
dout("handle_lease %s, ino %llx %p %.*s\n",
ceph_lease_op_name(h->action), vino.ino, inode,
dname.len, dname.name);
+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+
if (inode == NULL) {
dout("handle_lease no inode %llx\n", vino.ino);
goto release;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e00737cf523c..3288359353e9 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -202,9 +202,7 @@ struct ceph_mds_request {
bool r_direct_is_hash; /* true if r_direct_hash is valid */
/* data payload is used for xattr ops */
- struct page **r_pages;
- int r_num_pages;
- int r_data_len;
+ struct ceph_pagelist *r_pagelist;
/* what caps shall we drop? */
int r_inode_drop, r_inode_unless;
@@ -332,6 +330,8 @@ ceph_get_mds_session(struct ceph_mds_session *s)
return s;
}
+extern const char *ceph_session_state_name(int s);
+
extern void ceph_put_mds_session(struct ceph_mds_session *s);
extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 06150fd745ac..f6e12377335c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
goto out;
}
} else {
- root = d_obtain_alias(inode);
+ root = d_obtain_root(inode);
}
ceph_init_dentry(root);
dout("open_root_inode success, root dentry is %p\n", root);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 12b20744e386..b82f507979b8 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -95,6 +95,7 @@ struct ceph_fs_client {
struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
+ struct dentry *debugfs_mds_sessions;
#endif
#ifdef CONFIG_CEPH_FSCACHE
@@ -714,7 +715,7 @@ extern void ceph_queue_vmtruncate(struct inode *inode);
extern void ceph_queue_invalidate(struct inode *inode);
extern void ceph_queue_writeback(struct inode *inode);
-extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_do_getattr(struct inode *inode, int mask, bool force);
extern int ceph_permission(struct inode *inode, int mask);
extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -733,15 +734,23 @@ extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
extern void __init ceph_xattr_init(void);
extern void ceph_xattr_exit(void);
+extern const struct xattr_handler *ceph_xattr_handlers[];
/* acl.c */
-extern const struct xattr_handler *ceph_xattr_handlers[];
+struct ceph_acls_info {
+ void *default_acl;
+ void *acl;
+ struct ceph_pagelist *pagelist;
+};
#ifdef CONFIG_CEPH_FS_POSIX_ACL
struct posix_acl *ceph_get_acl(struct inode *, int);
int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
+int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acls_info *info);
+void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info);
+void ceph_release_acls_info(struct ceph_acls_info *info);
static inline void ceph_forget_all_cached_acls(struct inode *inode)
{
@@ -753,12 +762,18 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
#define ceph_get_acl NULL
#define ceph_set_acl NULL
-static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
- struct inode *dir)
+static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acls_info *info)
{
return 0;
}
-
+static inline void ceph_init_inode_acls(struct inode *inode,
+ struct ceph_acls_info *info)
+{
+}
+static inline void ceph_release_acls_info(struct ceph_acls_info *info)
+{
+}
static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
{
return 0;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c9c2b887381e..678b0d2bbbc4 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,4 +1,5 @@
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/pagelist.h>
#include "super.h"
#include "mds_client.h"
@@ -284,8 +285,7 @@ static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
return ceph_dir_vxattrs_name_size;
if (vxattrs == ceph_file_vxattrs)
return ceph_file_vxattrs_name_size;
- BUG();
-
+ BUG_ON(vxattrs);
return 0;
}
@@ -592,12 +592,12 @@ start:
xattr_version = ci->i_xattrs.version;
spin_unlock(&ci->i_ceph_lock);
- xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+ xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
GFP_NOFS);
err = -ENOMEM;
if (!xattrs)
goto bad_lock;
- memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+
for (i = 0; i < numattr; i++) {
xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
GFP_NOFS);
@@ -736,24 +736,20 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version);
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
- goto get_xattr;
- } else {
+ if (ci->i_xattrs.version == 0 ||
+ !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
spin_unlock(&ci->i_ceph_lock);
/* get xattrs from mds (if we don't already have them) */
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
if (err)
return err;
+ spin_lock(&ci->i_ceph_lock);
}
- spin_lock(&ci->i_ceph_lock);
-
err = __build_xattrs(inode);
if (err < 0)
goto out;
-get_xattr:
err = -ENODATA; /* == ENOATTR */
xattr = __get_xattr(ci, name);
if (!xattr)
@@ -798,23 +794,18 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version);
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
- goto list_xattr;
- } else {
+ if (ci->i_xattrs.version == 0 ||
+ !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
spin_unlock(&ci->i_ceph_lock);
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
if (err)
return err;
+ spin_lock(&ci->i_ceph_lock);
}
- spin_lock(&ci->i_ceph_lock);
-
err = __build_xattrs(inode);
if (err < 0)
goto out;
-
-list_xattr:
/*
* Start with virtual dir xattr names (if any) (including
* terminating '\0' characters for each).
@@ -860,35 +851,25 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_pagelist *pagelist = NULL;
int err;
- int i, nr_pages;
- struct page **pages = NULL;
- void *kaddr;
-
- /* copy value into some pages */
- nr_pages = calc_pages_for(0, size);
- if (nr_pages) {
- pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
- if (!pages)
+
+ if (value) {
+ /* copy value into pagelist */
+ pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ if (!pagelist)
return -ENOMEM;
- err = -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
- pages[i] = __page_cache_alloc(GFP_NOFS);
- if (!pages[i]) {
- nr_pages = i;
- goto out;
- }
- kaddr = kmap(pages[i]);
- memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
- min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
- }
+
+ ceph_pagelist_init(pagelist);
+ err = ceph_pagelist_append(pagelist, value, size);
+ if (err)
+ goto out;
+ } else {
+ flags |= CEPH_XATTR_REMOVE;
}
dout("setxattr value=%.*s\n", (int)size, value);
- if (!value)
- flags |= CEPH_XATTR_REMOVE;
-
/* do request */
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
USE_AUTH_MDS);
@@ -903,9 +884,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
req->r_args.setxattr.flags = cpu_to_le32(flags);
req->r_path2 = kstrdup(name, GFP_NOFS);
- req->r_pages = pages;
- req->r_num_pages = nr_pages;
- req->r_data_len = size;
+ req->r_pagelist = pagelist;
+ pagelist = NULL;
dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -913,11 +893,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
out:
- if (pages) {
- for (i = 0; i < nr_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- }
+ if (pagelist)
+ ceph_pagelist_release(pagelist);
return err;
}
@@ -968,7 +945,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
retry:
issued = __ceph_caps_issued(ci, NULL);
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
- if (!(issued & CEPH_CAP_XATTR_EXCL))
+ if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
__build_xattrs(inode);
@@ -1077,7 +1054,7 @@ retry:
issued = __ceph_caps_issued(ci, NULL);
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
- if (!(issued & CEPH_CAP_XATTR_EXCL))
+ if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
__build_xattrs(inode);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 603f18a65c12..a2172f3f69e3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -22,6 +22,11 @@ config CIFS
support for OS/2 and Windows ME and similar servers is provided as
well.
+ The module also provides optional support for the followon
+ protocols for CIFS including SMB3, which enables
+ useful performance and security features (see the description
+ of CONFIG_CIFS_SMB2).
+
The cifs module provides an advanced network file system
client for mounting to CIFS compliant servers. It includes
support for DFS (hierarchical name space), secure per-user
@@ -121,7 +126,8 @@ config CIFS_ACL
depends on CIFS_XATTR && KEYS
help
Allows fetching CIFS/NTFS ACL from the server. The DACL blob
- is handed over to the application/caller.
+ is handed over to the application/caller. See the man
+ page for getcifsacl for more information.
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
@@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB2
- bool "SMB2 network file system support"
+ bool "SMB2 and SMB3 network file system support"
depends on CIFS && INET
select NLS
select KEYS
@@ -170,16 +176,21 @@ config CIFS_SMB2
select DNS_RESOLVER
help
- This enables experimental support for the SMB2 (Server Message Block
- version 2) protocol. The SMB2 protocol is the successor to the
- popular CIFS and SMB network file sharing protocols. SMB2 is the
- native file sharing mechanism for recent versions of Windows
- operating systems (since Vista). SMB2 enablement will eventually
- allow users better performance, security and features, than would be
- possible with cifs. Note that smb2 mount options also are simpler
- (compared to cifs) due to protocol improvements.
-
- Unless you are a developer or tester, say N.
+ This enables support for the Server Message Block version 2
+ family of protocols, including SMB3. SMB3 support is
+ enabled on mount by specifying "vers=3.0" in the mount
+ options. These protocols are the successors to the popular
+ CIFS and SMB network file sharing protocols. SMB3 is the
+ native file sharing mechanism for the more recent
+ versions of Windows (Windows 8 and Windows 2012 and
+ later) and Samba server and many others support SMB3 well.
+ In general SMB3 enables better performance, security
+ and features, than would be possible with CIFS (Note that
+ when mounting to Samba, due to the CIFS POSIX extensions,
+ CIFS mounts can provide slightly better POSIX compatibility
+ than SMB3 mounts do though). Note that SMB2/SMB3 mount
+ options are also slightly simpler (compared to CIFS) due
+ to protocol improvements.
config CIFS_FSCACHE
bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index f3ac4154cbb6..44ec72684df5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
tcon->nativeFileSystem);
}
seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
- "\n\tPathComponentMax: %d Status: 0x%d",
+ "\n\tPathComponentMax: %d Status: %d",
le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
le32_to_cpu(tcon->fsAttrInfo.Attributes),
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 58df174deb10..b8602f199815 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -195,15 +195,15 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
else
noff = tkn_e - (sb_mountdata + off) + 1;
- if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) {
+ if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) {
off += noff;
continue;
}
- if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) {
+ if (strncasecmp(sb_mountdata + off, "ip=", 3) == 0) {
off += noff;
continue;
}
- if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
+ if (strncasecmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
off += noff;
continue;
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9409fa10bd5c..3182273a3407 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -45,6 +45,7 @@
#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
+#define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */
struct cifs_sb_info {
struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index a3e932547617..f4cf200b3c76 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -62,7 +62,6 @@ cifs_spnego_key_destroy(struct key *key)
struct key_type cifs_spnego_key_type = {
.name = "cifs.spnego",
.instantiate = cifs_spnego_key_instantiate,
- .match = user_match,
.destroy = cifs_spnego_key_destroy,
.describe = user_describe,
};
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 15e9505aa35f..0303c6793d90 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -20,6 +20,7 @@
*/
#include <linux/fs.h>
#include <linux/slab.h>
+#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
#include "cifs_uniupr.h"
#include "cifspdu.h"
@@ -61,26 +62,24 @@ cifs_utf16_bytes(const __le16 *from, int maxbytes,
return outlen;
}
-/*
- * cifs_mapchar - convert a host-endian char to proper char in codepage
- * @target - where converted character should be copied
- * @src_char - 2 byte host-endian source character
- * @cp - codepage to which character should be converted
- * @mapchar - should character be mapped according to mapchars mount option?
- *
- * This function handles the conversion of a single character. It is the
- * responsibility of the caller to ensure that the target buffer is large
- * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
- */
-static int
-cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
- bool mapchar)
+int cifs_remap(struct cifs_sb_info *cifs_sb)
{
- int len = 1;
+ int map_type;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
+ map_type = SFM_MAP_UNI_RSVD;
+ else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+ map_type = SFU_MAP_UNI_RSVD;
+ else
+ map_type = NO_MAP_UNI_RSVD;
- if (!mapchar)
- goto cp_convert;
+ return map_type;
+}
+/* Convert character using the SFU - "Services for Unix" remapping range */
+static bool
+convert_sfu_char(const __u16 src_char, char *target)
+{
/*
* BB: Cannot handle remapping UNI_SLASH until all the calls to
* build_path_from_dentry are modified, as they use slash as
@@ -106,19 +105,74 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
*target = '<';
break;
default:
- goto cp_convert;
+ return false;
}
+ return true;
+}
+
+/* Convert character using the SFM - "Services for Mac" remapping range */
+static bool
+convert_sfm_char(const __u16 src_char, char *target)
+{
+ switch (src_char) {
+ case SFM_COLON:
+ *target = ':';
+ break;
+ case SFM_ASTERISK:
+ *target = '*';
+ break;
+ case SFM_QUESTION:
+ *target = '?';
+ break;
+ case SFM_PIPE:
+ *target = '|';
+ break;
+ case SFM_GRTRTHAN:
+ *target = '>';
+ break;
+ case SFM_LESSTHAN:
+ *target = '<';
+ break;
+ case SFM_SLASH:
+ *target = '\\';
+ break;
+ default:
+ return false;
+ }
+ return true;
+}
-out:
- return len;
-cp_convert:
+/*
+ * cifs_mapchar - convert a host-endian char to proper char in codepage
+ * @target - where converted character should be copied
+ * @src_char - 2 byte host-endian source character
+ * @cp - codepage to which character should be converted
+ * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2?
+ *
+ * This function handles the conversion of a single character. It is the
+ * responsibility of the caller to ensure that the target buffer is large
+ * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
+ */
+static int
+cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
+ int maptype)
+{
+ int len = 1;
+
+ if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target))
+ return len;
+ else if ((maptype == SFU_MAP_UNI_RSVD) &&
+ convert_sfu_char(src_char, target))
+ return len;
+
+ /* if character not one of seven in special remap set */
len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
if (len <= 0) {
*target = '?';
len = 1;
}
- goto out;
+ return len;
}
/*
@@ -145,7 +199,7 @@ cp_convert:
*/
int
cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
- const struct nls_table *codepage, bool mapchar)
+ const struct nls_table *codepage, int map_type)
{
int i, charlen, safelen;
int outlen = 0;
@@ -172,13 +226,13 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
* conversion bleed into the null terminator
*/
if (outlen >= safelen) {
- charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
+ charlen = cifs_mapchar(tmp, ftmp, codepage, map_type);
if ((outlen + charlen) > (tolen - nullsize))
break;
}
/* put converted char into 'to' buffer */
- charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
+ charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type);
outlen += charlen;
}
@@ -267,7 +321,7 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
if (!dst)
return NULL;
cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
- false);
+ NO_MAP_UNI_RSVD);
} else {
len = strnlen(src, maxlen);
len++;
@@ -280,6 +334,66 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
return dst;
}
+static __le16 convert_to_sfu_char(char src_char)
+{
+ __le16 dest_char;
+
+ switch (src_char) {
+ case ':':
+ dest_char = cpu_to_le16(UNI_COLON);
+ break;
+ case '*':
+ dest_char = cpu_to_le16(UNI_ASTERISK);
+ break;
+ case '?':
+ dest_char = cpu_to_le16(UNI_QUESTION);
+ break;
+ case '<':
+ dest_char = cpu_to_le16(UNI_LESSTHAN);
+ break;
+ case '>':
+ dest_char = cpu_to_le16(UNI_GRTRTHAN);
+ break;
+ case '|':
+ dest_char = cpu_to_le16(UNI_PIPE);
+ break;
+ default:
+ dest_char = 0;
+ }
+
+ return dest_char;
+}
+
+static __le16 convert_to_sfm_char(char src_char)
+{
+ __le16 dest_char;
+
+ switch (src_char) {
+ case ':':
+ dest_char = cpu_to_le16(SFM_COLON);
+ break;
+ case '*':
+ dest_char = cpu_to_le16(SFM_ASTERISK);
+ break;
+ case '?':
+ dest_char = cpu_to_le16(SFM_QUESTION);
+ break;
+ case '<':
+ dest_char = cpu_to_le16(SFM_LESSTHAN);
+ break;
+ case '>':
+ dest_char = cpu_to_le16(SFM_GRTRTHAN);
+ break;
+ case '|':
+ dest_char = cpu_to_le16(SFM_PIPE);
+ break;
+ default:
+ dest_char = 0;
+ }
+
+ return dest_char;
+}
+
/*
* Convert 16 bit Unicode pathname to wire format from string in current code
* page. Conversion may involve remapping up the six characters that are
@@ -288,7 +402,7 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
*/
int
cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
- const struct nls_table *cp, int mapChars)
+ const struct nls_table *cp, int map_chars)
{
int i, charlen;
int j = 0;
@@ -296,39 +410,30 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
__le16 dst_char;
wchar_t tmp;
- if (!mapChars)
+ if (map_chars == NO_MAP_UNI_RSVD)
return cifs_strtoUTF16(target, source, PATH_MAX, cp);
for (i = 0; i < srclen; j++) {
src_char = source[i];
charlen = 1;
- switch (src_char) {
- case 0:
+
+ /* check if end of string */
+ if (src_char == 0)
goto ctoUTF16_out;
- case ':':
- dst_char = cpu_to_le16(UNI_COLON);
- break;
- case '*':
- dst_char = cpu_to_le16(UNI_ASTERISK);
- break;
- case '?':
- dst_char = cpu_to_le16(UNI_QUESTION);
- break;
- case '<':
- dst_char = cpu_to_le16(UNI_LESSTHAN);
- break;
- case '>':
- dst_char = cpu_to_le16(UNI_GRTRTHAN);
- break;
- case '|':
- dst_char = cpu_to_le16(UNI_PIPE);
- break;
+
+ /* see if we must remap this char */
+ if (map_chars == SFU_MAP_UNI_RSVD)
+ dst_char = convert_to_sfu_char(src_char);
+ else if (map_chars == SFM_MAP_UNI_RSVD)
+ dst_char = convert_to_sfm_char(src_char);
+ else
+ dst_char = 0;
/*
* FIXME: We can not handle remapping backslash (UNI_SLASH)
* until all the calls to build_path_from_dentry are modified,
* as they use backslash as separator.
*/
- default:
+ if (dst_char == 0) {
charlen = cp->char2uni(source + i, srclen - i, &tmp);
dst_char = cpu_to_le16(tmp);
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index d8eac3b6cefb..bdc52cb9a676 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -52,6 +52,34 @@
#define UNI_PIPE (__u16) ('|' + 0xF000)
#define UNI_SLASH (__u16) ('\\' + 0xF000)
+/*
+ * Macs use an older "SFM" mapping of the symbols above. Fortunately it does
+ * not conflict (although almost does) with the mapping above.
+ */
+
+#define SFM_ASTERISK ((__u16) 0xF021)
+#define SFM_QUESTION ((__u16) 0xF025)
+#define SFM_COLON ((__u16) 0xF022)
+#define SFM_GRTRTHAN ((__u16) 0xF024)
+#define SFM_LESSTHAN ((__u16) 0xF023)
+#define SFM_PIPE ((__u16) 0xF027)
+#define SFM_SLASH ((__u16) 0xF026)
+
+/*
+ * Mapping mechanism to use when one of the seven reserved characters is
+ * encountered. We can only map using one of the mechanisms at a time
+ * since otherwise readdir could return directory entries which we would
+ * not be able to open
+ *
+ * NO_MAP_UNI_RSVD = do not perform any remapping of the character
+ * SFM_MAP_UNI_RSVD = map reserved characters using SFM scheme (MAC compatible)
+ * SFU_MAP_UNI_RSVD = map reserved characters ala SFU ("mapchars" option)
+ *
+ */
+#define NO_MAP_UNI_RSVD 0
+#define SFM_MAP_UNI_RSVD 1
+#define SFU_MAP_UNI_RSVD 2
+
/* Just define what we want from uniupr.h. We don't want to define the tables
* in each source file.
*/
@@ -75,7 +103,7 @@ extern const struct UniCaseRange CifsUniLowerRange[];
#ifdef __KERNEL__
int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
- const struct nls_table *codepage, bool mapchar);
+ const struct nls_table *cp, int map_type);
int cifs_utf16_bytes(const __le16 *from, int maxbytes,
const struct nls_table *codepage);
int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
@@ -84,6 +112,7 @@ char *cifs_strndup_from_utf16(const char *src, const int maxlen,
const struct nls_table *codepage);
extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
const struct nls_table *cp, int mapChars);
+extern int cifs_remap(struct cifs_sb_info *cifs_sb);
#ifdef CONFIG_CIFS_SMB2
extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
int *utf16_len, const struct nls_table *cp,
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7ff866dbb89e..6d00c419cbae 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -84,7 +84,6 @@ static struct key_type cifs_idmap_key_type = {
.instantiate = cifs_idmap_key_instantiate,
.destroy = cifs_idmap_key_destroy,
.describe = user_describe,
- .match = user_match,
};
static char *
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4934347321d3..4ac7445e6ec7 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -431,7 +431,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
return -ENOMEM;
cifs_from_utf16(ses->domainName,
(__le16 *)blobptr, attrsize, attrsize,
- nls_cp, false);
+ nls_cp, NO_MAP_UNI_RSVD);
break;
}
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 888398067420..9d7996e8e793 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -207,6 +207,19 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
+{
+ struct super_block *sb = file->f_path.dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ if (server->ops->fallocate)
+ return server->ops->fallocate(file, tcon, mode, off, len);
+
+ return -EOPNOTSUPP;
+}
+
static int cifs_permission(struct inode *inode, int mask)
{
struct cifs_sb_info *cifs_sb;
@@ -800,7 +813,8 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
}
-static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
+static int
+cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv)
{
/*
* Note that this is called by vfs setlease with i_lock held to
@@ -812,10 +826,11 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
if (!(S_ISREG(inode->i_mode)))
return -EINVAL;
- /* check if file is oplocked */
- if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
+ /* Check if file is oplocked if this is request for new lease */
+ if (arg == F_UNLCK ||
+ ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
- return generic_setlease(file, arg, lease);
+ return generic_setlease(file, arg, lease, priv);
else if (tlink_tcon(cfile->tlink)->local_lease &&
!CIFS_CACHE_READ(CIFS_I(inode)))
/*
@@ -826,7 +841,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
* knows that the file won't be changed on the server by anyone
* else.
*/
- return generic_setlease(file, arg, lease);
+ return generic_setlease(file, arg, lease, priv);
else
return -EAGAIN;
}
@@ -848,7 +863,7 @@ const struct inode_operations cifs_dir_inode_ops = {
.link = cifs_hardlink,
.mkdir = cifs_mkdir,
.rmdir = cifs_rmdir,
- .rename = cifs_rename,
+ .rename2 = cifs_rename2,
.permission = cifs_permission,
/* revalidate:cifs_revalidate, */
.setattr = cifs_setattr,
@@ -908,6 +923,7 @@ const struct file_operations cifs_file_ops = {
.unlocked_ioctl = cifs_ioctl,
#endif /* CONFIG_CIFS_POSIX */
.setlease = cifs_setlease,
+ .fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_strict_ops = {
@@ -927,6 +943,7 @@ const struct file_operations cifs_file_strict_ops = {
.unlocked_ioctl = cifs_ioctl,
#endif /* CONFIG_CIFS_POSIX */
.setlease = cifs_setlease,
+ .fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_direct_ops = {
@@ -947,6 +964,7 @@ const struct file_operations cifs_file_direct_ops = {
#endif /* CONFIG_CIFS_POSIX */
.llseek = cifs_llseek,
.setlease = cifs_setlease,
+ .fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_nobrl_ops = {
@@ -965,6 +983,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.unlocked_ioctl = cifs_ioctl,
#endif /* CONFIG_CIFS_POSIX */
.setlease = cifs_setlease,
+ .fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_strict_nobrl_ops = {
@@ -983,6 +1002,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.unlocked_ioctl = cifs_ioctl,
#endif /* CONFIG_CIFS_POSIX */
.setlease = cifs_setlease,
+ .fallocate = cifs_fallocate,
};
const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -1002,6 +1022,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
#endif /* CONFIG_CIFS_POSIX */
.llseek = cifs_llseek,
.setlease = cifs_setlease,
+ .fallocate = cifs_fallocate,
};
const struct file_operations cifs_dir_ops = {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 70f178a7c759..002e0c173939 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
extern int cifs_rmdir(struct inode *, struct dentry *);
-extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
- struct dentry *);
+extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
+ struct dentry *, unsigned int);
extern int cifs_revalidate_file_attr(struct file *filp);
extern int cifs_revalidate_dentry_attr(struct dentry *);
extern int cifs_revalidate_file(struct file *filp);
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.03"
+#define CIFS_VERSION "2.05"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index de6aed8c78e5..02a33e529904 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,11 +70,6 @@
#define SERVER_NAME_LENGTH 40
#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1)
-/* used to define string lengths for reversing unicode strings */
-/* (256+1)*2 = 514 */
-/* (max path length + 1 for null) * 2 for unicode */
-#define MAX_NAME 514
-
/* SMB echo "timeout" -- FIXME: tunable? */
#define SMB_ECHO_INTERVAL (60 * HZ)
@@ -328,11 +323,11 @@ struct smb_version_operations {
int (*async_writev)(struct cifs_writedata *,
void (*release)(struct kref *));
/* sync read from the server */
- int (*sync_read)(const unsigned int, struct cifsFileInfo *,
+ int (*sync_read)(const unsigned int, struct cifs_fid *,
struct cifs_io_parms *, unsigned int *, char **,
int *);
/* sync write to the server */
- int (*sync_write)(const unsigned int, struct cifsFileInfo *,
+ int (*sync_write)(const unsigned int, struct cifs_fid *,
struct cifs_io_parms *, unsigned int *, struct kvec *,
unsigned long);
/* open dir, start readdir */
@@ -404,6 +399,15 @@ struct smb_version_operations {
const struct cifs_fid *, u32 *);
int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
int);
+ /* writepages retry size */
+ unsigned int (*wp_retry_size)(struct inode *);
+ /* get mtu credits */
+ int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
+ unsigned int *, unsigned int *);
+ /* check if we need to issue closedir */
+ bool (*dir_needs_close)(struct cifsFileInfo *);
+ long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
+ loff_t);
};
struct smb_version_values {
@@ -462,6 +466,7 @@ struct smb_vol {
bool direct_io:1;
bool strict_io:1; /* strict cache behavior */
bool remap:1; /* set to remap seven reserved chars in filenames */
+ bool sfu_remap:1; /* remap seven reserved chars ala SFU */
bool posix_paths:1; /* unset to not ask for posix pathnames. */
bool no_linux_ext:1;
bool sfu_emul:1;
@@ -495,6 +500,7 @@ struct smb_vol {
#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \
CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \
+ CIFS_MOUNT_MAP_SFM_CHR | \
CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \
CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \
CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
@@ -640,6 +646,16 @@ add_credits(struct TCP_Server_Info *server, const unsigned int add,
}
static inline void
+add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
+ const int optype)
+{
+ if (add) {
+ server->ops->add_credits(server, add, optype);
+ wake_up(&server->request_q);
+ }
+}
+
+static inline void
set_credits(struct TCP_Server_Info *server, const int val)
{
server->ops->set_credits(server, val);
@@ -868,6 +884,7 @@ struct cifs_tcon {
for this mount even if server would support */
bool local_lease:1; /* check leases (only) on local system not remote */
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
+ bool broken_sparse_sup; /* if server or share does not support sparse */
bool need_reconnect:1; /* connection reset, tid now invalid */
#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
@@ -1044,6 +1061,7 @@ struct cifs_readdata {
struct address_space *mapping;
__u64 offset;
unsigned int bytes;
+ unsigned int got_bytes;
pid_t pid;
int result;
struct work_struct work;
@@ -1053,6 +1071,7 @@ struct cifs_readdata {
struct kvec iov;
unsigned int pagesz;
unsigned int tailsz;
+ unsigned int credits;
unsigned int nr_pages;
struct page *pages[];
};
@@ -1073,6 +1092,7 @@ struct cifs_writedata {
int result;
unsigned int pagesz;
unsigned int tailsz;
+ unsigned int credits;
unsigned int nr_pages;
struct page *pages[];
};
@@ -1398,6 +1418,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
#define CIFS_OBREAK_OP 0x0100 /* oplock break request */
#define CIFS_NEG_OP 0x0200 /* negotiate request */
#define CIFS_OP_MASK 0x0380 /* mask request type */
+#define CIFS_HAS_CREDITS 0x0400 /* already has credits */
/* Security Flags: indicate type of session setup needed */
#define CIFSSEC_MAY_SIGN 0x00001
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 33df36ef9d52..5f9822ac0245 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2253,6 +2253,29 @@ typedef struct {
/* minimum includes first three fields, and empty FS Name */
#define MIN_FS_ATTR_INFO_SIZE 12
+
+/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
+#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000
+#define FILE_SUPPORTS_USN_JOURNAL 0x02000000
+#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000
+#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000
+#define FILE_SUPPORTS_HARD_LINKS 0x00400000
+#define FILE_SUPPORTS_TRANSACTIONS 0x00200000
+#define FILE_SEQUENTIAL_WRITE_ONCE 0x00100000
+#define FILE_READ_ONLY_VOLUME 0x00080000
+#define FILE_NAMED_STREAMS 0x00040000
+#define FILE_SUPPORTS_ENCRYPTION 0x00020000
+#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
+#define FILE_VOLUME_IS_COMPRESSED 0x00008000
+#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
+#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
+#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
+#define FILE_VOLUME_QUOTAS 0x00000020
+#define FILE_FILE_COMPRESSION 0x00000010
+#define FILE_PERSISTENT_ACLS 0x00000008
+#define FILE_UNICODE_ON_DISK 0x00000004
+#define FILE_CASE_PRESERVED_NAMES 0x00000002
+#define FILE_CASE_SENSITIVE_SEARCH 0x00000001
typedef struct {
__le32 Attributes;
__le32 MaxPathNameComponentLength;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ca7980a1e303..c31ce98c1704 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -36,6 +36,7 @@ extern struct smb_hdr *cifs_buf_get(void);
extern void cifs_buf_release(void *);
extern struct smb_hdr *cifs_small_buf_get(void);
extern void cifs_small_buf_release(void *);
+extern void free_rsp_buf(int, void *);
extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
struct kvec *iov);
extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
@@ -89,6 +90,9 @@ extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
struct smb_rqst *);
extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
+extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
+ unsigned int size, unsigned int *num,
+ unsigned int *credits);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
int * /* type of buf returned */ , const int flags);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6ce4e0954b98..61d00a6e398f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -196,10 +196,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
if (rc)
goto out;
- /*
- * FIXME: check if wsize needs updated due to negotiated smb buffer
- * size shrinking
- */
atomic_inc(&tconInfoReconnectCount);
/* tell server Unix caps we support */
@@ -871,7 +867,7 @@ CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
int rc = 0;
int bytes_returned;
int name_len;
- int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ int remap = cifs_remap(cifs_sb);
DelFileRetry:
rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB,
@@ -917,7 +913,7 @@ CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
int rc = 0;
int bytes_returned;
int name_len;
- int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ int remap = cifs_remap(cifs_sb);
cifs_dbg(FYI, "In CIFSSMBRmDir\n");
RmDirRetry:
@@ -962,7 +958,7 @@ CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
CREATE_DIRECTORY_RSP *pSMBr = NULL;
int bytes_returned;
int name_len;
- int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ int remap = cifs_remap(cifs_sb);
cifs_dbg(FYI, "In CIFSSMBMkDir\n");
MkDirRetry:
@@ -1284,7 +1280,7 @@ CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
__u16 count;
struct cifs_sb_info *cifs_sb = oparms->cifs_sb;
struct cifs_tcon *tcon = oparms->tcon;
- int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ int remap = cifs_remap(cifs_sb);
const struct nls_table *nls = cifs_sb->local_nls;
int create_options = oparms->create_options;
int desired_access = oparms->desired_access;
@@ -1517,7 +1513,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return length;
server->total_read += length;
- rdata->bytes = length;
cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
server->total_read, buflen, data_len);
@@ -1560,12 +1555,18 @@ cifs_readv_callback(struct mid_q_entry *mid)
rc);
}
/* FIXME: should this be counted toward the initiating task? */
- task_io_account_read(rdata->bytes);
- cifs_stats_bytes_read(tcon, rdata->bytes);
+ task_io_account_read(rdata->got_bytes);
+ cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
rdata->result = -EAGAIN;
+ if (server->sign && rdata->got_bytes)
+ /* reset bytes number since we can not check a sign */
+ rdata->got_bytes = 0;
+ /* FIXME: should this be counted toward the initiating task? */
+ task_io_account_read(rdata->got_bytes);
+ cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
default:
rdata->result = -EIO;
@@ -1734,10 +1735,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
if (*buf) {
- if (resp_buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(iov[0].iov_base);
- else if (resp_buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
+ free_rsp_buf(resp_buf_type, iov[0].iov_base);
} else if (resp_buf_type != CIFS_NO_BUFFER) {
/* return buffer to caller to free */
*buf = iov[0].iov_base;
@@ -1899,28 +1897,80 @@ cifs_writedata_release(struct kref *refcount)
static void
cifs_writev_requeue(struct cifs_writedata *wdata)
{
- int i, rc;
+ int i, rc = 0;
struct inode *inode = wdata->cfile->dentry->d_inode;
struct TCP_Server_Info *server;
+ unsigned int rest_len;
- for (i = 0; i < wdata->nr_pages; i++) {
- lock_page(wdata->pages[i]);
- clear_page_dirty_for_io(wdata->pages[i]);
- }
-
+ server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+ i = 0;
+ rest_len = wdata->bytes;
do {
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata, cifs_writedata_release);
- } while (rc == -EAGAIN);
+ struct cifs_writedata *wdata2;
+ unsigned int j, nr_pages, wsize, tailsz, cur_len;
+
+ wsize = server->ops->wp_retry_size(inode);
+ if (wsize < rest_len) {
+ nr_pages = wsize / PAGE_CACHE_SIZE;
+ if (!nr_pages) {
+ rc = -ENOTSUPP;
+ break;
+ }
+ cur_len = nr_pages * PAGE_CACHE_SIZE;
+ tailsz = PAGE_CACHE_SIZE;
+ } else {
+ nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
+ cur_len = rest_len;
+ tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
+ }
- for (i = 0; i < wdata->nr_pages; i++) {
- unlock_page(wdata->pages[i]);
- if (rc != 0) {
- SetPageError(wdata->pages[i]);
- end_page_writeback(wdata->pages[i]);
- page_cache_release(wdata->pages[i]);
+ wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
+ if (!wdata2) {
+ rc = -ENOMEM;
+ break;
}
- }
+
+ for (j = 0; j < nr_pages; j++) {
+ wdata2->pages[j] = wdata->pages[i + j];
+ lock_page(wdata2->pages[j]);
+ clear_page_dirty_for_io(wdata2->pages[j]);
+ }
+
+ wdata2->sync_mode = wdata->sync_mode;
+ wdata2->nr_pages = nr_pages;
+ wdata2->offset = page_offset(wdata2->pages[0]);
+ wdata2->pagesz = PAGE_CACHE_SIZE;
+ wdata2->tailsz = tailsz;
+ wdata2->bytes = cur_len;
+
+ wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+ if (!wdata2->cfile) {
+ cifs_dbg(VFS, "No writable handles for inode\n");
+ rc = -EBADF;
+ break;
+ }
+ wdata2->pid = wdata2->cfile->pid;
+ rc = server->ops->async_writev(wdata2, cifs_writedata_release);
+
+ for (j = 0; j < nr_pages; j++) {
+ unlock_page(wdata2->pages[j]);
+ if (rc != 0 && rc != -EAGAIN) {
+ SetPageError(wdata2->pages[j]);
+ end_page_writeback(wdata2->pages[j]);
+ page_cache_release(wdata2->pages[j]);
+ }
+ }
+
+ if (rc) {
+ kref_put(&wdata2->refcount, cifs_writedata_release);
+ if (rc == -EAGAIN)
+ continue;
+ break;
+ }
+
+ rest_len -= cur_len;
+ i += nr_pages;
+ } while (i < wdata->nr_pages);
mapping_set_error(inode->i_mapping, rc);
kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2203,10 +2253,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
}
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
- if (resp_buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(iov[0].iov_base);
- else if (resp_buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
+ free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls
since file handle passed in no longer valid */
@@ -2451,10 +2498,7 @@ plk_err_exit:
if (pSMB)
cifs_small_buf_release(pSMB);
- if (resp_buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(iov[0].iov_base);
- else if (resp_buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
+ free_rsp_buf(resp_buf_type, iov[0].iov_base);
/* Note: On -EAGAIN error only caller can retry on handle based calls
since file handle passed in no longer valid */
@@ -2528,7 +2572,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned;
int name_len, name_len2;
__u16 count;
- int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ int remap = cifs_remap(cifs_sb);
cifs_dbg(FYI, "In CIFSSMBRename\n");
renameRetry:
@@ -2924,7 +2968,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
int bytes_returned;
int name_len, name_len2;
__u16 count;
- int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ int remap = cifs_remap(cifs_sb);
cifs_dbg(FYI, "In CIFSCreateHardLink\n");
winCreateHardLinkRetry:
@@ -3838,10 +3882,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
}
}
qsec_out:
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(iov[0].iov_base);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
+ free_rsp_buf(buf_type, iov[0].iov_base);
/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
return rc;
}
@@ -4326,7 +4367,7 @@ findFirstRetry:
return rc;
nls_codepage = cifs_sb->local_nls;
- remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ remap = cifs_remap(cifs_sb);
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
@@ -5486,7 +5527,7 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
int name_len;
int rc = 0;
int bytes_returned = 0;
- int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ int remap = cifs_remap(cifs_sb);
__u16 params, byte_count, data_count, param_offset, offset;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b98366f21f9e..24fa08d261fb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -70,6 +70,7 @@ enum {
Opt_forcegid, Opt_noforcegid,
Opt_noblocksend, Opt_noautotune,
Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
+ Opt_mapposix, Opt_nomapposix,
Opt_mapchars, Opt_nomapchars, Opt_sfu,
Opt_nosfu, Opt_nodfs, Opt_posixpaths,
Opt_noposixpaths, Opt_nounix,
@@ -124,8 +125,10 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_soft, "soft" },
{ Opt_perm, "perm" },
{ Opt_noperm, "noperm" },
- { Opt_mapchars, "mapchars" },
+ { Opt_mapchars, "mapchars" }, /* SFU style */
{ Opt_nomapchars, "nomapchars" },
+ { Opt_mapposix, "mapposix" }, /* SFM style */
+ { Opt_nomapposix, "nomapposix" },
{ Opt_sfu, "sfu" },
{ Opt_nosfu, "nosfu" },
{ Opt_nodfs, "nodfs" },
@@ -557,7 +560,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
try_to_freeze();
if (server_unresponsive(server)) {
- total_read = -EAGAIN;
+ total_read = -ECONNABORTED;
break;
}
@@ -571,7 +574,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
break;
} else if (server->tcpStatus == CifsNeedReconnect) {
cifs_reconnect(server);
- total_read = -EAGAIN;
+ total_read = -ECONNABORTED;
break;
} else if (length == -ERESTARTSYS ||
length == -EAGAIN ||
@@ -588,7 +591,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
cifs_dbg(FYI, "Received no data or error: expecting %d\n"
"got %d", to_read, length);
cifs_reconnect(server);
- total_read = -EAGAIN;
+ total_read = -ECONNABORTED;
break;
}
}
@@ -786,7 +789,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
cifs_reconnect(server);
wake_up(&server->response_q);
- return -EAGAIN;
+ return -ECONNABORTED;
}
/* switch to large buffer if too big for a small one */
@@ -1231,6 +1234,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->linux_uid = current_uid();
vol->linux_gid = current_gid();
+ /*
+ * default to SFM style remapping of seven reserved characters
+ * unless user overrides it or we negotiate CIFS POSIX where
+ * it is unnecessary. Can not simultaneously use more than one mapping
+ * since then readdir could list files that open could not open
+ */
+ vol->remap = true;
+
/* default to only allowing write access to owner of the mount */
vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
@@ -1338,10 +1349,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->noperm = 1;
break;
case Opt_mapchars:
- vol->remap = 1;
+ vol->sfu_remap = true;
+ vol->remap = false; /* disable SFM mapping */
break;
case Opt_nomapchars:
- vol->remap = 0;
+ vol->sfu_remap = false;
+ break;
+ case Opt_mapposix:
+ vol->remap = true;
+ vol->sfu_remap = false; /* disable SFU mapping */
+ break;
+ case Opt_nomapposix:
+ vol->remap = false;
break;
case Opt_sfu:
vol->sfu_emul = 1;
@@ -1600,6 +1619,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
tmp_end++;
if (!(tmp_end < end && tmp_end[1] == delim)) {
/* No it is not. Set the password to NULL */
+ kfree(vol->password);
vol->password = NULL;
break;
}
@@ -1637,6 +1657,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
options = end;
}
+ kfree(vol->password);
/* Now build new password string */
temp_len = strlen(value);
vol->password = kzalloc(temp_len+1, GFP_KERNEL);
@@ -1716,7 +1737,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
- if (strnicmp(string, "default", 7) != 0) {
+ if (strncasecmp(string, "default", 7) != 0) {
vol->iocharset = kstrdup(string,
GFP_KERNEL);
if (!vol->iocharset) {
@@ -1788,7 +1809,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (string == NULL)
goto out_nomem;
- if (strnicmp(string, "1", 1) == 0) {
+ if (strncasecmp(string, "1", 1) == 0) {
/* This is the default */
break;
}
@@ -3195,6 +3216,8 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
if (pvolume_info->server_ino)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
if (pvolume_info->remap)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SFM_CHR;
+ if (pvolume_info->sfu_remap)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
if (pvolume_info->no_xattr)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
@@ -3237,10 +3260,20 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
}
if (pvolume_info->mfsymlinks) {
if (pvolume_info->sfu_emul) {
- cifs_dbg(VFS, "mount option mfsymlinks ignored if sfu mount option is used\n");
- } else {
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
+ /*
+ * Our SFU ("Services for Unix" emulation does not allow
+ * creating symlinks but does allow reading existing SFU
+ * symlinks (it does allow both creating and reading SFU
+ * style mknod and FIFOs though). When "mfsymlinks" and
+ * "sfu" are both enabled at the same time, it allows
+ * reading both types of symlinks, but will only create
+ * them with mfsymlinks format. This allows better
+ * Apple compatibility (probably better for Samba too)
+ * while still recognizing old Windows style symlinks.
+ */
+ cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n");
}
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
}
if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
@@ -3328,8 +3361,7 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls,
- &num_referrals, &referrals,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ &num_referrals, &referrals, cifs_remap(cifs_sb));
if (!rc && num_referrals > 0) {
char *fake_devname = NULL;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3db0c5fd9a11..b72bc29cba23 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -461,8 +461,8 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
xid = get_xid();
- cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
- inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+ inode, direntry, direntry);
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
if (IS_ERR(tlink)) {
@@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
goto out;
}
+ if (file->f_flags & O_DIRECT &&
+ CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+ if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+ file->f_op = &cifs_file_direct_nobrl_ops;
+ else
+ file->f_op = &cifs_file_direct_ops;
+ }
+
file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
if (file_info == NULL) {
if (server->ops->close)
@@ -532,8 +540,8 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
struct cifs_fid fid;
__u32 oplock;
- cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
- inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+ inode, direntry, direntry);
tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
rc = PTR_ERR(tlink);
@@ -569,12 +577,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
struct cifs_io_parms io_parms;
char *full_path = NULL;
struct inode *newinode = NULL;
- int oplock = 0;
+ __u32 oplock = 0;
struct cifs_fid fid;
struct cifs_open_parms oparms;
FILE_ALL_INFO *buf = NULL;
unsigned int bytes_written;
struct win_dev *pdev;
+ struct kvec iov[2];
if (!old_valid_dev(device_number))
return -EINVAL;
@@ -650,7 +659,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = CIFS_open(xid, &oparms, &oplock, buf);
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
if (rc)
goto mknod_out;
@@ -660,25 +673,26 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
*/
pdev = (struct win_dev *)buf;
- io_parms.netfid = fid.netfid;
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
io_parms.offset = 0;
io_parms.length = sizeof(struct win_dev);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct win_dev);
if (S_ISCHR(mode)) {
memcpy(pdev->type, "IntxCHR", 8);
pdev->major = cpu_to_le64(MAJOR(device_number));
pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
- NULL, 0);
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
} else if (S_ISBLK(mode)) {
memcpy(pdev->type, "IntxBLK", 8);
pdev->major = cpu_to_le64(MAJOR(device_number));
pdev->minor = cpu_to_le64(MINOR(device_number));
- rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
- NULL, 0);
+ rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written, iov, 1);
} /* else if (S_ISFIFO) */
- CIFSSMBClose(xid, tcon, fid.netfid);
+ tcon->ses->server->ops->close(xid, tcon, &fid);
d_drop(direntry);
/* FIXME: add code here to set EAs */
@@ -705,8 +719,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
xid = get_xid();
- cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
- parent_dir_inode, direntry->d_name.name, direntry);
+ cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
+ parent_dir_inode, direntry, direntry);
/* check whether path exists */
@@ -825,7 +839,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
int rc = 0;
- cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name);
+ cifs_dbg(FYI, "In cifs d_delete, name = %pd\n", direntry);
return rc;
} */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b88b1ade4d3d..3e4d00a06c44 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file)
cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
inode, file->f_flags, full_path);
+ if (file->f_flags & O_DIRECT &&
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+ file->f_op = &cifs_file_direct_nobrl_ops;
+ else
+ file->f_op = &cifs_file_direct_ops;
+ }
+
if (server->oplocks)
oplock = REQ_OPLOCK;
else
@@ -762,7 +770,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
cifs_dbg(FYI, "Freeing private data in close dir\n");
spin_lock(&cifs_file_list_lock);
- if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+ if (server->ops->dir_needs_close(cfile)) {
cfile->invalidHandle = true;
spin_unlock(&cifs_file_list_lock);
if (server->ops->close_dir)
@@ -1642,8 +1650,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
cifs_sb = CIFS_SB(dentry->d_sb);
- cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n",
- write_size, *offset, dentry->d_name.name);
+ cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n",
+ write_size, *offset, dentry);
tcon = tlink_tcon(open_file->tlink);
server = tcon->ses->server;
@@ -1670,8 +1678,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
break;
}
- len = min((size_t)cifs_sb->wsize,
- write_size - total_written);
+ len = min(server->ops->wp_retry_size(dentry->d_inode),
+ (unsigned int)write_size - total_written);
/* iov[0] is reserved for smb header */
iov[1].iov_base = (char *)write_data + total_written;
iov[1].iov_len = len;
@@ -1679,8 +1687,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
io_parms.tcon = tcon;
io_parms.offset = *offset;
io_parms.length = len;
- rc = server->ops->sync_write(xid, open_file, &io_parms,
- &bytes_written, iov, 1);
+ rc = server->ops->sync_write(xid, &open_file->fid,
+ &io_parms, &bytes_written, iov, 1);
}
if (rc || (bytes_written == 0)) {
if (total_written)
@@ -1878,15 +1886,163 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
return rc;
}
+static struct cifs_writedata *
+wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
+ pgoff_t end, pgoff_t *index,
+ unsigned int *found_pages)
+{
+ unsigned int nr_pages;
+ struct page **pages;
+ struct cifs_writedata *wdata;
+
+ wdata = cifs_writedata_alloc((unsigned int)tofind,
+ cifs_writev_complete);
+ if (!wdata)
+ return NULL;
+
+ /*
+ * find_get_pages_tag seems to return a max of 256 on each
+ * iteration, so we must call it several times in order to
+ * fill the array or the wsize is effectively limited to
+ * 256 * PAGE_CACHE_SIZE.
+ */
+ *found_pages = 0;
+ pages = wdata->pages;
+ do {
+ nr_pages = find_get_pages_tag(mapping, index,
+ PAGECACHE_TAG_DIRTY, tofind,
+ pages);
+ *found_pages += nr_pages;
+ tofind -= nr_pages;
+ pages += nr_pages;
+ } while (nr_pages && tofind && *index <= end);
+
+ return wdata;
+}
+
+static unsigned int
+wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
+ struct address_space *mapping,
+ struct writeback_control *wbc,
+ pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
+{
+ unsigned int nr_pages = 0, i;
+ struct page *page;
+
+ for (i = 0; i < found_pages; i++) {
+ page = wdata->pages[i];
+ /*
+ * At this point we hold neither mapping->tree_lock nor
+ * lock on the page itself: the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or even
+ * swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+
+ if (nr_pages == 0)
+ lock_page(page);
+ else if (!trylock_page(page))
+ break;
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ break;
+ }
+
+ if (!wbc->range_cyclic && page->index > end) {
+ *done = true;
+ unlock_page(page);
+ break;
+ }
+
+ if (*next && (page->index != *next)) {
+ /* Not next consecutive page */
+ unlock_page(page);
+ break;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ break;
+ }
+
+ /*
+ * This actually clears the dirty bit in the radix tree.
+ * See cifs_writepage() for more commentary.
+ */
+ set_page_writeback(page);
+ if (page_offset(page) >= i_size_read(mapping->host)) {
+ *done = true;
+ unlock_page(page);
+ end_page_writeback(page);
+ break;
+ }
+
+ wdata->pages[i] = page;
+ *next = page->index + 1;
+ ++nr_pages;
+ }
+
+ /* reset index to refind any pages skipped */
+ if (nr_pages == 0)
+ *index = wdata->pages[0]->index + 1;
+
+ /* put any pages we aren't going to use */
+ for (i = nr_pages; i < found_pages; i++) {
+ page_cache_release(wdata->pages[i]);
+ wdata->pages[i] = NULL;
+ }
+
+ return nr_pages;
+}
+
+static int
+wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
+ struct address_space *mapping, struct writeback_control *wbc)
+{
+ int rc = 0;
+ struct TCP_Server_Info *server;
+ unsigned int i;
+
+ wdata->sync_mode = wbc->sync_mode;
+ wdata->nr_pages = nr_pages;
+ wdata->offset = page_offset(wdata->pages[0]);
+ wdata->pagesz = PAGE_CACHE_SIZE;
+ wdata->tailsz = min(i_size_read(mapping->host) -
+ page_offset(wdata->pages[nr_pages - 1]),
+ (loff_t)PAGE_CACHE_SIZE);
+ wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
+
+ if (wdata->cfile != NULL)
+ cifsFileInfo_put(wdata->cfile);
+ wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
+ if (!wdata->cfile) {
+ cifs_dbg(VFS, "No writable handles for inode\n");
+ rc = -EBADF;
+ } else {
+ wdata->pid = wdata->cfile->pid;
+ server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+ rc = server->ops->async_writev(wdata, cifs_writedata_release);
+ }
+
+ for (i = 0; i < nr_pages; ++i)
+ unlock_page(wdata->pages[i]);
+
+ return rc;
+}
+
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+ struct TCP_Server_Info *server;
bool done = false, scanned = false, range_whole = false;
pgoff_t end, index;
struct cifs_writedata *wdata;
- struct TCP_Server_Info *server;
- struct page *page;
int rc = 0;
/*
@@ -1906,152 +2062,50 @@ static int cifs_writepages(struct address_space *mapping,
range_whole = true;
scanned = true;
}
+ server = cifs_sb_master_tcon(cifs_sb)->ses->server;
retry:
while (!done && index <= end) {
- unsigned int i, nr_pages, found_pages;
- pgoff_t next = 0, tofind;
- struct page **pages;
+ unsigned int i, nr_pages, found_pages, wsize, credits;
+ pgoff_t next = 0, tofind, saved_index = index;
- tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
- end - index) + 1;
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+ &wsize, &credits);
+ if (rc)
+ break;
+
+ tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
- wdata = cifs_writedata_alloc((unsigned int)tofind,
- cifs_writev_complete);
+ wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
+ &found_pages);
if (!wdata) {
rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- /*
- * find_get_pages_tag seems to return a max of 256 on each
- * iteration, so we must call it several times in order to
- * fill the array or the wsize is effectively limited to
- * 256 * PAGE_CACHE_SIZE.
- */
- found_pages = 0;
- pages = wdata->pages;
- do {
- nr_pages = find_get_pages_tag(mapping, &index,
- PAGECACHE_TAG_DIRTY,
- tofind, pages);
- found_pages += nr_pages;
- tofind -= nr_pages;
- pages += nr_pages;
- } while (nr_pages && tofind && index <= end);
-
if (found_pages == 0) {
kref_put(&wdata->refcount, cifs_writedata_release);
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- nr_pages = 0;
- for (i = 0; i < found_pages; i++) {
- page = wdata->pages[i];
- /*
- * At this point we hold neither mapping->tree_lock nor
- * lock on the page itself: the page may be truncated or
- * invalidated (changing page->mapping to NULL), or even
- * swizzled back from swapper_space to tmpfs file
- * mapping
- */
-
- if (nr_pages == 0)
- lock_page(page);
- else if (!trylock_page(page))
- break;
-
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- break;
- }
-
- if (!wbc->range_cyclic && page->index > end) {
- done = true;
- unlock_page(page);
- break;
- }
-
- if (next && (page->index != next)) {
- /* Not next consecutive page */
- unlock_page(page);
- break;
- }
-
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
-
- if (PageWriteback(page) ||
- !clear_page_dirty_for_io(page)) {
- unlock_page(page);
- break;
- }
-
- /*
- * This actually clears the dirty bit in the radix tree.
- * See cifs_writepage() for more commentary.
- */
- set_page_writeback(page);
-
- if (page_offset(page) >= i_size_read(mapping->host)) {
- done = true;
- unlock_page(page);
- end_page_writeback(page);
- break;
- }
-
- wdata->pages[i] = page;
- next = page->index + 1;
- ++nr_pages;
- }
-
- /* reset index to refind any pages skipped */
- if (nr_pages == 0)
- index = wdata->pages[0]->index + 1;
-
- /* put any pages we aren't going to use */
- for (i = nr_pages; i < found_pages; i++) {
- page_cache_release(wdata->pages[i]);
- wdata->pages[i] = NULL;
- }
+ nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
+ end, &index, &next, &done);
/* nothing to write? */
if (nr_pages == 0) {
kref_put(&wdata->refcount, cifs_writedata_release);
+ add_credits_and_wake_if(server, credits, 0);
continue;
}
- wdata->sync_mode = wbc->sync_mode;
- wdata->nr_pages = nr_pages;
- wdata->offset = page_offset(wdata->pages[0]);
- wdata->pagesz = PAGE_CACHE_SIZE;
- wdata->tailsz =
- min(i_size_read(mapping->host) -
- page_offset(wdata->pages[nr_pages - 1]),
- (loff_t)PAGE_CACHE_SIZE);
- wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
- wdata->tailsz;
-
- do {
- if (wdata->cfile != NULL)
- cifsFileInfo_put(wdata->cfile);
- wdata->cfile = find_writable_file(CIFS_I(mapping->host),
- false);
- if (!wdata->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- break;
- }
- wdata->pid = wdata->cfile->pid;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata,
- cifs_writedata_release);
- } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
+ wdata->credits = credits;
- for (i = 0; i < nr_pages; ++i)
- unlock_page(wdata->pages[i]);
+ rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
/* send failure -- clean up the mess */
if (rc != 0) {
+ add_credits_and_wake_if(server, wdata->credits, 0);
for (i = 0; i < nr_pages; ++i) {
if (rc == -EAGAIN)
redirty_page_for_writepage(wbc,
@@ -2066,6 +2120,11 @@ retry:
}
kref_put(&wdata->refcount, cifs_writedata_release);
+ if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
+ index = saved_index;
+ continue;
+ }
+
wbc->nr_to_write -= nr_pages;
if (wbc->nr_to_write <= 0)
done = true;
@@ -2214,8 +2273,8 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
xid = get_xid();
- cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
- file->f_path.dentry->d_name.name, datasync);
+ cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n",
+ file, datasync);
if (!CIFS_CACHE_READ(CIFS_I(inode))) {
rc = cifs_zap_mapping(inode);
@@ -2256,8 +2315,8 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
xid = get_xid();
- cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
- file->f_path.dentry->d_name.name, datasync);
+ cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n",
+ file, datasync);
tcon = tlink_tcon(smbfile->tlink);
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
@@ -2362,123 +2421,109 @@ cifs_uncached_writev_complete(struct work_struct *work)
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
-/* attempt to send write to server, retry on any -EAGAIN errors */
static int
-cifs_uncached_retry_writev(struct cifs_writedata *wdata)
+wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
+ size_t *len, unsigned long *num_pages)
{
- int rc;
- struct TCP_Server_Info *server;
+ size_t save_len, copied, bytes, cur_len = *len;
+ unsigned long i, nr_pages = *num_pages;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+ save_len = cur_len;
+ for (i = 0; i < nr_pages; i++) {
+ bytes = min_t(const size_t, cur_len, PAGE_SIZE);
+ copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
+ cur_len -= copied;
+ /*
+ * If we didn't copy as much as we expected, then that
+ * may mean we trod into an unmapped area. Stop copying
+ * at that point. On the next pass through the big
+ * loop, we'll likely end up getting a zero-length
+ * write and bailing out of it.
+ */
+ if (copied < bytes)
+ break;
+ }
+ cur_len = save_len - cur_len;
+ *len = cur_len;
- do {
- if (wdata->cfile->invalidHandle) {
- rc = cifs_reopen_file(wdata->cfile, false);
- if (rc != 0)
- continue;
- }
- rc = server->ops->async_writev(wdata,
- cifs_uncached_writedata_release);
- } while (rc == -EAGAIN);
+ /*
+ * If we have no data to send, then that probably means that
+ * the copy above failed altogether. That's most likely because
+ * the address in the iovec was bogus. Return -EFAULT and let
+ * the caller free anything we allocated and bail out.
+ */
+ if (!cur_len)
+ return -EFAULT;
- return rc;
+ /*
+ * i + 1 now represents the number of pages we actually used in
+ * the copy phase above.
+ */
+ *num_pages = i + 1;
+ return 0;
}
-static ssize_t
-cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
+static int
+cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
+ struct cifsFileInfo *open_file,
+ struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
{
- unsigned long nr_pages, i;
- size_t bytes, copied, len, cur_len;
- ssize_t total_written = 0;
- loff_t offset;
- struct cifsFileInfo *open_file;
- struct cifs_tcon *tcon;
- struct cifs_sb_info *cifs_sb;
- struct cifs_writedata *wdata, *tmp;
- struct list_head wdata_list;
- int rc;
+ int rc = 0;
+ size_t cur_len;
+ unsigned long nr_pages, num_pages, i;
+ struct cifs_writedata *wdata;
+ struct iov_iter saved_from;
+ loff_t saved_offset = offset;
pid_t pid;
-
- len = iov_iter_count(from);
- rc = generic_write_checks(file, poffset, &len, 0);
- if (rc)
- return rc;
-
- if (!len)
- return 0;
-
- iov_iter_truncate(from, len);
-
- INIT_LIST_HEAD(&wdata_list);
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_writev)
- return -ENOSYS;
-
- offset = *poffset;
+ struct TCP_Server_Info *server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
else
pid = current->tgid;
+ server = tlink_tcon(open_file->tlink)->ses->server;
+ memcpy(&saved_from, from, sizeof(struct iov_iter));
+
do {
- size_t save_len;
+ unsigned int wsize, credits;
- nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+ &wsize, &credits);
+ if (rc)
+ break;
+
+ nr_pages = get_numpages(wsize, len, &cur_len);
wdata = cifs_writedata_alloc(nr_pages,
cifs_uncached_writev_complete);
if (!wdata) {
rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
break;
}
rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
if (rc) {
kfree(wdata);
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- save_len = cur_len;
- for (i = 0; i < nr_pages; i++) {
- bytes = min_t(size_t, cur_len, PAGE_SIZE);
- copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
- from);
- cur_len -= copied;
- /*
- * If we didn't copy as much as we expected, then that
- * may mean we trod into an unmapped area. Stop copying
- * at that point. On the next pass through the big
- * loop, we'll likely end up getting a zero-length
- * write and bailing out of it.
- */
- if (copied < bytes)
- break;
- }
- cur_len = save_len - cur_len;
-
- /*
- * If we have no data to send, then that probably means that
- * the copy above failed altogether. That's most likely because
- * the address in the iovec was bogus. Set the rc to -EFAULT,
- * free anything we allocated and bail out.
- */
- if (!cur_len) {
+ num_pages = nr_pages;
+ rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
+ if (rc) {
for (i = 0; i < nr_pages; i++)
put_page(wdata->pages[i]);
kfree(wdata);
- rc = -EFAULT;
+ add_credits_and_wake_if(server, credits, 0);
break;
}
/*
- * i + 1 now represents the number of pages we actually used in
- * the copy phase above. Bring nr_pages down to that, and free
- * any pages that we didn't use.
+ * Bring nr_pages down to the number of pages we actually used,
+ * and free any pages that we didn't use.
*/
- for ( ; nr_pages > i + 1; nr_pages--)
+ for ( ; nr_pages > num_pages; nr_pages--)
put_page(wdata->pages[nr_pages - 1]);
wdata->sync_mode = WB_SYNC_ALL;
@@ -2489,18 +2534,69 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
wdata->bytes = cur_len;
wdata->pagesz = PAGE_SIZE;
wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
- rc = cifs_uncached_retry_writev(wdata);
+ wdata->credits = credits;
+
+ if (!wdata->cfile->invalidHandle ||
+ !cifs_reopen_file(wdata->cfile, false))
+ rc = server->ops->async_writev(wdata,
+ cifs_uncached_writedata_release);
if (rc) {
+ add_credits_and_wake_if(server, wdata->credits, 0);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
+ if (rc == -EAGAIN) {
+ memcpy(from, &saved_from,
+ sizeof(struct iov_iter));
+ iov_iter_advance(from, offset - saved_offset);
+ continue;
+ }
break;
}
- list_add_tail(&wdata->list, &wdata_list);
+ list_add_tail(&wdata->list, wdata_list);
offset += cur_len;
len -= cur_len;
} while (len > 0);
+ return rc;
+}
+
+static ssize_t
+cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
+{
+ size_t len;
+ ssize_t total_written = 0;
+ struct cifsFileInfo *open_file;
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_writedata *wdata, *tmp;
+ struct list_head wdata_list;
+ struct iov_iter saved_from;
+ int rc;
+
+ len = iov_iter_count(from);
+ rc = generic_write_checks(file, poffset, &len, 0);
+ if (rc)
+ return rc;
+
+ if (!len)
+ return 0;
+
+ iov_iter_truncate(from, len);
+
+ INIT_LIST_HEAD(&wdata_list);
+ cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ open_file = file->private_data;
+ tcon = tlink_tcon(open_file->tlink);
+
+ if (!tcon->ses->server->ops->async_writev)
+ return -ENOSYS;
+
+ memcpy(&saved_from, from, sizeof(struct iov_iter));
+
+ rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb,
+ &wdata_list);
+
/*
* If at least one write was successfully sent, then discard any rc
* value from the later writes. If the other write succeeds, then
@@ -2529,7 +2625,25 @@ restart_loop:
/* resend call if it's a retryable error */
if (rc == -EAGAIN) {
- rc = cifs_uncached_retry_writev(wdata);
+ struct list_head tmp_list;
+ struct iov_iter tmp_from;
+
+ INIT_LIST_HEAD(&tmp_list);
+ list_del_init(&wdata->list);
+
+ memcpy(&tmp_from, &saved_from,
+ sizeof(struct iov_iter));
+ iov_iter_advance(&tmp_from,
+ wdata->offset - *poffset);
+
+ rc = cifs_write_from_iter(wdata->offset,
+ wdata->bytes, &tmp_from,
+ open_file, cifs_sb, &tmp_list);
+
+ list_splice(&tmp_list, &wdata_list);
+
+ kref_put(&wdata->refcount,
+ cifs_uncached_writedata_release);
goto restart_loop;
}
}
@@ -2722,26 +2836,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
cifs_readdata_release(refcount);
}
-static int
-cifs_retry_async_readv(struct cifs_readdata *rdata)
-{
- int rc;
- struct TCP_Server_Info *server;
-
- server = tlink_tcon(rdata->cfile->tlink)->ses->server;
-
- do {
- if (rdata->cfile->invalidHandle) {
- rc = cifs_reopen_file(rdata->cfile, true);
- if (rc != 0)
- continue;
- }
- rc = server->ops->async_readv(rdata);
- } while (rc == -EAGAIN);
-
- return rc;
-}
-
/**
* cifs_readdata_to_iov - copy data from pages in response to an iovec
* @rdata: the readdata response with list of pages holding data
@@ -2754,7 +2848,7 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
static int
cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
{
- size_t remaining = rdata->bytes;
+ size_t remaining = rdata->got_bytes;
unsigned int i;
for (i = 0; i < rdata->nr_pages; i++) {
@@ -2782,11 +2876,12 @@ static int
cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
struct cifs_readdata *rdata, unsigned int len)
{
- int total_read = 0, result = 0;
+ int result = 0;
unsigned int i;
unsigned int nr_pages = rdata->nr_pages;
struct kvec iov;
+ rdata->got_bytes = 0;
rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
@@ -2820,55 +2915,45 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
if (result < 0)
break;
- total_read += result;
+ rdata->got_bytes += result;
}
- return total_read > 0 ? total_read : result;
+ return rdata->got_bytes > 0 && result != -ECONNABORTED ?
+ rdata->got_bytes : result;
}
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+static int
+cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
+ struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
{
- struct file *file = iocb->ki_filp;
- ssize_t rc;
- size_t len, cur_len;
- ssize_t total_read = 0;
- loff_t offset = iocb->ki_pos;
- unsigned int npages;
- struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
- struct cifsFileInfo *open_file;
- struct cifs_readdata *rdata, *tmp;
- struct list_head rdata_list;
+ struct cifs_readdata *rdata;
+ unsigned int npages, rsize, credits;
+ size_t cur_len;
+ int rc;
pid_t pid;
+ struct TCP_Server_Info *server;
- len = iov_iter_count(to);
- if (!len)
- return 0;
-
- INIT_LIST_HEAD(&rdata_list);
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_readv)
- return -ENOSYS;
+ server = tlink_tcon(open_file->tlink)->ses->server;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
else
pid = current->tgid;
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
-
do {
- cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+ &rsize, &credits);
+ if (rc)
+ break;
+
+ cur_len = min_t(const size_t, len, rsize);
npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
/* allocate a readdata struct */
rdata = cifs_readdata_alloc(npages,
cifs_uncached_readv_complete);
if (!rdata) {
+ add_credits_and_wake_if(server, credits, 0);
rc = -ENOMEM;
break;
}
@@ -2884,44 +2969,113 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
rdata->pid = pid;
rdata->pagesz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
+ rdata->credits = credits;
- rc = cifs_retry_async_readv(rdata);
+ if (!rdata->cfile->invalidHandle ||
+ !cifs_reopen_file(rdata->cfile, true))
+ rc = server->ops->async_readv(rdata);
error:
if (rc) {
+ add_credits_and_wake_if(server, rdata->credits, 0);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
+ if (rc == -EAGAIN)
+ continue;
break;
}
- list_add_tail(&rdata->list, &rdata_list);
+ list_add_tail(&rdata->list, rdata_list);
offset += cur_len;
len -= cur_len;
} while (len > 0);
+ return rc;
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t rc;
+ size_t len;
+ ssize_t total_read = 0;
+ loff_t offset = iocb->ki_pos;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *open_file;
+ struct cifs_readdata *rdata, *tmp;
+ struct list_head rdata_list;
+
+ len = iov_iter_count(to);
+ if (!len)
+ return 0;
+
+ INIT_LIST_HEAD(&rdata_list);
+ cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+ open_file = file->private_data;
+ tcon = tlink_tcon(open_file->tlink);
+
+ if (!tcon->ses->server->ops->async_readv)
+ return -ENOSYS;
+
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+ rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
+
/* if at least one read request send succeeded, then reset rc */
if (!list_empty(&rdata_list))
rc = 0;
len = iov_iter_count(to);
/* the loop below should proceed in the order of increasing offsets */
+again:
list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
- again:
if (!rc) {
/* FIXME: freezable sleep too? */
rc = wait_for_completion_killable(&rdata->done);
if (rc)
rc = -EINTR;
- else if (rdata->result) {
- rc = rdata->result;
+ else if (rdata->result == -EAGAIN) {
/* resend call if it's a retryable error */
- if (rc == -EAGAIN) {
- rc = cifs_retry_async_readv(rdata);
- goto again;
+ struct list_head tmp_list;
+ unsigned int got_bytes = rdata->got_bytes;
+
+ list_del_init(&rdata->list);
+ INIT_LIST_HEAD(&tmp_list);
+
+ /*
+ * Got a part of data and then reconnect has
+ * happened -- fill the buffer and continue
+ * reading.
+ */
+ if (got_bytes && got_bytes < rdata->bytes) {
+ rc = cifs_readdata_to_iov(rdata, to);
+ if (rc) {
+ kref_put(&rdata->refcount,
+ cifs_uncached_readdata_release);
+ continue;
+ }
}
- } else {
+
+ rc = cifs_send_async_read(
+ rdata->offset + got_bytes,
+ rdata->bytes - got_bytes,
+ rdata->cfile, cifs_sb,
+ &tmp_list);
+
+ list_splice(&tmp_list, &rdata_list);
+
+ kref_put(&rdata->refcount,
+ cifs_uncached_readdata_release);
+ goto again;
+ } else if (rdata->result)
+ rc = rdata->result;
+ else
rc = cifs_readdata_to_iov(rdata, to);
- }
+ /* if there was a short read -- discard anything left */
+ if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
+ rc = -ENODATA;
}
list_del_init(&rdata->list);
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
@@ -3030,18 +3184,19 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
for (total_read = 0, cur_offset = read_data; read_size > total_read;
total_read += bytes_read, cur_offset += bytes_read) {
- current_read_size = min_t(uint, read_size - total_read, rsize);
- /*
- * For windows me and 9x we do not want to request more than it
- * negotiated since it will refuse the read then.
- */
- if ((tcon->ses) && !(tcon->ses->capabilities &
+ do {
+ current_read_size = min_t(uint, read_size - total_read,
+ rsize);
+ /*
+ * For windows me and 9x we do not want to request more
+ * than it negotiated since it will refuse the read
+ * then.
+ */
+ if ((tcon->ses) && !(tcon->ses->capabilities &
tcon->ses->server->vals->cap_large_files)) {
- current_read_size = min_t(uint, current_read_size,
- CIFSMaxBufSize);
- }
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
+ current_read_size = min_t(uint,
+ current_read_size, CIFSMaxBufSize);
+ }
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, true);
if (rc != 0)
@@ -3051,10 +3206,11 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
io_parms.tcon = tcon;
io_parms.offset = *offset;
io_parms.length = current_read_size;
- rc = server->ops->sync_read(xid, open_file, &io_parms,
+ rc = server->ops->sync_read(xid, &open_file->fid, &io_parms,
&bytes_read, &cur_offset,
&buf_type);
- }
+ } while (rc == -EAGAIN);
+
if (rc || (bytes_read == 0)) {
if (total_read) {
break;
@@ -3133,25 +3289,30 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
static void
cifs_readv_complete(struct work_struct *work)
{
- unsigned int i;
+ unsigned int i, got_bytes;
struct cifs_readdata *rdata = container_of(work,
struct cifs_readdata, work);
+ got_bytes = rdata->got_bytes;
for (i = 0; i < rdata->nr_pages; i++) {
struct page *page = rdata->pages[i];
lru_cache_add_file(page);
- if (rdata->result == 0) {
+ if (rdata->result == 0 ||
+ (rdata->result == -EAGAIN && got_bytes)) {
flush_dcache_page(page);
SetPageUptodate(page);
}
unlock_page(page);
- if (rdata->result == 0)
+ if (rdata->result == 0 ||
+ (rdata->result == -EAGAIN && got_bytes))
cifs_readpage_to_fscache(rdata->mapping->host, page);
+ got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
+
page_cache_release(page);
rdata->pages[i] = NULL;
}
@@ -3162,7 +3323,7 @@ static int
cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
struct cifs_readdata *rdata, unsigned int len)
{
- int total_read = 0, result = 0;
+ int result = 0;
unsigned int i;
u64 eof;
pgoff_t eof_index;
@@ -3174,6 +3335,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+ rdata->got_bytes = 0;
rdata->tailsz = PAGE_CACHE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
@@ -3228,10 +3390,70 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
if (result < 0)
break;
- total_read += result;
+ rdata->got_bytes += result;
}
- return total_read > 0 ? total_read : result;
+ return rdata->got_bytes > 0 && result != -ECONNABORTED ?
+ rdata->got_bytes : result;
+}
+
+static int
+readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
+ unsigned int rsize, struct list_head *tmplist,
+ unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
+{
+ struct page *page, *tpage;
+ unsigned int expected_index;
+ int rc;
+
+ INIT_LIST_HEAD(tmplist);
+
+ page = list_entry(page_list->prev, struct page, lru);
+
+ /*
+ * Lock the page and put it in the cache. Since no one else
+ * should have access to this page, we're safe to simply set
+ * PG_locked without checking it first.
+ */
+ __set_page_locked(page);
+ rc = add_to_page_cache_locked(page, mapping,
+ page->index, GFP_KERNEL);
+
+ /* give up if we can't stick it in the cache */
+ if (rc) {
+ __clear_page_locked(page);
+ return rc;
+ }
+
+ /* move first page to the tmplist */
+ *offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ *bytes = PAGE_CACHE_SIZE;
+ *nr_pages = 1;
+ list_move_tail(&page->lru, tmplist);
+
+ /* now try and add more pages onto the request */
+ expected_index = page->index + 1;
+ list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+ /* discontinuity ? */
+ if (page->index != expected_index)
+ break;
+
+ /* would this page push the read over the rsize? */
+ if (*bytes + PAGE_CACHE_SIZE > rsize)
+ break;
+
+ __set_page_locked(page);
+ if (add_to_page_cache_locked(page, mapping, page->index,
+ GFP_KERNEL)) {
+ __clear_page_locked(page);
+ break;
+ }
+ list_move_tail(&page->lru, tmplist);
+ (*bytes) += PAGE_CACHE_SIZE;
+ expected_index++;
+ (*nr_pages)++;
+ }
+ return rc;
}
static int cifs_readpages(struct file *file, struct address_space *mapping,
@@ -3241,19 +3463,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
struct list_head tmplist;
struct cifsFileInfo *open_file = file->private_data;
struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- unsigned int rsize = cifs_sb->rsize;
+ struct TCP_Server_Info *server;
pid_t pid;
/*
- * Give up immediately if rsize is too small to read an entire page.
- * The VFS will fall back to readpage. We should never reach this
- * point however since we set ra_pages to 0 when the rsize is smaller
- * than a cache page.
- */
- if (unlikely(rsize < PAGE_CACHE_SIZE))
- return 0;
-
- /*
* Reads as many pages as possible from fscache. Returns -ENOBUFS
* immediately if the cookie is negative
*
@@ -3271,7 +3484,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
pid = current->tgid;
rc = 0;
- INIT_LIST_HEAD(&tmplist);
+ server = tlink_tcon(open_file->tlink)->ses->server;
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
__func__, file, mapping, num_pages);
@@ -3288,58 +3501,35 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* the rdata->pages, then we want them in increasing order.
*/
while (!list_empty(page_list)) {
- unsigned int i;
- unsigned int bytes = PAGE_CACHE_SIZE;
- unsigned int expected_index;
- unsigned int nr_pages = 1;
+ unsigned int i, nr_pages, bytes, rsize;
loff_t offset;
struct page *page, *tpage;
struct cifs_readdata *rdata;
+ unsigned credits;
- page = list_entry(page_list->prev, struct page, lru);
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+ &rsize, &credits);
+ if (rc)
+ break;
/*
- * Lock the page and put it in the cache. Since no one else
- * should have access to this page, we're safe to simply set
- * PG_locked without checking it first.
+ * Give up immediately if rsize is too small to read an entire
+ * page. The VFS will fall back to readpage. We should never
+ * reach this point however since we set ra_pages to 0 when the
+ * rsize is smaller than a cache page.
*/
- __set_page_locked(page);
- rc = add_to_page_cache_locked(page, mapping,
- page->index, GFP_KERNEL);
+ if (unlikely(rsize < PAGE_CACHE_SIZE)) {
+ add_credits_and_wake_if(server, credits, 0);
+ return 0;
+ }
- /* give up if we can't stick it in the cache */
+ rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
+ &nr_pages, &offset, &bytes);
if (rc) {
- __clear_page_locked(page);
+ add_credits_and_wake_if(server, credits, 0);
break;
}
- /* move first page to the tmplist */
- offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
- list_move_tail(&page->lru, &tmplist);
-
- /* now try and add more pages onto the request */
- expected_index = page->index + 1;
- list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
- /* discontinuity ? */
- if (page->index != expected_index)
- break;
-
- /* would this page push the read over the rsize? */
- if (bytes + PAGE_CACHE_SIZE > rsize)
- break;
-
- __set_page_locked(page);
- if (add_to_page_cache_locked(page, mapping,
- page->index, GFP_KERNEL)) {
- __clear_page_locked(page);
- break;
- }
- list_move_tail(&page->lru, &tmplist);
- bytes += PAGE_CACHE_SIZE;
- expected_index++;
- nr_pages++;
- }
-
rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
if (!rdata) {
/* best to give up if we're out of mem */
@@ -3350,6 +3540,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
page_cache_release(page);
}
rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
break;
}
@@ -3360,20 +3551,25 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->pid = pid;
rdata->pagesz = PAGE_CACHE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
+ rdata->credits = credits;
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
list_del(&page->lru);
rdata->pages[rdata->nr_pages++] = page;
}
- rc = cifs_retry_async_readv(rdata);
- if (rc != 0) {
+ if (!rdata->cfile->invalidHandle ||
+ !cifs_reopen_file(rdata->cfile, true))
+ rc = server->ops->async_readv(rdata);
+ if (rc) {
+ add_credits_and_wake_if(server, rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
lru_cache_add_file(page);
unlock_page(page);
page_cache_release(page);
}
+ /* Fallback to the readpage in error/reconnect cases */
kref_put(&rdata->refcount, cifs_readdata_release);
break;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 41de3935caa0..197cb503d528 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -30,6 +30,7 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
#include "fscache.h"
@@ -412,7 +413,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
struct cifs_sb_info *cifs_sb, unsigned int xid)
{
int rc;
- int oplock = 0;
+ __u32 oplock;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct cifs_fid fid;
@@ -451,8 +452,13 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ if (tcon->ses->server->oplocks)
+ oplock = REQ_OPLOCK;
+ else
+ oplock = 0;
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL);
if (rc) {
+ cifs_dbg(FYI, "check sfu type of %s, open rc = %d\n", path, rc);
cifs_put_tlink(tlink);
return rc;
}
@@ -464,7 +470,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
io_parms.offset = 0;
io_parms.length = 24;
- rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
+ rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms,
+ &bytes_read, &pbuf, &buf_type);
if ((rc == 0) && (bytes_read >= 8)) {
if (memcmp("IntxBLK", pbuf, 8) == 0) {
cifs_dbg(FYI, "Block device\n");
@@ -504,7 +511,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
fattr->cf_dtype = DT_REG;
rc = -EOPNOTSUPP; /* or some unknown SFU type */
}
- CIFSSMBClose(xid, tcon, fid.netfid);
+
+ tcon->ses->server->ops->close(xid, tcon, &fid);
cifs_put_tlink(tlink);
return rc;
}
@@ -539,7 +547,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
"SETFILEBITS", ea_value, 4 /* size of buf */,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
cifs_put_tlink(tlink);
if (rc < 0)
return (int)rc;
@@ -952,11 +960,18 @@ struct inode *cifs_root_iget(struct super_block *sb)
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
xid = get_xid();
- if (tcon->unix_ext)
+ if (tcon->unix_ext) {
rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
- else
- rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
+ /* some servers mistakenly claim POSIX support */
+ if (rc != -EOPNOTSUPP)
+ goto iget_no_retry;
+ cifs_dbg(VFS, "server does not support POSIX extensions");
+ tcon->unix_ext = false;
+ }
+
+ rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
+iget_no_retry:
if (!inode) {
inode = ERR_PTR(rc);
goto out;
@@ -1117,8 +1132,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
/* rename the file */
rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
if (rc != 0) {
rc = -EBUSY;
goto undo_setattr;
@@ -1159,8 +1173,7 @@ out:
*/
undo_rename:
CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
undo_setattr:
if (dosattr != origattr) {
info_buf->Attributes = cpu_to_le32(origattr);
@@ -1226,7 +1239,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = CIFSPOSIXDelFile(xid, tcon, full_path,
SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
cifs_dbg(FYI, "posix del rc %d\n", rc);
if ((rc == 0) || (rc == -ENOENT))
goto psx_del_no_retry;
@@ -1349,8 +1362,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
}
CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
} else {
struct TCP_Server_Info *server = tcon->ses->server;
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
@@ -1392,8 +1404,7 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
mode &= ~current_umask();
rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode,
NULL /* netfid */, info, &oplock, full_path,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
if (rc == -EOPNOTSUPP)
goto posix_mkdir_out;
else if (rc) {
@@ -1419,8 +1430,8 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
d_instantiate(dentry, newinode);
#ifdef CONFIG_CIFS_DEBUG2
- cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n",
- dentry, dentry->d_name.name, newinode);
+ cifs_dbg(FYI, "instantiated dentry %p %pd to inode %p\n",
+ dentry, dentry, newinode);
if (newinode->i_nlink != 2)
cifs_dbg(FYI, "unexpected number of links %d\n",
@@ -1617,8 +1628,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
if (rc == 0) {
rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid,
(const char *) to_dentry->d_name.name,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
CIFSSMBClose(xid, tcon, fid.netfid);
}
do_rename_exit:
@@ -1627,8 +1637,9 @@ do_rename_exit:
}
int
-cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
- struct inode *target_dir, struct dentry *target_dentry)
+cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
+ struct inode *target_dir, struct dentry *target_dentry,
+ unsigned int flags)
{
char *from_name = NULL;
char *to_name = NULL;
@@ -1640,6 +1651,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
unsigned int xid;
int rc, tmprc;
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
cifs_sb = CIFS_SB(source_dir->i_sb);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -1667,6 +1681,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
to_name);
+ /*
+ * No-replace is the natural behavior for CIFS, so skip unlink hacks.
+ */
+ if (flags & RENAME_NOREPLACE)
+ goto cifs_rename_exit;
+
if (rc == -EEXIST && tcon->unix_ext) {
/*
* Are src and dst hardlinks of same inode? We can only tell
@@ -1684,16 +1704,14 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name,
info_buf_source,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
if (tmprc != 0)
goto unlink_target;
tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name,
info_buf_target,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
if (tmprc == 0 && (info_buf_source->UniqueId ==
info_buf_target->UniqueId)) {
@@ -1710,13 +1728,22 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
unlink_target:
/* Try unlinking the target dentry if it's not negative */
if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
- tmprc = cifs_unlink(target_dir, target_dentry);
+ if (d_is_dir(target_dentry))
+ tmprc = cifs_rmdir(target_dir, target_dentry);
+ else
+ tmprc = cifs_unlink(target_dir, target_dentry);
if (tmprc)
goto cifs_rename_exit;
rc = cifs_do_rename(xid, source_dentry, from_name,
target_dentry, to_name);
}
+ /* force revalidate to go get info when needed */
+ CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
+
+ source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
+ target_dir->i_mtime = current_fs_time(source_dir->i_sb);
+
cifs_rename_exit:
kfree(info_buf_source);
kfree(from_name);
@@ -2049,8 +2076,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN,
GENERIC_WRITE, CREATE_NOT_DIR, &netfid,
&oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
if (rc == 0) {
unsigned int bytes_written;
@@ -2092,8 +2118,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
struct cifs_unix_set_info_args *args = NULL;
struct cifsFileInfo *open_file;
- cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n",
- direntry->d_name.name, attrs->ia_valid);
+ cifs_dbg(FYI, "setattr_unix on file %pd attrs->ia_valid=0x%x\n",
+ direntry, attrs->ia_valid);
xid = get_xid();
@@ -2235,8 +2261,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
xid = get_xid();
- cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n",
- direntry->d_name.name, attrs->ia_valid);
+ cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n",
+ direntry, attrs->ia_valid);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
attrs->ia_valid |= ATTR_FORCE;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 68559fd557fb..2ec6037f61c7 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,6 +28,10 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2proto.h"
+#endif
/*
* M-F Symlink Functions - Begin
@@ -213,8 +217,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto out;
- rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
- fromName, buf, &bytes_written);
+ if (tcon->ses->server->ops->create_mf_symlink)
+ rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
+ cifs_sb, fromName, buf, &bytes_written);
+ else
+ rc = -EOPNOTSUPP;
+
if (rc)
goto out;
@@ -339,9 +347,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
+ if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+ rc = -ENOENT;
/* it's not a symlink */
goto out;
+ }
io_parms.netfid = fid.netfid;
io_parms.pid = current->tgid;
@@ -395,6 +405,134 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
}
/*
+ * SMB 2.1/SMB3 Protocol specific functions
+ */
+#ifdef CONFIG_CIFS_SMB2
+int
+smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const unsigned char *path,
+ char *pbuf, unsigned int *pbytes_read)
+{
+ int rc;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ struct cifs_io_parms io_parms;
+ int buf_type = CIFS_NO_BUFFER;
+ __le16 *utf16_path;
+ __u8 oplock = SMB2_OPLOCK_LEVEL_II;
+ struct smb2_file_all_info *pfile_info = NULL;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_READ;
+ oparms.create_options = CREATE_NOT_DIR;
+ if (backup_cred(cifs_sb))
+ oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (utf16_path == NULL)
+ return -ENOMEM;
+
+ pfile_info = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
+ GFP_KERNEL);
+
+ if (pfile_info == NULL) {
+ kfree(utf16_path);
+ return -ENOMEM;
+ }
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL);
+ if (rc)
+ goto qmf_out_open_fail;
+
+ if (pfile_info->EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+ /* it's not a symlink */
+ rc = -ENOENT; /* Is there a better rc to return? */
+ goto qmf_out;
+ }
+
+ io_parms.netfid = fid.netfid;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+ io_parms.persistent_fid = fid.persistent_fid;
+ io_parms.volatile_fid = fid.volatile_fid;
+ rc = SMB2_read(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
+qmf_out:
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+qmf_out_open_fail:
+ kfree(utf16_path);
+ kfree(pfile_info);
+ return rc;
+}
+
+int
+smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const unsigned char *path,
+ char *pbuf, unsigned int *pbytes_written)
+{
+ int rc;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ struct cifs_io_parms io_parms;
+ int create_options = CREATE_NOT_DIR;
+ __le16 *utf16_path;
+ __u8 oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+ struct kvec iov[2];
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
+
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+ if (rc) {
+ kfree(utf16_path);
+ return rc;
+ }
+
+ io_parms.netfid = fid.netfid;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+ io_parms.persistent_fid = fid.persistent_fid;
+ io_parms.volatile_fid = fid.volatile_fid;
+
+ /* iov[0] is reserved for smb header */
+ iov[1].iov_base = pbuf;
+ iov[1].iov_len = CIFS_MF_SYMLINK_FILE_SIZE;
+
+ rc = SMB2_write(xid, &io_parms, pbytes_written, iov, 1);
+
+ /* Make sure we wrote all of the symlink data */
+ if ((rc == 0) && (*pbytes_written != CIFS_MF_SYMLINK_FILE_SIZE))
+ rc = -EIO;
+
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+
+ kfree(utf16_path);
+ return rc;
+}
+#endif /* CONFIG_CIFS_SMB2 */
+
+/*
* M-F Symlink Functions - End
*/
@@ -429,8 +567,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
if (tcon->unix_ext)
rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
else {
server = tcon->ses->server;
if (!server->ops->create_hardlink) {
@@ -455,11 +592,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
spin_lock(&old_file->d_inode->i_lock);
inc_nlink(old_file->d_inode);
spin_unlock(&old_file->d_inode->i_lock);
- /*
- * BB should we make this contingent on superblock flag
- * NOATIME?
- */
- /* old_file->d_inode->i_ctime = CURRENT_TIME; */
+
/*
* parent dir timestamps will update from srv within a
* second, would it really be worth it to set the parent
@@ -469,7 +602,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
}
/*
* if not oplocked will force revalidate to get info on source
- * file from srv
+ * file from srv. Note Samba server prior to 4.2 has bug -
+ * not updating src file ctime on hardlinks but Windows servers
+ * handle it properly
*/
cifsInode->time = 0;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 6bf55d0ed494..b7415d596dbd 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -226,6 +226,15 @@ cifs_small_buf_release(void *buf_to_free)
return;
}
+void
+free_rsp_buf(int resp_buftype, void *rsp)
+{
+ if (resp_buftype == CIFS_SMALL_BUFFER)
+ cifs_small_buf_release(rsp);
+ else if (resp_buftype == CIFS_LARGE_BUFFER)
+ cifs_buf_release(rsp);
+}
+
/* NB: MID can not be set if treeCon not passed in, in that
case it is responsbility of caller to set the mid */
void
@@ -414,7 +423,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
return true;
}
if (pSMBr->hdr.Status.CifsError) {
- cifs_dbg(FYI, "notify err 0x%d\n",
+ cifs_dbg(FYI, "notify err 0x%x\n",
pSMBr->hdr.Status.CifsError);
return true;
}
@@ -441,7 +450,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
if (pSMB->hdr.WordCount != 8)
return false;
- cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n",
+ cifs_dbg(FYI, "oplock type 0x%x level 0x%x\n",
pSMB->LockType, pSMB->OplockLevel);
if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
return false;
@@ -565,13 +574,6 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
cinode->oplock = 0;
}
-static int
-cifs_oplock_break_wait(void *unused)
-{
- schedule();
- return signal_pending(current) ? -ERESTARTSYS : 0;
-}
-
/*
* We wait for oplock breaks to be processed before we attempt to perform
* writes.
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6834b9c3bec1..b333ff60781d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc)
/* BB what about the timezone? BB */
/* Subtract the NTFS time offset, then convert to 1s intervals. */
- u64 t;
+ s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+
+ /*
+ * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+ * the alternative, do_div, does not work with negative numbers so have
+ * to special case them
+ */
+ if (t < 0) {
+ t = -t;
+ ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
+ ts.tv_nsec = -ts.tv_nsec;
+ ts.tv_sec = -t;
+ } else {
+ ts.tv_nsec = (long)do_div(t, 10000000) * 100;
+ ts.tv_sec = t;
+ }
- t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
- ts.tv_nsec = do_div(t, 10000000) * 100;
- ts.tv_sec = t;
return ts;
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b15862e0f68c..8fd2a95860ba 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -87,8 +87,6 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
return;
if (dentry) {
- int err;
-
inode = dentry->d_inode;
if (inode) {
/*
@@ -105,10 +103,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
goto out;
}
}
- err = d_invalidate(dentry);
+ d_invalidate(dentry);
dput(dentry);
- if (err)
- return;
}
/*
@@ -243,7 +239,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
OPEN_REPARSE_POINT, &fid, &oplock, NULL,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb);
if (!rc) {
tmpbuffer = kmalloc(maxpath);
rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
@@ -593,11 +589,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
/* close and restart search */
cifs_dbg(FYI, "search backing up - close and restart search\n");
spin_lock(&cifs_file_list_lock);
- if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+ if (server->ops->dir_needs_close(cfile)) {
cfile->invalidHandle = true;
spin_unlock(&cifs_file_list_lock);
- if (server->ops->close)
- server->ops->close(xid, tcon, &cfile->fid);
+ if (server->ops->close_dir)
+ server->ops->close_dir(xid, tcon, &cfile->fid);
} else
spin_unlock(&cifs_file_list_lock);
if (cfile->srch_inf.ntwrk_buf_start) {
@@ -708,15 +704,15 @@ static int cifs_filldir(char *find_entry, struct file *file,
if (file_info->srch_inf.unicode) {
struct nls_table *nlt = cifs_sb->local_nls;
+ int map_type;
+ map_type = cifs_remap(cifs_sb);
name.name = scratch_buf;
name.len =
cifs_from_utf16((char *)name.name, (__le16 *)de.name,
UNICODE_NAME_MAX,
min_t(size_t, de.namelen,
- (size_t)max_len), nlt,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ (size_t)max_len), nlt, map_type);
name.len -= nls_nullsize(nlt);
} else {
name.name = de.name;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index e87387dbf39f..57db63ff88da 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
kfree(ses->serverOS);
ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
- if (ses->serverOS)
+ if (ses->serverOS) {
strncpy(ses->serverOS, bcc_ptr, len);
- if (strncmp(ses->serverOS, "OS/2", 4) == 0)
- cifs_dbg(FYI, "OS/2 server\n");
+ if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+ cifs_dbg(FYI, "OS/2 server\n");
+ }
bcc_ptr += len + 1;
bleft -= len + 1;
@@ -520,382 +521,551 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
}
}
-int
-CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_cp)
+struct sess_data {
+ unsigned int xid;
+ struct cifs_ses *ses;
+ struct nls_table *nls_cp;
+ void (*func)(struct sess_data *);
+ int result;
+
+ /* we will send the SMB in three pieces:
+ * a fixed length beginning part, an optional
+ * SPNEGO blob (which can be zero length), and a
+ * last part which will include the strings
+ * and rest of bcc area. This allows us to avoid
+ * a large buffer 17K allocation
+ */
+ int buf0_type;
+ struct kvec iov[3];
+};
+
+static int
+sess_alloc_buffer(struct sess_data *sess_data, int wct)
{
- int rc = 0;
- int wct;
+ int rc;
+ struct cifs_ses *ses = sess_data->ses;
struct smb_hdr *smb_buf;
- char *bcc_ptr;
- char *str_area;
- SESSION_SETUP_ANDX *pSMB;
- __u32 capabilities;
- __u16 count;
- int resp_buf_type;
- struct kvec iov[3];
- enum securityEnum type;
- __u16 action, bytes_remaining;
- struct key *spnego_key = NULL;
- __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
- u16 blob_len;
- char *ntlmsspblob = NULL;
- if (ses == NULL) {
- WARN(1, "%s: ses == NULL!", __func__);
- return -EINVAL;
- }
+ rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
+ (void **)&smb_buf);
- type = select_sectype(ses->server, ses->sectype);
- cifs_dbg(FYI, "sess setup type %d\n", type);
- if (type == Unspecified) {
- cifs_dbg(VFS,
- "Unable to select appropriate authentication method!");
- return -EINVAL;
+ if (rc)
+ return rc;
+
+ sess_data->iov[0].iov_base = (char *)smb_buf;
+ sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
+ /*
+ * This variable will be used to clear the buffer
+ * allocated above in case of any error in the calling function.
+ */
+ sess_data->buf0_type = CIFS_SMALL_BUFFER;
+
+ /* 2000 big enough to fit max user, domain, NOS name etc. */
+ sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL);
+ if (!sess_data->iov[2].iov_base) {
+ rc = -ENOMEM;
+ goto out_free_smb_buf;
}
- if (type == RawNTLMSSP) {
- /* if memory allocation is successful, caller of this function
- * frees it.
- */
- ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
- if (!ses->ntlmssp)
- return -ENOMEM;
- ses->ntlmssp->sesskey_per_smbsess = false;
+ return 0;
+out_free_smb_buf:
+ kfree(smb_buf);
+ sess_data->iov[0].iov_base = NULL;
+ sess_data->iov[0].iov_len = 0;
+ sess_data->buf0_type = CIFS_NO_BUFFER;
+ return rc;
+}
+
+static void
+sess_free_buffer(struct sess_data *sess_data)
+{
+
+ free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+ sess_data->buf0_type = CIFS_NO_BUFFER;
+ kfree(sess_data->iov[2].iov_base);
+}
+
+static int
+sess_establish_session(struct sess_data *sess_data)
+{
+ struct cifs_ses *ses = sess_data->ses;
+
+ mutex_lock(&ses->server->srv_mutex);
+ if (!ses->server->session_estab) {
+ if (ses->server->sign) {
+ ses->server->session_key.response =
+ kmemdup(ses->auth_key.response,
+ ses->auth_key.len, GFP_KERNEL);
+ if (!ses->server->session_key.response) {
+ mutex_unlock(&ses->server->srv_mutex);
+ return -ENOMEM;
+ }
+ ses->server->session_key.len =
+ ses->auth_key.len;
+ }
+ ses->server->sequence_number = 0x2;
+ ses->server->session_estab = true;
}
+ mutex_unlock(&ses->server->srv_mutex);
-ssetup_ntlmssp_authenticate:
- if (phase == NtLmChallenge)
- phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
+ cifs_dbg(FYI, "CIFS session established successfully\n");
+ spin_lock(&GlobalMid_Lock);
+ ses->status = CifsGood;
+ ses->need_reconnect = false;
+ spin_unlock(&GlobalMid_Lock);
- if (type == LANMAN) {
-#ifndef CONFIG_CIFS_WEAK_PW_HASH
- /* LANMAN and plaintext are less secure and off by default.
- So we make this explicitly be turned on in kconfig (in the
- build) and turned on at runtime (changed from the default)
- in proc/fs/cifs or via mount parm. Unfortunately this is
- needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
- return -EOPNOTSUPP;
-#endif
- wct = 10; /* lanman 2 style sessionsetup */
- } else if ((type == NTLM) || (type == NTLMv2)) {
- /* For NTLMv2 failures eventually may need to retry NTLM */
- wct = 13; /* old style NTLM sessionsetup */
- } else /* same size: negotiate or auth, NTLMSSP or extended security */
- wct = 12;
+ return 0;
+}
- rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
- (void **)&smb_buf);
- if (rc)
- return rc;
+static int
+sess_sendreceive(struct sess_data *sess_data)
+{
+ int rc;
+ struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base;
+ __u16 count;
+
+ count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len;
+ smb_buf->smb_buf_length =
+ cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
+ put_bcc(count, smb_buf);
+
+ rc = SendReceive2(sess_data->xid, sess_data->ses,
+ sess_data->iov, 3 /* num_iovecs */,
+ &sess_data->buf0_type,
+ CIFS_LOG_ERROR);
- pSMB = (SESSION_SETUP_ANDX *)smb_buf;
+ return rc;
+}
+/*
+ * LANMAN and plaintext are less secure and off by default.
+ * So we make this explicitly be turned on in kconfig (in the
+ * build) and turned on at runtime (changed from the default)
+ * in proc/fs/cifs or via mount parm. Unfortunately this is
+ * needed for old Win (e.g. Win95), some obscure NAS and OS/2
+ */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static void
+sess_auth_lanman(struct sess_data *sess_data)
+{
+ int rc = 0;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ char *bcc_ptr;
+ struct cifs_ses *ses = sess_data->ses;
+ char lnm_session_key[CIFS_AUTH_RESP_SIZE];
+ __u32 capabilities;
+ __u16 bytes_remaining;
+
+ /* lanman 2 style sessionsetup */
+ /* wct = 10 */
+ rc = sess_alloc_buffer(sess_data, 10);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ bcc_ptr = sess_data->iov[2].iov_base;
capabilities = cifs_ssetup_hdr(ses, pSMB);
- /* we will send the SMB in three pieces:
- a fixed length beginning part, an optional
- SPNEGO blob (which can be zero length), and a
- last part which will include the strings
- and rest of bcc area. This allows us to avoid
- a large buffer 17K allocation */
- iov[0].iov_base = (char *)pSMB;
- iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
-
- /* setting this here allows the code at the end of the function
- to free the request buffer if there's an error */
- resp_buf_type = CIFS_SMALL_BUFFER;
+ pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
- /* 2000 big enough to fit max user, domain, NOS name etc. */
- str_area = kmalloc(2000, GFP_KERNEL);
- if (str_area == NULL) {
- rc = -ENOMEM;
- goto ssetup_exit;
- }
- bcc_ptr = str_area;
+ /* no capabilities flags in old lanman negotiation */
+ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- iov[1].iov_base = NULL;
- iov[1].iov_len = 0;
+ /* Calculate hash with password and copy into bcc_ptr.
+ * Encryption Key (stored as in cryptkey) gets used if the
+ * security mode bit in Negottiate Protocol response states
+ * to use challenge/response method (i.e. Password bit is 1).
+ */
+ rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+ ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
+ true : false, lnm_session_key);
- if (type == LANMAN) {
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- char lnm_session_key[CIFS_AUTH_RESP_SIZE];
+ memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+
+ /*
+ * can not sign if LANMAN negotiated so no need
+ * to calculate signing key? but what if server
+ * changed to do higher than lanman dialect and
+ * we reconnected would we ever calc signing_key?
+ */
- pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
+ cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
+ /* Unicode not allowed for LANMAN dialects */
+ ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
- /* no capabilities flags in old lanman negotiation */
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
+
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out;
- pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- /* Calculate hash with password and copy into bcc_ptr.
- * Encryption Key (stored as in cryptkey) gets used if the
- * security mode bit in Negottiate Protocol response states
- * to use challenge/response method (i.e. Password bit is 1).
- */
+ /* lanman response has a word count of 3 */
+ if (smb_buf->WordCount != 3) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out;
+ }
- rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
- ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
- true : false, lnm_session_key);
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
- memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- /* can not sign if LANMAN negotiated so no need
- to calculate signing key? but what if server
- changed to do higher than lanman dialect and
- we reconnected would we ever calc signing_key? */
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
+
+ /* BB check if Unicode and decode strings */
+ if (bytes_remaining == 0) {
+ /* no string area to decode, do nothing */
+ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+ /* unicode string area must be word-aligned */
+ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ ++bcc_ptr;
+ --bytes_remaining;
+ }
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ } else {
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ }
+
+ rc = sess_establish_session(sess_data);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ sess_free_buffer(sess_data);
+}
- cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
- /* Unicode not allowed for LANMAN dialects */
- ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
#endif
- } else if (type == NTLM) {
- pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
- pSMB->req_no_secext.CaseInsensitivePasswordLength =
+
+static void
+sess_auth_ntlm(struct sess_data *sess_data)
+{
+ int rc = 0;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ char *bcc_ptr;
+ struct cifs_ses *ses = sess_data->ses;
+ __u32 capabilities;
+ __u16 bytes_remaining;
+
+ /* old style NTLM sessionsetup */
+ /* wct = 13 */
+ rc = sess_alloc_buffer(sess_data, 13);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ bcc_ptr = sess_data->iov[2].iov_base;
+ capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+ pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+ pSMB->req_no_secext.CaseInsensitivePasswordLength =
cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- pSMB->req_no_secext.CaseSensitivePasswordLength =
+ pSMB->req_no_secext.CaseSensitivePasswordLength =
cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- /* calculate ntlm response and session key */
- rc = setup_ntlm_response(ses, nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+ /* calculate ntlm response and session key */
+ rc = setup_ntlm_response(ses, sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLM authentication\n",
rc);
- goto ssetup_exit;
- }
+ goto out;
+ }
- /* copy ntlm response */
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
-
- if (ses->capabilities & CAP_UNICODE) {
- /* unicode strings must be word aligned */
- if (iov[0].iov_len % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- }
- unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else
- ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else if (type == NTLMv2) {
- pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
-
- /* LM2 password would be here if we supported it */
- pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
-
- /* calculate nlmv2 response and session key */
- rc = setup_ntlmv2_rsp(ses, nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
- rc);
- goto ssetup_exit;
+ /* copy ntlm response */
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ CIFS_AUTH_RESP_SIZE);
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
+
+ if (ses->capabilities & CAP_UNICODE) {
+ /* unicode strings must be word aligned */
+ if (sess_data->iov[0].iov_len % 2) {
+ *bcc_ptr = 0;
+ bcc_ptr++;
}
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- ses->auth_key.len - CIFS_SESS_KEY_SIZE);
- bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
-
- /* set case sensitive password length after tilen may get
- * assigned, tilen is 0 otherwise.
- */
- pSMB->req_no_secext.CaseSensitivePasswordLength =
- cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+ } else {
+ ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+ }
- if (ses->capabilities & CAP_UNICODE) {
- if (iov[0].iov_len % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- }
- unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else
- ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else if (type == Kerberos) {
-#ifdef CONFIG_CIFS_UPCALL
- struct cifs_spnego_msg *msg;
- spnego_key = cifs_get_spnego_key(ses);
- if (IS_ERR(spnego_key)) {
- rc = PTR_ERR(spnego_key);
- spnego_key = NULL;
- goto ssetup_exit;
- }
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
- msg = spnego_key->payload.data;
- /* check version field to make sure that cifs.upcall is
- sending us a response in an expected form */
- if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
- cifs_dbg(VFS, "incorrect version of cifs.upcall "
- "expected %d but got %d)",
- CIFS_SPNEGO_UPCALL_VERSION, msg->version);
- rc = -EKEYREJECTED;
- goto ssetup_exit;
- }
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out;
- ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
- GFP_KERNEL);
- if (!ses->auth_key.response) {
- cifs_dbg(VFS,
- "Kerberos can't allocate (%u bytes) memory",
- msg->sesskey_len);
- rc = -ENOMEM;
- goto ssetup_exit;
- }
- ses->auth_key.len = msg->sesskey_len;
-
- pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
- capabilities |= CAP_EXTENDED_SECURITY;
- pSMB->req.Capabilities = cpu_to_le32(capabilities);
- iov[1].iov_base = msg->data + msg->sesskey_len;
- iov[1].iov_len = msg->secblob_len;
- pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
-
- if (ses->capabilities & CAP_UNICODE) {
- /* unicode strings must be word aligned */
- if ((iov[0].iov_len + iov[1].iov_len) % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- }
- unicode_oslm_strings(&bcc_ptr, nls_cp);
- unicode_domain_string(&bcc_ptr, ses, nls_cp);
- } else
- /* BB: is this right? */
- ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-#else /* ! CONFIG_CIFS_UPCALL */
- cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
- rc = -ENOSYS;
- goto ssetup_exit;
-#endif /* CONFIG_CIFS_UPCALL */
- } else if (type == RawNTLMSSP) {
- if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
- cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
- rc = -ENOSYS;
- goto ssetup_exit;
- }
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase);
- pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
- capabilities |= CAP_EXTENDED_SECURITY;
- pSMB->req.Capabilities |= cpu_to_le32(capabilities);
- switch(phase) {
- case NtLmNegotiate:
- build_ntlmssp_negotiate_blob(
- pSMB->req.SecurityBlob, ses);
- iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
- iov[1].iov_base = pSMB->req.SecurityBlob;
- pSMB->req.SecurityBlobLength =
- cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
- break;
- case NtLmAuthenticate:
- /*
- * 5 is an empirical value, large enough to hold
- * authenticate message plus max 10 of av paris,
- * domain, user, workstation names, flags, etc.
- */
- ntlmsspblob = kzalloc(
- 5*sizeof(struct _AUTHENTICATE_MESSAGE),
- GFP_KERNEL);
- if (!ntlmsspblob) {
- rc = -ENOMEM;
- goto ssetup_exit;
- }
+ if (smb_buf->WordCount != 3) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out;
+ }
- rc = build_ntlmssp_auth_blob(ntlmsspblob,
- &blob_len, ses, nls_cp);
- if (rc)
- goto ssetup_exit;
- iov[1].iov_len = blob_len;
- iov[1].iov_base = ntlmsspblob;
- pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
- /*
- * Make sure that we tell the server that we are using
- * the uid that it just gave us back on the response
- * (challenge)
- */
- smb_buf->Uid = ses->Suid;
- break;
- default:
- cifs_dbg(VFS, "invalid phase %d\n", phase);
- rc = -ENOSYS;
- goto ssetup_exit;
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
+
+ /* BB check if Unicode and decode strings */
+ if (bytes_remaining == 0) {
+ /* no string area to decode, do nothing */
+ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+ /* unicode string area must be word-aligned */
+ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ ++bcc_ptr;
+ --bytes_remaining;
}
- /* unicode strings must be word aligned */
- if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ } else {
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ }
+
+ rc = sess_establish_session(sess_data);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ sess_free_buffer(sess_data);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+}
+
+static void
+sess_auth_ntlmv2(struct sess_data *sess_data)
+{
+ int rc = 0;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ char *bcc_ptr;
+ struct cifs_ses *ses = sess_data->ses;
+ __u32 capabilities;
+ __u16 bytes_remaining;
+
+ /* old style NTLM sessionsetup */
+ /* wct = 13 */
+ rc = sess_alloc_buffer(sess_data, 13);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ bcc_ptr = sess_data->iov[2].iov_base;
+ capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+ pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+
+ /* LM2 password would be here if we supported it */
+ pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+
+ /* calculate nlmv2 response and session key */
+ rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
+ goto out;
+ }
+
+ memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+ bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+
+ /* set case sensitive password length after tilen may get
+ * assigned, tilen is 0 otherwise.
+ */
+ pSMB->req_no_secext.CaseSensitivePasswordLength =
+ cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+
+ if (ses->capabilities & CAP_UNICODE) {
+ if (sess_data->iov[0].iov_len % 2) {
*bcc_ptr = 0;
bcc_ptr++;
}
- unicode_oslm_strings(&bcc_ptr, nls_cp);
+ unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
} else {
- cifs_dbg(VFS, "secType %d not supported!\n", type);
- rc = -ENOSYS;
- goto ssetup_exit;
+ ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
}
- iov[2].iov_base = str_area;
- iov[2].iov_len = (long) bcc_ptr - (long) str_area;
- count = iov[1].iov_len + iov[2].iov_len;
- smb_buf->smb_buf_length =
- cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
- put_bcc(count, smb_buf);
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out;
- rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
- CIFS_LOG_ERROR);
- /* SMB request buf freed in SendReceive2 */
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
- pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
- smb_buf = (struct smb_hdr *)iov[0].iov_base;
+ if (smb_buf->WordCount != 3) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out;
+ }
- if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) &&
- (smb_buf->Status.CifsError ==
- cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
- if (phase != NtLmNegotiate) {
- cifs_dbg(VFS, "Unexpected more processing error\n");
- goto ssetup_exit;
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
+
+ /* BB check if Unicode and decode strings */
+ if (bytes_remaining == 0) {
+ /* no string area to decode, do nothing */
+ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+ /* unicode string area must be word-aligned */
+ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ ++bcc_ptr;
+ --bytes_remaining;
+ }
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ } else {
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ }
+
+ rc = sess_establish_session(sess_data);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ sess_free_buffer(sess_data);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+}
+
+#ifdef CONFIG_CIFS_UPCALL
+static void
+sess_auth_kerberos(struct sess_data *sess_data)
+{
+ int rc = 0;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ char *bcc_ptr;
+ struct cifs_ses *ses = sess_data->ses;
+ __u32 capabilities;
+ __u16 bytes_remaining;
+ struct key *spnego_key = NULL;
+ struct cifs_spnego_msg *msg;
+ u16 blob_len;
+
+ /* extended security */
+ /* wct = 12 */
+ rc = sess_alloc_buffer(sess_data, 12);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ bcc_ptr = sess_data->iov[2].iov_base;
+ capabilities = cifs_ssetup_hdr(ses, pSMB);
+
+ spnego_key = cifs_get_spnego_key(ses);
+ if (IS_ERR(spnego_key)) {
+ rc = PTR_ERR(spnego_key);
+ spnego_key = NULL;
+ goto out;
+ }
+
+ msg = spnego_key->payload.data;
+ /*
+ * check version field to make sure that cifs.upcall is
+ * sending us a response in an expected form
+ */
+ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+ cifs_dbg(VFS,
+ "incorrect version of cifs.upcall (expected %d but got %d)",
+ CIFS_SPNEGO_UPCALL_VERSION, msg->version);
+ rc = -EKEYREJECTED;
+ goto out_put_spnego_key;
+ }
+
+ ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+ GFP_KERNEL);
+ if (!ses->auth_key.response) {
+ cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
+ msg->sesskey_len);
+ rc = -ENOMEM;
+ goto out_put_spnego_key;
+ }
+ ses->auth_key.len = msg->sesskey_len;
+
+ pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+ capabilities |= CAP_EXTENDED_SECURITY;
+ pSMB->req.Capabilities = cpu_to_le32(capabilities);
+ sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
+ sess_data->iov[1].iov_len = msg->secblob_len;
+ pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
+
+ if (ses->capabilities & CAP_UNICODE) {
+ /* unicode strings must be word aligned */
+ if ((sess_data->iov[0].iov_len
+ + sess_data->iov[1].iov_len) % 2) {
+ *bcc_ptr = 0;
+ bcc_ptr++;
}
- /* NTLMSSP Negotiate sent now processing challenge (response) */
- phase = NtLmChallenge; /* process ntlmssp challenge */
- rc = 0; /* MORE_PROC rc is not an error here, but expected */
+ unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+ unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
+ } else {
+ /* BB: is this right? */
+ ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
}
+
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
+
+ rc = sess_sendreceive(sess_data);
if (rc)
- goto ssetup_exit;
+ goto out_put_spnego_key;
- if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+ if (smb_buf->WordCount != 4) {
rc = -EIO;
cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
- goto ssetup_exit;
+ goto out_put_spnego_key;
}
- action = le16_to_cpu(pSMB->resp.Action);
- if (action & GUEST_LOGIN)
+
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
- /* response can have either 3 or 4 word count - Samba sends 3 */
- /* and lanman response is 3 */
+
bytes_remaining = get_bcc(smb_buf);
bcc_ptr = pByteArea(smb_buf);
- if (smb_buf->WordCount == 4) {
- blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
- if (blob_len > bytes_remaining) {
- cifs_dbg(VFS, "bad security blob length %d\n",
- blob_len);
- rc = -EINVAL;
- goto ssetup_exit;
- }
- if (phase == NtLmChallenge) {
- rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
- /* now goto beginning for ntlmssp authenticate phase */
- if (rc)
- goto ssetup_exit;
- }
- bcc_ptr += blob_len;
- bytes_remaining -= blob_len;
+ blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+ if (blob_len > bytes_remaining) {
+ cifs_dbg(VFS, "bad security blob length %d\n",
+ blob_len);
+ rc = -EINVAL;
+ goto out_put_spnego_key;
}
+ bcc_ptr += blob_len;
+ bytes_remaining -= blob_len;
/* BB check if Unicode and decode strings */
if (bytes_remaining == 0) {
@@ -906,60 +1076,362 @@ ssetup_ntlmssp_authenticate:
++bcc_ptr;
--bytes_remaining;
}
- decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
} else {
- decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
}
-ssetup_exit:
- if (spnego_key) {
- key_invalidate(spnego_key);
- key_put(spnego_key);
+ rc = sess_establish_session(sess_data);
+out_put_spnego_key:
+ key_invalidate(spnego_key);
+ key_put(spnego_key);
+out:
+ sess_data->result = rc;
+ sess_data->func = NULL;
+ sess_free_buffer(sess_data);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+}
+
+#endif /* ! CONFIG_CIFS_UPCALL */
+
+/*
+ * The required kvec buffers have to be allocated before calling this
+ * function.
+ */
+static int
+_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
+{
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ struct cifs_ses *ses = sess_data->ses;
+ __u32 capabilities;
+ char *bcc_ptr;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)pSMB;
+
+ capabilities = cifs_ssetup_hdr(ses, pSMB);
+ if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+ cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
+ return -ENOSYS;
}
- kfree(str_area);
- kfree(ntlmsspblob);
- ntlmsspblob = NULL;
- if (resp_buf_type == CIFS_SMALL_BUFFER) {
- cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
- cifs_small_buf_release(iov[0].iov_base);
- } else if (resp_buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(iov[0].iov_base);
- /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
- if ((phase == NtLmChallenge) && (rc == 0))
- goto ssetup_ntlmssp_authenticate;
+ pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+ capabilities |= CAP_EXTENDED_SECURITY;
+ pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+
+ bcc_ptr = sess_data->iov[2].iov_base;
+ /* unicode strings must be word aligned */
+ if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
+ *bcc_ptr = 0;
+ bcc_ptr++;
+ }
+ unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+
+ sess_data->iov[2].iov_len = (long) bcc_ptr -
+ (long) sess_data->iov[2].iov_base;
+
+ return 0;
+}
+
+static void
+sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data);
+
+static void
+sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
+{
+ int rc;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ struct cifs_ses *ses = sess_data->ses;
+ __u16 bytes_remaining;
+ char *bcc_ptr;
+ u16 blob_len;
+
+ cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n");
+
+ /*
+ * if memory allocation is successful, caller of this function
+ * frees it.
+ */
+ ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+ if (!ses->ntlmssp) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ses->ntlmssp->sesskey_per_smbsess = false;
+
+ /* wct = 12 */
+ rc = sess_alloc_buffer(sess_data, 12);
+ if (rc)
+ goto out;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+
+ /* Build security blob before we assemble the request */
+ build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses);
+ sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+ sess_data->iov[1].iov_base = pSMB->req.SecurityBlob;
+ pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+
+ rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
+ if (rc)
+ goto out;
+
+ rc = sess_sendreceive(sess_data);
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+
+ /* If true, rc here is expected and not an error */
+ if (sess_data->buf0_type != CIFS_NO_BUFFER &&
+ smb_buf->Status.CifsError ==
+ cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
+ rc = 0;
+
+ if (rc)
+ goto out;
+
+ cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
+
+ if (smb_buf->WordCount != 4) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out;
+ }
+
+ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
+ cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
+
+ blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+ if (blob_len > bytes_remaining) {
+ cifs_dbg(VFS, "bad security blob length %d\n",
+ blob_len);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
+out:
+ sess_free_buffer(sess_data);
if (!rc) {
- mutex_lock(&ses->server->srv_mutex);
- if (!ses->server->session_estab) {
- if (ses->server->sign) {
- ses->server->session_key.response =
- kmemdup(ses->auth_key.response,
- ses->auth_key.len, GFP_KERNEL);
- if (!ses->server->session_key.response) {
- rc = -ENOMEM;
- mutex_unlock(&ses->server->srv_mutex);
- goto keycp_exit;
- }
- ses->server->session_key.len =
- ses->auth_key.len;
- }
- ses->server->sequence_number = 0x2;
- ses->server->session_estab = true;
- }
- mutex_unlock(&ses->server->srv_mutex);
+ sess_data->func = sess_auth_rawntlmssp_authenticate;
+ return;
+ }
+
+ /* Else error. Cleanup */
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+ kfree(ses->ntlmssp);
+ ses->ntlmssp = NULL;
+
+ sess_data->func = NULL;
+ sess_data->result = rc;
+}
+
+static void
+sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
+{
+ int rc;
+ struct smb_hdr *smb_buf;
+ SESSION_SETUP_ANDX *pSMB;
+ struct cifs_ses *ses = sess_data->ses;
+ __u16 bytes_remaining;
+ char *bcc_ptr;
+ char *ntlmsspblob = NULL;
+ u16 blob_len;
+
+ cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n");
- cifs_dbg(FYI, "CIFS session established successfully\n");
- spin_lock(&GlobalMid_Lock);
- ses->status = CifsGood;
- ses->need_reconnect = false;
- spin_unlock(&GlobalMid_Lock);
+ /* wct = 12 */
+ rc = sess_alloc_buffer(sess_data, 12);
+ if (rc)
+ goto out;
+
+ /* Build security blob before we assemble the request */
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)pSMB;
+ /*
+ * 5 is an empirical value, large enough to hold
+ * authenticate message plus max 10 of av paris,
+ * domain, user, workstation names, flags, etc.
+ */
+ ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE),
+ GFP_KERNEL);
+ if (!ntlmsspblob) {
+ rc = -ENOMEM;
+ goto out;
}
-keycp_exit:
+ rc = build_ntlmssp_auth_blob(ntlmsspblob,
+ &blob_len, ses, sess_data->nls_cp);
+ if (rc)
+ goto out_free_ntlmsspblob;
+ sess_data->iov[1].iov_len = blob_len;
+ sess_data->iov[1].iov_base = ntlmsspblob;
+ pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
+ /*
+ * Make sure that we tell the server that we are using
+ * the uid that it just gave us back on the response
+ * (challenge)
+ */
+ smb_buf->Uid = ses->Suid;
+
+ rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
+ if (rc)
+ goto out_free_ntlmsspblob;
+
+ rc = sess_sendreceive(sess_data);
+ if (rc)
+ goto out_free_ntlmsspblob;
+
+ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+ smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+ if (smb_buf->WordCount != 4) {
+ rc = -EIO;
+ cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+ goto out_free_ntlmsspblob;
+ }
+
+ if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+ cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+
+ bytes_remaining = get_bcc(smb_buf);
+ bcc_ptr = pByteArea(smb_buf);
+ blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+ if (blob_len > bytes_remaining) {
+ cifs_dbg(VFS, "bad security blob length %d\n",
+ blob_len);
+ rc = -EINVAL;
+ goto out_free_ntlmsspblob;
+ }
+ bcc_ptr += blob_len;
+ bytes_remaining -= blob_len;
+
+
+ /* BB check if Unicode and decode strings */
+ if (bytes_remaining == 0) {
+ /* no string area to decode, do nothing */
+ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+ /* unicode string area must be word-aligned */
+ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+ ++bcc_ptr;
+ --bytes_remaining;
+ }
+ decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ } else {
+ decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+ sess_data->nls_cp);
+ }
+
+out_free_ntlmsspblob:
+ kfree(ntlmsspblob);
+out:
+ sess_free_buffer(sess_data);
+
+ if (!rc)
+ rc = sess_establish_session(sess_data);
+
+ /* Cleanup */
kfree(ses->auth_key.response);
ses->auth_key.response = NULL;
kfree(ses->ntlmssp);
+ ses->ntlmssp = NULL;
+
+ sess_data->func = NULL;
+ sess_data->result = rc;
+}
+
+static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
+{
+ int type;
+
+ type = select_sectype(ses->server, ses->sectype);
+ cifs_dbg(FYI, "sess setup type %d\n", type);
+ if (type == Unspecified) {
+ cifs_dbg(VFS,
+ "Unable to select appropriate authentication method!");
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case LANMAN:
+ /* LANMAN and plaintext are less secure and off by default.
+ * So we make this explicitly be turned on in kconfig (in the
+ * build) and turned on at runtime (changed from the default)
+ * in proc/fs/cifs or via mount parm. Unfortunately this is
+ * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+ sess_data->func = sess_auth_lanman;
+ break;
+#else
+ return -EOPNOTSUPP;
+#endif
+ case NTLM:
+ sess_data->func = sess_auth_ntlm;
+ break;
+ case NTLMv2:
+ sess_data->func = sess_auth_ntlmv2;
+ break;
+ case Kerberos:
+#ifdef CONFIG_CIFS_UPCALL
+ sess_data->func = sess_auth_kerberos;
+ break;
+#else
+ cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
+ return -ENOSYS;
+ break;
+#endif /* CONFIG_CIFS_UPCALL */
+ case RawNTLMSSP:
+ sess_data->func = sess_auth_rawntlmssp_negotiate;
+ break;
+ default:
+ cifs_dbg(VFS, "secType %d not supported!\n", type);
+ return -ENOSYS;
+ }
+
+ return 0;
+}
+
+int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_cp)
+{
+ int rc = 0;
+ struct sess_data *sess_data;
+
+ if (ses == NULL) {
+ WARN(1, "%s: ses == NULL!", __func__);
+ return -EINVAL;
+ }
+
+ sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL);
+ if (!sess_data)
+ return -ENOMEM;
+
+ rc = select_sec(ses, sess_data);
+ if (rc)
+ goto out;
+
+ sess_data->xid = xid;
+ sess_data->ses = ses;
+ sess_data->buf0_type = CIFS_NO_BUFFER;
+ sess_data->nls_cp = (struct nls_table *) nls_cp;
+
+ while (sess_data->func)
+ sess_data->func(sess_data);
+
+ /* Store result before we free sess_data */
+ rc = sess_data->result;
+out:
+ kfree(sess_data);
return rc;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index d1fdfa848703..d2979036a4c7 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -23,6 +23,7 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifspdu.h"
+#include "cifs_unicode.h"
/*
* An NT cancel request header looks just like the original request except:
@@ -530,13 +531,11 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
rc = CIFSSMBQPathInfo(xid, tcon, full_path, file_info,
0 /* not legacy */, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
if (rc == -EOPNOTSUPP || rc == -EINVAL)
rc = SMBQueryInformation(xid, tcon, full_path, file_info,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
kfree(file_info);
return rc;
}
@@ -552,8 +551,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
/* could do find first instead but this returns more info */
rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
/*
* BB optimize code so we do not make the above call when server claims
* no NT SMB support and the above call failed at least once - set flag
@@ -562,8 +560,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
rc = SMBQueryInformation(xid, tcon, full_path, data,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
*adjustTZ = true;
}
@@ -586,7 +583,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
if (tmprc == -EOPNOTSUPP)
*symlink = true;
- else
+ else if (tmprc == 0)
CIFSSMBClose(xid, tcon, fid.netfid);
}
@@ -611,8 +608,7 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
*/
return CIFSGetSrvInodeNumber(xid, tcon, full_path, uniqueid,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
}
static int
@@ -703,8 +699,7 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
info.Attributes = cpu_to_le32(dosattrs);
rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
if (rc == 0)
cifsInode->cifsAttrs = dosattrs;
}
@@ -720,8 +715,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
oparms->create_options,
&oparms->fid->netfid, oplock, buf,
oparms->cifs_sb->local_nls,
- oparms->cifs_sb->mnt_cifs_flags
- & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(oparms->cifs_sb));
return CIFS_open(xid, oparms, oplock, buf);
}
@@ -749,21 +743,21 @@ cifs_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
}
static int
-cifs_sync_read(const unsigned int xid, struct cifsFileInfo *cfile,
+cifs_sync_read(const unsigned int xid, struct cifs_fid *pfid,
struct cifs_io_parms *parms, unsigned int *bytes_read,
char **buf, int *buf_type)
{
- parms->netfid = cfile->fid.netfid;
+ parms->netfid = pfid->netfid;
return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type);
}
static int
-cifs_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
+cifs_sync_write(const unsigned int xid, struct cifs_fid *pfid,
struct cifs_io_parms *parms, unsigned int *written,
struct kvec *iov, unsigned long nr_segs)
{
- parms->netfid = cfile->fid.netfid;
+ parms->netfid = pfid->netfid;
return CIFSSMBWrite2(xid, parms, written, iov, nr_segs);
}
@@ -800,8 +794,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
tcon = tlink_tcon(tlink);
rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
if (rc == 0) {
cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
goto out;
@@ -1009,6 +1002,18 @@ cifs_is_read_op(__u32 oplock)
return oplock == OPLOCK_READ;
}
+static unsigned int
+cifs_wp_retry_size(struct inode *inode)
+{
+ return CIFS_SB(inode->i_sb)->wsize;
+}
+
+static bool
+cifs_dir_needs_close(struct cifsFileInfo *cfile)
+{
+ return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
+}
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1019,6 +1024,7 @@ struct smb_version_operations smb1_operations = {
.set_credits = cifs_set_credits,
.get_credits_field = cifs_get_credits_field,
.get_credits = cifs_get_credits,
+ .wait_mtu_credits = cifs_wait_mtu_credits,
.get_next_mid = cifs_get_next_mid,
.read_data_offset = cifs_read_data_offset,
.read_data_length = cifs_read_data_length,
@@ -1078,6 +1084,8 @@ struct smb_version_operations smb1_operations = {
.query_mf_symlink = cifs_query_mf_symlink,
.create_mf_symlink = cifs_create_mf_symlink,
.is_read_op = cifs_is_read_op,
+ .wp_retry_size = cifs_wp_retry_size,
+ .dir_needs_close = cifs_dir_needs_close,
#ifdef CONFIG_CIFS_XATTR
.query_all_EAs = CIFSSMBQAllEAs,
.set_EA = CIFSSMBSetEA,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 3f17b4550831..45992944e238 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
goto out;
}
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+ smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
GFP_KERNEL);
if (smb2_data == NULL) {
rc = -ENOMEM;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 84c012a6aba0..899bbc86f73e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -91,7 +91,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
case SMB2_OP_SET_EOF:
tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
fid.volatile_fid, current->tgid,
- (__le64 *)data);
+ (__le64 *)data, false);
break;
case SMB2_OP_SET_INFO:
tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
@@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
*adjust_tz = false;
*symlink = false;
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+ smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
GFP_KERNEL);
if (smb2_data == NULL)
return -ENOMEM;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 94bd4fbb13d3..8257a5a97cc0 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
{STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
{STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
- {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"},
+ {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
{STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
{STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
{STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
@@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
"STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
{STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
+ {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
+ "STATUS_REPARSE_NOT_HANDLED"},
{STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
"STATUS_DEVICE_REQUIRES_CLEANING"},
{STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
@@ -298,7 +300,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"},
{STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"},
{STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"},
- {STATUS_INVALID_DEVICE_REQUEST, -EIO, "STATUS_INVALID_DEVICE_REQUEST"},
+ {STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"},
{STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"},
{STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"},
{STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"},
@@ -605,7 +607,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"},
{STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"},
{STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"},
- {STATUS_CANNOT_DELETE, -EIO, "STATUS_CANNOT_DELETE"},
+ {STATUS_CANNOT_DELETE, -EACCES, "STATUS_CANNOT_DELETE"},
{STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"},
{STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"},
{STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index b8021fde987d..1a08a34838fc 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -178,9 +178,24 @@ smb2_check_message(char *buf, unsigned int length)
/* Windows 7 server returns 24 bytes more */
if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
return 0;
- /* server can return one byte more */
+ /* server can return one byte more due to implied bcc[0] */
if (clc_len == 4 + len + 1)
return 0;
+
+ /*
+ * MacOS server pads after SMB2.1 write response with 3 bytes
+ * of junk. Other servers match RFC1001 len to actual
+ * SMB2/SMB3 frame length (header + smb2 response specific data)
+ * Log the server error (once), but allow it and continue
+ * since the frame is parseable.
+ */
+ if (clc_len < 4 /* RFC1001 header size */ + len) {
+ printk_once(KERN_WARNING
+ "SMB2 server sent bad RFC1001 len %d not %d\n",
+ len, clc_len - 4);
+ return 0;
+ }
+
return 1;
}
return 0;
@@ -364,6 +379,14 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
int len;
const char *start_of_path;
__le16 *to;
+ int map_type;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
+ map_type = SFM_MAP_UNI_RSVD;
+ else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+ map_type = SFU_MAP_UNI_RSVD;
+ else
+ map_type = NO_MAP_UNI_RSVD;
/* Windows doesn't allow paths beginning with \ */
if (from[0] == '\\')
@@ -371,9 +394,7 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
else
start_of_path = from;
to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, map_type);
return to;
}
@@ -437,7 +458,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
continue;
cifs_dbg(FYI, "found in the open list\n");
- cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+ cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
le32_to_cpu(rsp->NewLeaseState));
server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
@@ -467,7 +488,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
}
cifs_dbg(FYI, "found in the pending open list\n");
- cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+ cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
le32_to_cpu(rsp->NewLeaseState));
open->oplock = lease_state;
@@ -546,7 +567,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
return false;
}
- cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel);
+ cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 787844bde384..c5f521bcdee2 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -19,6 +19,7 @@
#include <linux/pagemap.h>
#include <linux/vfs.h>
+#include <linux/falloc.h>
#include "cifsglob.h"
#include "smb2pdu.h"
#include "smb2proto.h"
@@ -112,6 +113,53 @@ smb2_get_credits(struct mid_q_entry *mid)
return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
}
+static int
+smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
+ unsigned int *num, unsigned int *credits)
+{
+ int rc = 0;
+ unsigned int scredits;
+
+ spin_lock(&server->req_lock);
+ while (1) {
+ if (server->credits <= 0) {
+ spin_unlock(&server->req_lock);
+ cifs_num_waiters_inc(server);
+ rc = wait_event_killable(server->request_q,
+ has_credits(server, &server->credits));
+ cifs_num_waiters_dec(server);
+ if (rc)
+ return rc;
+ spin_lock(&server->req_lock);
+ } else {
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&server->req_lock);
+ return -ENOENT;
+ }
+
+ scredits = server->credits;
+ /* can deadlock with reopen */
+ if (scredits == 1) {
+ *num = SMB2_MAX_BUFFER_SIZE;
+ *credits = 0;
+ break;
+ }
+
+ /* leave one credit for a possible reopen */
+ scredits--;
+ *num = min_t(unsigned int, size,
+ scredits * SMB2_MAX_BUFFER_SIZE);
+
+ *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+ server->credits -= *credits;
+ server->in_flight++;
+ break;
+ }
+ }
+ spin_unlock(&server->req_lock);
+ return rc;
+}
+
static __u64
smb2_get_next_mid(struct TCP_Server_Info *server)
{
@@ -182,8 +230,9 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified wsize, or default */
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
- /* set it to the maximum buffer size value we can send with 1 credit */
- wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
+
+ if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+ wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
return wsize;
}
@@ -197,8 +246,9 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified rsize, or default */
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
- /* set it to the maximum buffer size value we can send with 1 credit */
- rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
+
+ if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+ rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
return rsize;
}
@@ -215,15 +265,18 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
NULL /* no data input */, 0 /* no data input */,
(char **)&out_buf, &ret_data_len);
-
- if ((rc == 0) && (ret_data_len > 0)) {
+ if (rc != 0)
+ cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
+ else if (ret_data_len < sizeof(struct network_interface_info_ioctl_rsp)) {
+ cifs_dbg(VFS, "server returned bad net interface info buf\n");
+ rc = -EINVAL;
+ } else {
/* Dump info on first interface */
cifs_dbg(FYI, "Adapter Capability 0x%x\t",
le32_to_cpu(out_buf->Capability));
cifs_dbg(FYI, "Link Speed %lld\n",
le64_to_cpu(out_buf->LinkSpeed));
- } else
- cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
+ }
return rc;
}
@@ -339,7 +392,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_file_all_info *smb2_data;
- smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+ smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
GFP_KERNEL);
if (smb2_data == NULL)
return -ENOMEM;
@@ -661,33 +714,94 @@ smb2_read_data_length(char *buf)
static int
-smb2_sync_read(const unsigned int xid, struct cifsFileInfo *cfile,
+smb2_sync_read(const unsigned int xid, struct cifs_fid *pfid,
struct cifs_io_parms *parms, unsigned int *bytes_read,
char **buf, int *buf_type)
{
- parms->persistent_fid = cfile->fid.persistent_fid;
- parms->volatile_fid = cfile->fid.volatile_fid;
+ parms->persistent_fid = pfid->persistent_fid;
+ parms->volatile_fid = pfid->volatile_fid;
return SMB2_read(xid, parms, bytes_read, buf, buf_type);
}
static int
-smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
+smb2_sync_write(const unsigned int xid, struct cifs_fid *pfid,
struct cifs_io_parms *parms, unsigned int *written,
struct kvec *iov, unsigned long nr_segs)
{
- parms->persistent_fid = cfile->fid.persistent_fid;
- parms->volatile_fid = cfile->fid.volatile_fid;
+ parms->persistent_fid = pfid->persistent_fid;
+ parms->volatile_fid = pfid->volatile_fid;
return SMB2_write(xid, parms, written, iov, nr_segs);
}
+/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */
+static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse)
+{
+ struct cifsInodeInfo *cifsi;
+ int rc;
+
+ cifsi = CIFS_I(inode);
+
+ /* if file already sparse don't bother setting sparse again */
+ if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse)
+ return true; /* already sparse */
+
+ if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse)
+ return true; /* already not sparse */
+
+ /*
+ * Can't check for sparse support on share the usual way via the
+ * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share
+ * since Samba server doesn't set the flag on the share, yet
+ * supports the set sparse FSCTL and returns sparse correctly
+ * in the file attributes. If we fail setting sparse though we
+ * mark that server does not support sparse files for this share
+ * to avoid repeatedly sending the unsupported fsctl to server
+ * if the file is repeatedly extended.
+ */
+ if (tcon->broken_sparse_sup)
+ return false;
+
+ rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
+ true /* is_fctl */, &setsparse, 1, NULL, NULL);
+ if (rc) {
+ tcon->broken_sparse_sup = true;
+ cifs_dbg(FYI, "set sparse rc = %d\n", rc);
+ return false;
+ }
+
+ if (setsparse)
+ cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE;
+ else
+ cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE);
+
+ return true;
+}
+
static int
smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
{
__le64 eof = cpu_to_le64(size);
+ struct inode *inode;
+
+ /*
+ * If extending file more than one page make sparse. Many Linux fs
+ * make files sparse by default when extending via ftruncate
+ */
+ inode = cfile->dentry->d_inode;
+
+ if (!set_alloc && (size > inode->i_size + 8192)) {
+ __u8 set_sparse = 1;
+
+ /* whether set sparse succeeds or not, extend the file */
+ smb2_set_sparse(xid, tcon, cfile, inode, set_sparse);
+ }
+
return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, &eof, false);
}
static int
@@ -904,6 +1018,105 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
+ loff_t offset, loff_t len, bool keep_size)
+{
+ struct inode *inode;
+ struct cifsInodeInfo *cifsi;
+ struct cifsFileInfo *cfile = file->private_data;
+ struct file_zero_data_information fsctl_buf;
+ long rc;
+ unsigned int xid;
+
+ xid = get_xid();
+
+ inode = cfile->dentry->d_inode;
+ cifsi = CIFS_I(inode);
+
+ /* if file not oplocked can't be sure whether asking to extend size */
+ if (!CIFS_CACHE_READ(cifsi))
+ if (keep_size == false)
+ return -EOPNOTSUPP;
+
+ /*
+ * Must check if file sparse since fallocate -z (zero range) assumes
+ * non-sparse allocation
+ */
+ if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
+ return -EOPNOTSUPP;
+
+ /*
+ * need to make sure we are not asked to extend the file since the SMB3
+ * fsctl does not change the file size. In the future we could change
+ * this to zero the first part of the range then set the file size
+ * which for a non sparse file would zero the newly extended range
+ */
+ if (keep_size == false)
+ if (i_size_read(inode) < offset + len)
+ return -EOPNOTSUPP;
+
+ cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+
+ fsctl_buf.FileOffset = cpu_to_le64(offset);
+ fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+
+ rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+ true /* is_fctl */, (char *)&fsctl_buf,
+ sizeof(struct file_zero_data_information), NULL, NULL);
+ free_xid(xid);
+ return rc;
+}
+
+static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode;
+ struct cifsInodeInfo *cifsi;
+ struct cifsFileInfo *cfile = file->private_data;
+ struct file_zero_data_information fsctl_buf;
+ long rc;
+ unsigned int xid;
+ __u8 set_sparse = 1;
+
+ xid = get_xid();
+
+ inode = cfile->dentry->d_inode;
+ cifsi = CIFS_I(inode);
+
+ /* Need to make file sparse, if not already, before freeing range. */
+ /* Consider adding equivalent for compressed since it could also work */
+ if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
+ return -EOPNOTSUPP;
+
+ cifs_dbg(FYI, "offset %lld len %lld", offset, len);
+
+ fsctl_buf.FileOffset = cpu_to_le64(offset);
+ fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+
+ rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+ true /* is_fctl */, (char *)&fsctl_buf,
+ sizeof(struct file_zero_data_information), NULL, NULL);
+ free_xid(xid);
+ return rc;
+}
+
+static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
+ loff_t off, loff_t len)
+{
+ /* KEEP_SIZE already checked for by do_fallocate */
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return smb3_punch_hole(file, tcon, off, len);
+ else if (mode & FALLOC_FL_ZERO_RANGE) {
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ return smb3_zero_range(file, tcon, off, len, true);
+ return smb3_zero_range(file, tcon, off, len, false);
+ }
+
+ return -EOPNOTSUPP;
+}
+
static void
smb2_downgrade_oplock(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, bool set_level2)
@@ -1104,6 +1317,19 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch)
return le32_to_cpu(lc->lcontext.LeaseState);
}
+static unsigned int
+smb2_wp_retry_size(struct inode *inode)
+{
+ return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize,
+ SMB2_MAX_BUFFER_SIZE);
+}
+
+static bool
+smb2_dir_needs_close(struct cifsFileInfo *cfile)
+{
+ return !cfile->invalidHandle;
+}
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -1113,6 +1339,7 @@ struct smb_version_operations smb20_operations = {
.set_credits = smb2_set_credits,
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
+ .wait_mtu_credits = cifs_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
@@ -1177,6 +1404,8 @@ struct smb_version_operations smb20_operations = {
.create_lease_buf = smb2_create_lease_buf,
.parse_lease_buf = smb2_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .wp_retry_size = smb2_wp_retry_size,
+ .dir_needs_close = smb2_dir_needs_close,
};
struct smb_version_operations smb21_operations = {
@@ -1188,6 +1417,7 @@ struct smb_version_operations smb21_operations = {
.set_credits = smb2_set_credits,
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
+ .wait_mtu_credits = smb2_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
@@ -1225,6 +1455,8 @@ struct smb_version_operations smb21_operations = {
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
.query_symlink = smb2_query_symlink,
+ .query_mf_symlink = smb3_query_mf_symlink,
+ .create_mf_symlink = smb3_create_mf_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -1252,6 +1484,8 @@ struct smb_version_operations smb21_operations = {
.create_lease_buf = smb2_create_lease_buf,
.parse_lease_buf = smb2_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .wp_retry_size = smb2_wp_retry_size,
+ .dir_needs_close = smb2_dir_needs_close,
};
struct smb_version_operations smb30_operations = {
@@ -1263,6 +1497,7 @@ struct smb_version_operations smb30_operations = {
.set_credits = smb2_set_credits,
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
+ .wait_mtu_credits = smb2_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
@@ -1301,6 +1536,8 @@ struct smb_version_operations smb30_operations = {
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
.query_symlink = smb2_query_symlink,
+ .query_mf_symlink = smb3_query_mf_symlink,
+ .create_mf_symlink = smb3_create_mf_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -1330,6 +1567,9 @@ struct smb_version_operations smb30_operations = {
.parse_lease_buf = smb3_parse_lease_buf,
.clone_range = smb2_clone_range,
.validate_negotiate = smb3_validate_negotiate,
+ .wp_retry_size = smb2_wp_retry_size,
+ .dir_needs_close = smb2_dir_needs_close,
+ .fallocate = smb3_fallocate,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b0b260dbb19d..8f1672bb82d5 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -108,7 +108,6 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
if (!tcon)
goto out;
- /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
/* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
/* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
if ((tcon->ses) &&
@@ -245,10 +244,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if (rc)
goto out;
atomic_inc(&tconInfoReconnectCount);
- /*
- * BB FIXME add code to check if wsize needs update due to negotiated
- * smb buffer size shrinking.
- */
out:
/*
* Check if handle based operation so we know whether we can continue
@@ -309,16 +304,6 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
return rc;
}
-static void
-free_rsp_buf(int resp_buftype, void *rsp)
-{
- if (resp_buftype == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(rsp);
- else if (resp_buftype == CIFS_LARGE_BUFFER)
- cifs_buf_release(rsp);
-}
-
-
/*
*
* SMB2 Worker functions follow:
@@ -545,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
struct smb2_sess_setup_rsp *rsp = NULL;
struct kvec iov[2];
int rc = 0;
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
struct TCP_Server_Info *server = ses->server;
u16 blob_length = 0;
@@ -922,7 +907,8 @@ tcon_exit:
tcon_error_exit:
if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
- tcon->bad_network_name = true;
+ if (tcon)
+ tcon->bad_network_name = true;
}
goto tcon_exit;
}
@@ -1112,6 +1098,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (oparms->create_options & CREATE_OPTION_READONLY)
file_attributes |= ATTR_READONLY;
+ if (oparms->create_options & CREATE_OPTION_SPECIAL)
+ file_attributes |= ATTR_SYSTEM;
req->ImpersonationLevel = IL_IMPERSONATION;
req->DesiredAccess = cpu_to_le32(oparms->desired_access);
@@ -1239,7 +1227,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
cifs_dbg(FYI, "SMB2 IOCTL\n");
- *out_data = NULL;
+ if (out_data != NULL)
+ *out_data = NULL;
+
/* zero out returned data len, in case of error */
if (plen)
*plen = 0;
@@ -1415,8 +1405,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
rsp = (struct smb2_close_rsp *)iov[0].iov_base;
if (rc != 0) {
- if (tcon)
- cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
+ cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
goto close_exit;
}
@@ -1545,7 +1534,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
{
return query_info(xid, tcon, persistent_fid, volatile_fid,
FILE_ALL_INFORMATION,
- sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+ sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
sizeof(struct smb2_file_all_info), data);
}
@@ -1738,12 +1727,18 @@ smb2_readv_callback(struct mid_q_entry *mid)
rc);
}
/* FIXME: should this be counted toward the initiating task? */
- task_io_account_read(rdata->bytes);
- cifs_stats_bytes_read(tcon, rdata->bytes);
+ task_io_account_read(rdata->got_bytes);
+ cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
rdata->result = -EAGAIN;
+ if (server->sign && rdata->got_bytes)
+ /* reset bytes number since we can not check a sign */
+ rdata->got_bytes = 0;
+ /* FIXME: should this be counted toward the initiating task? */
+ task_io_account_read(rdata->got_bytes);
+ cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
default:
if (rdata->result != -ENODATA)
@@ -1762,11 +1757,12 @@ smb2_readv_callback(struct mid_q_entry *mid)
int
smb2_async_readv(struct cifs_readdata *rdata)
{
- int rc;
+ int rc, flags = 0;
struct smb2_hdr *buf;
struct cifs_io_parms io_parms;
struct smb_rqst rqst = { .rq_iov = &rdata->iov,
.rq_nvec = 1 };
+ struct TCP_Server_Info *server;
cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
__func__, rdata->offset, rdata->bytes);
@@ -1777,18 +1773,41 @@ smb2_async_readv(struct cifs_readdata *rdata)
io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
io_parms.pid = rdata->pid;
+
+ server = io_parms.tcon->ses->server;
+
rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
- if (rc)
+ if (rc) {
+ if (rc == -EAGAIN && rdata->credits) {
+ /* credits was reset by reconnect */
+ rdata->credits = 0;
+ /* reduce in_flight value since we won't send the req */
+ spin_lock(&server->req_lock);
+ server->in_flight--;
+ spin_unlock(&server->req_lock);
+ }
return rc;
+ }
buf = (struct smb2_hdr *)rdata->iov.iov_base;
/* 4 for rfc1002 length field */
rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+ if (rdata->credits) {
+ buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
+ SMB2_MAX_BUFFER_SIZE));
+ spin_lock(&server->req_lock);
+ server->credits += rdata->credits -
+ le16_to_cpu(buf->CreditCharge);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ flags = CIFS_HAS_CREDITS;
+ }
+
kref_get(&rdata->refcount);
rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
cifs_readv_receive, smb2_readv_callback,
- rdata, 0);
+ rdata, flags);
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@ -1906,15 +1925,25 @@ int
smb2_async_writev(struct cifs_writedata *wdata,
void (*release)(struct kref *kref))
{
- int rc = -EACCES;
+ int rc = -EACCES, flags = 0;
struct smb2_write_req *req = NULL;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
struct kvec iov;
struct smb_rqst rqst;
rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
- if (rc)
+ if (rc) {
+ if (rc == -EAGAIN && wdata->credits) {
+ /* credits was reset by reconnect */
+ wdata->credits = 0;
+ /* reduce in_flight value since we won't send the req */
+ spin_lock(&server->req_lock);
+ server->in_flight--;
+ spin_unlock(&server->req_lock);
+ }
goto async_writev_out;
+ }
req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
@@ -1947,9 +1976,20 @@ smb2_async_writev(struct cifs_writedata *wdata,
inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+ if (wdata->credits) {
+ req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
+ SMB2_MAX_BUFFER_SIZE));
+ spin_lock(&server->req_lock);
+ server->credits += wdata->credits -
+ le16_to_cpu(req->hdr.CreditCharge);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ flags = CIFS_HAS_CREDITS;
+ }
+
kref_get(&wdata->refcount);
- rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
- smb2_writev_callback, wdata, 0);
+ rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata,
+ flags);
if (rc) {
kref_put(&wdata->refcount, release);
@@ -2141,6 +2181,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
if (rc) {
+ if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
+ srch_inf->endOfSearch = true;
+ rc = 0;
+ }
cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
goto qdir_exit;
}
@@ -2178,11 +2222,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
else
cifs_dbg(VFS, "illegal search buffer type\n");
- if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
- srch_inf->endOfSearch = 1;
- else
- srch_inf->endOfSearch = 0;
-
return rc;
qdir_exit:
@@ -2325,7 +2364,7 @@ SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
int
SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 pid, __le64 *eof)
+ u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc)
{
struct smb2_file_eof_info info;
void *data;
@@ -2336,8 +2375,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
data = &info;
size = sizeof(struct smb2_file_eof_info);
- return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid,
- FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
+ if (is_falloc)
+ return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+ pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size);
+ else
+ return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+ pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
}
int
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 69f3595d3952..e3188abdafd0 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -352,6 +352,8 @@ struct smb2_tree_disconnect_rsp {
#define FILE_ATTRIBUTE_OFFLINE 0x00001000
#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000
#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000
/* Oplock levels */
#define SMB2_OPLOCK_LEVEL_NONE 0x00
@@ -573,6 +575,12 @@ struct copychunk_ioctl {
__u32 Reserved2;
} __packed;
+/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
+struct file_zero_data_information {
+ __le64 FileOffset;
+ __le64 BeyondFinalZero;
+} __packed;
+
struct copychunk_ioctl_rsp {
__le32 ChunksWritten;
__le32 ChunkBytesWritten;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 0ce48db20a65..79dc650c18b2 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -82,7 +82,13 @@ extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
const char *from_name, const char *to_name,
struct cifs_sb_info *cifs_sb);
-
+extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const unsigned char *path,
+ char *pbuf, unsigned int *pbytes_written);
+extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_read);
extern int smb2_open_file(const unsigned int xid,
struct cifs_open_parms *oparms,
__u32 *oplock, FILE_ALL_INFO *buf);
@@ -139,7 +145,7 @@ extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
__le16 *target_file);
extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 pid,
- __le64 *eof);
+ __le64 *eof, bool is_fallocate);
extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
FILE_BASIC_INFO *buf);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 59c748ce872f..5111e7272db6 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -466,7 +466,12 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
static inline void
smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
{
+ unsigned int i, num = le16_to_cpu(hdr->CreditCharge);
+
hdr->MessageId = get_next_mid64(server);
+ /* skip message numbers according to CreditCharge field */
+ for (i = 1; i < num; i++)
+ get_next_mid(server);
}
static struct mid_q_entry *
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 43eb1367b103..6c1566366a66 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -29,6 +29,7 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/random.h>
+#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
#include "cifspdu.h"
#include "cifsglob.h"
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 0e538b5c9622..83efa59535be 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -63,7 +63,7 @@
#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */
-#define FSCTL_SET_ZERO_DATA 0x000900C8 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA 0x000980C8
#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */
#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */
#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 18cd5650a5fc..9d087f4e7d4e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -448,6 +448,15 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
return wait_for_free_credits(server, timeout, val);
}
+int
+cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
+ unsigned int *num, unsigned int *credits)
+{
+ *num = size;
+ *credits = 0;
+ return 0;
+}
+
static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
struct mid_q_entry **ppmidQ)
{
@@ -531,20 +540,23 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
{
int rc, timeout, optype;
struct mid_q_entry *mid;
+ unsigned int credits = 0;
timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
- rc = wait_for_free_request(server, timeout, optype);
- if (rc)
- return rc;
+ if ((flags & CIFS_HAS_CREDITS) == 0) {
+ rc = wait_for_free_request(server, timeout, optype);
+ if (rc)
+ return rc;
+ credits = 1;
+ }
mutex_lock(&server->srv_mutex);
mid = server->ops->setup_async_request(server, rqst);
if (IS_ERR(mid)) {
mutex_unlock(&server->srv_mutex);
- add_credits(server, 1, optype);
- wake_up(&server->request_q);
+ add_credits_and_wake_if(server, credits, optype);
return PTR_ERR(mid);
}
@@ -572,8 +584,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
return 0;
cifs_delete_mid(mid);
- add_credits(server, 1, optype);
- wake_up(&server->request_q);
+ add_credits_and_wake_if(server, credits, optype);
return rc;
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 5ac836a86b18..72a4d10653d6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -28,6 +28,8 @@
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
#define MAX_EA_VALUE_SIZE 65535
#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
@@ -85,8 +87,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
if (pTcon->ses->server->ops->set_EA)
rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
full_path, ea_name, NULL, (__u16)0,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
}
remove_ea_exit:
kfree(full_path);
@@ -154,8 +155,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
if (pTcon->ses->server->ops->set_EA)
rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
full_path, ea_name, ea_value, (__u16)value_size,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
== 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
@@ -165,8 +165,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
if (pTcon->ses->server->ops->set_EA)
rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
full_path, ea_name, ea_value, (__u16)value_size,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
#ifdef CONFIG_CIFS_ACL
@@ -199,8 +198,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
ea_value, (const int)value_size,
ACL_TYPE_ACCESS, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc);
#else
cifs_dbg(FYI, "set POSIX ACL not supported\n");
@@ -212,8 +210,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
ea_value, (const int)value_size,
ACL_TYPE_DEFAULT, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc);
#else
cifs_dbg(FYI, "set default POSIX ACL not supported\n");
@@ -285,8 +282,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
if (pTcon->ses->server->ops->query_all_EAs)
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
full_path, ea_name, ea_value, buf_size,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto get_ea_exit;
@@ -295,8 +291,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
if (pTcon->ses->server->ops->query_all_EAs)
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
full_path, ea_name, ea_value, buf_size,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
#ifdef CONFIG_CIFS_POSIX
@@ -304,8 +299,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
ea_value, buf_size, ACL_TYPE_ACCESS,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
#else
cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
#endif /* CONFIG_CIFS_POSIX */
@@ -316,8 +310,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
ea_value, buf_size, ACL_TYPE_DEFAULT,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
#else
cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n");
#endif /* CONFIG_CIFS_POSIX */
@@ -421,8 +414,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
if (pTcon->ses->server->ops->query_all_EAs)
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
full_path, NULL, data, buf_size,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
list_ea_exit:
kfree(full_path);
free_xid(xid);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 1da168c61d35..278f8fdeb9ef 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -13,7 +13,7 @@
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/sched.h>
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2849f41e72a2..1326d38960db 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -13,7 +13,7 @@
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/string.h>
#include <linux/coda.h>
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index cd8a63238b11..9c3dedc000d1 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -19,8 +19,7 @@
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/namei.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/coda.h>
#include <linux/coda_psdev.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 9e83b7790212..d244d743a232 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -18,7 +18,7 @@
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/coda.h>
#include <linux/coda_psdev.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index fe3afb2de880..b945410bfcd5 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -21,9 +21,7 @@
#include <linux/vfs.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
-
-#include <asm/uaccess.h>
-
+#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 3f5de96bbb58..4326d172fc27 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -16,7 +16,7 @@
#include <linux/string.h>
#include <linux/namei.h>
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/coda.h>
#include <linux/coda_psdev.h>
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 5c1e4242368b..822629126e89 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -40,7 +40,7 @@
#include <linux/pid_namespace.h>
#include <asm/io.h>
#include <asm/poll.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/coda.h>
#include <linux/coda_psdev.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 21fcf8dcb9cd..5bb6e27298a4 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,7 +27,7 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 66d3d3c6b4b2..b13df99f3534 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -794,25 +794,21 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
char *kernel_type;
unsigned long data_page;
char *kernel_dev;
- struct filename *dir;
int retval;
- retval = copy_mount_string(type, &kernel_type);
- if (retval < 0)
+ kernel_type = copy_mount_string(type);
+ retval = PTR_ERR(kernel_type);
+ if (IS_ERR(kernel_type))
goto out;
- dir = getname(dir_name);
- retval = PTR_ERR(dir);
- if (IS_ERR(dir))
+ kernel_dev = copy_mount_string(dev_name);
+ retval = PTR_ERR(kernel_dev);
+ if (IS_ERR(kernel_dev))
goto out1;
- retval = copy_mount_string(dev_name, &kernel_dev);
- if (retval < 0)
- goto out2;
-
retval = copy_mount_options(data, &data_page);
if (retval < 0)
- goto out3;
+ goto out2;
retval = -EINVAL;
@@ -821,19 +817,17 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
do_ncp_super_data_conv((void *)data_page);
} else if (!strcmp(kernel_type, NFS4_NAME)) {
if (do_nfs4_super_data_conv((void *) data_page))
- goto out4;
+ goto out3;
}
}
- retval = do_mount(kernel_dev, dir->name, kernel_type,
+ retval = do_mount(kernel_dev, dir_name, kernel_type,
flags, (void*)data_page);
- out4:
- free_page(data_page);
out3:
- kfree(kernel_dev);
+ free_page(data_page);
out2:
- putname(dir);
+ kfree(kernel_dev);
out1:
kfree(kernel_type);
out:
diff --git a/fs/coredump.c b/fs/coredump.c
index a93f7e6ea4cf..b5c86ffd5033 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -199,6 +199,14 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
err = cn_printf(cn, "%d",
task_tgid_nr(current));
break;
+ case 'i':
+ err = cn_printf(cn, "%d",
+ task_pid_vnr(current));
+ break;
+ case 'I':
+ err = cn_printf(cn, "%d",
+ task_pid_nr(current));
+ break;
/* uid */
case 'u':
err = cn_printf(cn, "%d", cred->uid);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index ddcfe590b8a8..355c522f3585 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -11,6 +11,8 @@
* The actual compression is based on zlib, see the other files.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -21,7 +23,7 @@
#include <linux/vfs.h>
#include <linux/mutex.h>
#include <uapi/linux/cramfs_fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
@@ -153,7 +155,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
static unsigned buffer_blocknr[READ_BUFFERS];
-static struct super_block * buffer_dev[READ_BUFFERS];
+static struct super_block *buffer_dev[READ_BUFFERS];
static int next_buffer;
/*
@@ -205,6 +207,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
for (i = 0; i < BLKS_PER_BUF; i++) {
struct page *page = pages[i];
+
if (page) {
wait_on_page_locked(page);
if (!PageUptodate(page)) {
@@ -223,6 +226,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
data = read_buffers[buffer];
for (i = 0; i < BLKS_PER_BUF; i++) {
struct page *page = pages[i];
+
if (page) {
memcpy(data, kmap(page), PAGE_CACHE_SIZE);
kunmap(page);
@@ -237,6 +241,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
static void cramfs_kill_sb(struct super_block *sb)
{
struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+
kill_block_super(sb);
kfree(sbi);
}
@@ -277,7 +282,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
/* check for wrong endianness */
if (super.magic == CRAMFS_MAGIC_WEND) {
if (!silent)
- printk(KERN_ERR "cramfs: wrong endianness\n");
+ pr_err("wrong endianness\n");
return -EINVAL;
}
@@ -287,22 +292,22 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
mutex_unlock(&read_mutex);
if (super.magic != CRAMFS_MAGIC) {
if (super.magic == CRAMFS_MAGIC_WEND && !silent)
- printk(KERN_ERR "cramfs: wrong endianness\n");
+ pr_err("wrong endianness\n");
else if (!silent)
- printk(KERN_ERR "cramfs: wrong magic\n");
+ pr_err("wrong magic\n");
return -EINVAL;
}
}
/* get feature flags first */
if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
- printk(KERN_ERR "cramfs: unsupported filesystem features\n");
+ pr_err("unsupported filesystem features\n");
return -EINVAL;
}
/* Check that the root inode is in a sane state */
if (!S_ISDIR(super.root.mode)) {
- printk(KERN_ERR "cramfs: root is not a directory\n");
+ pr_err("root is not a directory\n");
return -EINVAL;
}
/* correct strange, hard-coded permissions of mkcramfs */
@@ -310,23 +315,23 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
root_offset = super.root.offset << 2;
if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
- sbi->size=super.size;
- sbi->blocks=super.fsid.blocks;
- sbi->files=super.fsid.files;
+ sbi->size = super.size;
+ sbi->blocks = super.fsid.blocks;
+ sbi->files = super.fsid.files;
} else {
- sbi->size=1<<28;
- sbi->blocks=0;
- sbi->files=0;
+ sbi->size = 1<<28;
+ sbi->blocks = 0;
+ sbi->files = 0;
}
- sbi->magic=super.magic;
- sbi->flags=super.flags;
+ sbi->magic = super.magic;
+ sbi->flags = super.flags;
if (root_offset == 0)
- printk(KERN_INFO "cramfs: empty filesystem");
+ pr_info("empty filesystem");
else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
((root_offset != sizeof(struct cramfs_super)) &&
(root_offset != 512 + sizeof(struct cramfs_super))))
{
- printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset);
+ pr_err("bad root offset %lu\n", root_offset);
return -EINVAL;
}
@@ -425,7 +430,7 @@ static int cramfs_readdir(struct file *file, struct dir_context *ctx)
/*
* Lookup and fill in the inode data..
*/
-static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
unsigned int offset = 0;
struct inode *inode = NULL;
@@ -483,7 +488,7 @@ out:
return NULL;
}
-static int cramfs_readpage(struct file *file, struct page * page)
+static int cramfs_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
u32 maxblock;
@@ -511,7 +516,7 @@ static int cramfs_readpage(struct file *file, struct page * page)
if (compr_len == 0)
; /* hole */
else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
- pr_err("cramfs: bad compressed blocksize %u\n",
+ pr_err("bad compressed blocksize %u\n",
compr_len);
goto err;
} else {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 1760c1b84d97..ec4f1d4fdad0 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -15,6 +15,8 @@
* then is used by multiple filesystems.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/vmalloc.h>
@@ -37,7 +39,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
err = zlib_inflateReset(&stream);
if (err != Z_OK) {
- printk("zlib_inflateReset error %d\n", err);
+ pr_err("zlib_inflateReset error %d\n", err);
zlib_inflateEnd(&stream);
zlib_inflateInit(&stream);
}
@@ -48,8 +50,8 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
return stream.total_out;
err:
- printk("Error %d while decompressing!\n", err);
- printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
+ pr_err("Error %d while decompressing!\n", err);
+ pr_err("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
return -EIO;
}
@@ -57,7 +59,7 @@ int cramfs_uncompress_init(void)
{
if (!initialized++) {
stream.workspace = vmalloc(zlib_inflate_workspacesize());
- if ( !stream.workspace ) {
+ if (!stream.workspace) {
initialized = 0;
return -ENOMEM;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 06f65857a855..3ffef7f4e5cd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
unsigned int hash)
{
hash += (unsigned long) parent / L1_CACHE_BYTES;
- hash = hash + (hash >> d_hash_shift);
- return dentry_hashtable + (hash & d_hash_mask);
+ return dentry_hashtable + hash_32(hash, d_hash_shift);
}
/* Statistics gathering. */
@@ -236,18 +235,49 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
return dentry_string_cmp(cs, ct, tcount);
}
+struct external_name {
+ union {
+ atomic_t count;
+ struct rcu_head head;
+ } u;
+ unsigned char name[];
+};
+
+static inline struct external_name *external_name(struct dentry *dentry)
+{
+ return container_of(dentry->d_name.name, struct external_name, name[0]);
+}
+
static void __d_free(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
WARN_ON(!hlist_unhashed(&dentry->d_alias));
- if (dname_external(dentry))
- kfree(dentry->d_name.name);
kmem_cache_free(dentry_cache, dentry);
}
+static void __d_free_external(struct rcu_head *head)
+{
+ struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+ WARN_ON(!hlist_unhashed(&dentry->d_alias));
+ kfree(external_name(dentry));
+ kmem_cache_free(dentry_cache, dentry);
+}
+
+static inline int dname_external(const struct dentry *dentry)
+{
+ return dentry->d_name.name != dentry->d_iname;
+}
+
static void dentry_free(struct dentry *dentry)
{
+ if (unlikely(dname_external(dentry))) {
+ struct external_name *p = external_name(dentry);
+ if (likely(atomic_dec_and_test(&p->u.count))) {
+ call_rcu(&dentry->d_u.d_rcu, __d_free_external);
+ return;
+ }
+ }
/* if dentry was never visible to RCU, immediate free is OK */
if (!(dentry->d_flags & DCACHE_RCUACCESS))
__d_free(&dentry->d_u.d_rcu);
@@ -457,7 +487,7 @@ static void __dentry_kill(struct dentry *dentry)
* inform the fs via d_prune that this dentry is about to be
* unhashed and destroyed.
*/
- if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
+ if (dentry->d_flags & DCACHE_OP_PRUNE)
dentry->d_op->d_prune(dentry);
if (dentry->d_flags & DCACHE_LRU_LIST) {
@@ -620,62 +650,6 @@ kill_it:
}
EXPORT_SYMBOL(dput);
-/**
- * d_invalidate - invalidate a dentry
- * @dentry: dentry to invalidate
- *
- * Try to invalidate the dentry if it turns out to be
- * possible. If there are other dentries that can be
- * reached through this one we can't delete it and we
- * return -EBUSY. On success we return 0.
- *
- * no dcache lock.
- */
-
-int d_invalidate(struct dentry * dentry)
-{
- /*
- * If it's already been dropped, return OK.
- */
- spin_lock(&dentry->d_lock);
- if (d_unhashed(dentry)) {
- spin_unlock(&dentry->d_lock);
- return 0;
- }
- /*
- * Check whether to do a partial shrink_dcache
- * to get rid of unused child entries.
- */
- if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&dentry->d_lock);
- shrink_dcache_parent(dentry);
- spin_lock(&dentry->d_lock);
- }
-
- /*
- * Somebody else still using it?
- *
- * If it's a directory, we can't drop it
- * for fear of somebody re-populating it
- * with children (even though dropping it
- * would make it unreachable from the root,
- * we might still populate it if it was a
- * working directory or similar).
- * We also need to leave mountpoints alone,
- * directory or not.
- */
- if (dentry->d_lockref.count > 1 && dentry->d_inode) {
- if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
- spin_unlock(&dentry->d_lock);
- return -EBUSY;
- }
- }
-
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
- return 0;
-}
-EXPORT_SYMBOL(d_invalidate);
/* This must be called with d_lock held */
static inline void __dget_dlock(struct dentry *dentry)
@@ -731,20 +705,18 @@ EXPORT_SYMBOL(dget_parent);
/**
* d_find_alias - grab a hashed alias of inode
* @inode: inode in question
- * @want_discon: flag, used by d_splice_alias, to request
- * that only a DISCONNECTED alias be returned.
*
* If inode has a hashed alias, or is a directory and has any alias,
* acquire the reference to alias and return it. Otherwise return NULL.
* Notice that if inode is a directory there can be only one alias and
* it can be unhashed only if it has no children, or if it is the root
- * of a filesystem.
+ * of a filesystem, or if the directory was renamed and d_revalidate
+ * was the first vfs operation to notice.
*
* If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one unless @want_discon is set,
- * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
+ * any other hashed alias over that one.
*/
-static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode)
{
struct dentry *alias, *discon_alias;
@@ -756,7 +728,7 @@ again:
if (IS_ROOT(alias) &&
(alias->d_flags & DCACHE_DISCONNECTED)) {
discon_alias = alias;
- } else if (!want_discon) {
+ } else {
__dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
@@ -768,12 +740,9 @@ again:
alias = discon_alias;
spin_lock(&alias->d_lock);
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
- if (IS_ROOT(alias) &&
- (alias->d_flags & DCACHE_DISCONNECTED)) {
- __dget_dlock(alias);
- spin_unlock(&alias->d_lock);
- return alias;
- }
+ __dget_dlock(alias);
+ spin_unlock(&alias->d_lock);
+ return alias;
}
spin_unlock(&alias->d_lock);
goto again;
@@ -787,7 +756,7 @@ struct dentry *d_find_alias(struct inode *inode)
if (!hlist_empty(&inode->i_dentry)) {
spin_lock(&inode->i_lock);
- de = __d_find_alias(inode, 0);
+ de = __d_find_alias(inode);
spin_unlock(&inode->i_lock);
}
return de;
@@ -806,20 +775,13 @@ restart:
hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
spin_lock(&dentry->d_lock);
if (!dentry->d_lockref.count) {
- /*
- * inform the fs via d_prune that this dentry
- * is about to be unhashed and destroyed.
- */
- if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
- !d_unhashed(dentry))
- dentry->d_op->d_prune(dentry);
-
- __dget_dlock(dentry);
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&inode->i_lock);
- dput(dentry);
- goto restart;
+ struct dentry *parent = lock_parent(dentry);
+ if (likely(!dentry->d_lockref.count)) {
+ __dentry_kill(dentry);
+ goto restart;
+ }
+ if (parent)
+ spin_unlock(&parent->d_lock);
}
spin_unlock(&dentry->d_lock);
}
@@ -1200,7 +1162,7 @@ EXPORT_SYMBOL(have_submounts);
* reachable (e.g. NFS can unhash a directory dentry and then the complete
* subtree can become unreachable).
*
- * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For
+ * Only one of d_invalidate() and d_set_mounted() must succeed. For
* this reason take rename_lock and d_lock on dentry and ancestors.
*/
int d_set_mounted(struct dentry *dentry)
@@ -1209,7 +1171,7 @@ int d_set_mounted(struct dentry *dentry)
int ret = -ENOENT;
write_seqlock(&rename_lock);
for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
- /* Need exclusion wrt. check_submounts_and_drop() */
+ /* Need exclusion wrt. d_invalidate() */
spin_lock(&p->d_lock);
if (unlikely(d_unhashed(p))) {
spin_unlock(&p->d_lock);
@@ -1353,70 +1315,84 @@ void shrink_dcache_for_umount(struct super_block *sb)
}
}
-static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
+struct detach_data {
+ struct select_data select;
+ struct dentry *mountpoint;
+};
+static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
{
- struct select_data *data = _data;
+ struct detach_data *data = _data;
if (d_mountpoint(dentry)) {
- data->found = -EBUSY;
+ __dget_dlock(dentry);
+ data->mountpoint = dentry;
return D_WALK_QUIT;
}
- return select_collect(_data, dentry);
+ return select_collect(&data->select, dentry);
}
static void check_and_drop(void *_data)
{
- struct select_data *data = _data;
+ struct detach_data *data = _data;
- if (d_mountpoint(data->start))
- data->found = -EBUSY;
- if (!data->found)
- __d_drop(data->start);
+ if (!data->mountpoint && !data->select.found)
+ __d_drop(data->select.start);
}
/**
- * check_submounts_and_drop - prune dcache, check for submounts and drop
+ * d_invalidate - detach submounts, prune dcache, and drop
+ * @dentry: dentry to invalidate (aka detach, prune and drop)
*
- * All done as a single atomic operation relative to has_unlinked_ancestor().
- * Returns 0 if successfully unhashed @parent. If there were submounts then
- * return -EBUSY.
+ * no dcache lock.
*
- * @dentry: dentry to prune and drop
+ * The final d_drop is done as an atomic operation relative to
+ * rename_lock ensuring there are no races with d_set_mounted. This
+ * ensures there are no unhashed dentries on the path to a mountpoint.
*/
-int check_submounts_and_drop(struct dentry *dentry)
+void d_invalidate(struct dentry *dentry)
{
- int ret = 0;
+ /*
+ * If it's already been dropped, return OK.
+ */
+ spin_lock(&dentry->d_lock);
+ if (d_unhashed(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ return;
+ }
+ spin_unlock(&dentry->d_lock);
/* Negative dentries can be dropped without further checks */
if (!dentry->d_inode) {
d_drop(dentry);
- goto out;
+ return;
}
for (;;) {
- struct select_data data;
+ struct detach_data data;
- INIT_LIST_HEAD(&data.dispose);
- data.start = dentry;
- data.found = 0;
+ data.mountpoint = NULL;
+ INIT_LIST_HEAD(&data.select.dispose);
+ data.select.start = dentry;
+ data.select.found = 0;
+
+ d_walk(dentry, &data, detach_and_collect, check_and_drop);
- d_walk(dentry, &data, check_and_collect, check_and_drop);
- ret = data.found;
+ if (data.select.found)
+ shrink_dentry_list(&data.select.dispose);
- if (!list_empty(&data.dispose))
- shrink_dentry_list(&data.dispose);
+ if (data.mountpoint) {
+ detach_mounts(data.mountpoint);
+ dput(data.mountpoint);
+ }
- if (ret <= 0)
+ if (!data.mountpoint && !data.select.found)
break;
cond_resched();
}
-
-out:
- return ret;
}
-EXPORT_SYMBOL(check_submounts_and_drop);
+EXPORT_SYMBOL(d_invalidate);
/**
* __d_alloc - allocate a dcache entry
@@ -1445,11 +1421,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
*/
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
- dname = kmalloc(name->len + 1, GFP_KERNEL);
- if (!dname) {
+ size_t size = offsetof(struct external_name, name[1]);
+ struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+ if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
+ atomic_set(&p->u.count, 1);
+ dname = p->name;
} else {
dname = dentry->d_iname;
}
@@ -1781,25 +1760,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
}
EXPORT_SYMBOL(d_find_any_alias);
-/**
- * d_obtain_alias - find or allocate a dentry for a given inode
- * @inode: inode to allocate the dentry for
- *
- * Obtain a dentry for an inode resulting from NFS filehandle conversion or
- * similar open by handle operations. The returned dentry may be anonymous,
- * or may have a full name (if the inode was already in the cache).
- *
- * When called on a directory inode, we must ensure that the inode only ever
- * has one dentry. If a dentry is found, that is returned instead of
- * allocating a new one.
- *
- * On successful return, the reference to the inode has been transferred
- * to the dentry. In case of an error the reference on the inode is released.
- * To make it easier to use in export operations a %NULL or IS_ERR inode may
- * be passed in and will be the error will be propagate to the return value,
- * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
- */
-struct dentry *d_obtain_alias(struct inode *inode)
+static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
{
static const struct qstr anonstring = QSTR_INIT("/", 1);
struct dentry *tmp;
@@ -1830,7 +1791,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
}
/* attach a disconnected dentry */
- add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
+ add_flags = d_flags_for_inode(inode);
+
+ if (disconnected)
+ add_flags |= DCACHE_DISCONNECTED;
spin_lock(&tmp->d_lock);
tmp->d_inode = inode;
@@ -1851,59 +1815,51 @@ struct dentry *d_obtain_alias(struct inode *inode)
iput(inode);
return res;
}
-EXPORT_SYMBOL(d_obtain_alias);
/**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
- * @inode: the inode which may have a disconnected dentry
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
- * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
- * and return it, else simply d_add the inode to the dentry and return NULL.
+ * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
+ * @inode: inode to allocate the dentry for
*
- * This is needed in the lookup routine of any filesystem that is exportable
- * (via knfsd) so that we can build dcache paths to directories effectively.
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations. The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
*
- * If a dentry was found and moved, then it is returned. Otherwise NULL
- * is returned. This matches the expected return value of ->lookup.
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry. If a dentry is found, that is returned instead of
+ * allocating a new one.
*
- * Cluster filesystems may call this function with a negative, hashed dentry.
- * In that case, we know that the inode will be a regular file, and also this
- * will only occur during atomic_open. So we need to check for the dentry
- * being already hashed only in the final case.
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry. In case of an error the reference on the inode is released.
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
+ * be passed in and the error will be propagated to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
*/
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+struct dentry *d_obtain_alias(struct inode *inode)
{
- struct dentry *new = NULL;
-
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ return __d_obtain_alias(inode, 1);
+}
+EXPORT_SYMBOL(d_obtain_alias);
- if (inode && S_ISDIR(inode->i_mode)) {
- spin_lock(&inode->i_lock);
- new = __d_find_alias(inode, 1);
- if (new) {
- BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
- spin_unlock(&inode->i_lock);
- security_d_instantiate(new, inode);
- d_move(new, dentry);
- iput(inode);
- } else {
- /* already taking inode->i_lock, so d_add() by hand */
- __d_instantiate(dentry, inode);
- spin_unlock(&inode->i_lock);
- security_d_instantiate(dentry, inode);
- d_rehash(dentry);
- }
- } else {
- d_instantiate(dentry, inode);
- if (d_unhashed(dentry))
- d_rehash(dentry);
- }
- return new;
+/**
+ * d_obtain_root - find or allocate a dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain an IS_ROOT dentry for the root of a filesystem.
+ *
+ * We must ensure that directory inodes only ever have one dentry. If a
+ * dentry is found, that is returned instead of allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry. In case of an error the reference on the inode is
+ * released. A %NULL or IS_ERR inode may be passed in and will be the
+ * error will be propagate to the return value, with a %NULL @inode
+ * replaced by ERR_PTR(-ESTALE).
+ */
+struct dentry *d_obtain_root(struct inode *inode)
+{
+ return __d_obtain_alias(inode, 0);
}
-EXPORT_SYMBOL(d_splice_alias);
+EXPORT_SYMBOL(d_obtain_root);
/**
* d_add_ci - lookup or allocate new dentry with case-exact name
@@ -2142,10 +2098,10 @@ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
struct dentry *dentry;
unsigned seq;
- do {
- seq = read_seqbegin(&rename_lock);
- dentry = __d_lookup(parent, name);
- if (dentry)
+ do {
+ seq = read_seqbegin(&rename_lock);
+ dentry = __d_lookup(parent, name);
+ if (dentry)
break;
} while (read_seqretry(&rename_lock, seq));
return dentry;
@@ -2402,10 +2358,10 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
}
EXPORT_SYMBOL(dentry_update_name_case);
-static void switch_names(struct dentry *dentry, struct dentry *target)
+static void swap_names(struct dentry *dentry, struct dentry *target)
{
- if (dname_external(target)) {
- if (dname_external(dentry)) {
+ if (unlikely(dname_external(target))) {
+ if (unlikely(dname_external(dentry))) {
/*
* Both external: swap the pointers
*/
@@ -2421,7 +2377,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
target->d_name.name = target->d_iname;
}
} else {
- if (dname_external(dentry)) {
+ if (unlikely(dname_external(dentry))) {
/*
* dentry:external, target:internal. Give dentry's
* storage to target and make dentry internal
@@ -2442,7 +2398,25 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
}
}
}
- swap(dentry->d_name.len, target->d_name.len);
+ swap(dentry->d_name.hash_len, target->d_name.hash_len);
+}
+
+static void copy_name(struct dentry *dentry, struct dentry *target)
+{
+ struct external_name *old_name = NULL;
+ if (unlikely(dname_external(dentry)))
+ old_name = external_name(dentry);
+ if (unlikely(dname_external(target))) {
+ atomic_inc(&external_name(target)->u.count);
+ dentry->d_name = target->d_name;
+ } else {
+ memcpy(dentry->d_iname, target->d_name.name,
+ target->d_name.len + 1);
+ dentry->d_name.name = dentry->d_iname;
+ dentry->d_name.hash_len = target->d_name.hash_len;
+ }
+ if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
+ kfree_rcu(old_name, u.head);
}
static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
@@ -2472,25 +2446,29 @@ static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
}
}
-static void dentry_unlock_parents_for_move(struct dentry *dentry,
- struct dentry *target)
+static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
{
if (target->d_parent != dentry->d_parent)
spin_unlock(&dentry->d_parent->d_lock);
if (target->d_parent != target)
spin_unlock(&target->d_parent->d_lock);
+ spin_unlock(&target->d_lock);
+ spin_unlock(&dentry->d_lock);
}
/*
* When switching names, the actual string doesn't strictly have to
* be preserved in the target - because we're dropping the target
* anyway. As such, we can just do a simple memcpy() to copy over
- * the new name before we switch.
- *
- * Note that we have to be a lot more careful about getting the hash
- * switched - we have to switch the hash value properly even if it
- * then no longer matches the actual (corrupted) string of the target.
- * The hash value has to match the hash queue that the dentry is on..
+ * the new name before we switch, unless we are going to rehash
+ * it. Note that if we *do* unhash the target, we are not allowed
+ * to rehash it without giving it a new name/hash key - whether
+ * we swap or overwrite the names here, resulting name won't match
+ * the reality in filesystem; it's only there for d_path() purposes.
+ * Note that all of this is happening under rename_lock, so the
+ * any hash lookup seeing it in the middle of manipulations will
+ * be discarded anyway. So we do not care what happens to the hash
+ * key in that case.
*/
/*
* __d_move - move a dentry
@@ -2536,36 +2514,33 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
d_hash(dentry->d_parent, dentry->d_name.hash));
}
- list_del(&dentry->d_u.d_child);
- list_del(&target->d_u.d_child);
-
/* Switch the names.. */
- switch_names(dentry, target);
- swap(dentry->d_name.hash, target->d_name.hash);
+ if (exchange)
+ swap_names(dentry, target);
+ else
+ copy_name(dentry, target);
- /* ... and switch the parents */
+ /* ... and switch them in the tree */
if (IS_ROOT(dentry)) {
+ /* splicing a tree */
dentry->d_parent = target->d_parent;
target->d_parent = target;
- INIT_LIST_HEAD(&target->d_u.d_child);
+ list_del_init(&target->d_u.d_child);
+ list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
} else {
+ /* swapping two dentries */
swap(dentry->d_parent, target->d_parent);
-
- /* And add them back to the (new) parent lists */
- list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
+ list_move(&target->d_u.d_child, &target->d_parent->d_subdirs);
+ list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+ if (exchange)
+ fsnotify_d_move(target);
+ fsnotify_d_move(dentry);
}
- list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
-
write_seqcount_end(&target->d_seq);
write_seqcount_end(&dentry->d_seq);
- dentry_unlock_parents_for_move(dentry, target);
- if (exchange)
- fsnotify_d_move(target);
- spin_unlock(&target->d_lock);
- fsnotify_d_move(dentry);
- spin_unlock(&dentry->d_lock);
+ dentry_unlock_for_move(dentry, target);
}
/*
@@ -2650,10 +2625,8 @@ static struct dentry *__d_unalias(struct inode *inode,
goto out_err;
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
- if (likely(!d_mountpoint(alias))) {
- __d_move(alias, dentry, false);
- ret = alias;
- }
+ __d_move(alias, dentry, false);
+ ret = alias;
out_err:
spin_unlock(&inode->i_lock);
if (m2)
@@ -2663,38 +2636,73 @@ out_err:
return ret;
}
-/*
- * Prepare an anonymous dentry for life in the superblock's dentry tree as a
- * named dentry in place of the dentry to be replaced.
- * returns with anon->d_lock held!
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode: the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned. Otherwise NULL
+ * is returned. This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
*/
-static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
- struct dentry *dparent;
-
- dentry_lock_for_move(anon, dentry);
-
- write_seqcount_begin(&dentry->d_seq);
- write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED);
-
- dparent = dentry->d_parent;
-
- switch_names(dentry, anon);
- swap(dentry->d_name.hash, anon->d_name.hash);
-
- dentry->d_parent = dentry;
- list_del_init(&dentry->d_u.d_child);
- anon->d_parent = dparent;
- list_move(&anon->d_u.d_child, &dparent->d_subdirs);
-
- write_seqcount_end(&dentry->d_seq);
- write_seqcount_end(&anon->d_seq);
+ struct dentry *new = NULL;
- dentry_unlock_parents_for_move(anon, dentry);
- spin_unlock(&dentry->d_lock);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
- /* anon->d_lock still locked, returns locked */
+ if (inode && S_ISDIR(inode->i_mode)) {
+ spin_lock(&inode->i_lock);
+ new = __d_find_any_alias(inode);
+ if (new) {
+ if (!IS_ROOT(new)) {
+ spin_unlock(&inode->i_lock);
+ dput(new);
+ iput(inode);
+ return ERR_PTR(-EIO);
+ }
+ if (d_ancestor(new, dentry)) {
+ spin_unlock(&inode->i_lock);
+ dput(new);
+ iput(inode);
+ return ERR_PTR(-EIO);
+ }
+ write_seqlock(&rename_lock);
+ __d_move(new, dentry, false);
+ write_sequnlock(&rename_lock);
+ spin_unlock(&inode->i_lock);
+ security_d_instantiate(new, inode);
+ iput(inode);
+ } else {
+ /* already taking inode->i_lock, so d_add() by hand */
+ __d_instantiate(dentry, inode);
+ spin_unlock(&inode->i_lock);
+ security_d_instantiate(dentry, inode);
+ d_rehash(dentry);
+ }
+ } else {
+ d_instantiate(dentry, inode);
+ if (d_unhashed(dentry))
+ d_rehash(dentry);
+ }
+ return new;
}
+EXPORT_SYMBOL(d_splice_alias);
/**
* d_materialise_unique - introduce an inode into the tree
@@ -2724,7 +2732,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
struct dentry *alias;
/* Does an aliased dentry already exist? */
- alias = __d_find_alias(inode, 0);
+ alias = __d_find_alias(inode);
if (alias) {
actual = alias;
write_seqlock(&rename_lock);
@@ -2736,9 +2744,8 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
} else if (IS_ROOT(alias)) {
/* Is this an anonymous mountpoint that we
* could splice into our tree? */
- __d_materialise_dentry(dentry, alias);
+ __d_move(alias, dentry, false);
write_sequnlock(&rename_lock);
- __d_drop(alias);
goto found;
} else {
/* Nope, but we must(!) avoid directory
@@ -2764,13 +2771,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
actual = __d_instantiate_unique(dentry, inode);
if (!actual)
actual = dentry;
- else
- BUG_ON(!d_unhashed(actual));
- spin_lock(&actual->d_lock);
+ d_rehash(actual);
found:
- _d_rehash(actual);
- spin_unlock(&actual->d_lock);
spin_unlock(&inode->i_lock);
out_nolock:
if (actual == dentry) {
@@ -2807,6 +2810,9 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
* the beginning of the name. The sequence number check at the caller will
* retry it again when a d_move() does happen. So any garbage in the buffer
* due to mismatched pointer and length will be discarded.
+ *
+ * Data dependency barrier is needed to make sure that we see that terminating
+ * NUL. Alpha strikes again, film at 11...
*/
static int prepend_name(char **buffer, int *buflen, struct qstr *name)
{
@@ -2814,6 +2820,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
u32 dlen = ACCESS_ONCE(name->len);
char *p;
+ smp_read_barrier_depends();
+
*buflen -= dlen + 1;
if (*buflen < 0)
return -ENAMETOOLONG;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 17e39b047de5..e181b6b2e297 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
ssize_t ret;
- ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
+ ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
&sdio->from);
if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8d77ba7b1756..1323c568e362 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -718,16 +718,11 @@ static const struct file_operations waiters_fops = {
void dlm_delete_debug_file(struct dlm_ls *ls)
{
- if (ls->ls_debug_rsb_dentry)
- debugfs_remove(ls->ls_debug_rsb_dentry);
- if (ls->ls_debug_waiters_dentry)
- debugfs_remove(ls->ls_debug_waiters_dentry);
- if (ls->ls_debug_locks_dentry)
- debugfs_remove(ls->ls_debug_locks_dentry);
- if (ls->ls_debug_all_dentry)
- debugfs_remove(ls->ls_debug_all_dentry);
- if (ls->ls_debug_toss_dentry)
- debugfs_remove(ls->ls_debug_toss_dentry);
+ debugfs_remove(ls->ls_debug_rsb_dentry);
+ debugfs_remove(ls->ls_debug_waiters_dentry);
+ debugfs_remove(ls->ls_debug_locks_dentry);
+ debugfs_remove(ls->ls_debug_all_dentry);
+ debugfs_remove(ls->ls_debug_toss_dentry);
}
int dlm_create_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index f704458ea5f5..e0ab3a93eeff 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -30,7 +30,7 @@ struct plock_op {
struct plock_xop {
struct plock_op xop;
- void *callback;
+ int (*callback)(struct file_lock *fl, int result);
void *fl;
void *file;
struct file_lock flc;
@@ -190,7 +190,7 @@ static int dlm_plock_callback(struct plock_op *op)
struct file *file;
struct file_lock *fl;
struct file_lock *flc;
- int (*notify)(void *, void *, int) = NULL;
+ int (*notify)(struct file_lock *fl, int result) = NULL;
struct plock_xop *xop = (struct plock_xop *)op;
int rv = 0;
@@ -209,7 +209,7 @@ static int dlm_plock_callback(struct plock_op *op)
notify = xop->callback;
if (op->info.rv) {
- notify(fl, NULL, op->info.rv);
+ notify(fl, op->info.rv);
goto out;
}
@@ -228,7 +228,7 @@ static int dlm_plock_callback(struct plock_op *op)
(unsigned long long)op->info.number, file, fl);
}
- rv = notify(fl, NULL, 0);
+ rv = notify(fl, 0);
if (rv) {
/* XXX: We need to cancel the fs lock here: */
log_print("dlm_plock_callback: lock granted after lock request "
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 9d61947d473a..f3f5e72a29ba 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -206,7 +206,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
rs = (struct rcom_status *)rc_in->rc_buf;
- if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) {
+ if (!(le32_to_cpu(rs->rs_flags) & DLM_RSF_NEED_SLOTS)) {
status = dlm_recover_status(ls);
goto do_create;
}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index db0fad3269c0..f5bce9096555 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -229,8 +229,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
+ "[%pd]; rc = [%d]\n", __func__,
+ ecryptfs_dentry, rc);
goto out_free;
}
if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
@@ -327,7 +327,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct file *lower_file = ecryptfs_file_to_lower(file);
long rc = -ENOIOCTLCMD;
- if (lower_file->f_op && lower_file->f_op->compat_ioctl)
+ if (lower_file->f_op->compat_ioctl)
rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d4a9431ec73c..1686dc2da9fd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -53,9 +53,7 @@ static void unlock_dir(struct dentry *dir)
static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
{
- if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
- return 1;
- return 0;
+ return ecryptfs_inode_to_lower(inode) == lower_inode;
}
static int ecryptfs_inode_set(struct inode *inode, void *opaque)
@@ -192,12 +190,6 @@ ecryptfs_do_create(struct inode *directory_inode,
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- if (IS_ERR(lower_dir_dentry)) {
- ecryptfs_printk(KERN_ERR, "Error locking directory of "
- "dentry\n");
- inode = ERR_CAST(lower_dir_dentry);
- goto out;
- }
rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
@@ -215,7 +207,6 @@ ecryptfs_do_create(struct inode *directory_inode,
fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
out_lock:
unlock_dir(lower_dir_dentry);
-out:
return inode;
}
@@ -250,8 +241,8 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
+ "[%pd]; rc = [%d]\n", __func__,
+ ecryptfs_dentry, rc);
goto out;
}
rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
@@ -313,8 +304,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the lower file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- dentry->d_name.name, rc);
+ "[%pd]; rc = [%d]\n", __func__,
+ dentry, rc);
return rc;
}
@@ -418,8 +409,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
- "[%d] on lower_dentry = [%s]\n", __func__, rc,
- ecryptfs_dentry->d_name.name);
+ "[%d] on lower_dentry = [%pd]\n", __func__, rc,
+ ecryptfs_dentry);
goto out;
}
if (lower_dentry->d_inode)
@@ -1039,7 +1030,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
- if (!rc)
+ if (!rc && dentry->d_inode)
fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
out:
return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 4725a07f003c..635e8e16a5b7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -26,7 +26,6 @@
*/
#include <linux/string.h>
-#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/key.h>
#include <linux/random.h>
@@ -1846,7 +1845,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
"(Tag 11 not allowed by itself)\n");
rc = -EIO;
goto out_wipe_list;
- break;
default:
ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
"of the file header; hex value of "
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1b119d3bf924..c4cd1fd86cc2 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
s->s_blocksize = path.dentry->d_sb->s_blocksize;
s->s_magic = ECRYPTFS_SUPER_MAGIC;
+ s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
+
+ rc = -EINVAL;
+ if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ pr_err("eCryptfs: maximum fs stacking depth exceeded\n");
+ goto out_free;
+ }
inode = ecryptfs_get_inode(path.dentry->d_inode, s);
rc = PTR_ERR(inode);
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index e57380e5f6bd..286f10b0363b 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -434,8 +434,7 @@ void ecryptfs_release_messaging(void)
mutex_lock(&ecryptfs_msg_ctx_lists_mux);
for (i = 0; i < ecryptfs_message_buf_len; i++) {
mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
- if (ecryptfs_msg_ctx_arr[i].msg)
- kfree(ecryptfs_msg_ctx_arr[i].msg);
+ kfree(ecryptfs_msg_ctx_arr[i].msg);
mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
}
kfree(ecryptfs_msg_ctx_arr);
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 356c044e2cd3..bbee8f063dfa 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -12,7 +12,8 @@
#include "efs.h"
-static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) {
+static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
+{
struct buffer_head *bh;
int slot, namelen;
@@ -40,10 +41,10 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
pr_err("%s(): invalid directory block\n", __func__);
brelse(bh);
- return(0);
+ return 0;
}
- for(slot = 0; slot < dirblock->slots; slot++) {
+ for (slot = 0; slot < dirblock->slots; slot++) {
dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
namelen = dirslot->namelen;
@@ -52,12 +53,12 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
if ((namelen == len) && (!memcmp(name, nameptr, len))) {
inodenum = be32_to_cpu(dirslot->inode);
brelse(bh);
- return(inodenum);
+ return inodenum;
}
}
brelse(bh);
}
- return(0);
+ return 0;
}
struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b10b48c2a7af..7bcfff900f05 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
- ep_take_care_of_epollwakeup(&epds);
+ if (ep_op_has_event(op))
+ ep_take_care_of_epollwakeup(&epds);
/*
* We have to check that the file structure underneath the file descriptor
diff --git a/fs/exec.c b/fs/exec.c
index ab1f1200ce5d..7302b75a9820 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -368,10 +368,6 @@ static int bprm_mm_init(struct linux_binprm *bprm)
if (!mm)
goto err;
- err = init_new_context(current, mm);
- if (err)
- goto err;
-
err = __bprm_mm_init(bprm);
if (err)
goto err;
@@ -1376,18 +1372,23 @@ int search_binary_handler(struct linux_binprm *bprm)
read_unlock(&binfmt_lock);
bprm->recursion_depth++;
retval = fmt->load_binary(bprm);
+ read_lock(&binfmt_lock);
+ put_binfmt(fmt);
bprm->recursion_depth--;
- if (retval >= 0 || retval != -ENOEXEC ||
- bprm->mm == NULL || bprm->file == NULL) {
- put_binfmt(fmt);
+ if (retval < 0 && !bprm->mm) {
+ /* we got to flush_old_exec() and failed after it */
+ read_unlock(&binfmt_lock);
+ force_sigsegv(SIGSEGV, current);
+ return retval;
+ }
+ if (retval != -ENOEXEC || !bprm->file) {
+ read_unlock(&binfmt_lock);
return retval;
}
- read_lock(&binfmt_lock);
- put_binfmt(fmt);
}
read_unlock(&binfmt_lock);
- if (need_retry && retval == -ENOEXEC) {
+ if (need_retry) {
if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
printable(bprm->buf[2]) && printable(bprm->buf[3]))
return retval;
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 389ba8312d5d..b47c7b8dc275 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -4,7 +4,7 @@
# Copyright (C) 2008 Panasas Inc. All rights reserved.
#
# Authors:
-# Boaz Harrosh <bharrosh@panasas.com>
+# Boaz Harrosh <ooo@electrozaur.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index 3bbd46956d77..7d88ef566213 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -4,7 +4,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* Copyrights for code taken from ext2:
* Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 49f51ab4caac..d7defd557601 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -2,7 +2,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* Copyrights for code taken from ext2:
* Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index fffe86fd7a42..ad9cac670a47 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -2,7 +2,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* Copyrights for code taken from ext2:
* Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 71bf8e4fb5d4..1a376b42d305 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -2,7 +2,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* Copyrights for code taken from ext2:
* Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3f9cafd73931..f1d3d4eb8c4f 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -2,7 +2,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* Copyrights for code taken from ext2:
* Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 4731fd991efe..28907460e8fa 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -2,7 +2,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* Copyrights for code taken from ext2:
* Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index cfc0205d62c4..7bd8ac8dfb28 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -2,7 +2,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This file is part of exofs.
*
@@ -29,7 +29,7 @@
#include "ore_raid.h"
-MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
MODULE_LICENSE("GPL");
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 7f20f25c232c..27cbdb697649 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2011
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This file is part of the objects raid engine (ore).
*
@@ -116,7 +116,7 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
pages_in_unit - i);
- __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+ __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL);
if (unlikely(!__a1pa)) {
ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
num_a1pa);
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index cf6375d82129..a6e746775570 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -1,6 +1,6 @@
/*
* Copyright (C) from 2011
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This file is part of the objects raid engine (ore).
*
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index ed73ed8ebbee..95965503afcb 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -2,7 +2,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* Copyrights for code taken from ext2:
* Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
index 4dd687c3e747..832e2624b80b 100644
--- a/fs/exofs/symlink.c
+++ b/fs/exofs/symlink.c
@@ -2,7 +2,7 @@
* Copyright (C) 2005, 2006
* Avishay Traeger (avishay@gmail.com)
* Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* Copyrights for code taken from ext2:
* Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 1b4f2f95fc37..5e6a2c0a1f0b 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -1,7 +1,7 @@
/*
* Copyright (C) 2012
* Sachin Bhamare <sbhamare@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This file is part of exofs.
*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 3750031cfa2f..170dc41e8bf4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -161,7 +161,7 @@ static struct kmem_cache * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->i_block_alloc_info = NULL;
@@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext2_count_free_blocks(sb));
+ ext2_count_free_blocks(sb), GFP_KERNEL);
if (!err) {
err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext2_count_free_inodes(sb));
+ ext2_count_free_inodes(sb), GFP_KERNEL);
}
if (!err) {
err = percpu_counter_init(&sbi->s_dirs_counter,
- ext2_count_dirs(sb));
+ ext2_count_dirs(sb), GFP_KERNEL);
}
if (err) {
ext2_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
index e85ff15a060e..fc3cdcf24aed 100644
--- a/fs/ext3/ext3.h
+++ b/fs/ext3/ext3.h
@@ -237,6 +237,8 @@ struct ext3_new_group_data {
#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+/* Number of supported quota types */
+#define EXT3_MAXQUOTAS 2
/*
* Mount options
@@ -248,7 +250,7 @@ struct ext3_mount_options {
unsigned long s_commit_interval;
#ifdef CONFIG_QUOTA
int s_jquota_fmt;
- char *s_qf_names[MAXQUOTAS];
+ char *s_qf_names[EXT3_MAXQUOTAS];
#endif
};
@@ -669,7 +671,7 @@ struct ext3_sb_info {
unsigned long s_commit_interval;
struct block_device *journal_bdev;
#ifdef CONFIG_QUOTA
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
};
@@ -1183,9 +1185,9 @@ extern const struct inode_operations ext3_fast_symlink_inode_operations;
#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
#endif
-#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
+#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
int
ext3_mark_iloc_dirty(handle_t *handle,
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 08cdfe5461e3..eb742d0e67ff 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -441,7 +441,7 @@ static void ext3_put_super (struct super_block * sb)
percpu_counter_destroy(&sbi->s_dirs_counter);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT3_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
#endif
@@ -1354,13 +1354,6 @@ set_qf_format:
"not specified.");
return 0;
}
- } else {
- if (sbi->s_jquota_fmt) {
- ext3_msg(sb, KERN_ERR, "error: journaled quota format "
- "specified with no journaling "
- "enabled.");
- return 0;
- }
}
#endif
return 1;
@@ -1555,7 +1548,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
/* Needed for iput() to work correctly and not trash data */
sb->s_flags |= MS_ACTIVE;
/* Turn on quotas so that they are updated correctly */
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT3_MAXQUOTAS; i++) {
if (EXT3_SB(sb)->s_qf_names[i]) {
int ret = ext3_quota_on_mount(sb, i);
if (ret < 0)
@@ -1606,7 +1599,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
/* Turn quotas off */
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT3_MAXQUOTAS; i++) {
if (sb_dqopt(sb)->files[i])
dquot_quota_off(sb, i);
}
@@ -2039,14 +2032,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount2;
}
err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext3_count_free_blocks(sb));
+ ext3_count_free_blocks(sb), GFP_KERNEL);
if (!err) {
err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext3_count_free_inodes(sb));
+ ext3_count_free_inodes(sb), GFP_KERNEL);
}
if (!err) {
err = percpu_counter_init(&sbi->s_dirs_counter,
- ext3_count_dirs(sb));
+ ext3_count_dirs(sb), GFP_KERNEL);
}
if (err) {
ext3_msg(sb, KERN_ERR, "error: insufficient memory");
@@ -2139,7 +2132,7 @@ failed_mount2:
kfree(sbi->s_group_desc);
failed_mount:
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT3_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
#endif
ext3_blkdev_remove(sbi);
@@ -2659,7 +2652,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
old_opts.s_commit_interval = sbi->s_commit_interval;
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT3_MAXQUOTAS; i++)
if (sbi->s_qf_names[i]) {
old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
GFP_KERNEL);
@@ -2763,7 +2756,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
}
#ifdef CONFIG_QUOTA
/* Release old quota file names */
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT3_MAXQUOTAS; i++)
kfree(old_opts.s_qf_names[i]);
#endif
if (enable_quota)
@@ -2777,7 +2770,7 @@ restore_opts:
sbi->s_commit_interval = old_opts.s_commit_interval;
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT3_MAXQUOTAS; i++) {
kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
@@ -2828,8 +2821,9 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
*/
overhead += ngroups * (2 + sbi->s_itb_per_group);
- /* Add the journal blocks as well */
- overhead += sbi->s_journal->j_maxlen;
+ /* Add the internal journal blocks as well */
+ if (sbi->s_journal && !sbi->journal_bdev)
+ overhead += sbi->s_journal->j_maxlen;
sbi->s_overhead_last = overhead;
smp_wmb();
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 581ef40fbe90..83a6f497c4e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -176,7 +176,7 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
}
/* Initializes an uninitialized block bitmap */
-static void ext4_init_block_bitmap(struct super_block *sb,
+static int ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
@@ -192,7 +192,6 @@ static void ext4_init_block_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -205,7 +204,7 @@ static void ext4_init_block_bitmap(struct super_block *sb,
count);
}
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return;
+ return -EIO;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -243,6 +242,7 @@ static void ext4_init_block_bitmap(struct super_block *sb,
sb->s_blocksize * 8, bh->b_data);
ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
+ return 0;
}
/* Return the number of free blocks in a block group. It is used when
@@ -438,11 +438,15 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- ext4_init_block_bitmap(sb, bh, block_group, desc);
+ int err;
+
+ err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
+ if (err)
+ ext4_error(sb, "Checksum bad for grp %u", block_group);
return bh;
}
ext4_unlock_group(sb, block_group);
@@ -636,8 +640,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
* Account for the allocated meta blocks. We will never
* fail EDQUOT for metdata, but we do account for it.
*/
- if (!(*errp) &&
- ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
+ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
dquot_alloc_block_nofail(inode,
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 3285aa5a706a..b610779a958c 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
@@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
@@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
@@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 0bb3f9ea0832..c24143ea9c08 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -151,13 +151,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
&file->f_ra, file,
index, 1);
file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
- bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
+ bh = ext4_bread(NULL, inode, map.m_lblk, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
}
- /*
- * We ignore I/O errors on directories so users have a chance
- * of recovering data when there's a bad sector
- */
if (!bh) {
if (!dir_has_error) {
EXT4_ERROR_FILE(file, 0,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5b19760b1de5..c55a1faaed58 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -572,15 +572,15 @@ enum {
/*
* The bit position of these flags must not overlap with any of the
- * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
+ * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(),
* read_extent_tree_block(), ext4_split_extent_at(),
* ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
* EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
* caching the extents when reading from the extent tree while a
* truncate or punch hole operation is in progress.
*/
-#define EXT4_EX_NOCACHE 0x0400
-#define EXT4_EX_FORCE_CACHE 0x0800
+#define EXT4_EX_NOCACHE 0x40000000
+#define EXT4_EX_FORCE_CACHE 0x20000000
/*
* Flags used by ext4_free_blocks
@@ -890,6 +890,7 @@ struct ext4_inode_info {
struct ext4_es_tree i_es_tree;
rwlock_t i_es_lock;
struct list_head i_es_lru;
+ unsigned int i_es_all_nr; /* protected by i_es_lock */
unsigned int i_es_lru_nr; /* protected by i_es_lock */
unsigned long i_touch_when; /* jiffies of last accessing */
@@ -1174,6 +1175,9 @@ struct ext4_super_block {
#define EXT4_MF_MNTDIR_SAMPLED 0x0001
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+/* Number of quota types we support */
+#define EXT4_MAXQUOTAS 2
+
/*
* fourth extended-fs super-block data in memory
*/
@@ -1237,7 +1241,7 @@ struct ext4_sb_info {
u32 s_min_batch_time;
struct block_device *journal_bdev;
#ifdef CONFIG_QUOTA
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
@@ -1330,8 +1334,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_lru;
- unsigned long s_es_last_sorted;
- struct percpu_counter s_extent_cache_cnt;
+ struct ext4_es_stats s_es_stats;
struct mb_cache *s_mb_cache;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
@@ -1399,7 +1402,6 @@ enum {
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
- EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
@@ -1825,7 +1827,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
/*
* Special error return code only used by dx_probe() and its callers.
*/
-#define ERR_BAD_DX_DIR -75000
+#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
/*
* Timeout and state flag for lazy initialization inode thread.
@@ -2086,10 +2088,8 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
/* inode.c */
-struct buffer_head *ext4_getblk(handle_t *, struct inode *,
- ext4_lblk_t, int, int *);
-struct buffer_head *ext4_bread(handle_t *, struct inode *,
- ext4_lblk_t, int, int *);
+struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
+struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
@@ -2109,6 +2109,7 @@ int do_journal_get_write_access(handle_t *handle,
#define CONVERT_INLINE_DATA 2
extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct dentry *, struct iattr *);
extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -2332,10 +2333,18 @@ extern int ext4_register_li_request(struct super_block *sb,
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
return EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
+ (EXT4_SB(sb)->s_chksum_driver != NULL);
}
+static inline int ext4_has_metadata_csum(struct super_block *sb)
+{
+ WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ !EXT4_SB(sb)->s_chksum_driver);
+
+ return (EXT4_SB(sb)->s_chksum_driver != NULL);
+}
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
@@ -2454,6 +2463,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
+/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
+{
+ int changed = 0;
+
+ if (newsize > inode->i_size) {
+ i_size_write(inode, newsize);
+ changed = 1;
+ }
+ if (newsize > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, newsize);
+ changed |= 2;
+ }
+ return changed;
+}
+
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
@@ -2715,21 +2740,26 @@ extern int ext4_can_extents_be_merged(struct inode *inode,
struct ext4_extent *ex1,
struct ext4_extent *ex2);
extern int ext4_ext_insert_extent(handle_t *, struct inode *,
- struct ext4_ext_path *,
+ struct ext4_ext_path **,
struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path *,
- int flags);
+extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
+ struct ext4_ext_path **,
+ int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern int ext4_find_delalloc_range(struct inode *inode,
ext4_lblk_t lblk_start,
ext4_lblk_t lblk_end);
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
+ struct inode *inode2, ext4_lblk_t lblk1,
+ ext4_lblk_t lblk2, ext4_lblk_t count,
+ int mark_unwritten,int *err);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2739,8 +2769,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);
-extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
- struct ext4_extent **extent);
/* page-io.c */
extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a867f5ca9991..3c9381547094 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -123,6 +123,7 @@ find_ext4_extent_tail(struct ext4_extent_header *eh)
struct ext4_ext_path {
ext4_fsblk_t p_block;
__u16 p_depth;
+ __u16 p_maxdepth;
struct ext4_extent *p_ext;
struct ext4_extent_idx *p_idx;
struct ext4_extent_header *p_hdr;
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 0074e0d23d6e..3445035c7e01 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -256,8 +256,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
- /* Errors can only happen if there is a bug */
- if (WARN_ON_ONCE(err)) {
+ /* Errors can only happen due to aborted journal or a nasty bug */
+ if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
if (inode == NULL) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 17c00ff202f2..9c5b49fb281e 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -102,9 +102,9 @@
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
-#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
-#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
-#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
static inline int ext4_jbd2_credits_xattr(struct inode *inode)
{
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 76c2df382b7d..0b16fb4c06d3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -73,8 +73,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return 1;
et = find_ext4_extent_tail(eh);
@@ -88,8 +87,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return;
et = find_ext4_extent_tail(eh);
@@ -98,14 +96,14 @@ static void ext4_extent_block_csum_set(struct inode *inode,
static int ext4_split_extent(handle_t *handle,
struct inode *inode,
- struct ext4_ext_path *path,
+ struct ext4_ext_path **ppath,
struct ext4_map_blocks *map,
int split_flag,
int flags);
static int ext4_split_extent_at(handle_t *handle,
struct inode *inode,
- struct ext4_ext_path *path,
+ struct ext4_ext_path **ppath,
ext4_lblk_t split,
int split_flag,
int flags);
@@ -291,6 +289,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
return size;
}
+static inline int
+ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+ int nofail)
+{
+ struct ext4_ext_path *path = *ppath;
+ int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+
+ return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
+ EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
+ EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
+ (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+}
+
/*
* Calculate the number of metadata blocks needed
* to allocate @blocks
@@ -695,9 +707,11 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
void ext4_ext_drop_refs(struct ext4_ext_path *path)
{
- int depth = path->p_depth;
- int i;
+ int depth, i;
+ if (!path)
+ return;
+ depth = path->p_depth;
for (i = 0; i <= depth; i++, path++)
if (path->p_bh) {
brelse(path->p_bh);
@@ -841,24 +855,32 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
}
struct ext4_ext_path *
-ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path *path, int flags)
+ext4_find_extent(struct inode *inode, ext4_lblk_t block,
+ struct ext4_ext_path **orig_path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
- short int depth, i, ppos = 0, alloc = 0;
+ struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
+ short int depth, i, ppos = 0;
int ret;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
- /* account possible depth increase */
+ if (path) {
+ ext4_ext_drop_refs(path);
+ if (depth > path[0].p_maxdepth) {
+ kfree(path);
+ *orig_path = path = NULL;
+ }
+ }
if (!path) {
+ /* account possible depth increase */
path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
GFP_NOFS);
- if (!path)
+ if (unlikely(!path))
return ERR_PTR(-ENOMEM);
- alloc = 1;
+ path[0].p_maxdepth = depth + 1;
}
path[0].p_hdr = eh;
path[0].p_bh = NULL;
@@ -876,7 +898,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
flags);
- if (IS_ERR(bh)) {
+ if (unlikely(IS_ERR(bh))) {
ret = PTR_ERR(bh);
goto err;
}
@@ -910,8 +932,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
err:
ext4_ext_drop_refs(path);
- if (alloc)
- kfree(path);
+ kfree(path);
+ if (orig_path)
+ *orig_path = NULL;
return ERR_PTR(ret);
}
@@ -1238,16 +1261,24 @@ cleanup:
* just created block
*/
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
- unsigned int flags,
- struct ext4_extent *newext)
+ unsigned int flags)
{
struct ext4_extent_header *neh;
struct buffer_head *bh;
- ext4_fsblk_t newblock;
+ ext4_fsblk_t newblock, goal = 0;
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
int err = 0;
- newblock = ext4_ext_new_meta_block(handle, inode, NULL,
- newext, &err, flags);
+ /* Try to prepend new index to old one */
+ if (ext_depth(inode))
+ goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
+ if (goal > le32_to_cpu(es->s_first_data_block)) {
+ flags |= EXT4_MB_HINT_TRY_GOAL;
+ goal--;
+ } else
+ goal = ext4_inode_to_goal_block(inode);
+ newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+ NULL, &err);
if (newblock == 0)
return err;
@@ -1314,9 +1345,10 @@ out:
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
unsigned int mb_flags,
unsigned int gb_flags,
- struct ext4_ext_path *path,
+ struct ext4_ext_path **ppath,
struct ext4_extent *newext)
{
+ struct ext4_ext_path *path = *ppath;
struct ext4_ext_path *curp;
int depth, i, err = 0;
@@ -1340,23 +1372,21 @@ repeat:
goto out;
/* refill path */
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode,
+ path = ext4_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path, gb_flags);
+ ppath, gb_flags);
if (IS_ERR(path))
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags);
if (err)
goto out;
/* refill path */
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode,
+ path = ext4_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path, gb_flags);
+ ppath, gb_flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -1559,7 +1589,7 @@ found_extent:
* allocated block. Thus, index entries have to be consistent
* with leaves.
*/
-static ext4_lblk_t
+ext4_lblk_t
ext4_ext_next_allocated_block(struct ext4_ext_path *path)
{
int depth;
@@ -1802,6 +1832,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
sizeof(struct ext4_extent_idx);
s += sizeof(struct ext4_extent_header);
+ path[1].p_maxdepth = path[0].p_maxdepth;
memcpy(path[0].p_hdr, path[1].p_hdr, s);
path[0].p_depth = 0;
path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
@@ -1896,9 +1927,10 @@ out:
* creating new leaf in the no-space case.
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
+ struct ext4_ext_path **ppath,
struct ext4_extent *newext, int gb_flags)
{
+ struct ext4_ext_path *path = *ppath;
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
@@ -1907,6 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
ext4_lblk_t next;
int mb_flags = 0, unwritten;
+ if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
return -EIO;
@@ -1925,7 +1959,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
/*
* Try to see whether we should rather test the extent on
* right from ex, or from the left of ex. This is because
- * ext4_ext_find_extent() can return either extent on the
+ * ext4_find_extent() can return either extent on the
* left, or on the right from the searched position. This
* will make merging more effective.
*/
@@ -2008,7 +2042,7 @@ prepend:
if (next != EXT_MAX_BLOCKS) {
ext_debug("next leaf block - %u\n", next);
BUG_ON(npath != NULL);
- npath = ext4_ext_find_extent(inode, next, NULL, 0);
+ npath = ext4_find_extent(inode, next, NULL, 0);
if (IS_ERR(npath))
return PTR_ERR(npath);
BUG_ON(npath->p_depth != path->p_depth);
@@ -2028,9 +2062,9 @@ prepend:
* We're gonna add a new leaf in the tree.
*/
if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
- mb_flags = EXT4_MB_USE_RESERVED;
+ mb_flags |= EXT4_MB_USE_RESERVED;
err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
- path, newext);
+ ppath, newext);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -2108,10 +2142,8 @@ merge:
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
- if (npath) {
- ext4_ext_drop_refs(npath);
- kfree(npath);
- }
+ ext4_ext_drop_refs(npath);
+ kfree(npath);
return err;
}
@@ -2133,13 +2165,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
/* find extent for this block */
down_read(&EXT4_I(inode)->i_data_sem);
- if (path && ext_depth(inode) != depth) {
- /* depth was changed. we have to realloc path */
- kfree(path);
- path = NULL;
- }
-
- path = ext4_ext_find_extent(inode, block, path, 0);
+ path = ext4_find_extent(inode, block, &path, 0);
if (IS_ERR(path)) {
up_read(&EXT4_I(inode)->i_data_sem);
err = PTR_ERR(path);
@@ -2156,7 +2182,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
}
ex = path[depth].p_ext;
next = ext4_ext_next_allocated_block(path);
- ext4_ext_drop_refs(path);
flags = 0;
exists = 0;
@@ -2266,11 +2291,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
block = es.es_lblk + es.es_len;
}
- if (path) {
- ext4_ext_drop_refs(path);
- kfree(path);
- }
-
+ ext4_ext_drop_refs(path);
+ kfree(path);
return err;
}
@@ -2826,7 +2848,7 @@ again:
ext4_lblk_t ee_block;
/* find extent for this block */
- path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+ path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2854,24 +2876,14 @@ again:
*/
if (end >= ee_block &&
end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
- int split_flag = 0;
-
- if (ext4_ext_is_unwritten(ex))
- split_flag = EXT4_EXT_MARK_UNWRIT1 |
- EXT4_EXT_MARK_UNWRIT2;
-
/*
* Split the extent in two so that 'end' is the last
* block in the first new extent. Also we should not
* fail removing space due to ENOSPC so try to use
* reserved block if that happens.
*/
- err = ext4_split_extent_at(handle, inode, path,
- end + 1, split_flag,
- EXT4_EX_NOCACHE |
- EXT4_GET_BLOCKS_PRE_IO |
- EXT4_GET_BLOCKS_METADATA_NOFAIL);
-
+ err = ext4_force_split_extent_at(handle, inode, &path,
+ end + 1, 1);
if (err < 0)
goto out;
}
@@ -2893,7 +2905,7 @@ again:
ext4_journal_stop(handle);
return -ENOMEM;
}
- path[0].p_depth = depth;
+ path[0].p_maxdepth = path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
i = 0;
@@ -3013,10 +3025,9 @@ again:
out:
ext4_ext_drop_refs(path);
kfree(path);
- if (err == -EAGAIN) {
- path = NULL;
+ path = NULL;
+ if (err == -EAGAIN)
goto again;
- }
ext4_journal_stop(handle);
return err;
@@ -3130,11 +3141,12 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
*/
static int ext4_split_extent_at(handle_t *handle,
struct inode *inode,
- struct ext4_ext_path *path,
+ struct ext4_ext_path **ppath,
ext4_lblk_t split,
int split_flag,
int flags)
{
+ struct ext4_ext_path *path = *ppath;
ext4_fsblk_t newblock;
ext4_lblk_t ee_block;
struct ext4_extent *ex, newex, orig_ex, zero_ex;
@@ -3205,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle,
if (split_flag & EXT4_EXT_MARK_UNWRIT2)
ext4_ext_mark_unwritten(ex2);
- err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
if (split_flag & EXT4_EXT_DATA_VALID1) {
@@ -3271,11 +3283,12 @@ fix_extent_len:
*/
static int ext4_split_extent(handle_t *handle,
struct inode *inode,
- struct ext4_ext_path *path,
+ struct ext4_ext_path **ppath,
struct ext4_map_blocks *map,
int split_flag,
int flags)
{
+ struct ext4_ext_path *path = *ppath;
ext4_lblk_t ee_block;
struct ext4_extent *ex;
unsigned int ee_len, depth;
@@ -3298,7 +3311,7 @@ static int ext4_split_extent(handle_t *handle,
EXT4_EXT_MARK_UNWRIT2;
if (split_flag & EXT4_EXT_DATA_VALID2)
split_flag1 |= EXT4_EXT_DATA_VALID1;
- err = ext4_split_extent_at(handle, inode, path,
+ err = ext4_split_extent_at(handle, inode, ppath,
map->m_lblk + map->m_len, split_flag1, flags1);
if (err)
goto out;
@@ -3309,8 +3322,7 @@ static int ext4_split_extent(handle_t *handle,
* Update path is required because previous ext4_split_extent_at() may
* result in split of original leaf or extent zeroout.
*/
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+ path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
if (IS_ERR(path))
return PTR_ERR(path);
depth = ext_depth(inode);
@@ -3330,7 +3342,7 @@ static int ext4_split_extent(handle_t *handle,
split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
EXT4_EXT_MARK_UNWRIT2);
}
- err = ext4_split_extent_at(handle, inode, path,
+ err = ext4_split_extent_at(handle, inode, ppath,
map->m_lblk, split_flag1, flags);
if (err)
goto out;
@@ -3364,9 +3376,10 @@ out:
static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path *path,
+ struct ext4_ext_path **ppath,
int flags)
{
+ struct ext4_ext_path *path = *ppath;
struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
@@ -3590,11 +3603,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
}
- allocated = ext4_split_extent(handle, inode, path,
- &split_map, split_flag, flags);
- if (allocated < 0)
- err = allocated;
-
+ err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
+ flags);
+ if (err > 0)
+ err = 0;
out:
/* If we have gotten a failure, don't zero out status tree */
if (!err)
@@ -3629,9 +3641,10 @@ out:
static int ext4_split_convert_extents(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path *path,
+ struct ext4_ext_path **ppath,
int flags)
{
+ struct ext4_ext_path *path = *ppath;
ext4_lblk_t eof_block;
ext4_lblk_t ee_block;
struct ext4_extent *ex;
@@ -3665,74 +3678,15 @@ static int ext4_split_convert_extents(handle_t *handle,
split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
}
flags |= EXT4_GET_BLOCKS_PRE_IO;
- return ext4_split_extent(handle, inode, path, map, split_flag, flags);
-}
-
-static int ext4_convert_initialized_extents(handle_t *handle,
- struct inode *inode,
- struct ext4_map_blocks *map,
- struct ext4_ext_path *path)
-{
- struct ext4_extent *ex;
- ext4_lblk_t ee_block;
- unsigned int ee_len;
- int depth;
- int err = 0;
-
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- ee_block = le32_to_cpu(ex->ee_block);
- ee_len = ext4_ext_get_actual_len(ex);
-
- ext_debug("%s: inode %lu, logical"
- "block %llu, max_blocks %u\n", __func__, inode->i_ino,
- (unsigned long long)ee_block, ee_len);
-
- if (ee_block != map->m_lblk || ee_len > map->m_len) {
- err = ext4_split_convert_extents(handle, inode, map, path,
- EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
- if (err < 0)
- goto out;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- if (!ex) {
- EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
- (unsigned long) map->m_lblk);
- err = -EIO;
- goto out;
- }
- }
-
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
- /* first mark the extent as unwritten */
- ext4_ext_mark_unwritten(ex);
-
- /* note: ext4_ext_correct_indexes() isn't needed here because
- * borders are not changed
- */
- ext4_ext_try_to_merge(handle, inode, path, ex);
-
- /* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-out:
- ext4_ext_show_leaf(inode, path);
- return err;
+ return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
}
-
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path *path)
+ struct ext4_ext_path **ppath)
{
+ struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
@@ -3761,16 +3715,13 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
- err = ext4_split_convert_extents(handle, inode, map, path,
+ err = ext4_split_convert_extents(handle, inode, map, ppath,
EXT4_GET_BLOCKS_CONVERT);
if (err < 0)
- goto out;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
+ return err;
+ path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
depth = ext_depth(inode);
ex = path[depth].p_ext;
}
@@ -3963,12 +3914,16 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
}
static int
-ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map,
- struct ext4_ext_path *path, int flags,
- unsigned int allocated, ext4_fsblk_t newblock)
+convert_initialized_extent(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path **ppath, int flags,
+ unsigned int allocated, ext4_fsblk_t newblock)
{
- int ret = 0;
+ struct ext4_ext_path *path = *ppath;
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
+ int depth;
int err = 0;
/*
@@ -3978,28 +3933,67 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
- ret = ext4_convert_initialized_extents(handle, inode, map,
- path);
- if (ret >= 0) {
- ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk,
- path, map->m_len);
- } else
- err = ret;
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ ext_debug("%s: inode %lu, logical"
+ "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+ (unsigned long long)ee_block, ee_len);
+
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+ err = ext4_split_convert_extents(handle, inode, map, ppath,
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+ if (err < 0)
+ return err;
+ path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ return -EIO;
+ }
+ }
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ return err;
+ /* first mark the extent as unwritten */
+ ext4_ext_mark_unwritten(ex);
+
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
+ */
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (err)
+ return err;
+ ext4_ext_show_leaf(inode, path);
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
+ if (err)
+ return err;
map->m_flags |= EXT4_MAP_UNWRITTEN;
if (allocated > map->m_len)
allocated = map->m_len;
map->m_len = allocated;
-
- return err ? err : allocated;
+ return allocated;
}
static int
ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path *path, int flags,
+ struct ext4_ext_path **ppath, int flags,
unsigned int allocated, ext4_fsblk_t newblock)
{
+ struct ext4_ext_path *path = *ppath;
int ret = 0;
int err = 0;
ext4_io_end_t *io = ext4_inode_aio(inode);
@@ -4021,8 +4015,8 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
/* get_block() before submit the IO, split the extent */
if (flags & EXT4_GET_BLOCKS_PRE_IO) {
- ret = ext4_split_convert_extents(handle, inode, map,
- path, flags | EXT4_GET_BLOCKS_CONVERT);
+ ret = ext4_split_convert_extents(handle, inode, map, ppath,
+ flags | EXT4_GET_BLOCKS_CONVERT);
if (ret <= 0)
goto out;
/*
@@ -4040,7 +4034,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
- path);
+ ppath);
if (ret >= 0) {
ext4_update_inode_fsync_trans(handle, inode, 1);
err = check_eofblocks_fl(handle, inode, map->m_lblk,
@@ -4078,7 +4072,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
}
/* buffered write, writepage time, convert*/
- ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
+ ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out:
@@ -4279,7 +4273,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -4291,7 +4285,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/*
* consistent leaf must not be empty;
* this situation is possible, though, _during_ tree modification;
- * this is why assert can't be put in ext4_ext_find_extent()
+ * this is why assert can't be put in ext4_find_extent()
*/
if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
EXT4_ERROR_INODE(inode, "bad extent address "
@@ -4331,15 +4325,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
- allocated = ext4_ext_convert_initialized_extent(
- handle, inode, map, path, flags,
- allocated, newblock);
+ allocated = convert_initialized_extent(
+ handle, inode, map, &path,
+ flags, allocated, newblock);
goto out2;
} else if (!ext4_ext_is_unwritten(ex))
goto out;
ret = ext4_ext_handle_unwritten_extents(
- handle, inode, map, path, flags,
+ handle, inode, map, &path, flags,
allocated, newblock);
if (ret < 0)
err = ret;
@@ -4376,7 +4370,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/*
* If we are doing bigalloc, check to see if the extent returned
- * by ext4_ext_find_extent() implies a cluster we can use.
+ * by ext4_find_extent() implies a cluster we can use.
*/
if (cluster_offset && ex &&
get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
@@ -4451,6 +4445,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ar.flags = 0;
if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
ar.flags |= EXT4_MB_HINT_NOPREALLOC;
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ar.flags |= EXT4_MB_DELALLOC_RESERVED;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
@@ -4486,7 +4482,7 @@ got_allocated_blocks:
err = check_eofblocks_fl(handle, inode, map->m_lblk,
path, ar.len);
if (!err)
- err = ext4_ext_insert_extent(handle, inode, path,
+ err = ext4_ext_insert_extent(handle, inode, &path,
&newex, flags);
if (!err && set_unwritten) {
@@ -4619,10 +4615,8 @@ out:
map->m_pblk = newblock;
map->m_len = allocated;
out2:
- if (path) {
- ext4_ext_drop_refs(path);
- kfree(path);
- }
+ ext4_ext_drop_refs(path);
+ kfree(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
@@ -4665,7 +4659,8 @@ retry:
}
static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
- ext4_lblk_t len, int flags, int mode)
+ ext4_lblk_t len, loff_t new_size,
+ int flags, int mode)
{
struct inode *inode = file_inode(file);
handle_t *handle;
@@ -4674,8 +4669,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
int retries = 0;
struct ext4_map_blocks map;
unsigned int credits;
+ loff_t epos;
map.m_lblk = offset;
+ map.m_len = len;
/*
* Don't normalize the request if it can fit in one extent so
* that it doesn't get unnecessarily split into multiple
@@ -4690,9 +4687,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
credits = ext4_chunk_trans_blocks(inode, len);
retry:
- while (ret >= 0 && ret < len) {
- map.m_lblk = map.m_lblk + ret;
- map.m_len = len = len - ret;
+ while (ret >= 0 && len) {
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
credits);
if (IS_ERR(handle)) {
@@ -4709,6 +4704,21 @@ retry:
ret2 = ext4_journal_stop(handle);
break;
}
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
+ epos = (loff_t)map.m_lblk << inode->i_blkbits;
+ inode->i_ctime = ext4_current_time(inode);
+ if (new_size) {
+ if (epos > new_size)
+ epos = new_size;
+ if (ext4_update_inode_size(inode, epos) & 0x1)
+ inode->i_mtime = inode->i_ctime;
+ } else {
+ if (epos > inode->i_size)
+ ext4_set_inode_flag(inode,
+ EXT4_INODE_EOFBLOCKS);
+ }
+ ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret2)
break;
@@ -4731,7 +4741,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t new_size = 0;
int ret = 0;
int flags;
- int partial;
+ int credits;
+ int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
struct address_space *mapping = inode->i_mapping;
@@ -4771,7 +4782,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (start < offset || end > offset + len)
return -EINVAL;
- partial = (offset + len) & ((1 << blkbits) - 1);
+ partial_begin = offset & ((1 << blkbits) - 1);
+ partial_end = (offset + len) & ((1 << blkbits) - 1);
lblk = start >> blkbits;
max_blocks = (end >> blkbits);
@@ -4781,7 +4793,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
max_blocks -= lblk;
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
- EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE;
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
@@ -4805,7 +4818,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* If we have a partial block after EOF we have to allocate
* the entire block.
*/
- if (partial)
+ if (partial_end)
max_blocks += 1;
}
@@ -4813,25 +4826,41 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Now release the pages and zero block aligned part of pages*/
truncate_pagecache_range(inode, start, end - 1);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+ flags, mode);
+ if (ret)
+ goto out_dio;
/*
* Remove entire range from the extent status tree.
+ *
+ * ext4_es_remove_extent(inode, lblk, max_blocks) is
+ * NOT sufficient. I'm not sure why this is the case,
+ * but let's be conservative and remove the extent
+ * status tree for the entire inode. There should be
+ * no outstanding delalloc extents thanks to the
+ * filemap_write_and_wait_range() call above.
*/
- ret = ext4_es_remove_extent(inode, lblk, max_blocks);
- if (ret)
- goto out_dio;
-
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
- mode);
+ ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
if (ret)
goto out_dio;
}
+ if (!partial_begin && !partial_end)
+ goto out_dio;
- handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+ /*
+ * In worst case we have to writeout two nonadjacent unwritten
+ * blocks and update the inode
+ */
+ credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
+ if (ext4_should_journal_data(inode))
+ credits += 2;
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
@@ -4839,12 +4868,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-
if (new_size) {
- if (new_size > i_size_read(inode))
- i_size_write(inode, new_size);
- if (new_size > EXT4_I(inode)->i_disksize)
- ext4_update_i_disksize(inode, new_size);
+ ext4_update_inode_size(inode, new_size);
} else {
/*
* Mark that we allocate beyond EOF so the subsequent truncate
@@ -4853,7 +4878,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if ((offset + len) > i_size_read(inode))
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
-
ext4_mark_inode_dirty(handle, inode);
/* Zero out partial block at the edges of the range */
@@ -4880,13 +4904,11 @@ out_mutex:
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- handle_t *handle;
loff_t new_size = 0;
unsigned int max_blocks;
int ret = 0;
int flags;
ext4_lblk_t lblk;
- struct timespec tv;
unsigned int blkbits = inode->i_blkbits;
/* Return error if mode is not supported */
@@ -4937,36 +4959,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+ flags, mode);
if (ret)
goto out;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle))
- goto out;
-
- tv = inode->i_ctime = ext4_current_time(inode);
-
- if (new_size) {
- if (new_size > i_size_read(inode)) {
- i_size_write(inode, new_size);
- inode->i_mtime = tv;
- }
- if (new_size > EXT4_I(inode)->i_disksize)
- ext4_update_i_disksize(inode, new_size);
- } else {
- /*
- * Mark that we allocate beyond EOF so the subsequent truncate
- * can proceed even if the new size is the same as i_size.
- */
- if ((offset + len) > i_size_read(inode))
- ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+ ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
}
- ext4_mark_inode_dirty(handle, inode);
- if (file->f_flags & O_SYNC)
- ext4_handle_sync(handle);
-
- ext4_journal_stop(handle);
out:
mutex_unlock(&inode->i_mutex);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
@@ -5304,36 +5305,31 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
struct ext4_ext_path *path;
int ret = 0, depth;
struct ext4_extent *extent;
- ext4_lblk_t stop_block, current_block;
+ ext4_lblk_t stop_block;
ext4_lblk_t ex_start, ex_end;
/* Let path point to the last extent */
- path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
if (IS_ERR(path))
return PTR_ERR(path);
depth = path->p_depth;
extent = path[depth].p_ext;
- if (!extent) {
- ext4_ext_drop_refs(path);
- kfree(path);
- return ret;
- }
+ if (!extent)
+ goto out;
stop_block = le32_to_cpu(extent->ee_block) +
ext4_ext_get_actual_len(extent);
- ext4_ext_drop_refs(path);
- kfree(path);
/* Nothing to shift, if hole is at the end of file */
if (start >= stop_block)
- return ret;
+ goto out;
/*
* Don't start shifting extents until we make sure the hole is big
* enough to accomodate the shift.
*/
- path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+ path = ext4_find_extent(inode, start - 1, &path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
depth = path->p_depth;
@@ -5346,8 +5342,6 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
ex_start = 0;
ex_end = 0;
}
- ext4_ext_drop_refs(path);
- kfree(path);
if ((start == ex_start && shift > ex_start) ||
(shift > start - ex_end))
@@ -5355,7 +5349,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
/* Its safe to start updating extents */
while (start < stop_block) {
- path = ext4_ext_find_extent(inode, start, NULL, 0);
+ path = ext4_find_extent(inode, start, &path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
depth = path->p_depth;
@@ -5365,27 +5359,23 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
(unsigned long) start);
return -EIO;
}
-
- current_block = le32_to_cpu(extent->ee_block);
- if (start > current_block) {
+ if (start > le32_to_cpu(extent->ee_block)) {
/* Hole, move to the next extent */
- ret = mext_next_extent(inode, path, &extent);
- if (ret != 0) {
- ext4_ext_drop_refs(path);
- kfree(path);
- if (ret == 1)
- ret = 0;
- break;
+ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
+ path[depth].p_ext++;
+ } else {
+ start = ext4_ext_next_allocated_block(path);
+ continue;
}
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
handle, &start);
- ext4_ext_drop_refs(path);
- kfree(path);
if (ret)
break;
}
-
+out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
return ret;
}
@@ -5508,3 +5498,199 @@ out_mutex:
mutex_unlock(&inode->i_mutex);
return ret;
}
+
+/**
+ * ext4_swap_extents - Swap extents between two inodes
+ *
+ * @inode1: First inode
+ * @inode2: Second inode
+ * @lblk1: Start block for first inode
+ * @lblk2: Start block for second inode
+ * @count: Number of blocks to swap
+ * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @erp: Pointer to save error value
+ *
+ * This helper routine does exactly what is promise "swap extents". All other
+ * stuff such as page-cache locking consistency, bh mapping consistency or
+ * extent's data copying must be performed by caller.
+ * Locking:
+ * i_mutex is held for both inodes
+ * i_data_sem is locked for write for both inodes
+ * Assumptions:
+ * All pages from requested range are locked for both inodes
+ */
+int
+ext4_swap_extents(handle_t *handle, struct inode *inode1,
+ struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+ ext4_lblk_t count, int unwritten, int *erp)
+{
+ struct ext4_ext_path *path1 = NULL;
+ struct ext4_ext_path *path2 = NULL;
+ int replaced_count = 0;
+
+ BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+ BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+ BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+ BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+
+ *erp = ext4_es_remove_extent(inode1, lblk1, count);
+ if (unlikely(*erp))
+ return 0;
+ *erp = ext4_es_remove_extent(inode2, lblk2, count);
+ if (unlikely(*erp))
+ return 0;
+
+ while (count) {
+ struct ext4_extent *ex1, *ex2, tmp_ex;
+ ext4_lblk_t e1_blk, e2_blk;
+ int e1_len, e2_len, len;
+ int split = 0;
+
+ path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+ if (unlikely(IS_ERR(path1))) {
+ *erp = PTR_ERR(path1);
+ path1 = NULL;
+ finish:
+ count = 0;
+ goto repeat;
+ }
+ path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+ if (unlikely(IS_ERR(path2))) {
+ *erp = PTR_ERR(path2);
+ path2 = NULL;
+ goto finish;
+ }
+ ex1 = path1[path1->p_depth].p_ext;
+ ex2 = path2[path2->p_depth].p_ext;
+ /* Do we have somthing to swap ? */
+ if (unlikely(!ex2 || !ex1))
+ goto finish;
+
+ e1_blk = le32_to_cpu(ex1->ee_block);
+ e2_blk = le32_to_cpu(ex2->ee_block);
+ e1_len = ext4_ext_get_actual_len(ex1);
+ e2_len = ext4_ext_get_actual_len(ex2);
+
+ /* Hole handling */
+ if (!in_range(lblk1, e1_blk, e1_len) ||
+ !in_range(lblk2, e2_blk, e2_len)) {
+ ext4_lblk_t next1, next2;
+
+ /* if hole after extent, then go to next extent */
+ next1 = ext4_ext_next_allocated_block(path1);
+ next2 = ext4_ext_next_allocated_block(path2);
+ /* If hole before extent, then shift to that extent */
+ if (e1_blk > lblk1)
+ next1 = e1_blk;
+ if (e2_blk > lblk2)
+ next2 = e1_blk;
+ /* Do we have something to swap */
+ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
+ goto finish;
+ /* Move to the rightest boundary */
+ len = next1 - lblk1;
+ if (len < next2 - lblk2)
+ len = next2 - lblk2;
+ if (len > count)
+ len = count;
+ lblk1 += len;
+ lblk2 += len;
+ count -= len;
+ goto repeat;
+ }
+
+ /* Prepare left boundary */
+ if (e1_blk < lblk1) {
+ split = 1;
+ *erp = ext4_force_split_extent_at(handle, inode1,
+ &path1, lblk1, 0);
+ if (unlikely(*erp))
+ goto finish;
+ }
+ if (e2_blk < lblk2) {
+ split = 1;
+ *erp = ext4_force_split_extent_at(handle, inode2,
+ &path2, lblk2, 0);
+ if (unlikely(*erp))
+ goto finish;
+ }
+ /* ext4_split_extent_at() may result in leaf extent split,
+ * path must to be revalidated. */
+ if (split)
+ goto repeat;
+
+ /* Prepare right boundary */
+ len = count;
+ if (len > e1_blk + e1_len - lblk1)
+ len = e1_blk + e1_len - lblk1;
+ if (len > e2_blk + e2_len - lblk2)
+ len = e2_blk + e2_len - lblk2;
+
+ if (len != e1_len) {
+ split = 1;
+ *erp = ext4_force_split_extent_at(handle, inode1,
+ &path1, lblk1 + len, 0);
+ if (unlikely(*erp))
+ goto finish;
+ }
+ if (len != e2_len) {
+ split = 1;
+ *erp = ext4_force_split_extent_at(handle, inode2,
+ &path2, lblk2 + len, 0);
+ if (*erp)
+ goto finish;
+ }
+ /* ext4_split_extent_at() may result in leaf extent split,
+ * path must to be revalidated. */
+ if (split)
+ goto repeat;
+
+ BUG_ON(e2_len != e1_len);
+ *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+ if (unlikely(*erp))
+ goto finish;
+ *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+ if (unlikely(*erp))
+ goto finish;
+
+ /* Both extents are fully inside boundaries. Swap it now */
+ tmp_ex = *ex1;
+ ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+ ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+ ex1->ee_len = cpu_to_le16(e2_len);
+ ex2->ee_len = cpu_to_le16(e1_len);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex2);
+ if (ext4_ext_is_unwritten(&tmp_ex))
+ ext4_ext_mark_unwritten(ex1);
+
+ ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+ ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+ *erp = ext4_ext_dirty(handle, inode2, path2 +
+ path2->p_depth);
+ if (unlikely(*erp))
+ goto finish;
+ *erp = ext4_ext_dirty(handle, inode1, path1 +
+ path1->p_depth);
+ /*
+ * Looks scarry ah..? second inode already points to new blocks,
+ * and it was successfully dirtied. But luckily error may happen
+ * only due to journal error, so full transaction will be
+ * aborted anyway.
+ */
+ if (unlikely(*erp))
+ goto finish;
+ lblk1 += len;
+ lblk2 += len;
+ replaced_count += len;
+ count -= len;
+
+ repeat:
+ ext4_ext_drop_refs(path1);
+ kfree(path1);
+ ext4_ext_drop_refs(path2);
+ kfree(path2);
+ path1 = path2 = NULL;
+ }
+ return replaced_count;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 0b7e28e7eaa4..94e7855ae71b 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -11,6 +11,8 @@
*/
#include <linux/rbtree.h>
#include <linux/list_sort.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include "ext4.h"
#include "extents_status.h"
@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
*/
if (!ext4_es_is_delayed(es)) {
EXT4_I(inode)->i_es_lru_nr++;
- percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+ s_es_stats.es_stats_lru_cnt);
}
+ EXT4_I(inode)->i_es_all_nr++;
+ percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
return es;
}
static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
+ EXT4_I(inode)->i_es_all_nr--;
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
/* Decrease the lru counter when this es is not delayed */
if (!ext4_es_is_delayed(es)) {
BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
EXT4_I(inode)->i_es_lru_nr--;
- percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+ percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+ s_es_stats.es_stats_lru_cnt);
}
kmem_cache_free(ext4_es_cachep, es);
@@ -426,7 +436,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
unsigned short ee_len;
int depth, ee_status, es_status;
- path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
+ path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path))
return;
@@ -499,10 +509,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
}
out:
- if (path) {
- ext4_ext_drop_refs(path);
- kfree(path);
- }
+ ext4_ext_drop_refs(path);
+ kfree(path);
}
static void ext4_es_insert_extent_ind_check(struct inode *inode,
@@ -731,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es)
{
struct ext4_es_tree *tree;
+ struct ext4_es_stats *stats;
struct extent_status *es1 = NULL;
struct rb_node *node;
int found = 0;
@@ -767,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
}
out:
+ stats = &EXT4_SB(inode->i_sb)->s_es_stats;
if (found) {
BUG_ON(!es1);
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
+ stats->es_stats_cache_hits++;
+ } else {
+ stats->es_stats_cache_misses++;
}
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -933,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei)
{
struct ext4_inode_info *ei;
+ struct ext4_es_stats *es_stats;
struct list_head *cur, *tmp;
LIST_HEAD(skipped);
+ ktime_t start_time;
+ u64 scan_time;
int nr_shrunk = 0;
int retried = 0, skip_precached = 1, nr_skipped = 0;
+ es_stats = &sbi->s_es_stats;
+ start_time = ktime_get();
spin_lock(&sbi->s_es_lru_lock);
retry:
@@ -948,7 +966,8 @@ retry:
* If we have already reclaimed all extents from extent
* status tree, just stop the loop immediately.
*/
- if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+ if (percpu_counter_read_positive(
+ &es_stats->es_stats_lru_cnt) == 0)
break;
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
@@ -958,7 +977,7 @@ retry:
* time. Normally we try hard to avoid shrinking
* precached inodes, but we will as a last resort.
*/
- if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+ if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
(skip_precached && ext4_test_inode_state(&ei->vfs_inode,
EXT4_STATE_EXT_PRECACHED))) {
nr_skipped++;
@@ -992,7 +1011,7 @@ retry:
if ((nr_shrunk == 0) && nr_skipped && !retried) {
retried++;
list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
- sbi->s_es_last_sorted = jiffies;
+ es_stats->es_stats_last_sorted = jiffies;
ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
i_es_lru);
/*
@@ -1010,6 +1029,22 @@ retry:
if (locked_ei && nr_shrunk == 0)
nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+ scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ if (likely(es_stats->es_stats_scan_time))
+ es_stats->es_stats_scan_time = (scan_time +
+ es_stats->es_stats_scan_time*3) / 4;
+ else
+ es_stats->es_stats_scan_time = scan_time;
+ if (scan_time > es_stats->es_stats_max_scan_time)
+ es_stats->es_stats_max_scan_time = scan_time;
+ if (likely(es_stats->es_stats_shrunk))
+ es_stats->es_stats_shrunk = (nr_shrunk +
+ es_stats->es_stats_shrunk*3) / 4;
+ else
+ es_stats->es_stats_shrunk = nr_shrunk;
+
+ trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
+ nr_skipped, retried);
return nr_shrunk;
}
@@ -1020,8 +1055,8 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
struct ext4_sb_info *sbi;
sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
- nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
- trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
+ nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+ trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
}
@@ -1033,31 +1068,160 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
- trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+ trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
if (!nr_to_scan)
return ret;
nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
- trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+ trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
return nr_shrunk;
}
-void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
{
+ return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *
+ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return NULL;
+}
+
+static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+{
+ struct ext4_sb_info *sbi = seq->private;
+ struct ext4_es_stats *es_stats = &sbi->s_es_stats;
+ struct ext4_inode_info *ei, *max = NULL;
+ unsigned int inode_cnt = 0;
+
+ if (v != SEQ_START_TOKEN)
+ return 0;
+
+ /* here we just find an inode that has the max nr. of objects */
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
+ inode_cnt++;
+ if (max && max->i_es_all_nr < ei->i_es_all_nr)
+ max = ei;
+ else if (!max)
+ max = ei;
+ }
+ spin_unlock(&sbi->s_es_lru_lock);
+
+ seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
+ percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+ percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
+ seq_printf(seq, " %lu/%lu cache hits/misses\n",
+ es_stats->es_stats_cache_hits,
+ es_stats->es_stats_cache_misses);
+ if (es_stats->es_stats_last_sorted != 0)
+ seq_printf(seq, " %u ms last sorted interval\n",
+ jiffies_to_msecs(jiffies -
+ es_stats->es_stats_last_sorted));
+ if (inode_cnt)
+ seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
+
+ seq_printf(seq, "average:\n %llu us scan time\n",
+ div_u64(es_stats->es_stats_scan_time, 1000));
+ seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
+ if (inode_cnt)
+ seq_printf(seq,
+ "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
+ " %llu us max scan time\n",
+ max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
+ div_u64(es_stats->es_stats_max_scan_time, 1000));
+
+ return 0;
+}
+
+static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
+ .start = ext4_es_seq_shrinker_info_start,
+ .next = ext4_es_seq_shrinker_info_next,
+ .stop = ext4_es_seq_shrinker_info_stop,
+ .show = ext4_es_seq_shrinker_info_show,
+};
+
+static int
+ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = PDE_DATA(inode);
+ }
+
+ return ret;
+}
+
+static int
+ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+{
+ return seq_release(inode, file);
+}
+
+static const struct file_operations ext4_es_seq_shrinker_info_fops = {
+ .owner = THIS_MODULE,
+ .open = ext4_es_seq_shrinker_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = ext4_es_seq_shrinker_info_release,
+};
+
+int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+ int err;
+
INIT_LIST_HEAD(&sbi->s_es_lru);
spin_lock_init(&sbi->s_es_lru_lock);
- sbi->s_es_last_sorted = 0;
+ sbi->s_es_stats.es_stats_last_sorted = 0;
+ sbi->s_es_stats.es_stats_shrunk = 0;
+ sbi->s_es_stats.es_stats_cache_hits = 0;
+ sbi->s_es_stats.es_stats_cache_misses = 0;
+ sbi->s_es_stats.es_stats_scan_time = 0;
+ sbi->s_es_stats.es_stats_max_scan_time = 0;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
+ if (err)
+ return err;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL);
+ if (err)
+ goto err1;
+
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- register_shrinker(&sbi->s_es_shrinker);
+ err = register_shrinker(&sbi->s_es_shrinker);
+ if (err)
+ goto err2;
+
+ if (sbi->s_proc)
+ proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
+ &ext4_es_seq_shrinker_info_fops, sbi);
+
+ return 0;
+
+err2:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+err1:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+ return err;
}
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
+ if (sbi->s_proc)
+ remove_proc_entry("es_shrinker_info", sbi->s_proc);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f1b62a419920..efd5f970b501 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -64,6 +64,17 @@ struct ext4_es_tree {
struct extent_status *cache_es; /* recently accessed extent */
};
+struct ext4_es_stats {
+ unsigned long es_stats_last_sorted;
+ unsigned long es_stats_shrunk;
+ unsigned long es_stats_cache_hits;
+ unsigned long es_stats_cache_misses;
+ u64 es_stats_scan_time;
+ u64 es_stats_max_scan_time;
+ struct percpu_counter es_stats_all_cnt;
+ struct percpu_counter es_stats_lru_cnt;
+};
+
extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
(pb & ~ES_MASK));
}
-extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24a4432..8131be8c0af3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -137,10 +137,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
}
+ iocb->private = &overwrite;
if (o_direct) {
blk_start_plug(&plug);
- iocb->private = &overwrite;
/* check whether we do a DIO overwrite or not */
if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5b87fc36aab8..ac644c31ca67 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -887,6 +887,10 @@ got:
struct buffer_head *block_bitmap_bh;
block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (!block_bitmap_bh) {
+ err = -EIO;
+ goto out;
+ }
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
@@ -1011,8 +1015,7 @@ got:
spin_unlock(&sbi->s_next_gen_lock);
/* Precompute checksum seed for inode metadata */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index e75f840000a0..36b369697a13 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -318,34 +318,24 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
* as described above and return 0.
*/
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, int indirect_blks,
- int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+static int ext4_alloc_branch(handle_t *handle,
+ struct ext4_allocation_request *ar,
+ int indirect_blks, ext4_lblk_t *offsets,
+ Indirect *branch)
{
- struct ext4_allocation_request ar;
struct buffer_head * bh;
ext4_fsblk_t b, new_blocks[4];
__le32 *p;
int i, j, err, len = 1;
- /*
- * Set up for the direct block allocation
- */
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.len = *blks;
- ar.logical = iblock;
- if (S_ISREG(inode->i_mode))
- ar.flags = EXT4_MB_HINT_DATA;
-
for (i = 0; i <= indirect_blks; i++) {
if (i == indirect_blks) {
- ar.goal = goal;
- new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+ new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
} else
- goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
- goal, 0, NULL, &err);
+ ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
+ ar->inode, ar->goal,
+ ar->flags & EXT4_MB_DELALLOC_RESERVED,
+ NULL, &err);
if (err) {
i--;
goto failed;
@@ -354,7 +344,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
if (i == 0)
continue;
- bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
+ bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
if (unlikely(!bh)) {
err = -ENOMEM;
goto failed;
@@ -372,7 +362,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
b = new_blocks[i];
if (i == indirect_blks)
- len = ar.len;
+ len = ar->len;
for (j = 0; j < len; j++)
*p++ = cpu_to_le32(b++);
@@ -381,11 +371,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
unlock_buffer(bh);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
+ err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
if (err)
goto failed;
}
- *blks = ar.len;
return 0;
failed:
for (; i >= 0; i--) {
@@ -396,10 +385,10 @@ failed:
* existing before ext4_alloc_branch() was called.
*/
if (i > 0 && i != indirect_blks && branch[i].bh)
- ext4_forget(handle, 1, inode, branch[i].bh,
+ ext4_forget(handle, 1, ar->inode, branch[i].bh,
branch[i].bh->b_blocknr);
- ext4_free_blocks(handle, inode, NULL, new_blocks[i],
- (i == indirect_blks) ? ar.len : 1, 0);
+ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
+ (i == indirect_blks) ? ar->len : 1, 0);
}
return err;
}
@@ -419,9 +408,9 @@ failed:
* inode (->i_blocks, etc.). In case of success we end up with the full
* chain to new block and return 0.
*/
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, Indirect *where, int num,
- int blks)
+static int ext4_splice_branch(handle_t *handle,
+ struct ext4_allocation_request *ar,
+ Indirect *where, int num)
{
int i;
int err = 0;
@@ -446,9 +435,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
* Update the host buffer_head or inode to point to more just allocated
* direct blocks blocks
*/
- if (num == 0 && blks > 1) {
+ if (num == 0 && ar->len > 1) {
current_block = le32_to_cpu(where->key) + 1;
- for (i = 1; i < blks; i++)
+ for (i = 1; i < ar->len; i++)
*(where->p + i) = cpu_to_le32(current_block++);
}
@@ -465,14 +454,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
*/
jbd_debug(5, "splicing indirect only\n");
BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+ err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
if (err)
goto err_out;
} else {
/*
* OK, we spliced it into the inode itself on a direct block.
*/
- ext4_mark_inode_dirty(handle, inode);
+ ext4_mark_inode_dirty(handle, ar->inode);
jbd_debug(5, "splicing direct\n");
}
return err;
@@ -484,11 +473,11 @@ err_out:
* need to revoke the block, which is why we don't
* need to set EXT4_FREE_BLOCKS_METADATA.
*/
- ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+ ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
EXT4_FREE_BLOCKS_FORGET);
}
- ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
- blks, 0);
+ ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
+ ar->len, 0);
return err;
}
@@ -525,11 +514,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
int flags)
{
+ struct ext4_allocation_request ar;
int err = -EIO;
ext4_lblk_t offsets[4];
Indirect chain[4];
Indirect *partial;
- ext4_fsblk_t goal;
int indirect_blks;
int blocks_to_boundary = 0;
int depth;
@@ -579,7 +568,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
return -ENOSPC;
}
- goal = ext4_find_goal(inode, map->m_lblk, partial);
+ /* Set up for the direct block allocation */
+ memset(&ar, 0, sizeof(ar));
+ ar.inode = inode;
+ ar.logical = map->m_lblk;
+ if (S_ISREG(inode->i_mode))
+ ar.flags = EXT4_MB_HINT_DATA;
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+
+ ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = (chain + depth) - partial - 1;
@@ -588,13 +586,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
* Next look up the indirect map to count the totoal number of
* direct blocks to allocate for this branch.
*/
- count = ext4_blks_to_allocate(partial, indirect_blks,
- map->m_len, blocks_to_boundary);
+ ar.len = ext4_blks_to_allocate(partial, indirect_blks,
+ map->m_len, blocks_to_boundary);
+
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
- &count, goal,
+ err = ext4_alloc_branch(handle, &ar, indirect_blks,
offsets + (partial - chain), partial);
/*
@@ -605,14 +603,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
* may need to return -EAGAIN upwards in the worst case. --sct
*/
if (!err)
- err = ext4_splice_branch(handle, inode, map->m_lblk,
- partial, indirect_blks, count);
+ err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
if (err)
goto cleanup;
map->m_flags |= EXT4_MAP_NEW;
ext4_update_inode_fsync_trans(handle, inode, 1);
+ count = ar.len;
got_it:
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index bea662bd0ca6..3ea62695abce 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -594,6 +594,7 @@ retry:
if (ret) {
unlock_page(page);
page_cache_release(page);
+ page = NULL;
ext4_orphan_add(handle, inode);
up_write(&EXT4_I(inode)->xattr_sem);
sem_held = 0;
@@ -613,7 +614,8 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
- block_commit_write(page, from, to);
+ if (page)
+ block_commit_write(page, from, to);
out:
if (page) {
unlock_page(page);
@@ -1126,8 +1128,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
inline_size - EXT4_INLINE_DOTDOT_SIZE);
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
inode->i_size = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 367a60c07cf0..3356ab5395f4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -83,8 +83,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ !ext4_has_metadata_csum(inode->i_sb))
return 1;
provided = le16_to_cpu(raw->i_checksum_lo);
@@ -105,8 +104,7 @@ static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ !ext4_has_metadata_csum(inode->i_sb))
return;
csum = ext4_inode_csum(inode, raw, ei);
@@ -224,16 +222,15 @@ void ext4_evict_inode(struct inode *inode)
goto no_delete;
}
- if (!is_bad_inode(inode))
- dquot_initialize(inode);
+ if (is_bad_inode(inode))
+ goto no_delete;
+ dquot_initialize(inode);
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages_final(&inode->i_data);
WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
- if (is_bad_inode(inode))
- goto no_delete;
/*
* Protect us against freezing - iput() caller didn't have to have any
@@ -590,20 +587,12 @@ found:
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
- * the write lock of i_data_sem, and call get_blocks()
+ * the write lock of i_data_sem, and call get_block()
* with create == 1 flag.
*/
down_write(&EXT4_I(inode)->i_data_sem);
/*
- * if the caller is from delayed allocation writeout path
- * we have already reserved fs blocks for allocation
- * let the underlying get_block() function know to
- * avoid double accounting
- */
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
- /*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
@@ -631,8 +620,6 @@ found:
(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
ext4_da_update_reserve_space(inode, retval, 1);
}
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
if (retval > 0) {
unsigned int status;
@@ -734,11 +721,11 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, int create, int *errp)
+ ext4_lblk_t block, int create)
{
struct ext4_map_blocks map;
struct buffer_head *bh;
- int fatal = 0, err;
+ int err;
J_ASSERT(handle != NULL || create == 0);
@@ -747,21 +734,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
err = ext4_map_blocks(handle, inode, &map,
create ? EXT4_GET_BLOCKS_CREATE : 0);
- /* ensure we send some value back into *errp */
- *errp = 0;
-
- if (create && err == 0)
- err = -ENOSPC; /* should never happen */
+ if (err == 0)
+ return create ? ERR_PTR(-ENOSPC) : NULL;
if (err < 0)
- *errp = err;
- if (err <= 0)
- return NULL;
+ return ERR_PTR(err);
bh = sb_getblk(inode->i_sb, map.m_pblk);
- if (unlikely(!bh)) {
- *errp = -ENOMEM;
- return NULL;
- }
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
J_ASSERT(create != 0);
J_ASSERT(handle != NULL);
@@ -775,44 +755,44 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
*/
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
- fatal = ext4_journal_get_create_access(handle, bh);
- if (!fatal && !buffer_uptodate(bh)) {
+ err = ext4_journal_get_create_access(handle, bh);
+ if (unlikely(err)) {
+ unlock_buffer(bh);
+ goto errout;
+ }
+ if (!buffer_uptodate(bh)) {
memset(bh->b_data, 0, inode->i_sb->s_blocksize);
set_buffer_uptodate(bh);
}
unlock_buffer(bh);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!fatal)
- fatal = err;
- } else {
+ if (unlikely(err))
+ goto errout;
+ } else
BUFFER_TRACE(bh, "not a new buffer");
- }
- if (fatal) {
- *errp = fatal;
- brelse(bh);
- bh = NULL;
- }
return bh;
+errout:
+ brelse(bh);
+ return ERR_PTR(err);
}
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, int create, int *err)
+ ext4_lblk_t block, int create)
{
struct buffer_head *bh;
- bh = ext4_getblk(handle, inode, block, create, err);
- if (!bh)
+ bh = ext4_getblk(handle, inode, block, create);
+ if (IS_ERR(bh))
return bh;
- if (buffer_uptodate(bh))
+ if (!bh || buffer_uptodate(bh))
return bh;
ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
put_bh(bh);
- *err = -EIO;
- return NULL;
+ return ERR_PTR(-EIO);
}
int ext4_walk_page_buffers(handle_t *handle,
@@ -1055,27 +1035,11 @@ static int ext4_write_end(struct file *file,
} else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
-
/*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hole i_mutex.
- *
- * But it's important to update i_size while still holding page lock:
+ * it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
*/
- if (pos + copied > inode->i_size) {
- i_size_write(inode, pos + copied);
- i_size_changed = 1;
- }
-
- if (pos + copied > EXT4_I(inode)->i_disksize) {
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * but greater than i_disksize. (hint delalloc)
- */
- ext4_update_i_disksize(inode, (pos + copied));
- i_size_changed = 1;
- }
+ i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
page_cache_release(page);
@@ -1123,7 +1087,7 @@ static int ext4_journalled_write_end(struct file *file,
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
- loff_t new_i_size;
+ int size_changed = 0;
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1146,20 +1110,18 @@ static int ext4_journalled_write_end(struct file *file,
if (!partial)
SetPageUptodate(page);
}
- new_i_size = pos + copied;
- if (new_i_size > inode->i_size)
- i_size_write(inode, pos+copied);
+ size_changed = ext4_update_inode_size(inode, pos + copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, new_i_size);
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
}
- unlock_page(page);
- page_cache_release(page);
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
@@ -1554,7 +1516,7 @@ out_unlock:
}
/*
- * This is a special get_blocks_t callback which is used by
+ * This is a special get_block_t callback which is used by
* ext4_da_write_begin(). It will either return mapped block or
* reserve space for a single block.
*
@@ -2029,12 +1991,10 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* in data loss. So use reserved blocks to allocate metadata if
* possible.
*
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
- * in question are delalloc blocks. This affects functions in many
- * different parts of the allocation call path. This flag exists
- * primarily because we don't want to change *many* call functions, so
- * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
- * once the inode's allocation semaphore is taken.
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
+ * the blocks in question are delalloc blocks. This indicates
+ * that the blocks and quotas has already been checked when
+ * the data was copied into the page cache.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL;
@@ -2095,6 +2055,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
struct ext4_map_blocks *map = &mpd->map;
int err;
loff_t disksize;
+ int progress = 0;
mpd->io_submit.io_end->offset =
((loff_t)map->m_lblk) << inode->i_blkbits;
@@ -2111,8 +2072,11 @@ static int mpage_map_and_submit_extent(handle_t *handle,
* is non-zero, a commit should free up blocks.
*/
if ((err == -ENOMEM) ||
- (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ (err == -ENOSPC && ext4_count_free_clusters(sb))) {
+ if (progress)
+ goto update_disksize;
return err;
+ }
ext4_msg(sb, KERN_CRIT,
"Delayed block allocation failed for "
"inode %lu at logical offset %llu with"
@@ -2129,15 +2093,17 @@ static int mpage_map_and_submit_extent(handle_t *handle,
*give_up_on_write = true;
return err;
}
+ progress = 1;
/*
* Update buffer state, submit mapped pages, and get us new
* extent to map
*/
err = mpage_map_and_submit_buffers(mpd);
if (err < 0)
- return err;
+ goto update_disksize;
} while (map->m_len);
+update_disksize:
/*
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
@@ -2527,6 +2493,20 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
+/* We always reserve for an inode update; the superblock could be there too */
+static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
+{
+ if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+ return 1;
+
+ if (pos + len <= 0x7fffffffULL)
+ return 1;
+
+ /* We might need to update the superblock to set LARGE_FILE */
+ return 2;
+}
+
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -2577,7 +2557,8 @@ retry_grab:
* of file which has an already mapped buffer.
*/
retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_da_write_credits(inode, pos, len));
if (IS_ERR(handle)) {
page_cache_release(page);
return PTR_ERR(handle);
@@ -2670,10 +2651,7 @@ static int ext4_da_write_end(struct file *file,
if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
if (ext4_has_inline_data(inode) ||
ext4_da_should_update_i_disksize(page, end)) {
- down_write(&EXT4_I(inode)->i_data_sem);
- if (new_i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = new_i_size;
- up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_update_i_disksize(inode, new_i_size);
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
* bu greater than i_disksize.(hint delalloc)
@@ -3948,8 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = 0;
/* Precompute checksum seed for inode metadata */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_metadata_csum(sb)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
@@ -4139,6 +4116,13 @@ bad_inode:
return ERR_PTR(ret);
}
+struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
+{
+ if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+ return ERR_PTR(-EIO);
+ return ext4_iget(sb, ino);
+}
+
static int ext4_inode_blocks_set(handle_t *handle,
struct ext4_inode *raw_inode,
struct ext4_inode_info *ei)
@@ -4238,7 +4222,8 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
- if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+ err = ext4_inode_blocks_set(handle, raw_inode, ei);
+ if (err) {
spin_unlock(&ei->i_raw_lock);
goto out_brelse;
}
@@ -4548,8 +4533,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_orphan_del(NULL, inode);
goto err_out;
}
- } else
+ } else {
+ loff_t oldsize = inode->i_size;
+
i_size_write(inode, attr->ia_size);
+ pagecache_isize_extended(inode, oldsize, inode->i_size);
+ }
/*
* Blocks are going to be removed from the inode. Wait
@@ -4970,7 +4959,12 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else {
- jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal);
+ if (err < 0) {
+ jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
+ return err;
+ }
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
}
ext4_set_aops(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0f2252ec274d..bfda18a15592 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,8 +331,7 @@ flags_out:
if (!inode_owner_or_capable(inode))
return -EPERM;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_metadata_csum(inode->i_sb)) {
ext4_warning(sb, "Setting inode version is not "
"supported with metadata_csum enabled.");
return -ENOTTY;
@@ -532,9 +531,17 @@ group_add_out:
}
case EXT4_IOC_SWAP_BOOT:
+ {
+ int err;
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- return swap_inode_boot_loader(sb, inode);
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ err = swap_inode_boot_loader(sb, inode);
+ mnt_drop_write_file(filp);
+ return err;
+ }
case EXT4_IOC_RESIZE_FS: {
ext4_fsblk_t n_blocks_count;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 956027711faf..dbfe15c2533c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
+ if (WARN_ON(count == 0))
+ return;
BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
/* Don't bother if the block group is corrupt. */
@@ -3153,9 +3155,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
"start %lu, size %lu, fe_logical %lu",
(unsigned long) start, (unsigned long) size,
(unsigned long) ac->ac_o_ex.fe_logical);
+ BUG();
}
- BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
- start > ac->ac_o_ex.fe_logical);
BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
/* now prepare goal request */
@@ -3221,6 +3222,8 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
int err;
if (pa == NULL) {
+ if (ac->ac_f_ex.fe_len == 0)
+ return;
err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
if (err) {
/*
@@ -3235,6 +3238,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
ac->ac_f_ex.fe_len);
ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+ ext4_mb_unload_buddy(&e4b);
return;
}
if (pa->pa_type == MB_INODE_PA)
@@ -4129,7 +4133,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
* per cpu locality group is to reduce the contention between block
* request from multiple CPUs.
*/
- ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
+ ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
/* we're going to use group allocation */
ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4405,14 +4409,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
if (IS_NOQUOTA(ar->inode))
ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
- /*
- * For delayed allocation, we could skip the ENOSPC and
- * EDQUOT check, as blocks and quotas have been already
- * reserved when data being copied into pagecache.
- */
- if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
- ar->flags |= EXT4_MB_DELALLOC_RESERVED;
- else {
+ if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
/* Without delayed allocation we need to verify
* there is enough free blocks to do block allocation
* and verify allocation doesn't exceed the quota limits.
@@ -4523,8 +4520,7 @@ out:
if (inquota && ar->len < inquota)
dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
- if (!ext4_test_inode_state(ar->inode,
- EXT4_STATE_DELALLOC_RESERVED))
+ if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
/* release all the reserved blocks if non delalloc */
percpu_counter_sub(&sbi->s_dirtyclusters_counter,
reserv_clstrs);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index d3567f27bae7..a432634f2e6a 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -41,8 +41,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
ext4_ext_store_pblock(&newext, lb->first_pblock);
/* Locking only for convinience since we are operating on temp inode */
down_write(&EXT4_I(inode)->i_data_sem);
- path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
-
+ path = ext4_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
path = NULL;
@@ -81,13 +80,11 @@ static int finish_range(handle_t *handle, struct inode *inode,
goto err_out;
}
}
- retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+ retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
- if (path) {
- ext4_ext_drop_refs(path);
- kfree(path);
- }
+ ext4_ext_drop_refs(path);
+ kfree(path);
lb->first_pblock = 0;
return retval;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 32bce844c2e1..8313ca3324ec 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -20,8 +20,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 1;
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,8 +28,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return;
mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 671a74b14fd7..9f2311bc9c4f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -27,120 +27,26 @@
* @lblock: logical block number to find an extent path
* @path: pointer to an extent path pointer (for output)
*
- * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * ext4_find_extent wrapper. Return 0 on success, or a negative error value
* on failure.
*/
static inline int
get_ext_path(struct inode *inode, ext4_lblk_t lblock,
- struct ext4_ext_path **orig_path)
+ struct ext4_ext_path **ppath)
{
- int ret = 0;
struct ext4_ext_path *path;
- path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
+ path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
if (IS_ERR(path))
- ret = PTR_ERR(path);
- else if (path[ext_depth(inode)].p_ext == NULL)
- ret = -ENODATA;
- else
- *orig_path = path;
-
- return ret;
-}
-
-/**
- * copy_extent_status - Copy the extent's initialization status
- *
- * @src: an extent for getting initialize status
- * @dest: an extent to be set the status
- */
-static void
-copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
-{
- if (ext4_ext_is_unwritten(src))
- ext4_ext_mark_unwritten(dest);
- else
- dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
-}
-
-/**
- * mext_next_extent - Search for the next extent and set it to "extent"
- *
- * @inode: inode which is searched
- * @path: this will obtain data for the next extent
- * @extent: pointer to the next extent we have just gotten
- *
- * Search the next extent in the array of ext4_ext_path structure (@path)
- * and set it to ext4_extent structure (@extent). In addition, the member of
- * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
- * ext4_ext_path structure refers to the last extent, or a negative error
- * value on failure.
- */
-int
-mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
- struct ext4_extent **extent)
-{
- struct ext4_extent_header *eh;
- int ppos, leaf_ppos = path->p_depth;
-
- ppos = leaf_ppos;
- if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
- /* leaf block */
- *extent = ++path[ppos].p_ext;
- path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
- return 0;
- }
-
- while (--ppos >= 0) {
- if (EXT_LAST_INDEX(path[ppos].p_hdr) >
- path[ppos].p_idx) {
- int cur_ppos = ppos;
-
- /* index block */
- path[ppos].p_idx++;
- path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
- if (path[ppos+1].p_bh)
- brelse(path[ppos+1].p_bh);
- path[ppos+1].p_bh =
- sb_bread(inode->i_sb, path[ppos].p_block);
- if (!path[ppos+1].p_bh)
- return -EIO;
- path[ppos+1].p_hdr =
- ext_block_hdr(path[ppos+1].p_bh);
-
- /* Halfway index block */
- while (++cur_ppos < leaf_ppos) {
- path[cur_ppos].p_idx =
- EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
- path[cur_ppos].p_block =
- ext4_idx_pblock(path[cur_ppos].p_idx);
- if (path[cur_ppos+1].p_bh)
- brelse(path[cur_ppos+1].p_bh);
- path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
- path[cur_ppos].p_block);
- if (!path[cur_ppos+1].p_bh)
- return -EIO;
- path[cur_ppos+1].p_hdr =
- ext_block_hdr(path[cur_ppos+1].p_bh);
- }
-
- path[leaf_ppos].p_ext = *extent = NULL;
-
- eh = path[leaf_ppos].p_hdr;
- if (le16_to_cpu(eh->eh_entries) == 0)
- /* empty leaf is found */
- return -ENODATA;
-
- /* leaf block */
- path[leaf_ppos].p_ext = *extent =
- EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
- path[leaf_ppos].p_block =
- ext4_ext_pblock(path[leaf_ppos].p_ext);
- return 0;
- }
+ return PTR_ERR(path);
+ if (path[ext_depth(inode)].p_ext == NULL) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ *ppath = NULL;
+ return -ENODATA;
}
- /* We found the last extent */
- return 1;
+ *ppath = path;
+ return 0;
}
/**
@@ -178,417 +84,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode,
}
/**
- * mext_insert_across_blocks - Insert extents across leaf block
- *
- * @handle: journal handle
- * @orig_inode: original inode
- * @o_start: first original extent to be changed
- * @o_end: last original extent to be changed
- * @start_ext: first new extent to be inserted
- * @new_ext: middle of new extent to be inserted
- * @end_ext: last new extent to be inserted
- *
- * Allocate a new leaf block and insert extents into it. Return 0 on success,
- * or a negative error value on failure.
- */
-static int
-mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
- struct ext4_extent *o_start, struct ext4_extent *o_end,
- struct ext4_extent *start_ext, struct ext4_extent *new_ext,
- struct ext4_extent *end_ext)
-{
- struct ext4_ext_path *orig_path = NULL;
- ext4_lblk_t eblock = 0;
- int new_flag = 0;
- int end_flag = 0;
- int err = 0;
-
- if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
- if (o_start == o_end) {
-
- /* start_ext new_ext end_ext
- * donor |---------|-----------|--------|
- * orig |------------------------------|
- */
- end_flag = 1;
- } else {
-
- /* start_ext new_ext end_ext
- * donor |---------|----------|---------|
- * orig |---------------|--------------|
- */
- o_end->ee_block = end_ext->ee_block;
- o_end->ee_len = end_ext->ee_len;
- ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
- }
-
- o_start->ee_len = start_ext->ee_len;
- eblock = le32_to_cpu(start_ext->ee_block);
- new_flag = 1;
-
- } else if (start_ext->ee_len && new_ext->ee_len &&
- !end_ext->ee_len && o_start == o_end) {
-
- /* start_ext new_ext
- * donor |--------------|---------------|
- * orig |------------------------------|
- */
- o_start->ee_len = start_ext->ee_len;
- eblock = le32_to_cpu(start_ext->ee_block);
- new_flag = 1;
-
- } else if (!start_ext->ee_len && new_ext->ee_len &&
- end_ext->ee_len && o_start == o_end) {
-
- /* new_ext end_ext
- * donor |--------------|---------------|
- * orig |------------------------------|
- */
- o_end->ee_block = end_ext->ee_block;
- o_end->ee_len = end_ext->ee_len;
- ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-
- /*
- * Set 0 to the extent block if new_ext was
- * the first block.
- */
- if (new_ext->ee_block)
- eblock = le32_to_cpu(new_ext->ee_block);
-
- new_flag = 1;
- } else {
- ext4_debug("ext4 move extent: Unexpected insert case\n");
- return -EIO;
- }
-
- if (new_flag) {
- err = get_ext_path(orig_inode, eblock, &orig_path);
- if (err)
- goto out;
-
- if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, new_ext, 0))
- goto out;
- }
-
- if (end_flag) {
- err = get_ext_path(orig_inode,
- le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
- if (err)
- goto out;
-
- if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, end_ext, 0))
- goto out;
- }
-out:
- if (orig_path) {
- ext4_ext_drop_refs(orig_path);
- kfree(orig_path);
- }
-
- return err;
-
-}
-
-/**
- * mext_insert_inside_block - Insert new extent to the extent block
- *
- * @o_start: first original extent to be moved
- * @o_end: last original extent to be moved
- * @start_ext: first new extent to be inserted
- * @new_ext: middle of new extent to be inserted
- * @end_ext: last new extent to be inserted
- * @eh: extent header of target leaf block
- * @range_to_move: used to decide how to insert extent
- *
- * Insert extents into the leaf block. The extent (@o_start) is overwritten
- * by inserted extents.
- */
-static void
-mext_insert_inside_block(struct ext4_extent *o_start,
- struct ext4_extent *o_end,
- struct ext4_extent *start_ext,
- struct ext4_extent *new_ext,
- struct ext4_extent *end_ext,
- struct ext4_extent_header *eh,
- int range_to_move)
-{
- int i = 0;
- unsigned long len;
-
- /* Move the existing extents */
- if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
- len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
- (unsigned long)(o_end + 1);
- memmove(o_end + 1 + range_to_move, o_end + 1, len);
- }
-
- /* Insert start entry */
- if (start_ext->ee_len)
- o_start[i++].ee_len = start_ext->ee_len;
-
- /* Insert new entry */
- if (new_ext->ee_len) {
- o_start[i] = *new_ext;
- ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
- }
-
- /* Insert end entry */
- if (end_ext->ee_len)
- o_start[i] = *end_ext;
-
- /* Increment the total entries counter on the extent block */
- le16_add_cpu(&eh->eh_entries, range_to_move);
-}
-
-/**
- * mext_insert_extents - Insert new extent
- *
- * @handle: journal handle
- * @orig_inode: original inode
- * @orig_path: path indicates first extent to be changed
- * @o_start: first original extent to be changed
- * @o_end: last original extent to be changed
- * @start_ext: first new extent to be inserted
- * @new_ext: middle of new extent to be inserted
- * @end_ext: last new extent to be inserted
- *
- * Call the function to insert extents. If we cannot add more extents into
- * the leaf block, we call mext_insert_across_blocks() to create a
- * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
- * on success, or a negative error value on failure.
- */
-static int
-mext_insert_extents(handle_t *handle, struct inode *orig_inode,
- struct ext4_ext_path *orig_path,
- struct ext4_extent *o_start,
- struct ext4_extent *o_end,
- struct ext4_extent *start_ext,
- struct ext4_extent *new_ext,
- struct ext4_extent *end_ext)
-{
- struct ext4_extent_header *eh;
- unsigned long need_slots, slots_range;
- int range_to_move, depth, ret;
-
- /*
- * The extents need to be inserted
- * start_extent + new_extent + end_extent.
- */
- need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
- (new_ext->ee_len ? 1 : 0);
-
- /* The number of slots between start and end */
- slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
- / sizeof(struct ext4_extent);
-
- /* Range to move the end of extent */
- range_to_move = need_slots - slots_range;
- depth = orig_path->p_depth;
- orig_path += depth;
- eh = orig_path->p_hdr;
-
- if (depth) {
- /* Register to journal */
- BUFFER_TRACE(orig_path->p_bh, "get_write_access");
- ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
- if (ret)
- return ret;
- }
-
- /* Expansion */
- if (range_to_move > 0 &&
- (range_to_move > le16_to_cpu(eh->eh_max)
- - le16_to_cpu(eh->eh_entries))) {
-
- ret = mext_insert_across_blocks(handle, orig_inode, o_start,
- o_end, start_ext, new_ext, end_ext);
- if (ret < 0)
- return ret;
- } else
- mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
- end_ext, eh, range_to_move);
-
- return ext4_ext_dirty(handle, orig_inode, orig_path);
-}
-
-/**
- * mext_leaf_block - Move one leaf extent block into the inode.
- *
- * @handle: journal handle
- * @orig_inode: original inode
- * @orig_path: path indicates first extent to be changed
- * @dext: donor extent
- * @from: start offset on the target file
- *
- * In order to insert extents into the leaf block, we must divide the extent
- * in the leaf block into three extents. The one is located to be inserted
- * extents, and the others are located around it.
- *
- * Therefore, this function creates structures to save extents of the leaf
- * block, and inserts extents by calling mext_insert_extents() with
- * created extents. Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_leaf_block(handle_t *handle, struct inode *orig_inode,
- struct ext4_ext_path *orig_path, struct ext4_extent *dext,
- ext4_lblk_t *from)
-{
- struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
- struct ext4_extent new_ext, start_ext, end_ext;
- ext4_lblk_t new_ext_end;
- int oext_alen, new_ext_alen, end_ext_alen;
- int depth = ext_depth(orig_inode);
- int ret;
-
- start_ext.ee_block = end_ext.ee_block = 0;
- o_start = o_end = oext = orig_path[depth].p_ext;
- oext_alen = ext4_ext_get_actual_len(oext);
- start_ext.ee_len = end_ext.ee_len = 0;
-
- new_ext.ee_block = cpu_to_le32(*from);
- ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
- new_ext.ee_len = dext->ee_len;
- new_ext_alen = ext4_ext_get_actual_len(&new_ext);
- new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-
- /*
- * Case: original extent is first
- * oext |--------|
- * new_ext |--|
- * start_ext |--|
- */
- if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
- le32_to_cpu(new_ext.ee_block) <
- le32_to_cpu(oext->ee_block) + oext_alen) {
- start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
- le32_to_cpu(oext->ee_block));
- start_ext.ee_block = oext->ee_block;
- copy_extent_status(oext, &start_ext);
- } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
- prev_ext = oext - 1;
- /*
- * We can merge new_ext into previous extent,
- * if these are contiguous and same extent type.
- */
- if (ext4_can_extents_be_merged(orig_inode, prev_ext,
- &new_ext)) {
- o_start = prev_ext;
- start_ext.ee_len = cpu_to_le16(
- ext4_ext_get_actual_len(prev_ext) +
- new_ext_alen);
- start_ext.ee_block = oext->ee_block;
- copy_extent_status(prev_ext, &start_ext);
- new_ext.ee_len = 0;
- }
- }
-
- /*
- * Case: new_ext_end must be less than oext
- * oext |-----------|
- * new_ext |-------|
- */
- if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
- EXT4_ERROR_INODE(orig_inode,
- "new_ext_end(%u) should be less than or equal to "
- "oext->ee_block(%u) + oext_alen(%d) - 1",
- new_ext_end, le32_to_cpu(oext->ee_block),
- oext_alen);
- ret = -EIO;
- goto out;
- }
-
- /*
- * Case: new_ext is smaller than original extent
- * oext |---------------|
- * new_ext |-----------|
- * end_ext |---|
- */
- if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
- new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
- end_ext.ee_len =
- cpu_to_le16(le32_to_cpu(oext->ee_block) +
- oext_alen - 1 - new_ext_end);
- copy_extent_status(oext, &end_ext);
- end_ext_alen = ext4_ext_get_actual_len(&end_ext);
- ext4_ext_store_pblock(&end_ext,
- (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
- end_ext.ee_block =
- cpu_to_le32(le32_to_cpu(o_end->ee_block) +
- oext_alen - end_ext_alen);
- }
-
- ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
- o_end, &start_ext, &new_ext, &end_ext);
-out:
- return ret;
-}
-
-/**
- * mext_calc_swap_extents - Calculate extents for extent swapping.
- *
- * @tmp_dext: the extent that will belong to the original inode
- * @tmp_oext: the extent that will belong to the donor inode
- * @orig_off: block offset of original inode
- * @donor_off: block offset of donor inode
- * @max_count: the maximum length of extents
- *
- * Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_calc_swap_extents(struct ext4_extent *tmp_dext,
- struct ext4_extent *tmp_oext,
- ext4_lblk_t orig_off, ext4_lblk_t donor_off,
- ext4_lblk_t max_count)
-{
- ext4_lblk_t diff, orig_diff;
- struct ext4_extent dext_old, oext_old;
-
- BUG_ON(orig_off != donor_off);
-
- /* original and donor extents have to cover the same block offset */
- if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
- le32_to_cpu(tmp_oext->ee_block) +
- ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
- return -ENODATA;
-
- if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
- le32_to_cpu(tmp_dext->ee_block) +
- ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
- return -ENODATA;
-
- dext_old = *tmp_dext;
- oext_old = *tmp_oext;
-
- /* When tmp_dext is too large, pick up the target range. */
- diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-
- ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
- le32_add_cpu(&tmp_dext->ee_block, diff);
- le16_add_cpu(&tmp_dext->ee_len, -diff);
-
- if (max_count < ext4_ext_get_actual_len(tmp_dext))
- tmp_dext->ee_len = cpu_to_le16(max_count);
-
- orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
- ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
-
- /* Adjust extent length if donor extent is larger than orig */
- if (ext4_ext_get_actual_len(tmp_dext) >
- ext4_ext_get_actual_len(tmp_oext) - orig_diff)
- tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
- orig_diff);
-
- tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
-
- copy_extent_status(&oext_old, tmp_dext);
- copy_extent_status(&dext_old, tmp_oext);
-
- return 0;
-}
-
-/**
* mext_check_coverage - Check that all extents in range has the same type
*
* @inode: inode in question
@@ -619,171 +114,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
}
ret = 1;
out:
- if (path) {
- ext4_ext_drop_refs(path);
- kfree(path);
- }
+ ext4_ext_drop_refs(path);
+ kfree(path);
return ret;
}
/**
- * mext_replace_branches - Replace original extents with new extents
- *
- * @handle: journal handle
- * @orig_inode: original inode
- * @donor_inode: donor inode
- * @from: block offset of orig_inode
- * @count: block count to be replaced
- * @err: pointer to save return value
- *
- * Replace original inode extents and donor inode extents page by page.
- * We implement this replacement in the following three steps:
- * 1. Save the block information of original and donor inodes into
- * dummy extents.
- * 2. Change the block information of original inode to point at the
- * donor inode blocks.
- * 3. Change the block information of donor inode to point at the saved
- * original inode blocks in the dummy extents.
- *
- * Return replaced block count.
- */
-static int
-mext_replace_branches(handle_t *handle, struct inode *orig_inode,
- struct inode *donor_inode, ext4_lblk_t from,
- ext4_lblk_t count, int *err)
-{
- struct ext4_ext_path *orig_path = NULL;
- struct ext4_ext_path *donor_path = NULL;
- struct ext4_extent *oext, *dext;
- struct ext4_extent tmp_dext, tmp_oext;
- ext4_lblk_t orig_off = from, donor_off = from;
- int depth;
- int replaced_count = 0;
- int dext_alen;
-
- *err = ext4_es_remove_extent(orig_inode, from, count);
- if (*err)
- goto out;
-
- *err = ext4_es_remove_extent(donor_inode, from, count);
- if (*err)
- goto out;
-
- /* Get the original extent for the block "orig_off" */
- *err = get_ext_path(orig_inode, orig_off, &orig_path);
- if (*err)
- goto out;
-
- /* Get the donor extent for the head */
- *err = get_ext_path(donor_inode, donor_off, &donor_path);
- if (*err)
- goto out;
- depth = ext_depth(orig_inode);
- oext = orig_path[depth].p_ext;
- tmp_oext = *oext;
-
- depth = ext_depth(donor_inode);
- dext = donor_path[depth].p_ext;
- if (unlikely(!dext))
- goto missing_donor_extent;
- tmp_dext = *dext;
-
- *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
- donor_off, count);
- if (*err)
- goto out;
-
- /* Loop for the donor extents */
- while (1) {
- /* The extent for donor must be found. */
- if (unlikely(!dext)) {
- missing_donor_extent:
- EXT4_ERROR_INODE(donor_inode,
- "The extent for donor must be found");
- *err = -EIO;
- goto out;
- } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
- EXT4_ERROR_INODE(donor_inode,
- "Donor offset(%u) and the first block of donor "
- "extent(%u) should be equal",
- donor_off,
- le32_to_cpu(tmp_dext.ee_block));
- *err = -EIO;
- goto out;
- }
-
- /* Set donor extent to orig extent */
- *err = mext_leaf_block(handle, orig_inode,
- orig_path, &tmp_dext, &orig_off);
- if (*err)
- goto out;
-
- /* Set orig extent to donor extent */
- *err = mext_leaf_block(handle, donor_inode,
- donor_path, &tmp_oext, &donor_off);
- if (*err)
- goto out;
-
- dext_alen = ext4_ext_get_actual_len(&tmp_dext);
- replaced_count += dext_alen;
- donor_off += dext_alen;
- orig_off += dext_alen;
-
- BUG_ON(replaced_count > count);
- /* Already moved the expected blocks */
- if (replaced_count >= count)
- break;
-
- if (orig_path)
- ext4_ext_drop_refs(orig_path);
- *err = get_ext_path(orig_inode, orig_off, &orig_path);
- if (*err)
- goto out;
- depth = ext_depth(orig_inode);
- oext = orig_path[depth].p_ext;
- tmp_oext = *oext;
-
- if (donor_path)
- ext4_ext_drop_refs(donor_path);
- *err = get_ext_path(donor_inode, donor_off, &donor_path);
- if (*err)
- goto out;
- depth = ext_depth(donor_inode);
- dext = donor_path[depth].p_ext;
- tmp_dext = *dext;
-
- *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
- donor_off, count - replaced_count);
- if (*err)
- goto out;
- }
-
-out:
- if (orig_path) {
- ext4_ext_drop_refs(orig_path);
- kfree(orig_path);
- }
- if (donor_path) {
- ext4_ext_drop_refs(donor_path);
- kfree(donor_path);
- }
-
- return replaced_count;
-}
-
-/**
* mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
*
* @inode1: the inode structure
* @inode2: the inode structure
- * @index: page index
+ * @index1: page index
+ * @index2: page index
* @page: result page vector
*
* Grab two locked pages for inode's by inode order
*/
static int
mext_page_double_lock(struct inode *inode1, struct inode *inode2,
- pgoff_t index, struct page *page[2])
+ pgoff_t index1, pgoff_t index2, struct page *page[2])
{
struct address_space *mapping[2];
unsigned fl = AOP_FLAG_NOFS;
@@ -793,15 +142,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
mapping[0] = inode1->i_mapping;
mapping[1] = inode2->i_mapping;
} else {
+ pgoff_t tmp = index1;
+ index1 = index2;
+ index2 = tmp;
mapping[0] = inode2->i_mapping;
mapping[1] = inode1->i_mapping;
}
- page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+ page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
if (!page[0])
return -ENOMEM;
- page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+ page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
if (!page[1]) {
unlock_page(page[0]);
page_cache_release(page[0]);
@@ -893,25 +245,27 @@ out:
* @o_filp: file structure of original file
* @donor_inode: donor inode
* @orig_page_offset: page index on original file
+ * @donor_page_offset: page index on donor file
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
* @unwritten: orig extent is unwritten or not
* @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
- * with donor inode extents by calling mext_replace_branches().
+ * with donor inode extents by calling ext4_swap_extents().
* Finally, write out the saved data in new original inode blocks. Return
* replaced block count.
*/
static int
move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
- pgoff_t orig_page_offset, int data_offset_in_page,
- int block_len_in_page, int unwritten, int *err)
+ pgoff_t orig_page_offset, pgoff_t donor_page_offset,
+ int data_offset_in_page,
+ int block_len_in_page, int unwritten, int *err)
{
struct inode *orig_inode = file_inode(o_filp);
struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
- ext4_lblk_t orig_blk_offset;
+ ext4_lblk_t orig_blk_offset, donor_blk_offset;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
@@ -939,6 +293,9 @@ again:
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
+ donor_blk_offset = donor_page_offset * blocks_per_page +
+ data_offset_in_page;
+
/* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -959,7 +316,7 @@ again:
replaced_size = data_size;
*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
- pagep);
+ donor_page_offset, pagep);
if (unlikely(*err < 0))
goto stop_journal;
/*
@@ -978,7 +335,7 @@ again:
if (*err)
goto drop_data_sem;
- unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+ unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
block_len_in_page, 1, err);
if (*err)
goto drop_data_sem;
@@ -994,9 +351,10 @@ again:
*err = -EBUSY;
goto drop_data_sem;
}
- replaced_count = mext_replace_branches(handle, orig_inode,
- donor_inode, orig_blk_offset,
- block_len_in_page, err);
+ replaced_count = ext4_swap_extents(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ donor_blk_offset,
+ block_len_in_page, 1, err);
drop_data_sem:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto unlock_pages;
@@ -1014,9 +372,9 @@ data_copy:
goto unlock_pages;
}
ext4_double_down_write_data_sem(orig_inode, donor_inode);
- replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
- orig_blk_offset,
- block_len_in_page, err);
+ replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
+ orig_blk_offset, donor_blk_offset,
+ block_len_in_page, 1, err);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (*err) {
if (replaced_count) {
@@ -1061,9 +419,9 @@ repair_branches:
* Try to swap extents to it's original places
*/
ext4_double_down_write_data_sem(orig_inode, donor_inode);
- replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
- orig_blk_offset,
- block_len_in_page, &err2);
+ replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
+ orig_blk_offset, donor_blk_offset,
+ block_len_in_page, 0, &err2);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
@@ -1093,10 +451,14 @@ mext_check_arguments(struct inode *orig_inode,
struct inode *donor_inode, __u64 orig_start,
__u64 donor_start, __u64 *len)
{
- ext4_lblk_t orig_blocks, donor_blocks;
+ __u64 orig_eof, donor_eof;
unsigned int blkbits = orig_inode->i_blkbits;
unsigned int blocksize = 1 << blkbits;
+ orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
+ donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+
+
if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
ext4_debug("ext4 move extent: suid or sgid is set"
" to donor file [ino:orig %lu, donor %lu]\n",
@@ -1112,7 +474,7 @@ mext_check_arguments(struct inode *orig_inode,
ext4_debug("ext4 move extent: The argument files should "
"not be swapfile [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
+ return -EBUSY;
}
/* Ext4 move extent supports only extent based file */
@@ -1132,67 +494,28 @@ mext_check_arguments(struct inode *orig_inode,
}
/* Start offset should be same */
- if (orig_start != donor_start) {
+ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
+ (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
ext4_debug("ext4 move extent: orig and donor's start "
- "offset are not same [ino:orig %lu, donor %lu]\n",
+ "offset are not alligned [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if ((orig_start >= EXT_MAX_BLOCKS) ||
+ (donor_start >= EXT_MAX_BLOCKS) ||
(*len > EXT_MAX_BLOCKS) ||
+ (donor_start + *len >= EXT_MAX_BLOCKS) ||
(orig_start + *len >= EXT_MAX_BLOCKS)) {
ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
-
- if (orig_inode->i_size > donor_inode->i_size) {
- donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
- /* TODO: eliminate this artificial restriction */
- if (orig_start >= donor_blocks) {
- ext4_debug("ext4 move extent: orig start offset "
- "[%llu] should be less than donor file blocks "
- "[%u] [ino:orig %lu, donor %lu]\n",
- orig_start, donor_blocks,
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
- /* TODO: eliminate this artificial restriction */
- if (orig_start + *len > donor_blocks) {
- ext4_debug("ext4 move extent: End offset [%llu] should "
- "be less than donor file blocks [%u]."
- "So adjust length from %llu to %llu "
- "[ino:orig %lu, donor %lu]\n",
- orig_start + *len, donor_blocks,
- *len, donor_blocks - orig_start,
- orig_inode->i_ino, donor_inode->i_ino);
- *len = donor_blocks - orig_start;
- }
- } else {
- orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
- if (orig_start >= orig_blocks) {
- ext4_debug("ext4 move extent: start offset [%llu] "
- "should be less than original file blocks "
- "[%u] [ino:orig %lu, donor %lu]\n",
- orig_start, orig_blocks,
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
- if (orig_start + *len > orig_blocks) {
- ext4_debug("ext4 move extent: Adjust length "
- "from %llu to %llu. Because it should be "
- "less than original file blocks "
- "[ino:orig %lu, donor %lu]\n",
- *len, orig_blocks - orig_start,
- orig_inode->i_ino, donor_inode->i_ino);
- *len = orig_blocks - orig_start;
- }
- }
-
+ if (orig_eof < orig_start + *len - 1)
+ *len = orig_eof - orig_start;
+ if (donor_eof < donor_start + *len - 1)
+ *len = donor_eof - donor_start;
if (!*len) {
ext4_debug("ext4 move extent: len should not be 0 "
"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
@@ -1208,60 +531,26 @@ mext_check_arguments(struct inode *orig_inode,
*
* @o_filp: file structure of the original file
* @d_filp: file structure of the donor file
- * @orig_start: start offset in block for orig
- * @donor_start: start offset in block for donor
+ * @orig_blk: start offset in block for orig
+ * @donor_blk: start offset in block for donor
* @len: the number of blocks to be moved
* @moved_len: moved block length
*
* This function returns 0 and moved block length is set in moved_len
* if succeed, otherwise returns error value.
*
- * Note: ext4_move_extents() proceeds the following order.
- * 1:ext4_move_extents() calculates the last block number of moving extent
- * function by the start block number (orig_start) and the number of blocks
- * to be moved (len) specified as arguments.
- * If the {orig, donor}_start points a hole, the extent's start offset
- * pointed by ext_cur (current extent), holecheck_path, orig_path are set
- * after hole behind.
- * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
- * or the ext_cur exceeds the block_end which is last logical block number.
- * 3:To get the length of continues area, call mext_next_extent()
- * specified with the ext_cur (initial value is holecheck_path) re-cursive,
- * until find un-continuous extent, the start logical block number exceeds
- * the block_end or the extent points to the last extent.
- * 4:Exchange the original inode data with donor inode data
- * from orig_page_offset to seq_end_page.
- * The start indexes of data are specified as arguments.
- * That of the original inode is orig_page_offset,
- * and the donor inode is also orig_page_offset
- * (To easily handle blocksize != pagesize case, the offset for the
- * donor inode is block unit).
- * 5:Update holecheck_path and orig_path to points a next proceeding extent,
- * then returns to step 2.
- * 6:Release holecheck_path, orig_path and set the len to moved_len
- * which shows the number of moved blocks.
- * The moved_len is useful for the command to calculate the file offset
- * for starting next move extent ioctl.
- * 7:Return 0 on success, or a negative error value on failure.
*/
int
-ext4_move_extents(struct file *o_filp, struct file *d_filp,
- __u64 orig_start, __u64 donor_start, __u64 len,
- __u64 *moved_len)
+ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+ __u64 donor_blk, __u64 len, __u64 *moved_len)
{
struct inode *orig_inode = file_inode(o_filp);
struct inode *donor_inode = file_inode(d_filp);
- struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
- struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
- ext4_lblk_t block_start = orig_start;
- ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
- ext4_lblk_t rest_blocks;
- pgoff_t orig_page_offset = 0, seq_end_page;
- int ret, depth, last_extent = 0;
+ struct ext4_ext_path *path = NULL;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
- int data_offset_in_page;
- int block_len_in_page;
- int unwritten;
+ ext4_lblk_t o_end, o_start = orig_blk;
+ ext4_lblk_t d_start = donor_blk;
+ int ret;
if (orig_inode->i_sb != donor_inode->i_sb) {
ext4_debug("ext4 move extent: The argument files "
@@ -1303,121 +592,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
- donor_start, &len);
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+ donor_blk, &len);
if (ret)
goto out;
+ o_end = o_start + len;
- file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
- block_end = block_start + len - 1;
- if (file_end < block_end)
- len -= block_end - file_end;
+ while (o_start < o_end) {
+ struct ext4_extent *ex;
+ ext4_lblk_t cur_blk, next_blk;
+ pgoff_t orig_page_index, donor_page_index;
+ int offset_in_page;
+ int unwritten, cur_len;
- ret = get_ext_path(orig_inode, block_start, &orig_path);
- if (ret)
- goto out;
-
- /* Get path structure to check the hole */
- ret = get_ext_path(orig_inode, block_start, &holecheck_path);
- if (ret)
- goto out;
-
- depth = ext_depth(orig_inode);
- ext_cur = holecheck_path[depth].p_ext;
-
- /*
- * Get proper starting location of block replacement if block_start was
- * within the hole.
- */
- if (le32_to_cpu(ext_cur->ee_block) +
- ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
- /*
- * The hole exists between extents or the tail of
- * original file.
- */
- last_extent = mext_next_extent(orig_inode,
- holecheck_path, &ext_cur);
- if (last_extent < 0) {
- ret = last_extent;
- goto out;
- }
- last_extent = mext_next_extent(orig_inode, orig_path,
- &ext_dummy);
- if (last_extent < 0) {
- ret = last_extent;
+ ret = get_ext_path(orig_inode, o_start, &path);
+ if (ret)
goto out;
- }
- seq_start = le32_to_cpu(ext_cur->ee_block);
- } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
- /* The hole exists at the beginning of original file. */
- seq_start = le32_to_cpu(ext_cur->ee_block);
- else
- seq_start = block_start;
-
- /* No blocks within the specified range. */
- if (le32_to_cpu(ext_cur->ee_block) > block_end) {
- ext4_debug("ext4 move extent: The specified range of file "
- "may be the hole\n");
- ret = -EINVAL;
- goto out;
- }
-
- /* Adjust start blocks */
- add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
- ext4_ext_get_actual_len(ext_cur), block_end + 1) -
- max(le32_to_cpu(ext_cur->ee_block), block_start);
-
- while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
- seq_blocks += add_blocks;
-
- /* Adjust tail blocks */
- if (seq_start + seq_blocks - 1 > block_end)
- seq_blocks = block_end - seq_start + 1;
-
- ext_prev = ext_cur;
- last_extent = mext_next_extent(orig_inode, holecheck_path,
- &ext_cur);
- if (last_extent < 0) {
- ret = last_extent;
- break;
- }
- add_blocks = ext4_ext_get_actual_len(ext_cur);
-
- /*
- * Extend the length of contiguous block (seq_blocks)
- * if extents are contiguous.
- */
- if (ext4_can_extents_be_merged(orig_inode,
- ext_prev, ext_cur) &&
- block_end >= le32_to_cpu(ext_cur->ee_block) &&
- !last_extent)
+ ex = path[path->p_depth].p_ext;
+ next_blk = ext4_ext_next_allocated_block(path);
+ cur_blk = le32_to_cpu(ex->ee_block);
+ cur_len = ext4_ext_get_actual_len(ex);
+ /* Check hole before the start pos */
+ if (cur_blk + cur_len - 1 < o_start) {
+ if (next_blk == EXT_MAX_BLOCKS) {
+ o_start = o_end;
+ ret = -ENODATA;
+ goto out;
+ }
+ d_start += next_blk - o_start;
+ o_start = next_blk;
continue;
-
- /* Is original extent is unwritten */
- unwritten = ext4_ext_is_unwritten(ext_prev);
-
- data_offset_in_page = seq_start % blocks_per_page;
-
- /*
- * Calculate data blocks count that should be swapped
- * at the first page.
- */
- if (data_offset_in_page + seq_blocks > blocks_per_page) {
- /* Swapped blocks are across pages */
- block_len_in_page =
- blocks_per_page - data_offset_in_page;
- } else {
- /* Swapped blocks are in a page */
- block_len_in_page = seq_blocks;
+ /* Check hole after the start pos */
+ } else if (cur_blk > o_start) {
+ /* Skip hole */
+ d_start += cur_blk - o_start;
+ o_start = cur_blk;
+ /* Extent inside requested range ?*/
+ if (cur_blk >= o_end)
+ goto out;
+ } else { /* in_range(o_start, o_blk, o_len) */
+ cur_len += cur_blk - o_start;
}
-
- orig_page_offset = seq_start >>
- (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
- seq_end_page = (seq_start + seq_blocks - 1) >>
- (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
- seq_start = le32_to_cpu(ext_cur->ee_block);
- rest_blocks = seq_blocks;
-
+ unwritten = ext4_ext_is_unwritten(ex);
+ if (o_end - o_start < cur_len)
+ cur_len = o_end - o_start;
+
+ orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+ orig_inode->i_blkbits);
+ donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+ donor_inode->i_blkbits);
+ offset_in_page = o_start % blocks_per_page;
+ if (cur_len > blocks_per_page- offset_in_page)
+ cur_len = blocks_per_page - offset_in_page;
/*
* Up semaphore to avoid following problems:
* a. transaction deadlock among ext4_journal_start,
@@ -1426,77 +652,29 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
* in move_extent_per_page
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
-
- while (orig_page_offset <= seq_end_page) {
-
- /* Swap original branches with new branches */
- block_len_in_page = move_extent_per_page(
- o_filp, donor_inode,
- orig_page_offset,
- data_offset_in_page,
- block_len_in_page,
- unwritten, &ret);
-
- /* Count how many blocks we have exchanged */
- *moved_len += block_len_in_page;
- if (ret < 0)
- break;
- if (*moved_len > len) {
- EXT4_ERROR_INODE(orig_inode,
- "We replaced blocks too much! "
- "sum of replaced: %llu requested: %llu",
- *moved_len, len);
- ret = -EIO;
- break;
- }
-
- orig_page_offset++;
- data_offset_in_page = 0;
- rest_blocks -= block_len_in_page;
- if (rest_blocks > blocks_per_page)
- block_len_in_page = blocks_per_page;
- else
- block_len_in_page = rest_blocks;
- }
-
+ /* Swap original branches with new branches */
+ move_extent_per_page(o_filp, donor_inode,
+ orig_page_index, donor_page_index,
+ offset_in_page, cur_len,
+ unwritten, &ret);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
if (ret < 0)
break;
-
- /* Decrease buffer counter */
- if (holecheck_path)
- ext4_ext_drop_refs(holecheck_path);
- ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
- if (ret)
- break;
- depth = holecheck_path->p_depth;
-
- /* Decrease buffer counter */
- if (orig_path)
- ext4_ext_drop_refs(orig_path);
- ret = get_ext_path(orig_inode, seq_start, &orig_path);
- if (ret)
- break;
-
- ext_cur = holecheck_path[depth].p_ext;
- add_blocks = ext4_ext_get_actual_len(ext_cur);
- seq_blocks = 0;
-
+ o_start += cur_len;
+ d_start += cur_len;
}
+ *moved_len = o_start - orig_blk;
+ if (*moved_len > len)
+ *moved_len = len;
+
out:
if (*moved_len) {
ext4_discard_preallocations(orig_inode);
ext4_discard_preallocations(donor_inode);
}
- if (orig_path) {
- ext4_ext_drop_refs(orig_path);
- kfree(orig_path);
- }
- if (holecheck_path) {
- ext4_ext_drop_refs(holecheck_path);
- kfree(holecheck_path);
- }
+ ext4_ext_drop_refs(path);
+ kfree(path);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
ext4_inode_resume_unlocked_dio(orig_inode);
ext4_inode_resume_unlocked_dio(donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3520ab8a6639..426211882f72 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -53,7 +53,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
ext4_lblk_t *block)
{
struct buffer_head *bh;
- int err = 0;
+ int err;
if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
((inode->i_size >> 10) >=
@@ -62,9 +62,9 @@ static struct buffer_head *ext4_append(handle_t *handle,
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
- bh = ext4_bread(handle, inode, *block, 1, &err);
- if (!bh)
- return ERR_PTR(err);
+ bh = ext4_bread(handle, inode, *block, 1);
+ if (IS_ERR(bh))
+ return bh;
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
BUFFER_TRACE(bh, "get_write_access");
@@ -94,20 +94,20 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
{
struct buffer_head *bh;
struct ext4_dir_entry *dirent;
- int err = 0, is_dx_block = 0;
+ int is_dx_block = 0;
- bh = ext4_bread(NULL, inode, block, 0, &err);
- if (!bh) {
- if (err == 0) {
- ext4_error_inode(inode, __func__, line, block,
- "Directory hole found");
- return ERR_PTR(-EIO);
- }
+ bh = ext4_bread(NULL, inode, block, 0);
+ if (IS_ERR(bh)) {
__ext4_warning(inode->i_sb, __func__, line,
- "error reading directory block "
- "(ino %lu, block %lu)", inode->i_ino,
+ "error %ld reading directory block "
+ "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino,
(unsigned long) block);
- return ERR_PTR(err);
+
+ return bh;
+ }
+ if (!bh) {
+ ext4_error_inode(inode, __func__, line, block, "Directory hole found");
+ return ERR_PTR(-EIO);
}
dirent = (struct ext4_dir_entry *) bh->b_data;
/* Determine whether or not we have an index block */
@@ -124,8 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
"directory leaf block found instead of index block");
return ERR_PTR(-EIO);
}
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+ if (!ext4_has_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
return bh;
@@ -253,8 +252,7 @@ static unsigned dx_node_limit(struct inode *dir);
static struct dx_frame *dx_probe(const struct qstr *d_name,
struct inode *dir,
struct dx_hash_info *hinfo,
- struct dx_frame *frame,
- int *err);
+ struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
struct dx_hash_info *hinfo, struct dx_map_entry map[]);
@@ -270,8 +268,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
__u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir,
- int *err);
+ struct ext4_dir_entry_2 **res_dir);
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
@@ -340,8 +337,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
{
struct ext4_dir_entry_tail *t;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return 1;
t = get_dirent_tail(inode, dirent);
@@ -362,8 +358,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
{
struct ext4_dir_entry_tail *t;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return;
t = get_dirent_tail(inode, dirent);
@@ -438,8 +433,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
struct dx_tail *t;
int count_offset, limit, count;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return 1;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -468,8 +462,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
struct dx_tail *t;
int count_offset, limit, count;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -557,8 +550,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -567,8 +559,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -641,7 +632,9 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
struct stats stats;
printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
- if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
+ bh = ext4_bread(NULL,dir, block, 0);
+ if (!bh || IS_ERR(bh))
+ continue;
stats = levels?
dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
@@ -669,29 +662,25 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
*/
static struct dx_frame *
dx_probe(const struct qstr *d_name, struct inode *dir,
- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
unsigned count, indirect;
struct dx_entry *at, *entries, *p, *q, *m;
struct dx_root *root;
- struct buffer_head *bh;
struct dx_frame *frame = frame_in;
+ struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
- frame->bh = NULL;
- bh = ext4_read_dirblock(dir, 0, INDEX);
- if (IS_ERR(bh)) {
- *err = PTR_ERR(bh);
- goto fail;
- }
- root = (struct dx_root *) bh->b_data;
+ frame->bh = ext4_read_dirblock(dir, 0, INDEX);
+ if (IS_ERR(frame->bh))
+ return (struct dx_frame *) frame->bh;
+
+ root = (struct dx_root *) frame->bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
root->info.hash_version != DX_HASH_LEGACY) {
ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
root->info.hash_version);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
goto fail;
}
hinfo->hash_version = root->info.hash_version;
@@ -705,16 +694,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
if (root->info.unused_flags & 1) {
ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
root->info.unused_flags);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
goto fail;
}
if ((indirect = root->info.indirect_levels) > 1) {
ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
root->info.indirect_levels);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
goto fail;
}
@@ -724,27 +709,21 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
if (dx_get_limit(entries) != dx_root_limit(dir,
root->info.info_length)) {
ext4_warning(dir->i_sb, "dx entry: limit != root limit");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
goto fail;
}
dxtrace(printk("Look up %x", hash));
- while (1)
- {
+ while (1) {
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
ext4_warning(dir->i_sb,
"dx entry: no count or count > limit");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail2;
+ goto fail;
}
p = entries + 1;
q = entries + count - 1;
- while (p <= q)
- {
+ while (p <= q) {
m = p + (q - p)/2;
dxtrace(printk("."));
if (dx_get_hash(m) > hash)
@@ -753,8 +732,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
p = m + 1;
}
- if (0) // linear search cross check
- {
+ if (0) { // linear search cross check
unsigned n = count - 1;
at = entries;
while (n--)
@@ -771,38 +749,35 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
at = p - 1;
dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
- frame->bh = bh;
frame->entries = entries;
frame->at = at;
- if (!indirect--) return frame;
- bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
- if (IS_ERR(bh)) {
- *err = PTR_ERR(bh);
- goto fail2;
+ if (!indirect--)
+ return frame;
+ frame++;
+ frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ if (IS_ERR(frame->bh)) {
+ ret_err = (struct dx_frame *) frame->bh;
+ frame->bh = NULL;
+ goto fail;
}
- entries = ((struct dx_node *) bh->b_data)->entries;
+ entries = ((struct dx_node *) frame->bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit (dir)) {
ext4_warning(dir->i_sb,
"dx entry: limit != node limit");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail2;
+ goto fail;
}
- frame++;
- frame->bh = NULL;
}
-fail2:
+fail:
while (frame >= frame_in) {
brelse(frame->bh);
frame--;
}
-fail:
- if (*err == ERR_BAD_DX_DIR)
+ if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
ext4_warning(dir->i_sb,
"Corrupt dir inode %lu, running e2fsck is "
"recommended.", dir->i_ino);
- return NULL;
+ return ret_err;
}
static void dx_release (struct dx_frame *frames)
@@ -988,9 +963,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
- frame = dx_probe(NULL, dir, &hinfo, frames, &err);
- if (!frame)
- return err;
+ frame = dx_probe(NULL, dir, &hinfo, frames);
+ if (IS_ERR(frame))
+ return PTR_ERR(frame);
/* Add '.' and '..' from the htree header */
if (!start_hash && !start_minor_hash) {
@@ -1227,8 +1202,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
buffer */
int num = 0;
ext4_lblk_t nblocks;
- int i, err;
- int namelen;
+ int i, namelen;
*res_dir = NULL;
sb = dir->i_sb;
@@ -1258,13 +1232,13 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
goto restart;
}
if (is_dx(dir)) {
- bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
+ bh = ext4_dx_find_entry(dir, d_name, res_dir);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
- if (bh || (err != ERR_BAD_DX_DIR))
+ if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR)
return bh;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
@@ -1294,7 +1268,12 @@ restart:
break;
}
num++;
- bh = ext4_getblk(NULL, dir, b++, 0, &err);
+ bh = ext4_getblk(NULL, dir, b++, 0);
+ if (unlikely(IS_ERR(bh))) {
+ if (ra_max == 0)
+ return bh;
+ break;
+ }
bh_use[ra_max] = bh;
if (bh)
ll_rw_block(READ | REQ_META | REQ_PRIO,
@@ -1357,7 +1336,7 @@ cleanup_and_exit:
}
static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir, int *err)
+ struct ext4_dir_entry_2 **res_dir)
{
struct super_block * sb = dir->i_sb;
struct dx_hash_info hinfo;
@@ -1366,25 +1345,23 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
ext4_lblk_t block;
int retval;
- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
- return NULL;
+ frame = dx_probe(d_name, dir, &hinfo, frames);
+ if (IS_ERR(frame))
+ return (struct buffer_head *) frame;
do {
block = dx_get_block(frame->at);
bh = ext4_read_dirblock(dir, block, DIRENT);
- if (IS_ERR(bh)) {
- *err = PTR_ERR(bh);
+ if (IS_ERR(bh))
goto errout;
- }
+
retval = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb),
res_dir);
- if (retval == 1) { /* Success! */
- dx_release(frames);
- return bh;
- }
+ if (retval == 1)
+ goto success;
brelse(bh);
if (retval == -1) {
- *err = ERR_BAD_DX_DIR;
+ bh = ERR_PTR(ERR_BAD_DX_DIR);
goto errout;
}
@@ -1393,18 +1370,19 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
frames, NULL);
if (retval < 0) {
ext4_warning(sb,
- "error reading index page in directory #%lu",
- dir->i_ino);
- *err = retval;
+ "error %d reading index page in directory #%lu",
+ retval, dir->i_ino);
+ bh = ERR_PTR(retval);
goto errout;
}
} while (retval == 1);
- *err = -ENOENT;
+ bh = NULL;
errout:
dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
- dx_release (frames);
- return NULL;
+success:
+ dx_release(frames);
+ return bh;
}
static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -1417,6 +1395,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-ENAMETOOLONG);
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return (struct dentry *) bh;
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
@@ -1430,7 +1410,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
dentry);
return ERR_PTR(-EIO);
}
- inode = ext4_iget(dir->i_sb, ino);
+ inode = ext4_iget_normal(dir->i_sb, ino);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
"deleted inode referenced: %u",
@@ -1450,6 +1430,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
struct buffer_head *bh;
bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
+ if (IS_ERR(bh))
+ return (struct dentry *) bh;
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
@@ -1461,7 +1443,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
return ERR_PTR(-EIO);
}
- return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
+ return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino));
}
/*
@@ -1520,7 +1502,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
*/
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
struct buffer_head **bh,struct dx_frame *frame,
- struct dx_hash_info *hinfo, int *error)
+ struct dx_hash_info *hinfo)
{
unsigned blocksize = dir->i_sb->s_blocksize;
unsigned count, continued;
@@ -1535,16 +1517,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err = 0, i;
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
bh2 = ext4_append(handle, dir, &newblock);
if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- *error = PTR_ERR(bh2);
- return NULL;
+ return (struct ext4_dir_entry_2 *) bh2;
}
BUFFER_TRACE(*bh, "get_write_access");
@@ -1604,8 +1584,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
/* Which block gets the new entry? */
- if (hinfo->hash >= hash2)
- {
+ if (hinfo->hash >= hash2) {
swap(*bh, bh2);
de = de2;
}
@@ -1625,8 +1604,7 @@ journal_error:
brelse(bh2);
*bh = NULL;
ext4_std_error(dir->i_sb, err);
- *error = err;
- return NULL;
+ return ERR_PTR(err);
}
int ext4_find_dest_de(struct inode *dir, struct inode *inode,
@@ -1705,8 +1683,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
int csum_size = 0;
int err;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
@@ -1773,8 +1750,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct fake_dirent *fde;
int csum_size = 0;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
@@ -1840,31 +1816,39 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
ext4fs_dirhash(name, namelen, &hinfo);
+ memset(frames, 0, sizeof(frames));
frame = frames;
frame->entries = entries;
frame->at = entries;
frame->bh = bh;
bh = bh2;
- ext4_handle_dirty_dx_node(handle, dir, frame->bh);
- ext4_handle_dirty_dirent_node(handle, dir, bh);
+ retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+ if (retval)
+ goto out_frames;
+ retval = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ if (retval)
+ goto out_frames;
- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- if (!de) {
- /*
- * Even if the block split failed, we have to properly write
- * out all the changes we did so far. Otherwise we can end up
- * with corrupted filesystem.
- */
- ext4_mark_inode_dirty(handle, dir);
- dx_release(frames);
- return retval;
+ de = do_split(handle,dir, &bh, frame, &hinfo);
+ if (IS_ERR(de)) {
+ retval = PTR_ERR(de);
+ goto out_frames;
}
dx_release(frames);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
return retval;
+out_frames:
+ /*
+ * Even if the block split failed, we have to properly write
+ * out all the changes we did so far. Otherwise we can end up
+ * with corrupted filesystem.
+ */
+ ext4_mark_inode_dirty(handle, dir);
+ dx_release(frames);
+ return retval;
}
/*
@@ -1891,8 +1875,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
ext4_lblk_t block, blocks;
int csum_size = 0;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
@@ -1969,9 +1952,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct ext4_dir_entry_2 *de;
int err;
- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
- if (!frame)
- return err;
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames);
+ if (IS_ERR(frame))
+ return PTR_ERR(frame);
entries = frame->entries;
at = frame->at;
bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
@@ -2082,9 +2065,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
}
}
- de = do_split(handle, dir, &bh, frame, &hinfo, &err);
- if (!de)
+ de = do_split(handle, dir, &bh, frame, &hinfo);
+ if (IS_ERR(de)) {
+ err = PTR_ERR(de);
goto cleanup;
+ }
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
goto cleanup;
@@ -2154,8 +2139,7 @@ static int ext4_delete_entry(handle_t *handle,
return err;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
@@ -2374,8 +2358,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err;
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -2390,10 +2373,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
dir_block = ext4_append(handle, inode, &block);
if (IS_ERR(dir_block))
return PTR_ERR(dir_block);
- BUFFER_TRACE(dir_block, "get_write_access");
- err = ext4_journal_get_write_access(handle, dir_block);
- if (err)
- goto out;
de = (struct ext4_dir_entry_2 *)dir_block->b_data;
ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
set_nlink(inode, 2);
@@ -2560,7 +2539,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
int err = 0, rc;
bool dirty = false;
- if (!sbi->s_journal)
+ if (!sbi->s_journal || is_bad_inode(inode))
return 0;
WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
@@ -2727,6 +2706,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
if (!bh)
goto end_rmdir;
@@ -2794,6 +2775,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
if (!bh)
goto end_unlink;
@@ -3121,6 +3104,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
struct ext4_dir_entry_2 *de;
bh = ext4_find_entry(dir, d_name, &de, NULL);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
if (bh) {
retval = ext4_delete_entry(handle, dir, de, bh);
brelse(bh);
@@ -3128,7 +3113,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
return retval;
}
-static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
+ int force_reread)
{
int retval;
/*
@@ -3140,7 +3126,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
ent->de->name_len != ent->dentry->d_name.len ||
strncmp(ent->de->name, ent->dentry->d_name.name,
- ent->de->name_len)) {
+ ent->de->name_len) ||
+ force_reread) {
retval = ext4_find_delete_entry(handle, ent->dir,
&ent->dentry->d_name);
} else {
@@ -3169,6 +3156,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
}
}
+static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
+ int credits, handle_t **h)
+{
+ struct inode *wh;
+ handle_t *handle;
+ int retries = 0;
+
+ /*
+ * for inode block, sb block, group summaries,
+ * and inode bitmap
+ */
+ credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
+ EXT4_XATTR_TRANS_BLOCKS + 4);
+retry:
+ wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
+ &ent->dentry->d_name, 0, NULL,
+ EXT4_HT_DIR, credits);
+
+ handle = ext4_journal_current_handle();
+ if (IS_ERR(wh)) {
+ if (handle)
+ ext4_journal_stop(handle);
+ if (PTR_ERR(wh) == -ENOSPC &&
+ ext4_should_retry_alloc(ent->dir->i_sb, &retries))
+ goto retry;
+ } else {
+ *h = handle;
+ init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
+ wh->i_op = &ext4_special_inode_operations;
+ }
+ return wh;
+}
+
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
@@ -3178,7 +3198,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
* This comes from rename(const char *oldpath, const char *newpath)
*/
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
handle_t *handle = NULL;
struct ext4_renament old = {
@@ -3191,7 +3212,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
.dentry = new_dentry,
.inode = new_dentry->d_inode,
};
+ int force_reread;
int retval;
+ struct inode *whiteout = NULL;
+ int credits;
+ u8 old_file_type;
dquot_initialize(old.dir);
dquot_initialize(new.dir);
@@ -3202,6 +3227,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
dquot_initialize(new.inode);
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+ if (IS_ERR(old.bh))
+ return PTR_ERR(old.bh);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -3214,6 +3241,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
+ if (IS_ERR(new.bh)) {
+ retval = PTR_ERR(new.bh);
+ new.bh = NULL;
+ goto end_rename;
+ }
if (new.bh) {
if (!new.inode) {
brelse(new.bh);
@@ -3223,11 +3255,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
ext4_alloc_da_blocks(old.inode);
- handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
- (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+ if (!(flags & RENAME_WHITEOUT)) {
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ } else {
+ whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
+ if (IS_ERR(whiteout))
+ return PTR_ERR(whiteout);
+ }
if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
ext4_handle_sync(handle);
@@ -3246,16 +3284,41 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (retval)
goto end_rename;
}
+ /*
+ * If we're renaming a file within an inline_data dir and adding or
+ * setting the new dirent causes a conversion from inline_data to
+ * extents/blockmap, we need to force the dirent delete code to
+ * re-read the directory, or else we end up trying to delete a dirent
+ * from what is now the extent tree root (or a block map).
+ */
+ force_reread = (new.dir->i_ino == old.dir->i_ino &&
+ ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
+
+ old_file_type = old.de->file_type;
+ if (whiteout) {
+ /*
+ * Do this before adding a new entry, so the old entry is sure
+ * to be still pointing to the valid old entry.
+ */
+ retval = ext4_setent(handle, &old, whiteout->i_ino,
+ EXT4_FT_CHRDEV);
+ if (retval)
+ goto end_rename;
+ ext4_mark_inode_dirty(handle, whiteout);
+ }
if (!new.bh) {
retval = ext4_add_entry(handle, new.dentry, old.inode);
if (retval)
goto end_rename;
} else {
retval = ext4_setent(handle, &new,
- old.inode->i_ino, old.de->file_type);
+ old.inode->i_ino, old_file_type);
if (retval)
goto end_rename;
}
+ if (force_reread)
+ force_reread = !ext4_test_inode_flag(new.dir,
+ EXT4_INODE_INLINE_DATA);
/*
* Like most other Unix systems, set the ctime for inodes on a
@@ -3264,10 +3327,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
old.inode->i_ctime = ext4_current_time(old.inode);
ext4_mark_inode_dirty(handle, old.inode);
- /*
- * ok, that's it
- */
- ext4_rename_delete(handle, &old);
+ if (!whiteout) {
+ /*
+ * ok, that's it
+ */
+ ext4_rename_delete(handle, &old, force_reread);
+ }
if (new.inode) {
ext4_dec_count(handle, new.inode);
@@ -3303,6 +3368,12 @@ end_rename:
brelse(old.dir_bh);
brelse(old.bh);
brelse(new.bh);
+ if (whiteout) {
+ if (retval)
+ drop_nlink(whiteout);
+ unlock_new_inode(whiteout);
+ iput(whiteout);
+ }
if (handle)
ext4_journal_stop(handle);
return retval;
@@ -3330,6 +3401,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
&old.de, &old.inlined);
+ if (IS_ERR(old.bh))
+ return PTR_ERR(old.bh);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -3342,6 +3415,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
+ if (IS_ERR(new.bh)) {
+ retval = PTR_ERR(new.bh);
+ new.bh = NULL;
+ goto end_rename;
+ }
/* RENAME_EXCHANGE case: old *and* new must both exist */
if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
@@ -3428,18 +3506,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE) {
return ext4_cross_rename(old_dir, old_dentry,
new_dir, new_dentry);
}
- /*
- * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
- * is equivalent to regular rename.
- */
- return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
/*
@@ -3455,7 +3530,6 @@ const struct inode_operations ext4_dir_inode_operations = {
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
- .rename = ext4_rename,
.rename2 = ext4_rename2,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f03e2e..ca4588388fc3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -575,6 +575,7 @@ handle_bb:
bh = bclean(handle, sb, block);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@ handle_ib:
bh = bclean(handle, sb, block);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
@@ -1079,7 +1081,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
break;
if (meta_bg == 0)
- backup_block = group * bpg + blk_off;
+ backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
else
backup_block = (ext4_group_first_block_no(sb, group) +
ext4_bg_has_super(sb, group));
@@ -1210,8 +1212,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
{
struct buffer_head *bh;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 0;
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32b43ad154b9..2c9e6864abd9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -70,7 +70,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
static void ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
@@ -141,8 +140,7 @@ static __le32 ext4_superblock_csum(struct super_block *sb,
static int ext4_superblock_csum_verify(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 1;
return es->s_checksum == ext4_superblock_csum(sb, es);
@@ -152,8 +150,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return;
es->s_checksum = ext4_superblock_csum(sb, es);
@@ -820,10 +817,9 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_counter_destroy(&sbi->s_extent_cache_cnt);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
#endif
@@ -885,6 +881,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_lru);
+ ei->i_es_all_nr = 0;
ei->i_es_lru_nr = 0;
ei->i_touch_when = 0;
ei->i_reserved_data_blocks = 0;
@@ -1002,7 +999,7 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
* Currently we don't know the generation for parent directory, so
* a generation of 0 means "accept any"
*/
- inode = ext4_iget(sb, ino);
+ inode = ext4_iget_normal(sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
if (generation && inode->i_generation != generation) {
@@ -1124,25 +1121,6 @@ static const struct super_operations ext4_sops = {
.bdev_try_to_free_page = bdev_try_to_free_page,
};
-static const struct super_operations ext4_nojournal_sops = {
- .alloc_inode = ext4_alloc_inode,
- .destroy_inode = ext4_destroy_inode,
- .write_inode = ext4_write_inode,
- .dirty_inode = ext4_dirty_inode,
- .drop_inode = ext4_drop_inode,
- .evict_inode = ext4_evict_inode,
- .sync_fs = ext4_sync_fs_nojournal,
- .put_super = ext4_put_super,
- .statfs = ext4_statfs,
- .remount_fs = ext4_remount,
- .show_options = ext4_show_options,
-#ifdef CONFIG_QUOTA
- .quota_read = ext4_quota_read,
- .quota_write = ext4_quota_write,
-#endif
- .bdev_try_to_free_page = bdev_try_to_free_page,
-};
-
static const struct export_operations ext4_export_ops = {
.fh_to_dentry = ext4_fh_to_dentry,
.fh_to_parent = ext4_fh_to_parent,
@@ -1712,13 +1690,6 @@ static int parse_options(char *options, struct super_block *sb,
"not specified");
return 0;
}
- } else {
- if (sbi->s_jquota_fmt) {
- ext4_msg(sb, KERN_ERR, "journaled quota format "
- "specified with no journaling "
- "enabled");
- return 0;
- }
}
#endif
if (test_opt(sb, DIOREAD_NOLOCK)) {
@@ -2016,8 +1987,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
__u16 crc = 0;
__le32 le_group = cpu_to_le32(block_group);
- if ((sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+ if (ext4_has_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
__le16 save_csum;
__u32 csum32;
@@ -2035,6 +2005,10 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
}
/* old crc16 code */
+ if (!(sbi->s_es->s_feature_ro_compat &
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+ return 0;
+
offset = offsetof(struct ext4_group_desc, bg_checksum);
crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
@@ -2191,7 +2165,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
/* don't clear list on RO mount w/ errors */
if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
- jbd_debug(1, "Errors on filesystem, "
+ ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
"clearing orphan list.\n");
es->s_last_orphan = 0;
}
@@ -2207,7 +2181,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
/* Needed for iput() to work correctly and not trash data */
sb->s_flags |= MS_ACTIVE;
/* Turn on quotas so that they are updated correctly */
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
if (EXT4_SB(sb)->s_qf_names[i]) {
int ret = ext4_quota_on_mount(sb, i);
if (ret < 0)
@@ -2263,7 +2237,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
/* Turn quotas off */
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
if (sb_dqopt(sb)->files[i])
dquot_quota_off(sb, i);
}
@@ -2548,6 +2522,16 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
return count;
}
+static ssize_t es_ui_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+
+ unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
+ a->u.offset);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
static ssize_t reserved_clusters_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
@@ -2601,14 +2585,29 @@ static struct ext4_attr ext4_attr_##_name = { \
.offset = offsetof(struct ext4_sb_info, _elname),\
}, \
}
+
+#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+ .u = { \
+ .offset = offsetof(struct ext4_super_block, _elname), \
+ }, \
+}
+
#define EXT4_ATTR(name, mode, show, store) \
static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+
+#define EXT4_RO_ATTR_ES_UI(name, elname) \
+ EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
#define EXT4_RW_ATTR_SBI_UI(name, elname) \
EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+
#define ATTR_LIST(name) &ext4_attr_##name.attr
#define EXT4_DEPRECATED_ATTR(_name, _val) \
static struct ext4_attr ext4_attr_##_name = { \
@@ -2641,6 +2640,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
+EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
@@ -2664,6 +2666,9 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(warning_ratelimit_burst),
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
+ ATTR_LIST(errors_count),
+ ATTR_LIST(first_error_time),
+ ATTR_LIST(last_error_time),
NULL,
};
@@ -2723,9 +2728,25 @@ static void ext4_feat_release(struct kobject *kobj)
complete(&ext4_feat->f_kobj_unregister);
}
+static ssize_t ext4_feat_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "supported\n");
+}
+
+/*
+ * We can not use ext4_attr_show/store because it relies on the kobject
+ * being embedded in the ext4_sb_info structure which is definitely not
+ * true in this case.
+ */
+static const struct sysfs_ops ext4_feat_ops = {
+ .show = ext4_feat_show,
+ .store = NULL,
+};
+
static struct kobj_type ext4_feat_ktype = {
.default_attrs = ext4_feat_attrs,
- .sysfs_ops = &ext4_attr_ops,
+ .sysfs_ops = &ext4_feat_ops,
.release = ext4_feat_release,
};
@@ -3179,17 +3200,20 @@ static int set_journal_csum_feature_set(struct super_block *sb)
int compat, incompat;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
- /* journal checksum v2 */
+ if (ext4_has_metadata_csum(sb)) {
+ /* journal checksum v3 */
compat = 0;
- incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+ incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
} else {
/* journal checksum v1 */
compat = JBD2_FEATURE_COMPAT_CHECKSUM;
incompat = 0;
}
+ jbd2_journal_clear_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_CSUM_V3 |
+ JBD2_FEATURE_INCOMPAT_CSUM_V2);
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ret = jbd2_journal_set_features(sbi->s_journal,
compat, 0,
@@ -3202,10 +3226,8 @@ static int set_journal_csum_feature_set(struct super_block *sb)
jbd2_journal_clear_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
} else {
- jbd2_journal_clear_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
- JBD2_FEATURE_INCOMPAT_CSUM_V2);
+ jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
}
return ret;
@@ -3435,7 +3457,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
logical_sb_block = sb_block;
}
- if (!(bh = sb_bread(sb, logical_sb_block))) {
+ if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
ext4_msg(sb, KERN_ERR, "unable to read superblock");
goto out_fail;
}
@@ -3486,8 +3508,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
/* Precompute checksum seed for all metadata */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
@@ -3505,6 +3526,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
+ /* don't forget to enable journal_csum when metadata_csum is enabled. */
+ if (ext4_has_metadata_csum(sb))
+ set_opt(sb, JOURNAL_CHECKSUM);
+
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3518,8 +3543,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sb, ERRORS_CONT);
else
set_opt(sb, ERRORS_RO);
- if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
- set_opt(sb, BLOCK_VALIDITY);
+ /* block_validity enabled by default; disable with noblock_validity */
+ set_opt(sb, BLOCK_VALIDITY);
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);
@@ -3645,7 +3670,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
- bh = sb_bread(sb, logical_sb_block);
+ bh = sb_bread_unmovable(sb, logical_sb_block);
if (!bh) {
ext4_msg(sb, KERN_ERR,
"Can't read superblock on 2nd try");
@@ -3867,7 +3892,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
- sbi->s_group_desc[i] = sb_bread(sb, block);
+ sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
if (!sbi->s_group_desc[i]) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
@@ -3889,12 +3914,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.data = (unsigned long) sb;
/* Register extent status tree shrinker */
- ext4_es_register_shrinker(sbi);
-
- if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
- ext4_msg(sb, KERN_ERR, "insufficient memory");
+ if (ext4_es_register_shrinker(sbi))
goto failed_mount3;
- }
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_extent_max_zeroout_kb = 32;
@@ -3902,11 +3923,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/*
* set up enough so that it can read an inode
*/
- if (!test_opt(sb, NOLOAD) &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
- sb->s_op = &ext4_sops;
- else
- sb->s_op = &ext4_nojournal_sops;
+ sb->s_op = &ext4_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
@@ -3930,7 +3947,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
!(sb->s_flags & MS_RDONLY))
if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
- goto failed_mount3;
+ goto failed_mount3a;
/*
* The first inode we look at is the journal inode. Don't try
@@ -3939,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!test_opt(sb, NOLOAD) &&
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
if (ext4_load_journal(sb, es, journal_devnum))
- goto failed_mount3;
+ goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -4105,17 +4122,20 @@ no_journal:
block = ext4_count_free_clusters(sb);
ext4_free_blocks_count_set(sbi->s_es,
EXT4_C2B(sbi, block));
- err = percpu_counter_init(&sbi->s_freeclusters_counter, block);
+ err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
+ GFP_KERNEL);
if (!err) {
unsigned long freei = ext4_count_free_inodes(sb);
sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
- err = percpu_counter_init(&sbi->s_freeinodes_counter, freei);
+ err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
+ GFP_KERNEL);
}
if (!err)
err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb));
+ ext4_count_dirs(sb), GFP_KERNEL);
if (!err)
- err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+ err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
+ GFP_KERNEL);
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount6;
@@ -4224,10 +4244,10 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
-failed_mount3:
+failed_mount3a:
ext4_es_unregister_shrinker(sbi);
+failed_mount3:
del_timer_sync(&sbi->s_err_report);
- percpu_counter_destroy(&sbi->s_extent_cache_cnt);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
@@ -4242,7 +4262,7 @@ failed_mount:
remove_proc_entry(sb->s_id, ext4_proc_root);
}
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
#endif
ext4_blkdev_remove(sbi);
@@ -4370,6 +4390,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
+ if ((le32_to_cpu(es->s_feature_ro_compat) &
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ es->s_checksum != ext4_superblock_csum(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "external journal has "
+ "corrupt superblock");
+ brelse(bh);
+ goto out_bdev;
+ }
+
if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
ext4_msg(sb, KERN_ERR, "journal UUID does not match");
brelse(bh);
@@ -4672,15 +4701,19 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
* being sent at the end of the function. But we can skip it if
* transaction_commit will do it for us.
*/
- target = jbd2_get_latest_transaction(sbi->s_journal);
- if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
- !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+ if (sbi->s_journal) {
+ target = jbd2_get_latest_transaction(sbi->s_journal);
+ if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+ needs_barrier = true;
+
+ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
+ if (wait)
+ ret = jbd2_log_wait_commit(sbi->s_journal,
+ target);
+ }
+ } else if (wait && test_opt(sb, BARRIER))
needs_barrier = true;
-
- if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
- if (wait)
- ret = jbd2_log_wait_commit(sbi->s_journal, target);
- }
if (needs_barrier) {
int err;
err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
@@ -4691,19 +4724,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
return ret;
}
-static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
-{
- int ret = 0;
-
- trace_ext4_sync_fs(sb, wait);
- flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- dquot_writeback_dquots(sb, -1);
- if (wait && test_opt(sb, BARRIER))
- ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
-
- return ret;
-}
-
/*
* LVM calls this function before a (read-only) snapshot is created. This
* gives us a chance to flush the journal completely and mark the fs clean.
@@ -4722,23 +4742,26 @@ static int ext4_freeze(struct super_block *sb)
journal = EXT4_SB(sb)->s_journal;
- /* Now we set up the journal barrier. */
- jbd2_journal_lock_updates(journal);
+ if (journal) {
+ /* Now we set up the journal barrier. */
+ jbd2_journal_lock_updates(journal);
- /*
- * Don't clear the needs_recovery flag if we failed to flush
- * the journal.
- */
- error = jbd2_journal_flush(journal);
- if (error < 0)
- goto out;
+ /*
+ * Don't clear the needs_recovery flag if we failed to
+ * flush the journal.
+ */
+ error = jbd2_journal_flush(journal);
+ if (error < 0)
+ goto out;
+ }
/* Journal blocked and flushed, clear needs_recovery flag. */
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
out:
- /* we rely on upper layer to stop further updates */
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (journal)
+ /* we rely on upper layer to stop further updates */
+ jbd2_journal_unlock_updates(journal);
return error;
}
@@ -4769,7 +4792,7 @@ struct ext4_mount_options {
u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA
int s_jquota_fmt;
- char *s_qf_names[MAXQUOTAS];
+ char *s_qf_names[EXT4_MAXQUOTAS];
#endif
};
@@ -4799,7 +4822,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
if (sbi->s_qf_names[i]) {
old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
GFP_KERNEL);
@@ -4823,6 +4846,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
+ test_opt(sb, JOURNAL_CHECKSUM)) {
+ ext4_msg(sb, KERN_ERR, "changing journal_checksum "
+ "during remount not supported");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
if (test_opt2(sb, EXPLICIT_DELALLOC)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -4960,7 +4991,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_QUOTA
/* Release old quota file names */
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(old_opts.s_qf_names[i]);
if (enable_quota) {
if (sb_any_quota_suspended(sb))
@@ -4989,7 +5020,7 @@ restore_opts:
sbi->s_max_batch_time = old_opts.s_max_batch_time;
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
@@ -5192,7 +5223,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
{
int err;
struct inode *qf_inode;
- unsigned long qf_inums[MAXQUOTAS] = {
+ unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
};
@@ -5220,13 +5251,13 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
static int ext4_enable_quotas(struct super_block *sb)
{
int type, err = 0;
- unsigned long qf_inums[MAXQUOTAS] = {
+ unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
};
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < EXT4_MAXQUOTAS; type++) {
if (qf_inums[type]) {
err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
DQUOT_USAGE_ENABLED);
@@ -5304,7 +5335,6 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
{
struct inode *inode = sb_dqopt(sb)->files[type];
ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
- int err = 0;
int offset = off & (sb->s_blocksize - 1);
int tocopy;
size_t toread;
@@ -5319,9 +5349,9 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
while (toread > 0) {
tocopy = sb->s_blocksize - offset < toread ?
sb->s_blocksize - offset : toread;
- bh = ext4_bread(NULL, inode, blk, 0, &err);
- if (err)
- return err;
+ bh = ext4_bread(NULL, inode, blk, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
if (!bh) /* A hole? */
memset(data, 0, tocopy);
else
@@ -5342,8 +5372,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
{
struct inode *inode = sb_dqopt(sb)->files[type];
ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
- int err = 0;
- int offset = off & (sb->s_blocksize - 1);
+ int err, offset = off & (sb->s_blocksize - 1);
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
@@ -5364,14 +5393,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
return -EIO;
}
- bh = ext4_bread(handle, inode, blk, 1, &err);
+ bh = ext4_bread(handle, inode, blk, 1);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
if (!bh)
goto out;
BUFFER_TRACE(bh, "get write access");
err = ext4_journal_get_write_access(handle, bh);
if (err) {
brelse(bh);
- goto out;
+ return err;
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, len);
@@ -5380,8 +5411,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
out:
- if (err)
- return err;
if (inode->i_size < off + len) {
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e7387337060c..1e09fc77395c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -142,8 +142,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
sector_t block_nr,
struct ext4_xattr_header *hdr)
{
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ if (ext4_has_metadata_csum(inode->i_sb) &&
(hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
return 0;
return 1;
@@ -153,8 +152,7 @@ static void ext4_xattr_block_csum_set(struct inode *inode,
sector_t block_nr,
struct ext4_xattr_header *hdr)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return;
hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
@@ -190,14 +188,28 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
static int
-ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
+ void *value_start)
{
- while (!IS_LAST_ENTRY(entry)) {
- struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+ struct ext4_xattr_entry *e = entry;
+
+ while (!IS_LAST_ENTRY(e)) {
+ struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
if ((void *)next >= end)
return -EIO;
- entry = next;
+ e = next;
}
+
+ while (!IS_LAST_ENTRY(entry)) {
+ if (entry->e_value_size != 0 &&
+ (value_start + le16_to_cpu(entry->e_value_offs) <
+ (void *)e + sizeof(__u32) ||
+ value_start + le16_to_cpu(entry->e_value_offs) +
+ le32_to_cpu(entry->e_value_size) > end))
+ return -EIO;
+ entry = EXT4_XATTR_NEXT(entry);
+ }
+
return 0;
}
@@ -214,7 +226,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
return -EIO;
if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
return -EIO;
- error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+ error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
+ bh->b_data);
if (!error)
set_buffer_verified(bh);
return error;
@@ -331,7 +344,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
header = IHDR(inode, raw_inode);
entry = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = ext4_xattr_check_names(entry, end);
+ error = ext4_xattr_check_names(entry, end, entry);
if (error)
goto cleanup;
error = ext4_xattr_find_entry(&entry, name_index, name,
@@ -463,7 +476,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = ext4_xattr_check_names(IFIRST(header), end);
+ error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
if (error)
goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
@@ -899,14 +912,8 @@ inserted:
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
- /*
- * take i_data_sem because we will test
- * i_delalloc_reserved_flag in ext4_mb_new_blocks
- */
- down_read(&EXT4_I(inode)->i_data_sem);
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
- up_read((&EXT4_I(inode)->i_data_sem));
if (error)
goto cleanup;
@@ -986,7 +993,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
- error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+ error = ext4_xattr_check_names(IFIRST(header), is->s.end,
+ IFIRST(header));
if (error)
return error;
/* Find the named attribute. */
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 214fe1054fce..736a348509f7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -23,7 +23,7 @@ config F2FS_STAT_FS
mounted as f2fs. Each file shows the whole f2fs information.
/sys/kernel/debug/f2fs/status includes:
- - major file system information managed by f2fs currently
+ - major filesystem information managed by f2fs currently
- average SIT information about whole segments
- current memory footprint consumed by f2fs.
@@ -68,6 +68,6 @@ config F2FS_CHECK_FS
bool "F2FS consistency checking feature"
depends on F2FS_FS
help
- Enables BUG_ONs which check the file system consistency in runtime.
+ Enables BUG_ONs which check the filesystem consistency in runtime.
If you want to improve the performance, say N.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6aeed5bada52..dd10a031c052 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,7 +72,22 @@ out:
return page;
}
-static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ bool readahead = false;
+ struct page *page;
+
+ page = find_get_page(META_MAPPING(sbi), index);
+ if (!page || (page && !PageUptodate(page)))
+ readahead = true;
+ f2fs_put_page(page, 0);
+
+ if (readahead)
+ ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+ return get_meta_page(sbi, index);
+}
+
+static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
{
switch (type) {
case META_NAT:
@@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
case META_SSA:
case META_CP:
return 0;
+ case META_POR:
+ return MAX_BLKADDR(sbi);
default:
BUG();
}
@@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
{
block_t prev_blk_addr = 0;
struct page *page;
- int blkno = start;
- int max_blks = get_max_meta_blks(sbi, type);
+ block_t blkno = start;
+ block_t max_blks = get_max_meta_blks(sbi, type);
struct f2fs_io_info fio = {
.type = META,
@@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
break;
case META_SSA:
case META_CP:
- /* get ssa/cp block addr */
+ case META_POR:
+ if (unlikely(blkno >= max_blks))
+ goto out;
+ if (unlikely(blkno < SEG0_BLKADDR(sbi)))
+ goto out;
blk_addr = blkno;
break;
default:
@@ -151,8 +172,7 @@ out:
static int f2fs_write_meta_page(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
trace_f2fs_writepage(page, META);
@@ -160,14 +180,11 @@ static int f2fs_write_meta_page(struct page *page,
goto redirty_out;
if (wbc->for_reclaim)
goto redirty_out;
-
- /* Should not write any meta pages, if any IO error was occurred */
- if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
- goto no_write;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
f2fs_wait_on_page_writeback(page, META);
write_meta_page(sbi, page);
-no_write:
dec_page_count(sbi, F2FS_DIRTY_META);
unlock_page(page);
return 0;
@@ -180,7 +197,7 @@ redirty_out:
static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff, written;
trace_f2fs_writepages(mapping->host, wbc, META);
@@ -262,15 +279,12 @@ continue_unlock:
static int f2fs_set_meta_page_dirty(struct page *page)
{
- struct address_space *mapping = page->mapping;
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-
trace_f2fs_set_page_dirty(page, META);
SetPageUptodate(page);
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
- inc_page_count(sbi, F2FS_DIRTY_META);
+ inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
return 1;
}
return 0;
@@ -348,7 +362,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
return e ? true : false;
}
-static void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_dirty_inode(struct f2fs_sb_info *sbi)
{
struct ino_entry *e, *tmp;
int i;
@@ -381,7 +395,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
void release_orphan_inode(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- f2fs_bug_on(sbi->n_orphans == 0);
+ f2fs_bug_on(sbi, sbi->n_orphans == 0);
sbi->n_orphans--;
spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
}
@@ -401,7 +415,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode = f2fs_iget(sbi->sb, ino);
- f2fs_bug_on(IS_ERR(inode));
+ f2fs_bug_on(sbi, IS_ERR(inode));
clear_nlink(inode);
/* truncate all the data during iput */
@@ -446,8 +460,8 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
struct f2fs_orphan_block *orphan_blk = NULL;
unsigned int nentries = 0;
unsigned short index;
- unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
- (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+ unsigned short orphan_blocks =
+ (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
struct page *page = NULL;
struct ino_entry *orphan = NULL;
@@ -462,7 +476,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
list_for_each_entry(orphan, head, list) {
if (!page) {
page = find_get_page(META_MAPPING(sbi), start_blk++);
- f2fs_bug_on(!page);
+ f2fs_bug_on(sbi, !page);
orphan_blk =
(struct f2fs_orphan_block *)page_address(page);
memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -622,7 +636,7 @@ fail_no_cp:
static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
return -EEXIST;
@@ -634,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
return 0;
}
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+void update_dirty_page(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *new;
int ret = 0;
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
return;
+ if (!S_ISDIR(inode->i_mode)) {
+ inode_inc_dirty_pages(inode);
+ goto out;
+ }
+
new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
new->inode = inode;
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
ret = __add_dirty_inode(inode, new);
- inode_inc_dirty_dents(inode);
- SetPagePrivate(page);
+ inode_inc_dirty_pages(inode);
spin_unlock(&sbi->dir_inode_lock);
if (ret)
kmem_cache_free(inode_entry_slab, new);
+out:
+ SetPagePrivate(page);
}
void add_dirty_dir_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *new =
f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
int ret = 0;
@@ -677,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode)
void remove_dirty_dir_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dir_inode_entry *entry;
if (!S_ISDIR(inode->i_mode))
return;
spin_lock(&sbi->dir_inode_lock);
- if (get_dirty_dents(inode) ||
+ if (get_dirty_pages(inode) ||
!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
spin_unlock(&sbi->dir_inode_lock);
return;
@@ -737,7 +757,7 @@ retry:
/*
* Freeze all the FS-operations for checkpoint.
*/
-static void block_operations(struct f2fs_sb_info *sbi)
+static int block_operations(struct f2fs_sb_info *sbi)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
@@ -745,6 +765,7 @@ static void block_operations(struct f2fs_sb_info *sbi)
.for_reclaim = 0,
};
struct blk_plug plug;
+ int err = 0;
blk_start_plug(&plug);
@@ -754,11 +775,15 @@ retry_flush_dents:
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
sync_dirty_dir_inodes(sbi);
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto out;
+ }
goto retry_flush_dents;
}
/*
- * POR: we should ensure that there is no dirty node pages
+ * POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush.
*/
retry_flush_nodes:
@@ -767,9 +792,16 @@ retry_flush_nodes:
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
sync_node_pages(sbi, 0, &wbc);
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_unlock_all(sbi);
+ err = -EIO;
+ goto out;
+ }
goto retry_flush_nodes;
}
+out:
blk_finish_plug(&plug);
+ return err;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
@@ -793,11 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
finish_wait(&sbi->cp_wait, &wait);
}
-static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
- nid_t last_nid = 0;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ nid_t last_nid = nm_i->next_scan_nid;
block_t start_blk;
struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
@@ -813,8 +846,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
/* Flush all the NAT/SIT pages */
- while (get_pages(sbi, F2FS_DIRTY_META))
+ while (get_pages(sbi, F2FS_DIRTY_META)) {
sync_meta_pages(sbi, META, LONG_MAX);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+ }
next_free_nid(sbi, &last_nid);
@@ -825,7 +861,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
- for (i = 0; i < 3; i++) {
+ for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
ckpt->cur_node_segno[i] =
cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
ckpt->cur_node_blkoff[i] =
@@ -833,7 +869,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
ckpt->alloc_type[i + CURSEG_HOT_NODE] =
curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
}
- for (i = 0; i < 3; i++) {
+ for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
ckpt->cur_data_segno[i] =
cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
ckpt->cur_data_blkoff[i] =
@@ -848,24 +884,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* 2 cp + n data seg summary + orphan inode blocks */
data_sum_blocks = npages_for_summary_flush(sbi);
- if (data_sum_blocks < 3)
+ if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
else
clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
- orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
- / F2FS_ORPHANS_PER_BLOCK;
+ orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
orphan_blocks);
- if (is_umount) {
+ if (cpc->reason == CP_UMOUNT) {
set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
cp_payload_blks + data_sum_blocks +
orphan_blocks + NR_CURSEG_NODE_TYPE);
} else {
clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
cp_payload_blks + data_sum_blocks +
orphan_blocks);
}
@@ -875,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
else
clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+ if (sbi->need_fsck)
+ set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
/* update SIT/NAT bitmap */
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -909,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
- if (is_umount) {
+ if (cpc->reason == CP_UMOUNT) {
write_node_summaries(sbi, start_blk);
start_blk += NR_CURSEG_NODE_TYPE;
}
@@ -924,6 +962,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* wait for previous submitted node/meta pages writeback */
wait_on_all_pages_writeback(sbi);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
@@ -934,27 +975,35 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* Here, we only have one bio having CP pack */
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
- if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
- clear_prefree_segments(sbi);
- release_dirty_inode(sbi);
- F2FS_RESET_SB_DIRT(sbi);
- }
+ release_dirty_inode(sbi);
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
+ clear_prefree_segments(sbi);
+ F2FS_RESET_SB_DIRT(sbi);
}
/*
- * We guarantee that this checkpoint procedure should not fail.
+ * We guarantee that this checkpoint procedure will not fail.
*/
-void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
mutex_lock(&sbi->cp_mutex);
- block_operations(sbi);
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
+ if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
+ goto out;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto out;
+ if (block_operations(sbi))
+ goto out;
+
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
f2fs_submit_merged_bio(sbi, DATA, WRITE);
f2fs_submit_merged_bio(sbi, NODE, WRITE);
@@ -970,16 +1019,16 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* write cached NAT/SIT entries to NAT/SIT area */
flush_nat_entries(sbi);
- flush_sit_entries(sbi);
+ flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
- do_checkpoint(sbi, is_umount);
+ do_checkpoint(sbi, cpc);
unblock_operations(sbi);
- mutex_unlock(&sbi->cp_mutex);
-
stat_inc_cp_count(sbi->stat_info);
- trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
+out:
+ mutex_unlock(&sbi->cp_mutex);
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
}
void init_ino_entry_info(struct f2fs_sb_info *sbi)
@@ -999,8 +1048,8 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
* for cp pack we can have max 1020*504 orphan entries
*/
sbi->n_orphans = 0;
- sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
- * F2FS_ORPHANS_PER_BLOCK;
+ sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
+ NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
}
int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 03313099c51c..8e58c4cc2cb9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -53,7 +53,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
struct page *page = bvec->bv_page;
if (unlikely(err)) {
- SetPageError(page);
+ set_page_dirty(page);
set_bit(AS_EIO, &page->mapping->flags);
f2fs_stop_checkpoint(sbi);
}
@@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
bio = bio_alloc(GFP_NOIO, npages);
bio->bi_bdev = sbi->sb->s_bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
bio->bi_private = sbi;
@@ -193,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ int bio_blocks = MAX_BIO_BLOCKS(sbi);
io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
io->fio = *fio;
@@ -236,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
int reserve_new_block(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
@@ -258,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
int err;
/* if inode_page exists, index should be zero */
- f2fs_bug_on(!need_put && index);
+ f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
err = get_dnode_of_data(dn, index, ALLOC_NODE);
if (err)
@@ -321,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
block_t start_blkaddr, end_blkaddr;
int need_update = true;
- f2fs_bug_on(blk_addr == NEW_ADDR);
+ f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
dn->ofs_in_node;
@@ -396,7 +396,6 @@ end_update:
struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
@@ -429,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
return page;
}
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
sync ? READ_SYNC : READA);
if (err)
return ERR_PTR(err);
@@ -451,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
*/
struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
@@ -490,7 +488,8 @@ repeat:
return page;
}
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+ dn.data_blkaddr, READ_SYNC);
if (err)
return ERR_PTR(err);
@@ -517,7 +516,6 @@ repeat:
struct page *get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
struct page *page;
struct dnode_of_data dn;
@@ -541,8 +539,8 @@ repeat:
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
- READ_SYNC);
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+ dn.data_blkaddr, READ_SYNC);
if (err)
goto put_err;
@@ -573,10 +571,12 @@ put_err:
static int __allocate_data_block(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ struct f2fs_inode_info *fi = F2FS_I(dn->inode);
struct f2fs_summary sum;
block_t new_blkaddr;
struct node_info ni;
+ pgoff_t fofs;
int type;
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
@@ -599,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
update_extent_cache(new_blkaddr, dn);
clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ /* update i_size */
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ dn->ofs_in_node;
+ if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
+ i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
+
dn->data_blkaddr = new_blkaddr;
return 0;
}
@@ -614,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
static int __get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create, bool fiemap)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
unsigned int blkbits = inode->i_sb->s_blocksize_bits;
unsigned maxblocks = bh_result->b_size >> blkbits;
struct dnode_of_data dn;
@@ -630,8 +635,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
goto out;
if (create) {
- f2fs_balance_fs(sbi);
- f2fs_lock_op(sbi);
+ f2fs_balance_fs(F2FS_I_SB(inode));
+ f2fs_lock_op(F2FS_I_SB(inode));
}
/* When reading holes, we need its node page */
@@ -691,7 +696,7 @@ get_next:
allocated = true;
blkaddr = dn.data_blkaddr;
}
- /* Give more consecutive addresses for the read ahead */
+ /* Give more consecutive addresses for the readahead */
if (blkaddr == (bh_result->b_blocknr + ofs)) {
ofs++;
dn.ofs_in_node++;
@@ -707,7 +712,7 @@ put_out:
f2fs_put_dnode(&dn);
unlock_out:
if (create)
- f2fs_unlock_op(sbi);
+ f2fs_unlock_op(F2FS_I_SB(inode));
out:
trace_f2fs_get_data_block(inode, iblock, bh_result, err);
return err;
@@ -739,7 +744,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
trace_f2fs_readpage(page, DATA);
- /* If the file has inline data, try to read it directlly */
+ /* If the file has inline data, try to read it directly */
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
else
@@ -804,7 +809,7 @@ static int f2fs_write_data_page(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_CACHE_SHIFT;
@@ -836,10 +841,19 @@ write:
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
err = do_write_data_page(page, &fio);
goto done;
}
+ /* we should bypass data pages to proceed the kworkder jobs */
+ if (unlikely(f2fs_cp_error(sbi))) {
+ SetPageError(page);
+ unlock_page(page);
+ goto out;
+ }
+
if (!wbc->for_reclaim)
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0))
@@ -857,7 +871,7 @@ done:
clear_cold_data(page);
out:
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
unlock_page(page);
if (need_balance_fs)
f2fs_balance_fs(sbi);
@@ -883,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
bool locked = false;
int ret;
long diff;
@@ -895,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
return 0;
if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
- get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) &&
+ get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
@@ -917,7 +931,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
return ret;
skip_write:
- wbc->pages_skipped += get_dirty_dents(inode);
+ wbc->pages_skipped += get_dirty_pages(inode);
return 0;
}
@@ -927,7 +941,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
if (to > inode->i_size) {
truncate_pagecache(inode, inode->i_size);
- truncate_blocks(inode, inode->i_size);
+ truncate_blocks(inode, inode->i_size, true);
}
}
@@ -936,7 +950,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page;
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
struct dnode_of_data dn;
@@ -946,7 +960,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
f2fs_balance_fs(sbi);
repeat:
- err = f2fs_convert_inline_data(inode, pos + len);
+ err = f2fs_convert_inline_data(inode, pos + len, NULL);
if (err)
goto fail;
@@ -1038,7 +1052,10 @@ static int f2fs_write_end(struct file *file,
trace_f2fs_write_end(inode, pos, len, copied);
- set_page_dirty(page);
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ register_inmem_page(inode, page);
+ else
+ set_page_dirty(page);
if (pos + copied > i_size_read(inode)) {
i_size_write(inode, pos + copied);
@@ -1083,9 +1100,6 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
if (check_direct_IO(inode, rw, iter, offset))
return 0;
- /* clear fsync mark to recover these blocks */
- fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
-
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
@@ -1101,8 +1115,12 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
unsigned int length)
{
struct inode *inode = page->mapping->host;
+
+ if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+ return;
+
if (PageDirty(page))
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
ClearPagePrivate(page);
}
@@ -1124,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
- set_dirty_dir_page(inode, page);
+ update_dirty_page(inode, page);
return 1;
}
return 0;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a441ba33be11..0a91ab813a9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -32,7 +32,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
struct f2fs_stat_info *si = F2FS_STAT(sbi);
int i;
- /* valid check of the segment numbers */
+ /* validation check of the segment numbers */
si->hit_ext = sbi->read_hit_ext;
si->total_ext = sbi->total_hit_ext;
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
total_vblocks = 0;
blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
hblks_per_sec = blks_per_sec / 2;
- for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
ndirty++;
}
}
- dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
+ dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
si->bimodal = bimodal / dist;
if (si->dirty_count)
si->avg_vblocks = total_vblocks / ndirty;
@@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build sit */
si->base_mem += sizeof(struct sit_info);
- si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
- si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
+ si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
+ si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
if (sbi->segs_per_sec > 1)
- si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry);
+ si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
/* build free segmap */
si->base_mem += sizeof(struct free_segmap_info);
- si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
/* build curseg */
si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -149,10 +149,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build dirty segmap */
si->base_mem += sizeof(struct dirty_seglist_info);
- si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
- si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
+ si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
- /* buld nm */
+ /* build nm */
si->base_mem += sizeof(struct f2fs_nm_info);
si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index bcf893c3d903..b54f87149c09 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -124,9 +124,9 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
/*
* For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs are occurred.
+ * We stop here for figuring out where the bugs has occurred.
*/
- f2fs_bug_on(!de->name_len);
+ f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
@@ -151,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
bool room = false;
int max_slots = 0;
- f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
+ f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -284,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
int update_dent_inode(struct inode *inode, const struct qstr *name)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
- page = get_node_page(sbi, inode->i_ino);
+ page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -337,7 +336,6 @@ static int make_empty_dir(struct inode *inode,
static struct page *init_inode_metadata(struct inode *inode,
struct inode *dir, const struct qstr *name)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct page *page;
int err;
@@ -360,7 +358,7 @@ static struct page *init_inode_metadata(struct inode *inode,
if (err)
goto put_error;
} else {
- page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
if (IS_ERR(page))
return page;
@@ -381,7 +379,7 @@ static struct page *init_inode_metadata(struct inode *inode,
* we should remove this inode from orphan list.
*/
if (inode->i_nlink == 0)
- remove_orphan_inode(sbi, inode->i_ino);
+ remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
inc_nlink(inode);
}
return page;
@@ -391,7 +389,7 @@ put_error:
error:
/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
truncate_inode_pages(&inode->i_data, 0);
- truncate_blocks(inode, 0);
+ truncate_blocks(inode, 0, false);
remove_dirty_dir_inode(inode);
remove_inode_page(inode);
return ERR_PTR(err);
@@ -563,7 +561,7 @@ fail:
}
/*
- * It only removes the dentry from the dentry page,corresponding name
+ * It only removes the dentry from the dentry page, corresponding name
* entry in name page does not need to be touched during deletion.
*/
void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
@@ -571,8 +569,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
{
struct f2fs_dentry_block *dentry_blk;
unsigned int bit_pos;
- struct address_space *mapping = page->mapping;
- struct inode *dir = mapping->host;
+ struct inode *dir = page->mapping->host;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
int i;
@@ -594,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
if (inode) {
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
down_write(&F2FS_I(inode)->i_sem);
@@ -621,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
truncate_hole(dir, page->index, page->index + 1);
clear_page_dirty_for_io(page);
ClearPageUptodate(page);
- inode_dec_dirty_dents(dir);
+ inode_dec_dirty_pages(dir);
}
f2fs_put_page(page, 1);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4dab5338a97a..8171e80b2ee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,16 @@
#include <linux/sched.h>
#ifdef CONFIG_F2FS_CHECK_FS
-#define f2fs_bug_on(condition) BUG_ON(condition)
+#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
#else
-#define f2fs_bug_on(condition)
+#define f2fs_bug_on(sbi, condition) \
+ do { \
+ if (unlikely(condition)) { \
+ WARN_ON(1); \
+ sbi->need_fsck = true; \
+ } \
+ } while (0)
#define f2fs_down_write(x, y) down_write(x)
#endif
@@ -90,6 +96,20 @@ enum {
SIT_BITMAP
};
+enum {
+ CP_UMOUNT,
+ CP_SYNC,
+ CP_DISCARD,
+};
+
+struct cp_control {
+ int reason;
+ __u64 trim_start;
+ __u64 trim_end;
+ __u64 trim_minlen;
+ __u64 trimmed;
+};
+
/*
* For CP/NAT/SIT/SSA readahead
*/
@@ -97,7 +117,8 @@ enum {
META_CP,
META_NAT,
META_SIT,
- META_SSA
+ META_SSA,
+ META_POR,
};
/* for the list of ino */
@@ -130,7 +151,9 @@ struct discard_entry {
struct fsync_inode_entry {
struct list_head list; /* list head */
struct inode *inode; /* vfs inode pointer */
- block_t blkaddr; /* block address locating the last inode */
+ block_t blkaddr; /* block address locating the last fsync */
+ block_t last_dentry; /* block address locating the last dentry */
+ block_t last_inode; /* block address locating the last inode */
};
#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
@@ -141,6 +164,9 @@ struct fsync_inode_entry {
#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
+#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+
static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
{
int before = nats_in_cursum(rs);
@@ -155,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
return before;
}
+static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
+ int type)
+{
+ if (type == NAT_JOURNAL)
+ return size <= MAX_NAT_JENTRIES(sum);
+ return size <= MAX_SIT_JENTRIES(sum);
+}
+
/*
* ioctl commands
*/
-#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+
+#define F2FS_IOCTL_MAGIC 0xf5
+#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
+#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
+#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -222,13 +261,16 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
- atomic_t dirty_dents; /* # of dirty dentry pages */
+ atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
struct extent_info ext; /* in-memory extent cache entry */
struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
+
+ struct list_head inmem_pages; /* inmemory pages managed by f2fs */
+ struct mutex inmem_lock; /* lock for inmemory pages */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -260,11 +302,10 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
+ struct radix_tree_root nat_set_root;/* root of the nat set cache */
rwlock_t nat_tree_lock; /* protect nat_tree_lock */
- unsigned int nat_cnt; /* the # of cached nat entries */
struct list_head nat_entries; /* cached nat entry list (clean) */
- struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
- struct list_head nat_entry_set; /* nat entry set list */
+ unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
/* free node ids management */
@@ -332,18 +373,16 @@ enum {
};
struct flush_cmd {
- struct flush_cmd *next;
struct completion wait;
+ struct llist_node llnode;
int ret;
};
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
- struct flush_cmd *issue_list; /* list for command issue */
- struct flush_cmd *dispatch_list; /* list for command dispatch */
- spinlock_t issue_lock; /* for issue list lock */
- struct flush_cmd *issue_tail; /* list tail of issue list */
+ struct llist_head issue_list; /* list for command issue */
+ struct llist_node *dispatch_list; /* list for command dispatch */
};
struct f2fs_sm_info {
@@ -369,8 +408,11 @@ struct f2fs_sm_info {
int nr_discards; /* # of discards in the list */
int max_discards; /* max. discards to be issued */
+ struct list_head sit_entry_set; /* sit entry set list */
+
unsigned int ipu_policy; /* in-place-update policy */
unsigned int min_ipu_util; /* in-place-update threshold */
+ unsigned int min_fsync_blocks; /* threshold for fsync */
/* for flush command control */
struct flush_cmd_control *cmd_control_info;
@@ -395,7 +437,7 @@ enum count_type {
};
/*
- * The below are the page types of bios used in submti_bio().
+ * The below are the page types of bios used in submit_bio().
* The available types are:
* DATA User data pages. It operates as async mode.
* NODE Node pages. It operates as async mode.
@@ -434,6 +476,7 @@ struct f2fs_sb_info {
struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
int s_dirty; /* dirty flag for checkpoint */
+ bool need_fsck; /* need fsck.f2fs to fix */
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
@@ -470,7 +513,7 @@ struct f2fs_sb_info {
struct list_head dir_inode_list; /* dir inode list */
spinlock_t dir_inode_lock; /* for dir inode list lock */
- /* basic file system units */
+ /* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
unsigned int log_blocksize; /* log2 block size */
unsigned int blocksize; /* block size */
@@ -539,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
return sb->s_fs_info;
}
+static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
+{
+ return F2FS_SB(inode->i_sb);
+}
+
+static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
+{
+ return F2FS_I_SB(mapping->host);
+}
+
+static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
+{
+ return F2FS_M_SB(page->mapping);
+}
+
static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
{
return (struct f2fs_super_block *)(sbi->raw_super);
@@ -703,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
blkcnt_t count)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi->total_valid_block_count < (block_t) count);
- f2fs_bug_on(inode->i_blocks < count);
+ f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
+ f2fs_bug_on(sbi, inode->i_blocks < count);
inode->i_blocks -= count;
sbi->total_valid_block_count -= (block_t)count;
spin_unlock(&sbi->stat_lock);
@@ -716,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
F2FS_SET_SB_DIRT(sbi);
}
-static inline void inode_inc_dirty_dents(struct inode *inode)
+static inline void inode_inc_dirty_pages(struct inode *inode)
{
- inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
- atomic_inc(&F2FS_I(inode)->dirty_dents);
+ atomic_inc(&F2FS_I(inode)->dirty_pages);
+ if (S_ISDIR(inode->i_mode))
+ inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -727,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
atomic_dec(&sbi->nr_pages[count_type]);
}
-static inline void inode_dec_dirty_dents(struct inode *inode)
+static inline void inode_dec_dirty_pages(struct inode *inode)
{
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
return;
- dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
- atomic_dec(&F2FS_I(inode)->dirty_dents);
+ atomic_dec(&F2FS_I(inode)->dirty_pages);
+
+ if (S_ISDIR(inode->i_mode))
+ dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
}
static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -741,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
return atomic_read(&sbi->nr_pages[count_type]);
}
-static inline int get_dirty_dents(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
{
- return atomic_read(&F2FS_I(inode)->dirty_dents);
+ return atomic_read(&F2FS_I(inode)->dirty_pages);
}
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -799,7 +860,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
/*
* odd numbered checkpoint should at cp segment 0
- * and even segent must be at cp segment 1
+ * and even segment must be at cp segment 1
*/
if (!(ckpt_version & 1))
start_addr += sbi->blocks_per_seg;
@@ -848,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(!sbi->total_valid_block_count);
- f2fs_bug_on(!sbi->total_valid_node_count);
- f2fs_bug_on(!inode->i_blocks);
+ f2fs_bug_on(sbi, !sbi->total_valid_block_count);
+ f2fs_bug_on(sbi, !sbi->total_valid_node_count);
+ f2fs_bug_on(sbi, !inode->i_blocks);
inode->i_blocks--;
sbi->total_valid_node_count--;
@@ -867,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count);
+ f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
sbi->total_valid_inode_count++;
spin_unlock(&sbi->stat_lock);
}
@@ -875,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(!sbi->total_valid_inode_count);
+ f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
sbi->total_valid_inode_count--;
spin_unlock(&sbi->stat_lock);
}
@@ -891,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
return;
if (unlock) {
- f2fs_bug_on(!PageLocked(page));
+ f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
unlock_page(page);
}
page_cache_release(page);
@@ -998,7 +1059,9 @@ enum {
FI_INLINE_DATA, /* used for inline data*/
FI_APPEND_WRITE, /* inode has appended data */
FI_UPDATE_WRITE, /* inode has in-place-update data */
- FI_NEED_IPU, /* used fo ipu for fdatasync */
+ FI_NEED_IPU, /* used for ipu per file */
+ FI_ATOMIC_FILE, /* indicate atomic file */
+ FI_VOLATILE_FILE, /* indicate volatile file */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1085,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
}
+static inline bool f2fs_is_atomic_file(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+}
+
+static inline bool f2fs_is_volatile_file(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+}
+
static inline void *inline_data_addr(struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1096,6 +1169,11 @@ static inline int f2fs_readonly(struct super_block *sb)
return sb->s_flags & MS_RDONLY;
}
+static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
+{
+ return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+}
+
static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
{
set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -1117,7 +1195,7 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
*/
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64);
+int truncate_blocks(struct inode *, u64, bool);
void f2fs_truncate(struct inode *);
int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1136,6 +1214,7 @@ void update_inode(struct inode *, struct page *);
void update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
+void handle_failed_inode(struct inode *);
/*
* namei.c
@@ -1183,9 +1262,9 @@ struct dnode_of_data;
struct node_info;
bool available_free_memory(struct f2fs_sb_info *, int);
-int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
-void fsync_mark_clear(struct f2fs_sb_info *, nid_t);
+bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
+bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1202,10 +1281,8 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-void recover_node_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_summary *, struct node_info *, block_t);
void recover_inline_xattr(struct inode *, struct page *);
-bool recover_xattr_data(struct inode *, struct page *, block_t);
+void recover_xattr_data(struct inode *, struct page *, block_t);
int recover_inode_page(struct f2fs_sb_info *, struct page *);
int restore_node_summary(struct f2fs_sb_info *, unsigned int,
struct f2fs_summary_block *);
@@ -1218,6 +1295,8 @@ void destroy_node_manager_caches(void);
/*
* segment.c
*/
+void register_inmem_page(struct inode *, struct page *);
+void commit_inmem_pages(struct inode *, bool);
void f2fs_balance_fs(struct f2fs_sb_info *);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1226,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *);
+void release_discard_addrs(struct f2fs_sb_info *);
void discard_next_dnode(struct f2fs_sb_info *, block_t);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
+int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(struct f2fs_sb_info *, struct page *,
@@ -1238,8 +1319,6 @@ void write_data_page(struct page *, struct dnode_of_data *, block_t *,
void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
void recover_data_page(struct f2fs_sb_info *, struct page *,
struct f2fs_summary *, block_t, block_t);
-void rewrite_node_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_summary *, block_t, block_t);
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1247,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
int lookup_journal_in_cursum(struct f2fs_summary_block *,
int, unsigned int, int);
-void flush_sit_entries(struct f2fs_sb_info *);
+void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
int build_segment_manager(struct f2fs_sb_info *);
void destroy_segment_manager(struct f2fs_sb_info *);
int __init create_segment_manager_caches(void);
@@ -1258,10 +1337,12 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
+struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void release_dirty_inode(struct f2fs_sb_info *);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
@@ -1269,11 +1350,11 @@ void add_orphan_inode(struct f2fs_sb_info *, nid_t);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
void recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
-void set_dirty_dir_page(struct inode *, struct page *);
+void update_dirty_page(struct inode *, struct page *);
void add_dirty_dir_inode(struct inode *);
void remove_dirty_dir_inode(struct inode *);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, bool);
+void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
void init_ino_entry_info(struct f2fs_sb_info *);
int __init create_checkpoint_caches(void);
void destroy_checkpoint_caches(void);
@@ -1357,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_SB(inode->i_sb))->inline_inode++); \
+ ((F2FS_I_SB(inode))->inline_inode++); \
} while (0)
#define stat_dec_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_SB(inode->i_sb))->inline_inode--); \
+ ((F2FS_I_SB(inode))->inline_inode--); \
} while (0)
#define stat_inc_seg_type(sbi, curseg) \
@@ -1439,8 +1520,8 @@ extern const struct inode_operations f2fs_special_inode_operations;
*/
bool f2fs_may_inline(struct inode *);
int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
void truncate_inline_data(struct inode *, u64);
-int recover_inline_data(struct inode *, struct page *);
+bool recover_inline_data(struct inode *, struct page *);
#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 208f1a9bd569..8e68bb64f835 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
int err;
@@ -41,6 +41,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
sb_start_pagefault(inode->i_sb);
+ /* force to convert with normal data indices */
+ err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
+ if (err)
+ goto out;
+
/* block allocation */
f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -110,11 +115,31 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
return 1;
}
+static inline bool need_do_checkpoint(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ bool need_cp = false;
+
+ if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+ need_cp = true;
+ else if (file_wrong_pino(inode))
+ need_cp = true;
+ else if (!space_for_roll_forward(sbi))
+ need_cp = true;
+ else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
+ need_cp = true;
+ else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+ need_cp = true;
+
+ return need_cp;
+}
+
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ nid_t ino = inode->i_ino;
int ret = 0;
bool need_cp = false;
struct writeback_control wbc = {
@@ -129,12 +154,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_f2fs_sync_file_enter(inode);
/* if fdatasync is triggered, let's do in-place-update */
- if (datasync)
+ if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(fi, FI_NEED_IPU);
-
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (datasync)
- clear_inode_flag(fi, FI_NEED_IPU);
+ clear_inode_flag(fi, FI_NEED_IPU);
+
if (ret) {
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
@@ -144,33 +168,31 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* if there is no written data, don't waste time to write recovery info.
*/
if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
- !exist_written_data(sbi, inode->i_ino, APPEND_INO)) {
+ !exist_written_data(sbi, ino, APPEND_INO)) {
+ struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+
+ /* But we need to avoid that there are some inode updates */
+ if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
+ f2fs_put_page(i, 0);
+ goto go_write;
+ }
+ f2fs_put_page(i, 0);
+
if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
- exist_written_data(sbi, inode->i_ino, UPDATE_INO))
+ exist_written_data(sbi, ino, UPDATE_INO))
goto flush_out;
goto out;
}
-
+go_write:
/* guarantee free sections for fsync */
f2fs_balance_fs(sbi);
- down_read(&fi->i_sem);
-
/*
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
- need_cp = true;
- else if (file_wrong_pino(inode))
- need_cp = true;
- else if (!space_for_roll_forward(sbi))
- need_cp = true;
- else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
- need_cp = true;
- else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
- need_cp = true;
-
+ down_read(&fi->i_sem);
+ need_cp = need_do_checkpoint(inode);
up_read(&fi->i_sem);
if (need_cp) {
@@ -194,26 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
up_write(&fi->i_sem);
}
} else {
- /* if there is no written node page, write its inode page */
- while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
- if (fsync_mark_done(sbi, inode->i_ino))
- goto out;
+sync_nodes:
+ sync_node_pages(sbi, ino, &wbc);
+
+ if (need_inode_block_update(sbi, ino)) {
mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
+ goto sync_nodes;
}
- ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
+
+ ret = wait_on_node_pages_writeback(sbi, ino);
if (ret)
goto out;
/* once recovery info is written, don't need to tack this */
- remove_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+ remove_dirty_inode(sbi, ino, APPEND_INO);
clear_inode_flag(fi, FI_APPEND_WRITE);
flush_out:
- remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+ remove_dirty_inode(sbi, ino, UPDATE_INO);
clear_inode_flag(fi, FI_UPDATE_WRITE);
- ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
+ ret = f2fs_issue_flush(F2FS_I_SB(inode));
}
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -288,7 +312,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
if (err && err != -ENOENT) {
goto fail;
} else if (err == -ENOENT) {
- /* direct node is not exist */
+ /* direct node does not exists */
if (whence == SEEK_DATA) {
pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
F2FS_I(inode));
@@ -340,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
maxbytes, i_size_read(inode));
case SEEK_DATA:
case SEEK_HOLE:
+ if (offset < 0)
+ return -ENXIO;
return f2fs_seek_block(file, offset, whence);
}
@@ -356,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
int nr_free = 0, ofs = dn->ofs_in_node;
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_node *raw_node;
__le32 *addr;
@@ -417,9 +443,9 @@ out:
f2fs_put_page(page, 1);
}
-int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from, bool lock)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int blocksize = inode->i_sb->s_blocksize;
struct dnode_of_data dn;
pgoff_t free_from;
@@ -433,14 +459,16 @@ int truncate_blocks(struct inode *inode, u64 from)
free_from = (pgoff_t)
((from + blocksize - 1) >> (sbi->log_blocksize));
- f2fs_lock_op(sbi);
+ if (lock)
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
if (err) {
if (err == -ENOENT)
goto free_next;
- f2fs_unlock_op(sbi);
+ if (lock)
+ f2fs_unlock_op(sbi);
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
@@ -448,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from)
count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
count -= dn.ofs_in_node;
- f2fs_bug_on(count < 0);
+ f2fs_bug_on(sbi, count < 0);
if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
truncate_data_blocks_range(&dn, count);
@@ -458,7 +486,8 @@ int truncate_blocks(struct inode *inode, u64 from)
f2fs_put_dnode(&dn);
free_next:
err = truncate_inode_blocks(inode, free_from);
- f2fs_unlock_op(sbi);
+ if (lock)
+ f2fs_unlock_op(sbi);
done:
/* lastly zero out the first data page */
truncate_partial_data_page(inode, from);
@@ -475,7 +504,7 @@ void f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
- if (!truncate_blocks(inode, i_size_read(inode))) {
+ if (!truncate_blocks(inode, i_size_read(inode), true)) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
@@ -531,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- err = f2fs_convert_inline_data(inode, attr->ia_size);
+ if (attr->ia_valid & ATTR_SIZE) {
+ err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
if (err)
return err;
- truncate_setsize(inode, attr->ia_size);
- f2fs_truncate(inode);
- f2fs_balance_fs(F2FS_SB(inode->i_sb));
+ if (attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ f2fs_truncate(inode);
+ f2fs_balance_fs(F2FS_I_SB(inode));
+ } else {
+ /*
+ * giving a chance to truncate blocks past EOF which
+ * are fallocated with FALLOC_FL_KEEP_SIZE.
+ */
+ f2fs_truncate(inode);
+ }
}
__setattr_copy(inode, attr);
@@ -573,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = {
static void fill_zero(struct inode *inode, pgoff_t index,
loff_t start, loff_t len)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page;
if (!len)
@@ -622,7 +658,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
loff_t off_start, off_end;
int ret = 0;
- ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ /* skip punching hole beyond i_size */
+ if (offset >= inode->i_size)
+ return ret;
+
+ ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
if (ret)
return ret;
@@ -645,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (pg_start < pg_end) {
struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
f2fs_balance_fs(sbi);
@@ -666,7 +709,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t len, int mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t index, pg_start, pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_start, off_end;
@@ -678,7 +721,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (ret)
return ret;
- ret = f2fs_convert_inline_data(inode, offset + len);
+ ret = f2fs_convert_inline_data(inode, offset + len, NULL);
if (ret)
return ret;
@@ -762,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
return flags & F2FS_OTHER_FLMASK;
}
-long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *)arg);
+}
+
+static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int flags;
+ unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ unsigned int oldflags;
int ret;
- switch (cmd) {
- case F2FS_IOC_GETFLAGS:
- flags = fi->i_flags & FS_FL_USER_VISIBLE;
- return put_user(flags, (int __user *) arg);
- case F2FS_IOC_SETFLAGS:
- {
- unsigned int oldflags;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EACCES;
+ goto out;
+ }
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto out;
- }
+ if (get_user(flags, (int __user *)arg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
+ mutex_lock(&inode->i_mutex);
+
+ oldflags = fi->i_flags;
- if (get_user(flags, (int __user *) arg)) {
- ret = -EFAULT;
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EPERM;
goto out;
}
+ }
- flags = f2fs_mask_flags(inode->i_mode, flags);
+ flags = flags & FS_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+ fi->i_flags = flags;
+ mutex_unlock(&inode->i_mutex);
- mutex_lock(&inode->i_mutex);
+ f2fs_set_inode_flags(inode);
+ inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
- oldflags = fi->i_flags;
+static int f2fs_ioc_start_atomic_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
- ret = -EPERM;
- goto out;
- }
- }
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ f2fs_balance_fs(sbi);
- flags = flags & FS_FL_USER_MODIFIABLE;
- flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
- fi->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
+ set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- f2fs_set_inode_flags(inode);
- inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
-out:
- mnt_drop_write_file(filp);
+ return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
+}
+
+static int f2fs_ioc_commit_atomic_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ int ret;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (f2fs_is_volatile_file(inode))
+ return 0;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
return ret;
- }
+
+ if (f2fs_is_atomic_file(inode))
+ commit_inmem_pages(inode, false);
+
+ ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_start_volatile_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ return 0;
+}
+
+static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max((unsigned int)range.minlen,
+ q->limits.discard_granularity);
+ ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+ return 0;
+}
+
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case F2FS_IOC_GETFLAGS:
+ return f2fs_ioc_getflags(filp, arg);
+ case F2FS_IOC_SETFLAGS:
+ return f2fs_ioc_setflags(filp, arg);
+ case F2FS_IOC_START_ATOMIC_WRITE:
+ return f2fs_ioc_start_atomic_write(filp);
+ case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+ return f2fs_ioc_commit_atomic_write(filp);
+ case F2FS_IOC_START_VOLATILE_WRITE:
+ return f2fs_ioc_start_volatile_write(filp);
+ case FITRIM:
+ return f2fs_ioc_fitrim(filp, arg);
default:
return -ENOTTY;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d7947d90ccc3..2a8f4acdb86b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -58,7 +58,7 @@ static int gc_thread_func(void *data)
* 3. IO subsystem is idle by checking the # of requests in
* bdev's request list.
*
- * Note) We have to avoid triggering GCs too much frequently.
+ * Note) We have to avoid triggering GCs frequently.
* Because it is possible that some segments can be
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
@@ -193,7 +193,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
* selected by background GC before.
* Those segments guarantee they have small valid blocks.
*/
- for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) {
+ for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
if (sec_usage_check(sbi, secno))
continue;
clear_bit(secno, dirty_i->victim_secmap);
@@ -222,7 +222,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
u = (vblocks * 100) >> sbi->log_blocks_per_seg;
- /* Handle if the system time is changed by user */
+ /* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
sit_i->min_mtime = mtime;
if (mtime > sit_i->max_mtime)
@@ -263,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int secno, max_cost;
int nsearched = 0;
+ mutex_lock(&dirty_i->seglist_lock);
+
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
p.min_cost = max_cost = get_max_cost(sbi, &p);
- mutex_lock(&dirty_i->seglist_lock);
-
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -281,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned long cost;
unsigned int segno;
- segno = find_next_bit(p.dirty_segmap,
- TOTAL_SEGS(sbi), p.offset);
- if (segno >= TOTAL_SEGS(sbi)) {
+ segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
+ if (segno >= MAIN_SEGS(sbi)) {
if (sbi->last_victim[p.gc_mode]) {
sbi->last_victim[p.gc_mode] = 0;
p.offset = 0;
@@ -423,6 +422,12 @@ next_step:
if (IS_ERR(node_page))
continue;
+ /* block may become invalid during get_node_page */
+ if (check_valid_map(sbi, segno, off) == 0) {
+ f2fs_put_page(node_page, 1);
+ continue;
+ }
+
/* set page dirty and write it */
if (gc_type == FG_GC) {
f2fs_wait_on_page_writeback(node_page, NODE);
@@ -531,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
f2fs_wait_on_page_writeback(page, DATA);
if (clear_page_dirty_for_io(page))
- inode_dec_dirty_dents(inode);
+ inode_dec_dirty_pages(inode);
set_cold_data(page);
do_write_data_page(page, &fio);
clear_cold_data(page);
@@ -593,7 +598,7 @@ next_step:
if (phase == 2) {
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode))
+ if (IS_ERR(inode) || is_bad_inode(inode))
continue;
start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
@@ -688,17 +693,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
int gc_type = BG_GC;
int nfree = 0;
int ret = -1;
+ struct cp_control cpc = {
+ .reason = CP_SYNC,
+ };
INIT_LIST_HEAD(&ilist);
gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
- if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+ if (unlikely(f2fs_cp_error(sbi)))
goto stop;
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
gc_type = FG_GC;
- write_checkpoint(sbi, false);
+ write_checkpoint(sbi, &cpc);
}
if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
@@ -723,7 +731,7 @@ gc_more:
goto gc_more;
if (gc_type == FG_GC)
- write_checkpoint(sbi, false);
+ write_checkpoint(sbi, &cpc);
stop:
mutex_unlock(&sbi->gc_mutex);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5d5eb6047bf4..16f0b2b22999 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -91,7 +91,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
block_t invalid_user_blocks = sbi->user_block_count -
written_block_count(sbi);
/*
- * Background GC is triggered with the following condition.
+ * Background GC is triggered with the following conditions.
* 1. There are a number of invalid blocks.
* 2. There is not enough free space.
*/
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 948d17bf7281..a844fcfb9a8d 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -42,7 +42,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[])
buf[1] += b1;
}
-static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
+static void str2hashbuf(const unsigned char *msg, size_t len,
+ unsigned int *buf, int num)
{
unsigned pad, val;
int i;
@@ -73,9 +74,9 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
{
__u32 hash;
f2fs_hash_t f2fs_hash;
- const char *p;
+ const unsigned char *p;
__u32 in[8], buf[4];
- const char *name = name_info->name;
+ const unsigned char *name = name_info->name;
size_t len = name_info->len;
if ((len <= 2) && (name[0] == '.') &&
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5beeccef9ae1..88036fd75797 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,11 +15,13 @@
bool f2fs_may_inline(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
block_t nr_blocks;
loff_t i_size;
- if (!test_opt(sbi, INLINE_DATA))
+ if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
+ return false;
+
+ if (f2fs_is_atomic_file(inode))
return false;
nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
@@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode)
int f2fs_read_inline_data(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *ipage;
void *src_addr, *dst_addr;
@@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
goto out;
}
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
unlock_page(page);
return PTR_ERR(ipage);
@@ -68,12 +69,12 @@ out:
static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
{
- int err;
+ int err = 0;
struct page *ipage;
struct dnode_of_data dn;
void *src_addr, *dst_addr;
block_t new_blk_addr;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_io_info fio = {
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
@@ -86,6 +87,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
goto out;
}
+ /* someone else converted inline_data already */
+ if (!f2fs_has_inline_data(inode))
+ goto out;
+
/*
* i_addr[0] is not used for inline data,
* so reserving new block will not destroy inline data
@@ -124,9 +129,10 @@ out:
return err;
}
-int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
+ struct page *page)
{
- struct page *page;
+ struct page *new_page = page;
int err;
if (!f2fs_has_inline_data(inode))
@@ -134,17 +140,20 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
else if (to_size <= MAX_INLINE_DATA)
return 0;
- page = grab_cache_page(inode->i_mapping, 0);
- if (!page)
- return -ENOMEM;
+ if (!page || page->index != 0) {
+ new_page = grab_cache_page(inode->i_mapping, 0);
+ if (!new_page)
+ return -ENOMEM;
+ }
- err = __f2fs_convert_inline_data(inode, page);
- f2fs_put_page(page, 1);
+ err = __f2fs_convert_inline_data(inode, new_page);
+ if (!page || page->index != 0)
+ f2fs_put_page(new_page, 1);
return err;
}
int f2fs_write_inline_data(struct inode *inode,
- struct page *page, unsigned size)
+ struct page *page, unsigned size)
{
void *src_addr, *dst_addr;
struct page *ipage;
@@ -181,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode,
void truncate_inline_data(struct inode *inode, u64 from)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *ipage;
if (from >= MAX_INLINE_DATA)
return;
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return;
@@ -199,9 +207,9 @@ void truncate_inline_data(struct inode *inode, u64 from)
f2fs_put_page(ipage, 1);
}
-int recover_inline_data(struct inode *inode, struct page *npage)
+bool recover_inline_data(struct inode *inode, struct page *npage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode *ri = NULL;
void *src_addr, *dst_addr;
struct page *ipage;
@@ -218,10 +226,10 @@ int recover_inline_data(struct inode *inode, struct page *npage)
ri = F2FS_INODE(npage);
if (f2fs_has_inline_data(inode) &&
- ri && ri->i_inline & F2FS_INLINE_DATA) {
+ ri && (ri->i_inline & F2FS_INLINE_DATA)) {
process_inline:
ipage = get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(IS_ERR(ipage));
+ f2fs_bug_on(sbi, IS_ERR(ipage));
f2fs_wait_on_page_writeback(ipage, NODE);
@@ -230,22 +238,22 @@ process_inline:
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
- return -1;
+ return true;
}
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(IS_ERR(ipage));
+ f2fs_bug_on(sbi, IS_ERR(ipage));
f2fs_wait_on_page_writeback(ipage, NODE);
zero_user_segment(ipage, INLINE_DATA_OFFSET,
INLINE_DATA_OFFSET + MAX_INLINE_DATA);
clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
- } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
- truncate_blocks(inode, 0);
+ } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
+ truncate_blocks(inode, 0, false);
set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
goto process_inline;
}
- return 0;
+ return false;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c39999f3868..0deead4505e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
static int do_read_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct page *node_page;
struct f2fs_inode *ri;
@@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page)
void update_inode_page(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
retry:
node_page = get_node_page(sbi, inode->i_ino);
@@ -238,7 +238,7 @@ retry:
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
@@ -266,9 +266,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
*/
void f2fs_evict_inode(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ /* some remained atomic pages should discarded */
+ if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ commit_inmem_pages(inode, true);
+
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -276,7 +280,7 @@ void f2fs_evict_inode(struct inode *inode)
inode->i_ino == F2FS_META_INO(sbi))
goto out_clear;
- f2fs_bug_on(get_dirty_dents(inode));
+ f2fs_bug_on(sbi, get_dirty_pages(inode));
remove_dirty_dir_inode(inode);
if (inode->i_nlink || is_bad_inode(inode))
@@ -306,3 +310,26 @@ no_delete:
out_clear:
clear_inode(inode);
}
+
+/* caller should call f2fs_lock_op() */
+void handle_failed_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ clear_nlink(inode);
+ make_bad_inode(inode);
+ unlock_new_inode(inode);
+
+ i_size_write(inode, 0);
+ if (F2FS_HAS_BLOCKS(inode))
+ f2fs_truncate(inode);
+
+ remove_inode_page(inode);
+ stat_dec_inline_inode(inode);
+
+ alloc_nid_failed(sbi, inode->i_ino);
+ f2fs_unlock_op(sbi);
+
+ /* iput will drop the inode object */
+ iput(inode);
+}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 27b03776ffd2..0d2526e5aa11 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -23,7 +23,7 @@
static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
nid_t ino;
struct inode *inode;
bool nid_free = false;
@@ -102,7 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
nid_t ino = 0;
int err;
@@ -123,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, ino);
@@ -133,11 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
unlock_new_inode(inode);
return 0;
out:
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, ino);
+ handle_failed_inode(inode);
return err;
}
@@ -145,7 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
int err;
f2fs_balance_fs(sbi);
@@ -156,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
d_instantiate(dentry, inode);
return 0;
out:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
iput(inode);
+ f2fs_unlock_op(sbi);
return err;
}
@@ -205,7 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode = dentry->d_inode;
struct f2fs_dir_entry *de;
struct page *page;
@@ -229,7 +226,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
f2fs_delete_entry(de, page, inode);
f2fs_unlock_op(sbi);
- /* In order to evict this inode, we set it dirty */
+ /* In order to evict this inode, we set it dirty */
mark_inode_dirty(inode);
fail:
trace_f2fs_unlink_exit(inode, err);
@@ -239,7 +236,7 @@ fail:
static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
size_t symlen = strlen(symname) + 1;
int err;
@@ -255,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
err = page_symlink(inode, symname, symlen);
alloc_nid_done(sbi, inode->i_ino);
@@ -266,17 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
unlock_new_inode(inode);
return err;
out:
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
@@ -294,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out_fail;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
@@ -307,11 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
out_fail:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
@@ -326,7 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err = 0;
@@ -344,27 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- f2fs_unlock_op(sbi);
if (err)
goto out;
+ f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
d_instantiate(dentry, inode);
unlock_new_inode(inode);
return 0;
out:
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct page *old_dir_page;
@@ -488,8 +473,7 @@ out:
static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct super_block *sb = old_dir->i_sb;
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct page *old_dir_page, *new_dir_page;
@@ -650,7 +634,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
@@ -686,12 +670,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
release_out:
release_orphan_inode(sbi);
out:
- f2fs_unlock_op(sbi);
- clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
- alloc_nid_failed(sbi, inode->i_ino);
+ handle_failed_inode(inode);
return err;
}
@@ -704,7 +683,6 @@ const struct inode_operations f2fs_dir_inode_operations = {
.mkdir = f2fs_mkdir,
.rmdir = f2fs_rmdir,
.mknod = f2fs_mknod,
- .rename = f2fs_rename,
.rename2 = f2fs_rename2,
.tmpfile = f2fs_tmpfile,
.getattr = f2fs_getattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d3d90d284631..44b8afef43d9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
static void clear_node_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
unsigned int long flags;
if (PageDirty(page)) {
@@ -65,7 +64,7 @@ static void clear_node_page_dirty(struct page *page)
spin_unlock_irqrestore(&mapping->tree_lock, flags);
clear_page_dirty_for_io(page);
- dec_page_count(sbi, F2FS_DIRTY_NODES);
+ dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
}
ClearPageUptodate(page);
}
@@ -92,7 +91,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
/* get current nat block page with lock */
src_page = get_meta_page(sbi, src_off);
dst_page = grab_meta_page(sbi, dst_off);
- f2fs_bug_on(PageDirty(src_page));
+ f2fs_bug_on(sbi, PageDirty(src_page));
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
@@ -124,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
kmem_cache_free(nat_entry_slab, e);
}
-int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
+ struct nat_entry_set *head;
+
+ if (get_nat_flag(ne, IS_DIRTY))
+ return;
+retry:
+ head = radix_tree_lookup(&nm_i->nat_set_root, set);
+ if (!head) {
+ head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+
+ INIT_LIST_HEAD(&head->entry_list);
+ INIT_LIST_HEAD(&head->set_list);
+ head->set = set;
+ head->entry_cnt = 0;
+
+ if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
+ cond_resched();
+ goto retry;
+ }
+ }
+ list_move_tail(&ne->list, &head->entry_list);
+ nm_i->dirty_nat_cnt++;
+ head->entry_cnt++;
+ set_nat_flag(ne, IS_DIRTY, true);
+}
+
+static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+ struct nat_entry_set *head;
+
+ head = radix_tree_lookup(&nm_i->nat_set_root, set);
+ if (head) {
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ set_nat_flag(ne, IS_DIRTY, false);
+ head->entry_cnt--;
+ nm_i->dirty_nat_cnt--;
+ }
+}
+
+static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+ nid_t start, unsigned int nr, struct nat_entry_set **ep)
+{
+ return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
+ start, nr);
+}
+
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- int is_cp = 1;
+ bool is_cp = true;
read_lock(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
- if (e && !e->checkpointed)
- is_cp = 0;
+ if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+ is_cp = false;
read_unlock(&nm_i->nat_tree_lock);
return is_cp;
}
-bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
+bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- bool fsync_done = false;
+ bool fsynced = false;
read_lock(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
- if (e)
- fsync_done = e->fsync_done;
+ e = __lookup_nat_cache(nm_i, ino);
+ if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
+ fsynced = true;
read_unlock(&nm_i->nat_tree_lock);
- return fsync_done;
+ return fsynced;
}
-void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid)
+bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
+ bool need_update = true;
- write_lock(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
- if (e)
- e->fsync_done = false;
- write_unlock(&nm_i->nat_tree_lock);
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, ino);
+ if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
+ (get_nat_flag(e, IS_CHECKPOINTED) ||
+ get_nat_flag(e, HAS_FSYNCED_INODE)))
+ need_update = false;
+ read_unlock(&nm_i->nat_tree_lock);
+ return need_update;
}
static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
@@ -177,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
}
memset(new, 0, sizeof(struct nat_entry));
nat_set_nid(new, nid);
- new->checkpointed = true;
+ nat_reset_flag(new);
list_add_tail(&new->list, &nm_i->nat_entries);
nm_i->nat_cnt++;
return new;
@@ -216,7 +270,7 @@ retry:
goto retry;
}
e->ni = *ni;
- f2fs_bug_on(ni->blk_addr == NEW_ADDR);
+ f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
/*
* when nid is reallocated,
@@ -224,20 +278,20 @@ retry:
* So, reinitialize it with new information.
*/
e->ni = *ni;
- f2fs_bug_on(ni->blk_addr != NULL_ADDR);
+ f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
}
/* sanity check */
- f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
- f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
new_blkaddr == NULL_ADDR);
- f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
new_blkaddr == NEW_ADDR);
- f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR &&
+ f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
nat_get_blkaddr(e) != NULL_ADDR &&
new_blkaddr == NEW_ADDR);
- /* increament version no as node is removed */
+ /* increment version no as node is removed */
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
unsigned char version = nat_get_version(e);
nat_set_version(e, inc_node_version(version));
@@ -245,12 +299,17 @@ retry:
/* change address */
nat_set_blkaddr(e, new_blkaddr);
+ if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
+ set_nat_flag(e, IS_CHECKPOINTED, false);
__set_nat_cache_dirty(nm_i, e);
/* update fsync_mark if its inode nat entry is still alive */
e = __lookup_nat_cache(nm_i, ni->ino);
- if (e)
- e->fsync_done = fsync_done;
+ if (e) {
+ if (fsync_done && ni->nid == ni->ino)
+ set_nat_flag(e, HAS_FSYNCED_INODE, true);
+ set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
+ }
write_unlock(&nm_i->nat_tree_lock);
}
@@ -274,7 +333,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
}
/*
- * This function returns always success
+ * This function always returns success
*/
void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
{
@@ -411,7 +470,7 @@ got:
*/
int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct page *npage[4];
struct page *parent;
int offset[4];
@@ -504,15 +563,15 @@ release_out:
static void truncate_node(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct node_info ni;
get_node_info(sbi, dn->nid, &ni);
if (dn->inode->i_blocks == 0) {
- f2fs_bug_on(ni.blk_addr != NULL_ADDR);
+ f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
goto invalidate;
}
- f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+ f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
/* Deallocate node address */
invalidate_blocks(sbi, ni.blk_addr);
@@ -540,14 +599,13 @@ invalidate:
static int truncate_dnode(struct dnode_of_data *dn)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct page *page;
if (dn->nid == 0)
return 1;
/* get direct node */
- page = get_node_page(sbi, dn->nid);
+ page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
return 1;
else if (IS_ERR(page))
@@ -564,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn)
static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
int ofs, int depth)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct dnode_of_data rdn = *dn;
struct page *page;
struct f2fs_node *rn;
@@ -578,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
- page = get_node_page(sbi, dn->nid);
+ page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
if (IS_ERR(page)) {
trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
return PTR_ERR(page);
@@ -636,7 +693,6 @@ out_err:
static int truncate_partial_nodes(struct dnode_of_data *dn,
struct f2fs_inode *ri, int *offset, int depth)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct page *pages[2];
nid_t nid[3];
nid_t child_nid;
@@ -650,8 +706,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
/* get indirect nodes in the path */
for (i = 0; i < idx + 1; i++) {
- /* refernece count'll be increased */
- pages[i] = get_node_page(sbi, nid[i]);
+ /* reference count'll be increased */
+ pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
if (IS_ERR(pages[i])) {
err = PTR_ERR(pages[i]);
idx = i - 1;
@@ -696,7 +752,7 @@ fail:
*/
int truncate_inode_blocks(struct inode *inode, pgoff_t from)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err = 0, cont = 1;
int level, offset[4], noffset[4];
unsigned int nofs = 0;
@@ -792,7 +848,7 @@ fail:
int truncate_xattr_node(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t nid = F2FS_I(inode)->i_xattr_nid;
struct dnode_of_data dn;
struct page *npage;
@@ -823,22 +879,27 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
*/
void remove_inode_page(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct page *page;
- nid_t ino = inode->i_ino;
struct dnode_of_data dn;
- page = get_node_page(sbi, ino);
- if (IS_ERR(page))
+ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+ if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
return;
- if (truncate_xattr_node(inode, page)) {
- f2fs_put_page(page, 1);
+ if (truncate_xattr_node(inode, dn.inode_page)) {
+ f2fs_put_dnode(&dn);
return;
}
- /* 0 is possible, after f2fs_new_inode() is failed */
- f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
- set_new_dnode(&dn, inode, page, page, ino);
+
+ /* remove potential inline_data blocks */
+ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode))
+ truncate_data_blocks_range(&dn, 1);
+
+ /* 0 is possible, after f2fs_new_inode() has failed */
+ f2fs_bug_on(F2FS_I_SB(inode),
+ inode->i_blocks != 0 && inode->i_blocks != 1);
+
+ /* will put inode & node pages */
truncate_node(&dn);
}
@@ -856,7 +917,7 @@ struct page *new_inode_page(struct inode *inode)
struct page *new_node_page(struct dnode_of_data *dn,
unsigned int ofs, struct page *ipage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct node_info old_ni, new_ni;
struct page *page;
int err;
@@ -876,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
get_node_info(sbi, dn->nid, &old_ni);
/* Reinitialize old_ni with new node page */
- f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
+ f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
new_ni = old_ni;
new_ni.ino = dn->inode->i_ino;
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
@@ -914,7 +975,7 @@ fail:
*/
static int read_node_page(struct page *page, int rw)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct node_info ni;
get_node_info(sbi, page->index, &ni);
@@ -990,7 +1051,7 @@ got_it:
*/
struct page *get_node_page_ra(struct page *parent, int start)
{
- struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
struct blk_plug plug;
struct page *page;
int err, i, end;
@@ -1120,17 +1181,24 @@ continue_unlock:
/* called by fsync() */
if (ino && IS_DNODE(page)) {
- int mark = !is_checkpointed_node(sbi, ino);
set_fsync_mark(page, 1);
- if (IS_INODE(page))
- set_dentry_mark(page, mark);
+ if (IS_INODE(page)) {
+ if (!is_checkpointed_node(sbi, ino) &&
+ !has_fsynced_inode(sbi, ino))
+ set_dentry_mark(page, 1);
+ else
+ set_dentry_mark(page, 0);
+ }
nwritten++;
} else {
set_fsync_mark(page, 0);
set_dentry_mark(page, 0);
}
- NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
- wrote++;
+
+ if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+ unlock_page(page);
+ else
+ wrote++;
if (--wbc->nr_to_write == 0)
break;
@@ -1199,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
static int f2fs_write_node_page(struct page *page,
struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
nid_t nid;
block_t new_addr;
struct node_info ni;
@@ -1212,12 +1280,14 @@ static int f2fs_write_node_page(struct page *page,
if (unlikely(sbi->por_doing))
goto redirty_out;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
f2fs_wait_on_page_writeback(page, NODE);
/* get old block addr of this node page */
nid = nid_of_node(page);
- f2fs_bug_on(page->index != nid);
+ f2fs_bug_on(sbi, page->index != nid);
get_node_info(sbi, nid, &ni);
@@ -1248,7 +1318,7 @@ redirty_out:
static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff;
trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -1273,15 +1343,12 @@ skip_write:
static int f2fs_set_node_page_dirty(struct page *page)
{
- struct address_space *mapping = page->mapping;
- struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-
trace_f2fs_set_page_dirty(page, NODE);
SetPageUptodate(page);
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
- inc_page_count(sbi, F2FS_DIRTY_NODES);
+ inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
SetPagePrivate(page);
return 1;
}
@@ -1292,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
unsigned int length)
{
struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
if (PageDirty(page))
- dec_page_count(sbi, F2FS_DIRTY_NODES);
+ dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
ClearPagePrivate(page);
}
@@ -1347,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
read_lock(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (ne &&
- (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
+ (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+ nat_get_blkaddr(ne) != NULL_ADDR))
allocated = true;
read_unlock(&nm_i->nat_tree_lock);
if (allocated)
@@ -1404,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
break;
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
- f2fs_bug_on(blk_addr == NEW_ADDR);
+ f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
if (blk_addr == NULL_ADDR) {
if (add_free_nid(sbi, start_nid, true) < 0)
break;
@@ -1474,12 +1541,12 @@ retry:
/* We should not use stale free nids created by build_free_nids */
if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
- f2fs_bug_on(list_empty(&nm_i->free_nid_list));
+ f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
list_for_each_entry(i, &nm_i->free_nid_list, list)
if (i->state == NID_NEW)
break;
- f2fs_bug_on(i->state != NID_NEW);
+ f2fs_bug_on(sbi, i->state != NID_NEW);
*nid = i->nid;
i->state = NID_ALLOC;
nm_i->fcnt--;
@@ -1505,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
spin_lock(&nm_i->free_nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(!i || i->state != NID_ALLOC);
+ f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
__del_from_free_nid_list(nm_i, i);
spin_unlock(&nm_i->free_nid_list_lock);
@@ -1526,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
spin_lock(&nm_i->free_nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(!i || i->state != NID_ALLOC);
+ f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
if (!available_free_memory(sbi, FREE_NIDS)) {
__del_from_free_nid_list(nm_i, i);
need_free = true;
@@ -1540,35 +1607,21 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
kmem_cache_free(free_nid_slab, i);
}
-void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
- struct f2fs_summary *sum, struct node_info *ni,
- block_t new_blkaddr)
-{
- rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
- set_node_addr(sbi, ni, new_blkaddr, false);
- clear_node_page_dirty(page);
-}
-
void recover_inline_xattr(struct inode *inode, struct page *page)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
void *src_addr, *dst_addr;
size_t inline_size;
struct page *ipage;
struct f2fs_inode *ri;
- if (!f2fs_has_inline_xattr(inode))
- return;
-
- if (!IS_INODE(page))
- return;
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
ri = F2FS_INODE(page);
- if (!(ri->i_inline & F2FS_INLINE_XATTR))
- return;
-
- ipage = get_node_page(sbi, inode->i_ino);
- f2fs_bug_on(IS_ERR(ipage));
+ if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
+ clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+ goto update_inode;
+ }
dst_addr = inline_xattr_addr(ipage);
src_addr = inline_xattr_addr(page);
@@ -1576,28 +1629,25 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
f2fs_wait_on_page_writeback(ipage, NODE);
memcpy(dst_addr, src_addr, inline_size);
-
+update_inode:
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
}
-bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
nid_t new_xnid = nid_of_node(page);
struct node_info ni;
- if (!f2fs_has_xattr_block(ofs_of_node(page)))
- return false;
-
/* 1: invalidate the previous xattr nid */
if (!prev_xnid)
goto recover_xnid;
/* Deallocate node address */
get_node_info(sbi, prev_xnid, &ni);
- f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+ f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
invalidate_blocks(sbi, ni.blk_addr);
dec_valid_node_count(sbi, inode);
set_node_addr(sbi, &ni, NULL_ADDR, false);
@@ -1605,7 +1655,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
recover_xnid:
/* 2: allocate new xattr nid */
if (unlikely(!inc_valid_node_count(sbi, inode)))
- f2fs_bug_on(1);
+ f2fs_bug_on(sbi, 1);
remove_free_nid(NM_I(sbi), new_xnid);
get_node_info(sbi, new_xnid, &ni);
@@ -1618,7 +1668,6 @@ recover_xnid:
set_node_addr(sbi, &ni, blkaddr, false);
update_inode_page(inode);
- return true;
}
int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1637,7 +1686,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
if (!ipage)
return -ENOMEM;
- /* Should not use this inode from free nid list */
+ /* Should not use this inode from free nid list */
remove_free_nid(NM_I(sbi), ino);
SetPageUptodate(ipage);
@@ -1651,6 +1700,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
dst->i_blocks = cpu_to_le64(1);
dst->i_links = cpu_to_le32(1);
dst->i_xattr_nid = 0;
+ dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
new_ni = old_ni;
new_ni.ino = ino;
@@ -1659,13 +1709,14 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
inc_valid_inode_count(sbi);
+ set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
return 0;
}
/*
* ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-readed pages are alloced in bd_inode's mapping tree.
+ * these pre-read pages are allocated in bd_inode's mapping tree.
*/
static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
int start, int nrpages)
@@ -1697,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum_entry;
struct inode *inode = sbi->sb->s_bdev->bd_inode;
block_t addr;
- int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ int bio_blocks = MAX_BIO_BLOCKS(sbi);
struct page *pages[bio_blocks];
int i, idx, last_offset, nrpages, err = 0;
@@ -1709,7 +1760,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
nrpages = min(last_offset - i, bio_blocks);
- /* read ahead node pages */
+ /* readahead node pages */
nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
if (!nrpages)
return -ENOMEM;
@@ -1739,89 +1790,6 @@ skip:
return err;
}
-static struct nat_entry_set *grab_nat_entry_set(void)
-{
- struct nat_entry_set *nes =
- f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
-
- nes->entry_cnt = 0;
- INIT_LIST_HEAD(&nes->set_list);
- INIT_LIST_HEAD(&nes->entry_list);
- return nes;
-}
-
-static void release_nat_entry_set(struct nat_entry_set *nes,
- struct f2fs_nm_info *nm_i)
-{
- f2fs_bug_on(!list_empty(&nes->entry_list));
-
- nm_i->dirty_nat_cnt -= nes->entry_cnt;
- list_del(&nes->set_list);
- kmem_cache_free(nat_entry_set_slab, nes);
-}
-
-static void adjust_nat_entry_set(struct nat_entry_set *nes,
- struct list_head *head)
-{
- struct nat_entry_set *next = nes;
-
- if (list_is_last(&nes->set_list, head))
- return;
-
- list_for_each_entry_continue(next, head, set_list)
- if (nes->entry_cnt <= next->entry_cnt)
- break;
-
- list_move_tail(&nes->set_list, &next->set_list);
-}
-
-static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
-{
- struct nat_entry_set *nes;
- nid_t start_nid = START_NID(ne->ni.nid);
-
- list_for_each_entry(nes, head, set_list) {
- if (nes->start_nid == start_nid) {
- list_move_tail(&ne->list, &nes->entry_list);
- nes->entry_cnt++;
- adjust_nat_entry_set(nes, head);
- return;
- }
- }
-
- nes = grab_nat_entry_set();
-
- nes->start_nid = start_nid;
- list_move_tail(&ne->list, &nes->entry_list);
- nes->entry_cnt++;
- list_add(&nes->set_list, head);
-}
-
-static void merge_nats_in_set(struct f2fs_sb_info *sbi)
-{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct list_head *dirty_list = &nm_i->dirty_nat_entries;
- struct list_head *set_list = &nm_i->nat_entry_set;
- struct nat_entry *ne, *tmp;
-
- write_lock(&nm_i->nat_tree_lock);
- list_for_each_entry_safe(ne, tmp, dirty_list, list) {
- if (nat_get_blkaddr(ne) == NEW_ADDR)
- continue;
- add_nat_entry(ne, set_list);
- nm_i->dirty_nat_cnt++;
- }
- write_unlock(&nm_i->nat_tree_lock);
-}
-
-static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
-{
- if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
- return true;
- else
- return false;
-}
-
static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1856,99 +1824,130 @@ found:
mutex_unlock(&curseg->curseg_mutex);
}
-/*
- * This function is called during the checkpointing process.
- */
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+static void __adjust_nat_entry_set(struct nat_entry_set *nes,
+ struct list_head *head, int max)
{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
- struct nat_entry_set *nes, *tmp;
- struct list_head *head = &nm_i->nat_entry_set;
- bool to_journal = true;
+ struct nat_entry_set *cur;
- /* merge nat entries of dirty list to nat entry set temporarily */
- merge_nats_in_set(sbi);
+ if (nes->entry_cnt >= max)
+ goto add_out;
- /*
- * if there are no enough space in journal to store dirty nat
- * entries, remove all entries from journal and merge them
- * into nat entry set.
- */
- if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
- remove_nats_in_journal(sbi);
-
- /*
- * merge nat entries of dirty list to nat entry set temporarily
- */
- merge_nats_in_set(sbi);
+ list_for_each_entry(cur, head, set_list) {
+ if (cur->entry_cnt >= nes->entry_cnt) {
+ list_add(&nes->set_list, cur->set_list.prev);
+ return;
+ }
}
+add_out:
+ list_add_tail(&nes->set_list, head);
+}
- if (!nm_i->dirty_nat_cnt)
- return;
+static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+ struct nat_entry_set *set)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
+ bool to_journal = true;
+ struct f2fs_nat_block *nat_blk;
+ struct nat_entry *ne, *cur;
+ struct page *page = NULL;
/*
* there are two steps to flush nat entries:
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- list_for_each_entry_safe(nes, tmp, head, set_list) {
- struct f2fs_nat_block *nat_blk;
- struct nat_entry *ne, *cur;
- struct page *page;
- nid_t start_nid = nes->start_nid;
+ if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+ to_journal = false;
- if (to_journal && !__has_cursum_space(sum, nes->entry_cnt))
- to_journal = false;
+ if (to_journal) {
+ mutex_lock(&curseg->curseg_mutex);
+ } else {
+ page = get_next_nat_page(sbi, start_nid);
+ nat_blk = page_address(page);
+ f2fs_bug_on(sbi, !nat_blk);
+ }
+
+ /* flush dirty nats in nat entry set */
+ list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
+ struct f2fs_nat_entry *raw_ne;
+ nid_t nid = nat_get_nid(ne);
+ int offset;
+
+ if (nat_get_blkaddr(ne) == NEW_ADDR)
+ continue;
if (to_journal) {
- mutex_lock(&curseg->curseg_mutex);
+ offset = lookup_journal_in_cursum(sum,
+ NAT_JOURNAL, nid, 1);
+ f2fs_bug_on(sbi, offset < 0);
+ raw_ne = &nat_in_journal(sum, offset);
+ nid_in_journal(sum, offset) = cpu_to_le32(nid);
} else {
- page = get_next_nat_page(sbi, start_nid);
- nat_blk = page_address(page);
- f2fs_bug_on(!nat_blk);
+ raw_ne = &nat_blk->entries[nid - start_nid];
}
+ raw_nat_from_node_info(raw_ne, &ne->ni);
- /* flush dirty nats in nat entry set */
- list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
- struct f2fs_nat_entry *raw_ne;
- nid_t nid = nat_get_nid(ne);
- int offset;
+ write_lock(&NM_I(sbi)->nat_tree_lock);
+ nat_reset_flag(ne);
+ __clear_nat_cache_dirty(NM_I(sbi), ne);
+ write_unlock(&NM_I(sbi)->nat_tree_lock);
- if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
- NAT_JOURNAL, nid, 1);
- f2fs_bug_on(offset < 0);
- raw_ne = &nat_in_journal(sum, offset);
- nid_in_journal(sum, offset) = cpu_to_le32(nid);
- } else {
- raw_ne = &nat_blk->entries[nid - start_nid];
- }
- raw_nat_from_node_info(raw_ne, &ne->ni);
+ if (nat_get_blkaddr(ne) == NULL_ADDR)
+ add_free_nid(sbi, nid, false);
+ }
- if (nat_get_blkaddr(ne) == NULL_ADDR &&
- add_free_nid(sbi, nid, false) <= 0) {
- write_lock(&nm_i->nat_tree_lock);
- __del_from_nat_cache(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
- } else {
- write_lock(&nm_i->nat_tree_lock);
- __clear_nat_cache_dirty(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
- }
- }
+ if (to_journal)
+ mutex_unlock(&curseg->curseg_mutex);
+ else
+ f2fs_put_page(page, 1);
- if (to_journal)
- mutex_unlock(&curseg->curseg_mutex);
- else
- f2fs_put_page(page, 1);
+ if (!set->entry_cnt) {
+ radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+ kmem_cache_free(nat_entry_set_slab, set);
+ }
+}
- release_nat_entry_set(nes, nm_i);
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct nat_entry_set *setvec[NATVEC_SIZE];
+ struct nat_entry_set *set, *tmp;
+ unsigned int found;
+ nid_t set_idx = 0;
+ LIST_HEAD(sets);
+
+ /*
+ * if there are no enough space in journal to store dirty nat
+ * entries, remove all entries from journal and merge them
+ * into nat entry set.
+ */
+ if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+ remove_nats_in_journal(sbi);
+
+ if (!nm_i->dirty_nat_cnt)
+ return;
+
+ while ((found = __gang_lookup_nat_set(nm_i,
+ set_idx, NATVEC_SIZE, setvec))) {
+ unsigned idx;
+ set_idx = setvec[found - 1]->set + 1;
+ for (idx = 0; idx < found; idx++)
+ __adjust_nat_entry_set(setvec[idx], &sets,
+ MAX_NAT_JENTRIES(sum));
}
- f2fs_bug_on(!list_empty(head));
- f2fs_bug_on(nm_i->dirty_nat_cnt);
+ /* flush dirty nats in nat entry set */
+ list_for_each_entry_safe(set, tmp, &sets, set_list)
+ __flush_nat_entry_set(sbi, set);
+
+ f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}
static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1967,7 +1966,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
/* not used nids: 0, node, meta, (and root counted as valid node) */
- nm_i->available_nids = nm_i->max_nid - 3;
+ nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
@@ -1975,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+ INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->nat_entries);
- INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
- INIT_LIST_HEAD(&nm_i->nat_entry_set);
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
@@ -2026,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
/* destroy free nid list */
spin_lock(&nm_i->free_nid_list_lock);
list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
- f2fs_bug_on(i->state == NID_ALLOC);
+ f2fs_bug_on(sbi, i->state == NID_ALLOC);
__del_from_free_nid_list(nm_i, i);
nm_i->fcnt--;
spin_unlock(&nm_i->free_nid_list_lock);
kmem_cache_free(free_nid_slab, i);
spin_lock(&nm_i->free_nid_list_lock);
}
- f2fs_bug_on(nm_i->fcnt);
+ f2fs_bug_on(sbi, nm_i->fcnt);
spin_unlock(&nm_i->free_nid_list_lock);
/* destroy nat cache */
@@ -2045,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
for (idx = 0; idx < found; idx++)
__del_from_nat_cache(nm_i, natvec[idx]);
}
- f2fs_bug_on(nm_i->nat_cnt);
+ f2fs_bug_on(sbi, nm_i->nat_cnt);
write_unlock(&nm_i->nat_tree_lock);
kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 8a116a407599..8d5e6e0dd840 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -39,10 +39,16 @@ struct node_info {
unsigned char version; /* version of the node */
};
+enum {
+ IS_CHECKPOINTED, /* is it checkpointed before? */
+ HAS_FSYNCED_INODE, /* is the inode fsynced before? */
+ HAS_LAST_FSYNC, /* has the latest node fsync mark? */
+ IS_DIRTY, /* this nat entry is dirty? */
+};
+
struct nat_entry {
struct list_head list; /* for clean or dirty nat list */
- bool checkpointed; /* whether it is checkpointed or not */
- bool fsync_done; /* whether the latest node has fsync mark */
+ unsigned char flag; /* for node information bits */
struct node_info ni; /* in-memory node information */
};
@@ -55,18 +61,32 @@ struct nat_entry {
#define nat_get_version(nat) (nat->ni.version)
#define nat_set_version(nat, v) (nat->ni.version = v)
-#define __set_nat_cache_dirty(nm_i, ne) \
- do { \
- ne->checkpointed = false; \
- list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
- } while (0)
-#define __clear_nat_cache_dirty(nm_i, ne) \
- do { \
- ne->checkpointed = true; \
- list_move_tail(&ne->list, &nm_i->nat_entries); \
- } while (0)
#define inc_node_version(version) (++version)
+static inline void set_nat_flag(struct nat_entry *ne,
+ unsigned int type, bool set)
+{
+ unsigned char mask = 0x01 << type;
+ if (set)
+ ne->flag |= mask;
+ else
+ ne->flag &= ~mask;
+}
+
+static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
+{
+ unsigned char mask = 0x01 << type;
+ return ne->flag & mask;
+}
+
+static inline void nat_reset_flag(struct nat_entry *ne)
+{
+ /* these states can be set only after checkpoint was done */
+ set_nat_flag(ne, IS_CHECKPOINTED, true);
+ set_nat_flag(ne, HAS_FSYNCED_INODE, false);
+ set_nat_flag(ne, HAS_LAST_FSYNC, true);
+}
+
static inline void node_info_from_raw_nat(struct node_info *ni,
struct f2fs_nat_entry *raw_ne)
{
@@ -90,9 +110,9 @@ enum mem_type {
};
struct nat_entry_set {
- struct list_head set_list; /* link with all nat sets */
+ struct list_head set_list; /* link with other nat sets */
struct list_head entry_list; /* link with dirty nat entries */
- nid_t start_nid; /* start nid of nats in set */
+ nid_t set; /* set number*/
unsigned int entry_cnt; /* the # of nat entries in set */
};
@@ -110,18 +130,19 @@ struct free_nid {
int state; /* in use or not: NID_NEW or NID_ALLOC */
};
-static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *fnid;
- if (nm_i->fcnt <= 0)
- return -1;
spin_lock(&nm_i->free_nid_list_lock);
+ if (nm_i->fcnt <= 0) {
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return;
+ }
fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
*nid = fnid->nid;
spin_unlock(&nm_i->free_nid_list_lock);
- return 0;
}
/*
@@ -197,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src)
static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
struct f2fs_node *rn = F2FS_NODE(page);
rn->footer.cp_ver = ckpt->checkpoint_ver;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fe1c6d921ba2..ebd013225788 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -14,6 +14,37 @@
#include "node.h"
#include "segment.h"
+/*
+ * Roll forward recovery scenarios.
+ *
+ * [Term] F: fsync_mark, D: dentry_mark
+ *
+ * 1. inode(x) | CP | inode(x) | dnode(F)
+ * -> Update the latest inode(x).
+ *
+ * 2. inode(x) | CP | inode(F) | dnode(F)
+ * -> No problem.
+ *
+ * 3. inode(x) | CP | dnode(F) | inode(x)
+ * -> Recover to the latest dnode(F), and drop the last inode(x)
+ *
+ * 4. inode(x) | CP | dnode(F) | inode(F)
+ * -> No problem.
+ *
+ * 5. CP | inode(x) | dnode(F)
+ * -> The inode(DF) was missing. Should drop this dnode(F).
+ *
+ * 6. CP | inode(DF) | dnode(F)
+ * -> No problem.
+ *
+ * 7. CP | dnode(F) | inode(DF)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ *
+ * 8. CP | dnode(F) | inode(x)
+ * -> If f2fs_iget fails, then goto next to find inode(DF).
+ * But it will fail due to no inode(DF).
+ */
+
static struct kmem_cache *fsync_entry_slab;
bool space_for_roll_forward(struct f2fs_sb_info *sbi)
@@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
return NULL;
}
-static int recover_dentry(struct page *ipage, struct inode *inode)
+static int recover_dentry(struct inode *inode, struct page *ipage)
{
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -62,8 +93,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
}
retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de && inode->i_ino == le32_to_cpu(de->ino))
+ if (de && inode->i_ino == le32_to_cpu(de->ino)) {
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
goto out_unmap_put;
+ }
if (de) {
einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
if (IS_ERR(einode)) {
@@ -73,7 +106,7 @@ retry:
err = -EEXIST;
goto out_unmap_put;
}
- err = acquire_orphan_inode(F2FS_SB(inode->i_sb));
+ err = acquire_orphan_inode(F2FS_I_SB(inode));
if (err) {
iput(einode);
goto out_unmap_put;
@@ -108,35 +141,28 @@ out:
return err;
}
-static int recover_inode(struct inode *inode, struct page *node_page)
+static void recover_inode(struct inode *inode, struct page *page)
{
- struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
+ struct f2fs_inode *raw = F2FS_INODE(page);
- if (!IS_INODE(node_page))
- return 0;
-
- inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- i_size_write(inode, le64_to_cpu(raw_inode->i_size));
- inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
- inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
- inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
- inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-
- if (is_dent_dnode(node_page))
- return recover_dentry(node_page, inode);
+ inode->i_mode = le16_to_cpu(raw->i_mode);
+ i_size_write(inode, le64_to_cpu(raw->i_size));
+ inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+ inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
+ inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
+ inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
+ inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
- ino_of_node(node_page), raw_inode->i_name);
- return 0;
+ ino_of_node(page), F2FS_INODE(page)->i_name);
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
- struct page *page;
+ struct page *page = NULL;
block_t blkaddr;
int err = 0;
@@ -144,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- /* read node page */
- page = alloc_page(GFP_F2FS_ZERO);
- if (!page)
- return -ENOMEM;
- lock_page(page);
-
while (1) {
struct fsync_inode_entry *entry;
- err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
- if (err)
- return err;
+ if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+ return 0;
- lock_page(page);
+ page = get_meta_page_ra(sbi, blkaddr);
if (cp_ver != cpver_of_node(page))
break;
@@ -178,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
}
/* add this fsync inode to the list */
- entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
+ entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
if (!entry) {
err = -ENOMEM;
break;
}
-
+ /*
+ * CP | dnode(F) | inode(DF)
+ * For this case, we should not give up now.
+ */
entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
if (IS_ERR(entry->inode)) {
err = PTR_ERR(entry->inode);
kmem_cache_free(fsync_entry_slab, entry);
+ if (err == -ENOENT)
+ goto next;
break;
}
list_add_tail(&entry->list, head);
}
entry->blkaddr = blkaddr;
- err = recover_inode(entry->inode, page);
- if (err && err != -ENOENT)
- break;
+ if (IS_INODE(page)) {
+ entry->last_inode = blkaddr;
+ if (is_dent_dnode(page))
+ entry->last_dentry = blkaddr;
+ }
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
+ f2fs_put_page(page, 1);
}
-
- unlock_page(page);
- __free_pages(page, 0);
-
+ f2fs_put_page(page, 1);
return err;
}
@@ -277,16 +301,30 @@ got_it:
ino = ino_of_node(node_page);
f2fs_put_page(node_page, 1);
- /* Deallocate previous index in the node page */
- inode = f2fs_iget(sbi->sb, ino);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
+ if (ino != dn->inode->i_ino) {
+ /* Deallocate previous index in the node page */
+ inode = f2fs_iget(sbi->sb, ino);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ } else {
+ inode = dn->inode;
+ }
bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
- le16_to_cpu(sum.ofs_in_node);
+ le16_to_cpu(sum.ofs_in_node);
- truncate_hole(inode, bidx, bidx + 1);
- iput(inode);
+ if (ino != dn->inode->i_ino) {
+ truncate_hole(inode, bidx, bidx + 1);
+ iput(inode);
+ } else {
+ struct dnode_of_data tdn;
+ set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
+ if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+ return 0;
+ if (tdn.data_blkaddr != NULL_ADDR)
+ truncate_data_blocks_range(&tdn, 1);
+ f2fs_put_page(tdn.node_page, 1);
+ }
return 0;
}
@@ -300,14 +338,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct node_info ni;
int err = 0, recovered = 0;
- recover_inline_xattr(inode, page);
-
- if (recover_inline_data(inode, page))
+ /* step 1: recover xattr */
+ if (IS_INODE(page)) {
+ recover_inline_xattr(inode, page);
+ } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+ recover_xattr_data(inode, page, blkaddr);
goto out;
+ }
- if (recover_xattr_data(inode, page, blkaddr))
+ /* step 2: recover inline data */
+ if (recover_inline_data(inode, page))
goto out;
+ /* step 3: recover data indices */
start = start_bidx_of_node(ofs_of_node(page), fi);
end = start + ADDRS_PER_PAGE(page, fi);
@@ -324,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
f2fs_wait_on_page_writeback(dn.node_page, NODE);
get_node_info(sbi, dn.nid, &ni);
- f2fs_bug_on(ni.ino != ino_of_node(page));
- f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page));
+ f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
+ f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
for (; start < end; start++) {
block_t src, dest;
@@ -337,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (src == NULL_ADDR) {
err = reserve_new_block(&dn);
/* We should not get -ENOSPC */
- f2fs_bug_on(err);
+ f2fs_bug_on(sbi, err);
}
/* Check the previous node page having this index */
@@ -364,8 +407,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
fill_node_footer(dn.node_page, dn.nid, ni.ino,
ofs_of_node(page), false);
set_page_dirty(dn.node_page);
-
- recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
err:
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
@@ -381,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
- struct page *page;
+ struct page *page = NULL;
int err = 0;
block_t blkaddr;
@@ -389,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi,
curseg = CURSEG_I(sbi, type);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- /* read node page */
- page = alloc_page(GFP_F2FS_ZERO);
- if (!page)
- return -ENOMEM;
-
- lock_page(page);
-
while (1) {
struct fsync_inode_entry *entry;
- err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
- if (err)
- return err;
+ if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+ break;
- lock_page(page);
+ page = get_meta_page_ra(sbi, blkaddr);
- if (cp_ver != cpver_of_node(page))
+ if (cp_ver != cpver_of_node(page)) {
+ f2fs_put_page(page, 1);
break;
+ }
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry)
goto next;
-
+ /*
+ * inode(x) | CP | inode(x) | dnode(F)
+ * In this case, we can lose the latest inode(x).
+ * So, call recover_inode for the inode update.
+ */
+ if (entry->last_inode == blkaddr)
+ recover_inode(entry->inode, page);
+ if (entry->last_dentry == blkaddr) {
+ err = recover_dentry(entry->inode, page);
+ if (err) {
+ f2fs_put_page(page, 1);
+ break;
+ }
+ }
err = do_recover_data(sbi, entry->inode, page, blkaddr);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
break;
+ }
if (entry->blkaddr == blkaddr) {
iput(entry->inode);
@@ -424,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
+ f2fs_put_page(page, 1);
}
-
- unlock_page(page);
- __free_pages(page, 0);
-
if (!err)
allocate_new_segments(sbi);
return err;
@@ -452,6 +499,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
/* step #1: find fsynced inode numbers */
sbi->por_doing = true;
+ /* prevent checkpoint */
+ mutex_lock(&sbi->cp_mutex);
+
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
err = find_fsync_dnodes(sbi, &inode_list);
@@ -465,11 +515,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
/* step #2: recover data */
err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
- f2fs_bug_on(!list_empty(&inode_list));
+ if (!err)
+ f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);
+ /* truncate meta pages to be used by the recovery */
+ truncate_inode_pages_range(META_MAPPING(sbi),
+ MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
truncate_inode_pages_final(META_MAPPING(sbi));
@@ -482,8 +537,16 @@ out:
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
sync_meta_pages(sbi, META, LONG_MAX);
+ set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ mutex_unlock(&sbi->cp_mutex);
} else if (need_writecp) {
- write_checkpoint(sbi, false);
+ struct cp_control cpc = {
+ .reason = CP_SYNC,
+ };
+ mutex_unlock(&sbi->cp_mutex);
+ write_checkpoint(sbi, &cpc);
+ } else {
+ mutex_unlock(&sbi->cp_mutex);
}
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0dfeebae2a50..923cb76fdc46 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -25,6 +25,8 @@
#define __reverse_ffz(x) __reverse_ffs(~(x))
static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *sit_entry_set_slab;
+static struct kmem_cache *inmem_entry_slab;
/*
* __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -62,7 +64,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
}
/*
- * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
* f2fs_set_bit makes MSB and LSB reversed in a byte.
* Example:
* LSB <--> MSB
@@ -172,6 +174,60 @@ found_middle:
return result + __reverse_ffz(tmp);
}
+void register_inmem_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct inmem_pages *new;
+
+ new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
+
+ /* add atomic page indices to the list */
+ new->page = page;
+ INIT_LIST_HEAD(&new->list);
+
+ /* increase reference count with clean state */
+ mutex_lock(&fi->inmem_lock);
+ get_page(page);
+ list_add_tail(&new->list, &fi->inmem_pages);
+ mutex_unlock(&fi->inmem_lock);
+}
+
+void commit_inmem_pages(struct inode *inode, bool abort)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct inmem_pages *cur, *tmp;
+ bool submit_bio = false;
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = WRITE_SYNC,
+ };
+
+ f2fs_balance_fs(sbi);
+ f2fs_lock_op(sbi);
+
+ mutex_lock(&fi->inmem_lock);
+ list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
+ lock_page(cur->page);
+ if (!abort && cur->page->mapping == inode->i_mapping) {
+ f2fs_wait_on_page_writeback(cur->page, DATA);
+ if (clear_page_dirty_for_io(cur->page))
+ inode_dec_dirty_pages(inode);
+ do_write_data_page(cur->page, &fio);
+ submit_bio = true;
+ }
+ f2fs_put_page(cur->page, 1);
+ list_del(&cur->list);
+ kmem_cache_free(inmem_entry_slab, cur);
+ }
+ if (submit_bio)
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ mutex_unlock(&fi->inmem_lock);
+
+ filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
+ f2fs_unlock_op(sbi);
+}
+
/*
* This function balances dirty node and dentry pages.
* In addition, it controls garbage collection.
@@ -205,24 +261,20 @@ repeat:
if (kthread_should_stop())
return 0;
- spin_lock(&fcc->issue_lock);
- if (fcc->issue_list) {
- fcc->dispatch_list = fcc->issue_list;
- fcc->issue_list = fcc->issue_tail = NULL;
- }
- spin_unlock(&fcc->issue_lock);
-
- if (fcc->dispatch_list) {
+ if (!llist_empty(&fcc->issue_list)) {
struct bio *bio = bio_alloc(GFP_NOIO, 0);
struct flush_cmd *cmd, *next;
int ret;
+ fcc->dispatch_list = llist_del_all(&fcc->issue_list);
+ fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
+
bio->bi_bdev = sbi->sb->s_bdev;
ret = submit_bio_wait(WRITE_FLUSH, bio);
- for (cmd = fcc->dispatch_list; cmd; cmd = next) {
+ llist_for_each_entry_safe(cmd, next,
+ fcc->dispatch_list, llnode) {
cmd->ret = ret;
- next = cmd->next;
complete(&cmd->wait);
}
bio_put(bio);
@@ -230,7 +282,7 @@ repeat:
}
wait_event_interruptible(*q,
- kthread_should_stop() || fcc->issue_list);
+ kthread_should_stop() || !llist_empty(&fcc->issue_list));
goto repeat;
}
@@ -249,15 +301,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
init_completion(&cmd.wait);
- cmd.next = NULL;
- spin_lock(&fcc->issue_lock);
- if (fcc->issue_list)
- fcc->issue_tail->next = &cmd;
- else
- fcc->issue_list = &cmd;
- fcc->issue_tail = &cmd;
- spin_unlock(&fcc->issue_lock);
+ llist_add(&cmd.llnode, &fcc->issue_list);
if (!fcc->dispatch_list)
wake_up(&fcc->flush_wait_queue);
@@ -276,8 +321,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
- spin_lock_init(&fcc->issue_lock);
init_waitqueue_head(&fcc->flush_wait_queue);
+ init_llist_head(&fcc->issue_list);
SM_I(sbi)->cmd_control_info = fcc;
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
@@ -317,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
struct seg_entry *sentry = get_seg_entry(sbi, segno);
enum dirty_type t = sentry->type;
+ if (unlikely(t >= DIRTY)) {
+ f2fs_bug_on(sbi, 1);
+ return;
+ }
if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
dirty_i->nr_dirty[t]++;
}
@@ -376,8 +425,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
block_t blkstart, block_t blklen)
{
- sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
- sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
+ sector_t start = SECTOR_FROM_BLOCK(blkstart);
+ sector_t len = SECTOR_FROM_BLOCK(blklen);
trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
}
@@ -392,21 +441,47 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
}
}
-static void add_discard_addrs(struct f2fs_sb_info *sbi,
- unsigned int segno, struct seg_entry *se)
+static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct list_head *head = &SM_I(sbi)->discard_list;
struct discard_entry *new;
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
int max_blocks = sbi->blocks_per_seg;
+ struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
unsigned long dmap[entries];
unsigned int start = 0, end = -1;
+ bool force = (cpc->reason == CP_DISCARD);
int i;
- if (!test_opt(sbi, DISCARD))
+ if (!force && !test_opt(sbi, DISCARD))
+ return;
+
+ if (force && !se->valid_blocks) {
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ /*
+ * if this segment is registered in the prefree list, then
+ * we should skip adding a discard candidate, and let the
+ * checkpoint do that later.
+ */
+ mutex_lock(&dirty_i->seglist_lock);
+ if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
+ mutex_unlock(&dirty_i->seglist_lock);
+ cpc->trimmed += sbi->blocks_per_seg;
+ return;
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+
+ new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+ INIT_LIST_HEAD(&new->list);
+ new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
+ new->len = sbi->blocks_per_seg;
+ list_add_tail(&new->list, head);
+ SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
+ cpc->trimmed += sbi->blocks_per_seg;
return;
+ }
/* zero block will be discarded through the prefree list */
if (!se->valid_blocks || se->valid_blocks == max_blocks)
@@ -416,23 +491,39 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi,
for (i = 0; i < entries; i++)
dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
- while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+ while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
start = __find_rev_next_bit(dmap, max_blocks, end + 1);
if (start >= max_blocks)
break;
end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+ if (end - start < cpc->trim_minlen)
+ continue;
+
new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, segno) + start;
+ new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
new->len = end - start;
+ cpc->trimmed += end - start;
list_add_tail(&new->list, head);
SM_I(sbi)->nr_discards += end - start;
}
}
+void release_discard_addrs(struct f2fs_sb_info *sbi)
+{
+ struct list_head *head = &(SM_I(sbi)->discard_list);
+ struct discard_entry *entry, *this;
+
+ /* drop caches */
+ list_for_each_entry_safe(entry, this, head, list) {
+ list_del(&entry->list);
+ kmem_cache_free(discard_entry_slab, entry);
+ }
+}
+
/*
* Should call clear_prefree_segments after checkpoint is done.
*/
@@ -440,10 +531,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int segno;
- unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
- for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs)
+ for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
__set_test_and_free(sbi, segno);
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -454,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
- unsigned int total_segs = TOTAL_SEGS(sbi);
unsigned int start = 0, end = -1;
mutex_lock(&dirty_i->seglist_lock);
while (1) {
int i;
- start = find_next_bit(prefree_map, total_segs, end + 1);
- if (start >= total_segs)
+ start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
+ if (start >= MAIN_SEGS(sbi))
break;
- end = find_next_zero_bit(prefree_map, total_segs, start + 1);
+ end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
+ start + 1);
for (i = start; i < end; i++)
clear_bit(i, prefree_map);
@@ -488,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
}
}
-static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
+static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
+
+ if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
sit_i->dirty_sentries++;
+ return false;
+ }
+
+ return true;
}
static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
@@ -516,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
new_vblocks = se->valid_blocks + del;
offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
- f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
+ f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
(new_vblocks > sbi->blocks_per_seg)));
se->valid_blocks = new_vblocks;
@@ -526,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
/* Update valid block bitmap */
if (del > 0) {
if (f2fs_set_bit(offset, se->cur_valid_map))
- BUG();
+ f2fs_bug_on(sbi, 1);
} else {
if (!f2fs_clear_bit(offset, se->cur_valid_map))
- BUG();
+ f2fs_bug_on(sbi, 1);
}
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
se->ckpt_valid_blocks += del;
@@ -558,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
unsigned int segno = GET_SEGNO(sbi, addr);
struct sit_info *sit_i = SIT_I(sbi);
- f2fs_bug_on(addr == NULL_ADDR);
+ f2fs_bug_on(sbi, addr == NULL_ADDR);
if (addr == NEW_ADDR)
return;
@@ -634,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
- if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+ if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
return !test_bit(segno, free_i->free_segmap);
return 0;
}
@@ -648,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
{
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno, secno, zoneno;
- unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone;
+ unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
unsigned int hint = *newseg / sbi->segs_per_sec;
unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
unsigned int left_start = hint;
@@ -660,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
- TOTAL_SEGS(sbi), *newseg + 1);
+ MAIN_SEGS(sbi), *newseg + 1);
if (segno - *newseg < sbi->segs_per_sec -
(*newseg % sbi->segs_per_sec))
goto got_it;
}
find_other_zone:
- secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint);
- if (secno >= TOTAL_SECS(sbi)) {
+ secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
+ if (secno >= MAIN_SECS(sbi)) {
if (dir == ALLOC_RIGHT) {
secno = find_next_zero_bit(free_i->free_secmap,
- TOTAL_SECS(sbi), 0);
- f2fs_bug_on(secno >= TOTAL_SECS(sbi));
+ MAIN_SECS(sbi), 0);
+ f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
} else {
go_left = 1;
left_start = hint - 1;
@@ -686,8 +781,8 @@ find_other_zone:
continue;
}
left_start = find_next_zero_bit(free_i->free_secmap,
- TOTAL_SECS(sbi), 0);
- f2fs_bug_on(left_start >= TOTAL_SECS(sbi));
+ MAIN_SECS(sbi), 0);
+ f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
break;
}
secno = left_start;
@@ -726,7 +821,7 @@ skip_left:
}
got_it:
/* set it as dirty segment in free segmap */
- f2fs_bug_on(test_bit(segno, free_i->free_segmap));
+ f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
__set_inuse(sbi, segno);
*newseg = segno;
write_unlock(&free_i->segmap_lock);
@@ -808,7 +903,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
}
/*
- * This function always allocates a used segment (from dirty seglist) by SSR
+ * This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
@@ -898,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = {
.allocate_segment = allocate_segment_by_default,
};
+int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
+{
+ __u64 start = range->start >> sbi->log_blocksize;
+ __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+ unsigned int start_segno, end_segno;
+ struct cp_control cpc;
+
+ if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
+ range->len < sbi->blocksize)
+ return -EINVAL;
+
+ if (end <= MAIN_BLKADDR(sbi))
+ goto out;
+
+ /* start/end segment number in main_area */
+ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
+ end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
+ GET_SEGNO(sbi, end);
+ cpc.reason = CP_DISCARD;
+ cpc.trim_start = start_segno;
+ cpc.trim_end = end_segno;
+ cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
+ cpc.trimmed = 0;
+
+ /* do checkpoint to issue discard commands safely */
+ write_checkpoint(sbi, &cpc);
+out:
+ range->len = cpc.trimmed << sbi->log_blocksize;
+ return 0;
+}
+
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -953,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
static int __get_segment_type(struct page *page, enum page_type p_type)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
- switch (sbi->active_logs) {
+ switch (F2FS_P_SB(page)->active_logs) {
case 2:
return __get_segment_type_2(page, p_type);
case 4:
return __get_segment_type_4(page, p_type);
}
/* NR_CURSEG_TYPE(6) logs by default */
- f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE);
+ f2fs_bug_on(F2FS_P_SB(page),
+ F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
return __get_segment_type_6(page, p_type);
}
@@ -1041,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
void write_data_page(struct page *page, struct dnode_of_data *dn,
block_t *new_blkaddr, struct f2fs_io_info *fio)
{
- struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_summary sum;
struct node_info ni;
- f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
+ f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
@@ -1055,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
void rewrite_data_page(struct page *page, block_t old_blkaddr,
struct f2fs_io_info *fio)
{
- struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
+ f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
}
void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1103,55 +1227,6 @@ void recover_data_page(struct f2fs_sb_info *sbi,
mutex_unlock(&curseg->curseg_mutex);
}
-void rewrite_node_page(struct f2fs_sb_info *sbi,
- struct page *page, struct f2fs_summary *sum,
- block_t old_blkaddr, block_t new_blkaddr)
-{
- struct sit_info *sit_i = SIT_I(sbi);
- int type = CURSEG_WARM_NODE;
- struct curseg_info *curseg;
- unsigned int segno, old_cursegno;
- block_t next_blkaddr = next_blkaddr_of_node(page);
- unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
- struct f2fs_io_info fio = {
- .type = NODE,
- .rw = WRITE_SYNC,
- };
-
- curseg = CURSEG_I(sbi, type);
-
- mutex_lock(&curseg->curseg_mutex);
- mutex_lock(&sit_i->sentry_lock);
-
- segno = GET_SEGNO(sbi, new_blkaddr);
- old_cursegno = curseg->segno;
-
- /* change the current segment */
- if (segno != curseg->segno) {
- curseg->next_segno = segno;
- change_curseg(sbi, type, true);
- }
- curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
- __add_sum_entry(sbi, type, sum);
-
- /* change the current log to the next block addr in advance */
- if (next_segno != segno) {
- curseg->next_segno = next_segno;
- change_curseg(sbi, type, true);
- }
- curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
-
- /* rewrite node page */
- set_page_writeback(page);
- f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
- refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
- locate_dirty_segment(sbi, old_cursegno);
-
- mutex_unlock(&sit_i->sentry_lock);
- mutex_unlock(&curseg->curseg_mutex);
-}
-
static inline bool is_merged_page(struct f2fs_sb_info *sbi,
struct page *page, enum page_type type)
{
@@ -1179,8 +1254,9 @@ out:
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type)
{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
if (PageWriteback(page)) {
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+
if (is_merged_page(sbi, page, type))
f2fs_submit_merged_bio(sbi, type, WRITE);
wait_on_page_writeback(page);
@@ -1449,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
+ unsigned int offset = SIT_BLOCK_OFFSET(segno);
block_t blk_addr = sit_i->sit_base_addr + offset;
check_seg_range(sbi, segno);
@@ -1475,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
/* get current sit block page without lock */
src_page = get_meta_page(sbi, src_off);
dst_page = grab_meta_page(sbi, dst_off);
- f2fs_bug_on(PageDirty(src_page));
+ f2fs_bug_on(sbi, PageDirty(src_page));
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
@@ -1489,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
return dst_page;
}
-static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
+static struct sit_entry_set *grab_sit_entry_set(void)
+{
+ struct sit_entry_set *ses =
+ f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
+
+ ses->entry_cnt = 0;
+ INIT_LIST_HEAD(&ses->set_list);
+ return ses;
+}
+
+static void release_sit_entry_set(struct sit_entry_set *ses)
+{
+ list_del(&ses->set_list);
+ kmem_cache_free(sit_entry_set_slab, ses);
+}
+
+static void adjust_sit_entry_set(struct sit_entry_set *ses,
+ struct list_head *head)
+{
+ struct sit_entry_set *next = ses;
+
+ if (list_is_last(&ses->set_list, head))
+ return;
+
+ list_for_each_entry_continue(next, head, set_list)
+ if (ses->entry_cnt <= next->entry_cnt)
+ break;
+
+ list_move_tail(&ses->set_list, &next->set_list);
+}
+
+static void add_sit_entry(unsigned int segno, struct list_head *head)
+{
+ struct sit_entry_set *ses;
+ unsigned int start_segno = START_SEGNO(segno);
+
+ list_for_each_entry(ses, head, set_list) {
+ if (ses->start_segno == start_segno) {
+ ses->entry_cnt++;
+ adjust_sit_entry_set(ses, head);
+ return;
+ }
+ }
+
+ ses = grab_sit_entry_set();
+
+ ses->start_segno = start_segno;
+ ses->entry_cnt++;
+ list_add(&ses->set_list, head);
+}
+
+static void add_sits_in_set(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ struct list_head *set_list = &sm_info->sit_entry_set;
+ unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
+ unsigned int segno;
+
+ for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
+ add_sit_entry(segno, set_list);
+}
+
+static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
int i;
- /*
- * If the journal area in the current summary is full of sit entries,
- * all the sit entries will be flushed. Otherwise the sit entries
- * are not able to replace with newly hot sit entries.
- */
- if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
- for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
- unsigned int segno;
- segno = le32_to_cpu(segno_in_journal(sum, i));
- __mark_sit_entry_dirty(sbi, segno);
- }
- update_sits_in_cursum(sum, -sits_in_cursum(sum));
- return true;
+ for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+ unsigned int segno;
+ bool dirtied;
+
+ segno = le32_to_cpu(segno_in_journal(sum, i));
+ dirtied = __mark_sit_entry_dirty(sbi, segno);
+
+ if (!dirtied)
+ add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
}
- return false;
+ update_sits_in_cursum(sum, -sits_in_cursum(sum));
}
/*
* CP calls this function, which flushes SIT entries including sit_journal,
* and moves prefree segs to free segs.
*/
-void flush_sit_entries(struct f2fs_sb_info *sbi)
+void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct sit_info *sit_i = SIT_I(sbi);
unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
- unsigned long nsegs = TOTAL_SEGS(sbi);
- struct page *page = NULL;
- struct f2fs_sit_block *raw_sit = NULL;
- unsigned int start = 0, end = 0;
- unsigned int segno;
- bool flushed;
+ struct sit_entry_set *ses, *tmp;
+ struct list_head *head = &SM_I(sbi)->sit_entry_set;
+ bool to_journal = true;
+ struct seg_entry *se;
mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
/*
- * "flushed" indicates whether sit entries in journal are flushed
- * to the SIT area or not.
+ * add and account sit entries of dirty bitmap in sit entry
+ * set temporarily
*/
- flushed = flush_sits_in_journal(sbi);
+ add_sits_in_set(sbi);
- for_each_set_bit(segno, bitmap, nsegs) {
- struct seg_entry *se = get_seg_entry(sbi, segno);
- int sit_offset, offset;
+ /*
+ * if there are no enough space in journal to store dirty sit
+ * entries, remove all entries from journal and add and account
+ * them in sit entry set.
+ */
+ if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+ remove_sits_in_journal(sbi);
- sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+ if (!sit_i->dirty_sentries)
+ goto out;
- /* add discard candidates */
- if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
- add_discard_addrs(sbi, segno, se);
+ /*
+ * there are two steps to flush sit entries:
+ * #1, flush sit entries to journal in current cold data summary block.
+ * #2, flush sit entries to sit page.
+ */
+ list_for_each_entry_safe(ses, tmp, head, set_list) {
+ struct page *page;
+ struct f2fs_sit_block *raw_sit = NULL;
+ unsigned int start_segno = ses->start_segno;
+ unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
+ (unsigned long)MAIN_SEGS(sbi));
+ unsigned int segno = start_segno;
+
+ if (to_journal &&
+ !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+ to_journal = false;
+
+ if (!to_journal) {
+ page = get_next_sit_page(sbi, start_segno);
+ raw_sit = page_address(page);
+ }
- if (flushed)
- goto to_sit_page;
+ /* flush dirty sit entries in region of current sit set */
+ for_each_set_bit_from(segno, bitmap, end) {
+ int offset, sit_offset;
- offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
- if (offset >= 0) {
- segno_in_journal(sum, offset) = cpu_to_le32(segno);
- seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
- goto flush_done;
- }
-to_sit_page:
- if (!page || (start > segno) || (segno > end)) {
- if (page) {
- f2fs_put_page(page, 1);
- page = NULL;
+ se = get_seg_entry(sbi, segno);
+
+ /* add discard candidates */
+ if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
+ cpc->trim_start = segno;
+ add_discard_addrs(sbi, cpc);
}
- start = START_SEGNO(sit_i, segno);
- end = start + SIT_ENTRY_PER_BLOCK - 1;
+ if (to_journal) {
+ offset = lookup_journal_in_cursum(sum,
+ SIT_JOURNAL, segno, 1);
+ f2fs_bug_on(sbi, offset < 0);
+ segno_in_journal(sum, offset) =
+ cpu_to_le32(segno);
+ seg_info_to_raw_sit(se,
+ &sit_in_journal(sum, offset));
+ } else {
+ sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+ seg_info_to_raw_sit(se,
+ &raw_sit->entries[sit_offset]);
+ }
- /* read sit block that will be updated */
- page = get_next_sit_page(sbi, start);
- raw_sit = page_address(page);
+ __clear_bit(segno, bitmap);
+ sit_i->dirty_sentries--;
+ ses->entry_cnt--;
}
- /* udpate entry in SIT block */
- seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
-flush_done:
- __clear_bit(segno, bitmap);
- sit_i->dirty_sentries--;
+ if (!to_journal)
+ f2fs_put_page(page, 1);
+
+ f2fs_bug_on(sbi, ses->entry_cnt);
+ release_sit_entry_set(ses);
+ }
+
+ f2fs_bug_on(sbi, !list_empty(head));
+ f2fs_bug_on(sbi, sit_i->dirty_sentries);
+out:
+ if (cpc->reason == CP_DISCARD) {
+ for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
+ add_discard_addrs(sbi, cpc);
}
mutex_unlock(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- /* writeout last modified SIT block */
- f2fs_put_page(page, 1);
-
set_prefree_as_free_segments(sbi);
}
@@ -1603,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
+ sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
if (!sit_i->sentries)
return -ENOMEM;
- bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
- for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ for (start = 0; start < MAIN_SEGS(sbi); start++) {
sit_i->sentries[start].cur_valid_map
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
sit_i->sentries[start].ckpt_valid_map
@@ -1623,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
}
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) *
+ sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
sizeof(struct sec_entry));
if (!sit_i->sec_entries)
return -ENOMEM;
@@ -1658,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
static int build_free_segmap(struct f2fs_sb_info *sbi)
{
- struct f2fs_sm_info *sm_info = SM_I(sbi);
struct free_segmap_info *free_i;
unsigned int bitmap_size, sec_bitmap_size;
@@ -1669,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
- bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
- sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
+ sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -1684,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
memset(free_i->free_secmap, 0xff, sec_bitmap_size);
/* init free segmap information */
- free_i->start_segno =
- (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
+ free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
free_i->free_segments = 0;
free_i->free_sections = 0;
rwlock_init(&free_i->segmap_lock);
@@ -1722,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
- int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ int nrpages = MAX_BIO_BLOCKS(sbi);
do {
readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
@@ -1730,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
- for (; start < end && start < TOTAL_SEGS(sbi); start++) {
+ for (; start < end && start < MAIN_SEGS(sbi); start++) {
struct seg_entry *se = &sit_i->sentries[start];
struct f2fs_sit_block *sit_blk;
struct f2fs_sit_entry sit;
@@ -1768,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
unsigned int start;
int type;
- for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ for (start = 0; start < MAIN_SEGS(sbi); start++) {
struct seg_entry *sentry = get_seg_entry(sbi, start);
if (!sentry->valid_blocks)
__set_free(sbi, start);
@@ -1785,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
+ unsigned int segno = 0, offset = 0;
unsigned short valid_blocks;
while (1) {
/* find dirty segment based on free segmap */
- segno = find_next_inuse(free_i, total_segs, offset);
- if (segno >= total_segs)
+ segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
+ if (segno >= MAIN_SEGS(sbi))
break;
offset = segno + 1;
valid_blocks = get_valid_blocks(sbi, segno, 0);
- if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
+ if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
+ continue;
+ if (valid_blocks > sbi->blocks_per_seg) {
+ f2fs_bug_on(sbi, 1);
continue;
+ }
mutex_lock(&dirty_i->seglist_lock);
__locate_dirty_segment(sbi, segno, DIRTY);
mutex_unlock(&dirty_i->seglist_lock);
@@ -1806,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
static int init_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
+ unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
@@ -1827,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->dirty_info = dirty_i;
mutex_init(&dirty_i->seglist_lock);
- bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
@@ -1851,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
sit_i->min_mtime = LLONG_MAX;
- for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
unsigned int i;
unsigned long long mtime = 0;
@@ -1889,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
sm_info->rec_prefree_segments = sm_info->main_segments *
DEF_RECLAIM_PREFREE_SEGMENTS / 100;
- sm_info->ipu_policy = F2FS_IPU_DISABLE;
+ sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
+ sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
INIT_LIST_HEAD(&sm_info->discard_list);
sm_info->nr_discards = 0;
sm_info->max_discards = 0;
+ INIT_LIST_HEAD(&sm_info->sit_entry_set);
+
if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
err = create_flush_cmd_control(sbi);
if (err)
@@ -1991,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
return;
if (sit_i->sentries) {
- for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ for (start = 0; start < MAIN_SEGS(sbi); start++) {
kfree(sit_i->sentries[start].cur_valid_map);
kfree(sit_i->sentries[start].ckpt_valid_map);
}
@@ -2025,11 +2197,30 @@ int __init create_segment_manager_caches(void)
discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
sizeof(struct discard_entry));
if (!discard_entry_slab)
- return -ENOMEM;
+ goto fail;
+
+ sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+ sizeof(struct nat_entry_set));
+ if (!sit_entry_set_slab)
+ goto destory_discard_entry;
+
+ inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+ sizeof(struct inmem_pages));
+ if (!inmem_entry_slab)
+ goto destroy_sit_entry_set;
return 0;
+
+destroy_sit_entry_set:
+ kmem_cache_destroy(sit_entry_set_slab);
+destory_discard_entry:
+ kmem_cache_destroy(discard_entry_slab);
+fail:
+ return -ENOMEM;
}
void destroy_segment_manager_caches(void)
{
+ kmem_cache_destroy(sit_entry_set_slab);
kmem_cache_destroy(discard_entry_slab);
+ kmem_cache_destroy(inmem_entry_slab);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 55973f7b0330..2495bec1c621 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -45,16 +45,26 @@
(secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
sbi->segs_per_sec)) \
-#define START_BLOCK(sbi, segno) \
- (SM_I(sbi)->seg0_blkaddr + \
+#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
+#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
+
+#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
+#define MAIN_SECS(sbi) (sbi->total_sections)
+
+#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
+#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+
+#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
+#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
+ sbi->log_blocks_per_seg))
+
+#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
(GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+
#define NEXT_FREE_BLKADDR(sbi, curseg) \
(START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
-#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr)
-
-#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
- ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
+#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
@@ -77,23 +87,21 @@
#define SIT_ENTRY_OFFSET(sit_i, segno) \
(segno % sit_i->sents_per_block)
-#define SIT_BLOCK_OFFSET(sit_i, segno) \
+#define SIT_BLOCK_OFFSET(segno) \
(segno / SIT_ENTRY_PER_BLOCK)
-#define START_SEGNO(sit_i, segno) \
- (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define START_SEGNO(segno) \
+ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
#define SIT_BLK_CNT(sbi) \
- ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
+ ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
#define f2fs_bitmap_size(nr) \
(BITS_TO_LONGS(nr) * sizeof(unsigned long))
-#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
-#define TOTAL_SECS(sbi) (sbi->total_sections)
-#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
- (((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
-#define SECTOR_TO_BLOCK(sbi, sectors) \
- (sectors >> (sbi)->log_sectors_per_block)
-#define MAX_BIO_BLOCKS(max_hw_blocks) \
- (min((int)max_hw_blocks, BIO_MAX_PAGES))
+#define SECTOR_FROM_BLOCK(blk_addr) \
+ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
+#define SECTOR_TO_BLOCK(sectors) \
+ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
+#define MAX_BIO_BLOCKS(sbi) \
+ ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -167,6 +175,11 @@ struct segment_allocation {
void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
};
+struct inmem_pages {
+ struct list_head list;
+ struct page *page;
+};
+
struct sit_info {
const struct segment_allocation *s_ops;
@@ -237,6 +250,12 @@ struct curseg_info {
unsigned int next_segno; /* preallocated segment */
};
+struct sit_entry_set {
+ struct list_head set_list; /* link with all sit sets */
+ unsigned int start_segno; /* start segno of sits in set */
+ unsigned int entry_cnt; /* the # of sit entries in set */
+};
+
/*
* inline functions
*/
@@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
clear_bit(segno, free_i->free_segmap);
free_i->free_segments++;
- next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
+ next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
if (next >= start_segno + sbi->segs_per_sec) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
@@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
- return (prefree_segments(sbi) / sbi->segs_per_sec)
- + free_sections(sbi) < overprovision_sections(sbi);
+ int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+ int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+ reserved_sections(sbi) + 1);
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -466,48 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi)
* F2FS_IPU_UTIL - if FS utilization is over threashold,
* F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
* threashold,
+ * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
+ * storages. IPU will be triggered only if the # of dirty
+ * pages over min_fsync_blocks.
* F2FS_IPUT_DISABLE - disable IPU. (=default option)
*/
#define DEF_MIN_IPU_UTIL 70
+#define DEF_MIN_FSYNC_BLOCKS 8
enum {
F2FS_IPU_FORCE,
F2FS_IPU_SSR,
F2FS_IPU_UTIL,
F2FS_IPU_SSR_UTIL,
- F2FS_IPU_DISABLE,
+ F2FS_IPU_FSYNC,
};
static inline bool need_inplace_update(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ unsigned int policy = SM_I(sbi)->ipu_policy;
/* IPU can be done only for the user data */
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
return false;
- /* this is only set during fdatasync */
- if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+ if (policy & (0x1 << F2FS_IPU_FORCE))
+ return true;
+ if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
+ return true;
+ if (policy & (0x1 << F2FS_IPU_UTIL) &&
+ utilization(sbi) > SM_I(sbi)->min_ipu_util)
+ return true;
+ if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
+ utilization(sbi) > SM_I(sbi)->min_ipu_util)
return true;
- switch (SM_I(sbi)->ipu_policy) {
- case F2FS_IPU_FORCE:
+ /* this is only set during fdatasync */
+ if (policy & (0x1 << F2FS_IPU_FSYNC) &&
+ is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
return true;
- case F2FS_IPU_SSR:
- if (need_SSR(sbi))
- return true;
- break;
- case F2FS_IPU_UTIL:
- if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
- return true;
- break;
- case F2FS_IPU_SSR_UTIL:
- if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
- return true;
- break;
- case F2FS_IPU_DISABLE:
- break;
- }
+
return false;
}
@@ -534,28 +554,21 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
#ifdef CONFIG_F2FS_CHECK_FS
static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
{
- unsigned int end_segno = SM_I(sbi)->segment_count - 1;
- BUG_ON(segno > end_segno);
+ BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
}
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
- struct f2fs_sm_info *sm_info = SM_I(sbi);
- block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
- block_t start_addr = sm_info->seg0_blkaddr;
- block_t end_addr = start_addr + total_blks - 1;
- BUG_ON(blk_addr < start_addr);
- BUG_ON(blk_addr > end_addr);
+ BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
+ BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
}
/*
- * Summary block is always treated as invalid block
+ * Summary block is always treated as an invalid block
*/
static inline void check_block_count(struct f2fs_sb_info *sbi,
int segno, struct f2fs_sit_entry *raw_sit)
{
- struct f2fs_sm_info *sm_info = SM_I(sbi);
- unsigned int end_segno = sm_info->segment_count - 1;
bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
int valid_blocks = 0;
int cur_pos = 0, next_pos;
@@ -564,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
/* check boundary of a given segment number */
- BUG_ON(segno > end_segno);
+ BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
/* check bitmap with valid block count */
do {
@@ -583,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
}
#else
-#define check_seg_range(sbi, segno)
-#define verify_block_addr(sbi, blk_addr)
-#define check_block_count(sbi, segno, raw_sit)
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ if (segno > TOTAL_SEGS(sbi) - 1)
+ sbi->need_fsck = true;
+}
+
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+ if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
+ sbi->need_fsck = true;
+}
+
+/*
+ * Summary block is always treated as an invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+ int segno, struct f2fs_sit_entry *raw_sit)
+{
+ /* check segment usage */
+ if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
+ sbi->need_fsck = true;
+
+ /* check boundary of a given segment number */
+ if (segno > TOTAL_SEGS(sbi) - 1)
+ sbi->need_fsck = true;
+}
#endif
static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
unsigned int start)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
+ unsigned int offset = SIT_BLOCK_OFFSET(start);
block_t blk_addr = sit_i->sit_base_addr + offset;
check_seg_range(sbi, start);
@@ -619,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
{
- unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
+ unsigned int block_off = SIT_BLOCK_OFFSET(start);
if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
f2fs_clear_bit(block_off, sit_i->sit_bitmap);
@@ -666,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
{
struct block_device *bdev = sbi->sb->s_bdev;
struct request_queue *q = bdev_get_queue(bdev);
- return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
+ return SECTOR_TO_BLOCK(queue_max_sectors(q));
}
/*
@@ -683,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
else if (type == NODE)
return 3 * sbi->blocks_per_seg;
else if (type == META)
- return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ return MAX_BIO_BLOCKS(sbi);
else
return 0;
}
@@ -706,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
else if (type == NODE)
desired = 3 * max_hw_blocks(sbi);
else
- desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ desired = MAX_BIO_BLOCKS(sbi);
wbc->nr_to_write = desired;
return desired - nr_to_write;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 657582fc7601..41d6f700f4ee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -190,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -204,6 +205,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(max_small_discards),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
+ ATTR_LIST(min_fsync_blocks),
ATTR_LIST(max_victim_search),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
@@ -366,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
fi->vfs_inode.i_version = 1;
- atomic_set(&fi->dirty_dents, 0);
+ atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
rwlock_init(&fi->ext.ext_lock);
init_rwsem(&fi->i_sem);
+ INIT_LIST_HEAD(&fi->inmem_pages);
+ mutex_init(&fi->inmem_lock);
set_inode_flag(fi, FI_NEW_INODE);
@@ -432,8 +436,19 @@ static void f2fs_put_super(struct super_block *sb)
stop_gc_thread(sbi);
/* We don't need to do checkpoint when it's clean */
- if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES))
- write_checkpoint(sbi, true);
+ if (sbi->s_dirty) {
+ struct cp_control cpc = {
+ .reason = CP_UMOUNT,
+ };
+ write_checkpoint(sbi, &cpc);
+ }
+
+ /*
+ * normally superblock is clean, so we need to release this.
+ * In addition, EIO will skip do checkpoint, we need this as well.
+ */
+ release_dirty_inode(sbi);
+ release_discard_addrs(sbi);
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -457,12 +472,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
trace_f2fs_sync_fs(sb, sync);
- if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
- return 0;
-
if (sync) {
+ struct cp_control cpc = {
+ .reason = CP_SYNC,
+ };
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, false);
+ write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
} else {
f2fs_balance_fs(sbi);
@@ -505,8 +520,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
buf->f_bavail = user_block_count - valid_user_blocks(sbi);
- buf->f_files = sbi->total_node_count;
- buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
+ buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+ buf->f_ffree = buf->f_files - valid_inode_count(sbi);
buf->f_namelen = F2FS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
@@ -613,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
active_logs = sbi->active_logs;
+ sbi->mount_opt.opt = 0;
+ sbi->active_logs = NR_CURSEG_TYPE;
+
/* parse mount options */
err = parse_options(sb, data);
if (err)
@@ -663,7 +681,7 @@ restore_gc:
if (need_restart_gc) {
if (start_gc_thread(sbi))
f2fs_msg(sbi->sb, KERN_WARNING,
- "background gc thread is stop");
+ "background gc thread has stopped");
} else if (need_stop_gc) {
stop_gc_thread(sbi);
}
@@ -783,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb,
return 1;
}
- if (le32_to_cpu(raw_super->log_sectorsize) !=
- F2FS_LOG_SECTOR_SIZE) {
- f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
+ /* Currently, support 512/1024/2048/4096 bytes sector size */
+ if (le32_to_cpu(raw_super->log_sectorsize) >
+ F2FS_MAX_LOG_SECTOR_SIZE ||
+ le32_to_cpu(raw_super->log_sectorsize) <
+ F2FS_MIN_LOG_SECTOR_SIZE) {
+ f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
+ le32_to_cpu(raw_super->log_sectorsize));
return 1;
}
- if (le32_to_cpu(raw_super->log_sectors_per_block) !=
- F2FS_LOG_SECTORS_PER_BLOCK) {
- f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block");
+ if (le32_to_cpu(raw_super->log_sectors_per_block) +
+ le32_to_cpu(raw_super->log_sectorsize) !=
+ F2FS_MAX_LOG_SECTOR_SIZE) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid log sectors per block(%u) log sectorsize(%u)",
+ le32_to_cpu(raw_super->log_sectors_per_block),
+ le32_to_cpu(raw_super->log_sectorsize));
return 1;
}
return 0;
@@ -812,7 +838,7 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
if (unlikely(fsmeta >= total))
return 1;
- if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
+ if (unlikely(f2fs_cp_error(sbi))) {
f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
return 1;
}
@@ -846,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->nr_pages[i], 0);
sbi->dir_level = DEF_DIR_LEVEL;
+ sbi->need_fsck = false;
}
/*
@@ -899,8 +926,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
struct buffer_head *raw_super_buf;
struct inode *root;
long err = -EINVAL;
+ bool retry = true;
int i;
+try_onemore:
/* allocate memory for f2fs-specific super block info */
sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
if (!sbi)
@@ -1077,12 +1106,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto free_proc;
+ if (!retry)
+ sbi->need_fsck = true;
+
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
err = recover_fsync_data(sbi);
- if (err)
+ if (err) {
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%ld", err);
+ goto free_kobj;
+ }
}
/*
@@ -1123,6 +1157,13 @@ free_sb_buf:
brelse(raw_super_buf);
free_sbi:
kfree(sbi);
+
+ /* give only one another chance */
+ if (retry) {
+ retry = 0;
+ shrink_dcache_sb(sb);
+ goto try_onemore;
+ }
return err;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8bea941ee309..deca8728117b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
static void *read_all_xattrs(struct inode *inode, struct page *ipage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_xattr_header *header;
size_t size = PAGE_SIZE, inline_size = 0;
void *txattr_addr;
@@ -325,7 +325,7 @@ fail:
static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
void *txattr_addr, struct page *ipage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
size_t inline_size = 0;
void *xattr_addr;
struct page *xpage;
@@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
alloc_nid_failed(sbi, new_nid);
return PTR_ERR(xpage);
}
- f2fs_bug_on(new_nid);
+ f2fs_bug_on(sbi, new_nid);
f2fs_wait_on_page_writeback(xpage, NODE);
} else {
struct dnode_of_data dn;
@@ -528,7 +528,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
int free;
/*
* If value is NULL, it is remove operation.
- * In case of update operation, we caculate free.
+ * In case of update operation, we calculate free.
*/
free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
if (found)
@@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
const void *value, size_t size,
struct page *ipage, int flags)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
/* this case is only from init_inode_metadata */
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 628e22a5a543..d8da2d2e30ae 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -164,8 +164,6 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
return 0;
}
-extern struct timezone sys_tz;
-
/*
* The epoch of FAT timestamp is 1980.
* : bits : value
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 72c82f69b01b..99d440a4a6ba 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -21,6 +21,7 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/shmem_fs.h>
#include <asm/poll.h>
#include <asm/siginfo.h>
@@ -97,26 +98,19 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
write_unlock_irq(&filp->f_owner.lock);
}
-int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
+void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
int force)
{
- int err;
-
- err = security_file_set_fowner(filp);
- if (err)
- return err;
-
+ security_file_set_fowner(filp);
f_modown(filp, pid, type, force);
- return 0;
}
EXPORT_SYMBOL(__f_setown);
-int f_setown(struct file *filp, unsigned long arg, int force)
+void f_setown(struct file *filp, unsigned long arg, int force)
{
enum pid_type type;
struct pid *pid;
int who = arg;
- int result;
type = PIDTYPE_PID;
if (who < 0) {
type = PIDTYPE_PGID;
@@ -124,9 +118,8 @@ int f_setown(struct file *filp, unsigned long arg, int force)
}
rcu_read_lock();
pid = find_vpid(who);
- result = __f_setown(filp, pid, type, force);
+ __f_setown(filp, pid, type, force);
rcu_read_unlock();
- return result;
}
EXPORT_SYMBOL(f_setown);
@@ -180,7 +173,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
if (owner.pid && !pid)
ret = -ESRCH;
else
- ret = __f_setown(filp, pid, type, 1);
+ __f_setown(filp, pid, type, 1);
rcu_read_unlock();
return ret;
@@ -301,7 +294,8 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
force_successful_syscall_return();
break;
case F_SETOWN:
- err = f_setown(filp, arg, 1);
+ f_setown(filp, arg, 1);
+ err = 0;
break;
case F_GETOWN_EX:
err = f_getown_ex(filp, arg);
@@ -336,6 +330,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_GETPIPE_SZ:
err = pipe_fcntl(filp, cmd, arg);
break;
+ case F_ADD_SEALS:
+ case F_GET_SEALS:
+ err = shmem_fcntl(filp, cmd, arg);
+ break;
default:
break;
}
diff --git a/fs/file.c b/fs/file.c
index 66923fe3176e..ab3eb6a88239 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -367,7 +367,7 @@ static struct fdtable *close_files(struct files_struct * files)
struct file * file = xchg(&fdt->fd[i], NULL);
if (file) {
filp_close(file, files);
- cond_resched();
+ cond_resched_rcu_qs();
}
}
i++;
@@ -750,6 +750,7 @@ bool get_close_on_exec(unsigned int fd)
static int do_dup2(struct files_struct *files,
struct file *file, unsigned fd, unsigned flags)
+__releases(&files->file_lock)
{
struct file *tofree;
struct fdtable *fdt;
diff --git a/fs/file_table.c b/fs/file_table.c
index 385bfd31512a..3f85411b03ce 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -150,18 +150,10 @@ over:
/**
* alloc_file - allocate and initialize a 'struct file'
- * @mnt: the vfsmount on which the file will reside
- * @dentry: the dentry representing the new file
+ *
+ * @path: the (dentry, vfsmount) pair for the new file
* @mode: the mode with which the new file will be opened
* @fop: the 'struct file_operations' for the new file
- *
- * Use this instead of get_empty_filp() to get a new
- * 'struct file'. Do so because of the same initialization
- * pitfalls reasons listed for init_file(). This is a
- * preferred interface to using init_file().
- *
- * If all the callers of init_file() are eliminated, its
- * code should be moved into this function.
*/
struct file *alloc_file(struct path *path, fmode_t mode,
const struct file_operations *fop)
@@ -331,5 +323,5 @@ void __init files_init(unsigned long mempages)
n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
- percpu_counter_init(&nr_files, 0);
+ percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
new file mode 100644
index 000000000000..9368236ca100
--- /dev/null
+++ b/fs/fs_pin.c
@@ -0,0 +1,78 @@
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/fs_pin.h>
+#include "internal.h"
+#include "mount.h"
+
+static void pin_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct fs_pin, rcu));
+}
+
+static DEFINE_SPINLOCK(pin_lock);
+
+void pin_put(struct fs_pin *p)
+{
+ if (atomic_long_dec_and_test(&p->count))
+ call_rcu(&p->rcu, pin_free_rcu);
+}
+
+void pin_remove(struct fs_pin *pin)
+{
+ spin_lock(&pin_lock);
+ hlist_del(&pin->m_list);
+ hlist_del(&pin->s_list);
+ spin_unlock(&pin_lock);
+}
+
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+ spin_lock(&pin_lock);
+ hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+ hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
+ spin_unlock(&pin_lock);
+}
+
+void mnt_pin_kill(struct mount *m)
+{
+ while (1) {
+ struct hlist_node *p;
+ struct fs_pin *pin;
+ rcu_read_lock();
+ p = ACCESS_ONCE(m->mnt_pins.first);
+ if (!p) {
+ rcu_read_unlock();
+ break;
+ }
+ pin = hlist_entry(p, struct fs_pin, m_list);
+ if (!atomic_long_inc_not_zero(&pin->count)) {
+ rcu_read_unlock();
+ cpu_relax();
+ continue;
+ }
+ rcu_read_unlock();
+ pin->kill(pin);
+ }
+}
+
+void sb_pin_kill(struct super_block *sb)
+{
+ while (1) {
+ struct hlist_node *p;
+ struct fs_pin *pin;
+ rcu_read_lock();
+ p = ACCESS_ONCE(sb->s_pins.first);
+ if (!p) {
+ rcu_read_unlock();
+ break;
+ }
+ pin = hlist_entry(p, struct fs_pin, s_list);
+ if (!atomic_long_inc_not_zero(&pin->count)) {
+ rcu_read_unlock();
+ cpu_relax();
+ continue;
+ }
+ rcu_read_unlock();
+ pin->kill(pin);
+ }
+}
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index a31b83c5cbd9..b39d487ccfb0 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write,
return ret;
}
-struct ctl_table fscache_sysctls[] = {
+static struct ctl_table fscache_sysctls[] = {
{
.procname = "object_max_active",
.data = &fscache_object_max_active,
@@ -87,7 +87,7 @@ struct ctl_table fscache_sysctls[] = {
{}
};
-struct ctl_table fscache_sysctls_root[] = {
+static struct ctl_table fscache_sysctls_root[] = {
{
.procname = "fscache",
.mode = 0555,
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index b8179ca6bf9d..51dde817e1f2 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -380,26 +380,14 @@ no_config:
static int fscache_objlist_open(struct inode *inode, struct file *file)
{
struct fscache_objlist_data *data;
- struct seq_file *m;
- int ret;
- ret = seq_open(file, &fscache_objlist_ops);
- if (ret < 0)
- return ret;
-
- m = file->private_data;
-
- /* buffer for key extraction */
- data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL);
- if (!data) {
- seq_release(inode, file);
+ data = __seq_open_private(file, &fscache_objlist_ops, sizeof(*data));
+ if (!data)
return -ENOMEM;
- }
/* get the configuration key */
fscache_objlist_config(data);
- m->private = data;
return 0;
}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index d3b4539f1651..da032daf0e0d 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -982,6 +982,7 @@ nomem:
submit_op_failed:
clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
spin_unlock(&cookie->lock);
+ fscache_unuse_cookie(object);
kfree(op);
_leave(" [EIO]");
return transit_to(KILL_OBJECT);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 85332b9d19d1..de33b3fccca6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -44,6 +44,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
EXPORT_SYMBOL(__fscache_wait_on_page_write);
/*
+ * wait for a page to finish being written to the cache. Put a timeout here
+ * since we might be called recursively via parent fs.
+ */
+static
+bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+ return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
+ HZ);
+}
+
+/*
* decide whether a page can be released, possibly by cancelling a store to it
* - we're allowed to sleep if __GFP_WAIT is flagged
*/
@@ -115,7 +128,10 @@ page_busy:
}
fscache_stat(&fscache_n_store_vmscan_wait);
- __fscache_wait_on_page_write(cookie, page);
+ if (!release_page_wait_timeout(cookie, page))
+ _debug("fscache writeout timeout page: %p{%lx}",
+ page, page->index);
+
gfp &= ~__GFP_WAIT;
goto try_again;
}
@@ -182,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
{
struct fscache_operation *op;
struct fscache_object *object;
- bool wake_cookie;
+ bool wake_cookie = false;
_enter("%p", cookie);
@@ -212,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
__fscache_use_cookie(cookie);
if (fscache_submit_exclusive_op(object, op) < 0)
- goto nobufs;
+ goto nobufs_dec;
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_attr_changed_ok);
fscache_put_operation(op);
_leave(" = 0");
return 0;
-nobufs:
+nobufs_dec:
wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs:
spin_unlock(&cookie->lock);
kfree(op);
if (wake_cookie)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0c6048247a34..dbab798f5caf 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -274,9 +274,6 @@ out:
invalid:
ret = 0;
-
- if (!(flags & LOOKUP_RCU) && check_submounts_and_drop(entry) != 0)
- ret = 1;
goto out;
}
@@ -845,12 +842,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
return err;
}
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
- struct inode *newdir, struct dentry *newent)
-{
- return fuse_rename2(olddir, oldent, newdir, newent, 0);
-}
-
static int fuse_link(struct dentry *entry, struct inode *newdir,
struct dentry *newent)
{
@@ -1295,9 +1286,7 @@ static int fuse_direntplus_link(struct file *file,
d_drop(dentry);
} else if (get_node_id(inode) != o->nodeid ||
((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
- err = d_invalidate(dentry);
- if (err)
- goto out;
+ d_invalidate(dentry);
} else if (is_bad_inode(inode)) {
err = -EIO;
goto out;
@@ -2024,7 +2013,6 @@ static const struct inode_operations fuse_dir_inode_operations = {
.symlink = fuse_symlink,
.unlink = fuse_unlink,
.rmdir = fuse_rmdir,
- .rename = fuse_rename,
.rename2 = fuse_rename2,
.link = fuse_link,
.setattr = fuse_setattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 40ac2628ddcf..caa8d95b24e8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1303,10 +1303,11 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
unsigned npages;
size_t start;
- unsigned n = req->max_pages - req->num_pages;
ssize_t ret = iov_iter_get_pages(ii,
&req->pages[req->num_pages],
- n * PAGE_SIZE, &start);
+ *nbytesp - nbytes,
+ req->max_pages - req->num_pages,
+ &start);
if (ret < 0)
return ret;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e6ee5b6e8d99..f0b945ab853e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp)
* Returns: The length of the extent (minimum of one block)
*/
-static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
{
const __be64 *end = (start + len);
const __be64 *first = ptr;
@@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
struct buffer_head *bh_map, struct metapath *mp,
const unsigned int sheight,
const unsigned int height,
- const unsigned int maxlen)
+ const size_t maxlen)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
} else {
/* Need to allocate indirect blocks */
ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
- dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+ dblks = min(maxlen, (size_t)(ptrs_per_blk -
+ mp->mp_list[end_of_metadata]));
if (height == ip->i_height) {
/* Writing into existing tree, extend tree down */
iblks = height - sheight;
@@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned int bsize = sdp->sd_sb.sb_bsize;
- const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+ const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
const u64 *arr = sdp->sd_heightsize;
__be64 *ptr;
u64 size;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index d3a5d4e29ba5..589f4ea9381c 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -93,9 +93,6 @@ invalid_gunlock:
if (!had_lock)
gfs2_glock_dq_uninit(&d_gh);
invalid:
- if (check_submounts_and_drop(dentry) != 0)
- goto valid;
-
dput(parent);
return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a349f9a9685..5d4261ff5d23 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2100,8 +2100,13 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
}
if (IS_ERR(dent))
return PTR_ERR(dent);
- da->bh = bh;
- da->dent = dent;
+
+ if (da->save_loc) {
+ da->bh = bh;
+ da->dent = dent;
+ } else {
+ brelse(bh);
+ }
return 0;
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 126c65dda028..e1b309c24dab 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,6 +23,7 @@ struct gfs2_diradd {
unsigned nr_blocks;
struct gfs2_dirent *dent;
struct buffer_head *bh;
+ int save_loc;
};
extern struct inode *gfs2_dir_search(struct inode *dir,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 26b3f952e6b1..80dd44dca028 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -26,6 +26,7 @@
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
#include <linux/aio.h>
+#include <linux/delay.h>
#include "gfs2.h"
#include "incore.h"
@@ -913,26 +914,6 @@ out_uninit:
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
/**
- * gfs2_setlease - acquire/release a file lease
- * @file: the file pointer
- * @arg: lease type
- * @fl: file lock
- *
- * We don't currently have a way to enforce a lease across the whole
- * cluster; until we do, disable leases (by just returning -EINVAL),
- * unless the administrator has requested purely local locking.
- *
- * Locking: called under i_lock
- *
- * Returns: errno
- */
-
-static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
-{
- return -EINVAL;
-}
-
-/**
* gfs2_lock - acquire/release a posix lock on a file
* @file: the file pointer
* @cmd: either modify or retrieve lock state, possibly wait
@@ -979,9 +960,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
unsigned int state;
int flags;
int error = 0;
+ int sleeptime;
state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT;
+ flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
mutex_lock(&fp->f_fl_mutex);
@@ -1001,7 +983,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
gfs2_holder_init(gl, state, flags, fl_gh);
gfs2_glock_put(gl);
}
- error = gfs2_glock_nq(fl_gh);
+ for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
+ error = gfs2_glock_nq(fl_gh);
+ if (error != GLR_TRYFAILED)
+ break;
+ fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
+ fl_gh->gh_error = 0;
+ msleep(sleeptime);
+ }
if (error) {
gfs2_holder_uninit(fl_gh);
if (error == GLR_TRYFAILED)
@@ -1024,7 +1013,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
mutex_lock(&fp->f_fl_mutex);
flock_lock_file_wait(file, fl);
if (fl_gh->gh_gl) {
- gfs2_glock_dq_wait(fl_gh);
+ gfs2_glock_dq(fl_gh);
gfs2_holder_uninit(fl_gh);
}
mutex_unlock(&fp->f_fl_mutex);
@@ -1069,7 +1058,7 @@ const struct file_operations gfs2_file_fops = {
.flock = gfs2_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .setlease = gfs2_setlease,
+ .setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7f513b1ceb2c..8f0c19d1d943 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -811,7 +811,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
{
INIT_LIST_HEAD(&gh->gh_list);
gh->gh_gl = gl;
- gh->gh_ip = (unsigned long)__builtin_return_address(0);
+ gh->gh_ip = _RET_IP_;
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
@@ -835,7 +835,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
gh->gh_state = state;
gh->gh_flags = flags;
gh->gh_iflags = 0;
- gh->gh_ip = (unsigned long)__builtin_return_address(0);
+ gh->gh_ip = _RET_IP_;
if (gh->gh_owner_pid)
put_pid(gh->gh_owner_pid);
gh->gh_owner_pid = get_pid(task_pid(current));
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 2ffc67dce87f..1cc0bba6313f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -93,7 +93,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
* tr->alloced is not set since the transaction structure is
* on the stack */
tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
- tr.tr_ip = (unsigned long)__builtin_return_address(0);
+ tr.tr_ip = _RET_IP_;
sb_start_intwrite(sdp->sd_vfs);
if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) {
sb_end_intwrite(sdp->sd_vfs);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 67d310c9ada3..39e7e9959b74 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -262,6 +262,9 @@ struct gfs2_holder {
unsigned long gh_ip;
};
+/* Number of quota types we support */
+#define GFS2_MAXQUOTAS 2
+
/* Resource group multi-block reservation, in order of appearance:
Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -282,8 +285,8 @@ struct gfs2_blkreserv {
u64 rs_inum; /* Inode number for reservation */
/* ancillary quota stuff */
- struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
- struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+ struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
+ struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
unsigned int rs_qa_qd_num;
};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e62e59477884..c4ed823d150e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -600,7 +600,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
int error, free_vfs_inode = 0;
u32 aflags = 0;
unsigned blocks = 1;
- struct gfs2_diradd da = { .bh = NULL, };
+ struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
@@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!IS_ERR(inode)) {
d = d_splice_alias(inode, dentry);
error = PTR_ERR(d);
- if (IS_ERR(d))
+ if (IS_ERR(d)) {
+ inode = ERR_CAST(d);
goto fail_gunlock;
+ }
error = 0;
if (file) {
if (S_ISREG(inode->i_mode)) {
@@ -670,6 +672,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
gfs2_set_inode_blocks(inode, 1);
munge_mode_uid_gid(dip, inode);
+ check_and_update_goal(dip);
ip->i_goal = dip->i_goal;
ip->i_diskflags = 0;
ip->i_eattr = 0;
@@ -840,8 +843,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
int error;
inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (!inode)
+ if (inode == NULL) {
+ d_add(dentry, NULL);
return NULL;
+ }
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -854,7 +859,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
d = d_splice_alias(inode, dentry);
if (IS_ERR(d)) {
- iput(inode);
gfs2_glock_dq_uninit(&gh);
return d;
}
@@ -896,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder ghs[2];
struct buffer_head *dibh;
- struct gfs2_diradd da = { .bh = NULL, };
+ struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
int error;
if (S_ISDIR(inode->i_mode))
@@ -1241,6 +1245,9 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
struct dentry *d;
bool excl = !!(flags & O_EXCL);
+ if (!d_unhashed(dentry))
+ goto skip_lookup;
+
d = __gfs2_lookup(dir, dentry, file, opened);
if (IS_ERR(d))
return PTR_ERR(d);
@@ -1257,6 +1264,8 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
}
BUG_ON(d != NULL);
+
+skip_lookup:
if (!(flags & O_CREAT))
return -ENOENT;
@@ -1334,7 +1343,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
- struct gfs2_diradd da = { .nr_blocks = 0, };
+ struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, };
unsigned int x;
int error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f4cb9c0d6bbd..7474c413ffd1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -577,6 +577,13 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
return rgd;
}
+void check_and_update_goal(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
+ ip->i_goal = ip->i_no_addr;
+}
+
void gfs2_free_clones(struct gfs2_rgrpd *rgd)
{
int x;
@@ -1910,6 +1917,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
rs->rs_rbm.rgd = begin = ip->i_rgd;
} else {
+ check_and_update_goal(ip);
rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
}
if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
@@ -2089,7 +2097,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
u32 blen, unsigned char new_state)
{
struct gfs2_rbm rbm;
- struct gfs2_bitmap *bi;
+ struct gfs2_bitmap *bi, *bi_prev = NULL;
rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
if (!rbm.rgd) {
@@ -2098,18 +2106,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
return NULL;
}
+ gfs2_rbm_from_block(&rbm, bstart);
while (blen--) {
- gfs2_rbm_from_block(&rbm, bstart);
bi = rbm_bi(&rbm);
- bstart++;
- if (!bi->bi_clone) {
- bi->bi_clone = kmalloc(bi->bi_bh->b_size,
- GFP_NOFS | __GFP_NOFAIL);
- memcpy(bi->bi_clone + bi->bi_offset,
- bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
+ if (bi != bi_prev) {
+ if (!bi->bi_clone) {
+ bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+ GFP_NOFS | __GFP_NOFAIL);
+ memcpy(bi->bi_clone + bi->bi_offset,
+ bi->bi_bh->b_data + bi->bi_offset,
+ bi->bi_len);
+ }
+ gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
+ bi_prev = bi;
}
- gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
gfs2_setbit(&rbm, false, new_state);
+ gfs2_rbm_incr(&rbm);
}
return rbm.rgd;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 463ab2e95d1c..5d8f085f7ade 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -80,4 +80,5 @@ static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
return rs && !RB_EMPTY_NODE(&rs->rs_node);
}
+extern void check_and_update_goal(struct gfs2_inode *ip);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2607ff13d486..a346f56c4c6d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1294,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
int val;
if (is_ancestor(root, sdp->sd_master_dir))
- seq_printf(s, ",meta");
+ seq_puts(s, ",meta");
if (args->ar_lockproto[0])
seq_printf(s, ",lockproto=%s", args->ar_lockproto);
if (args->ar_locktable[0])
@@ -1302,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
if (args->ar_hostdata[0])
seq_printf(s, ",hostdata=%s", args->ar_hostdata);
if (args->ar_spectator)
- seq_printf(s, ",spectator");
+ seq_puts(s, ",spectator");
if (args->ar_localflocks)
- seq_printf(s, ",localflocks");
+ seq_puts(s, ",localflocks");
if (args->ar_debug)
- seq_printf(s, ",debug");
+ seq_puts(s, ",debug");
if (args->ar_posix_acl)
- seq_printf(s, ",acl");
+ seq_puts(s, ",acl");
if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
char *state;
switch (args->ar_quota) {
@@ -1328,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",quota=%s", state);
}
if (args->ar_suiddir)
- seq_printf(s, ",suiddir");
+ seq_puts(s, ",suiddir");
if (args->ar_data != GFS2_DATA_DEFAULT) {
char *state;
switch (args->ar_data) {
@@ -1345,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",data=%s", state);
}
if (args->ar_discard)
- seq_printf(s, ",discard");
+ seq_puts(s, ",discard");
val = sdp->sd_tune.gt_logd_secs;
if (val != 30)
seq_printf(s, ",commit=%d", val);
@@ -1376,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",errors=%s", state);
}
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
- seq_printf(s, ",nobarrier");
+ seq_puts(s, ",nobarrier");
if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
- seq_printf(s, ",demote_interface_used");
+ seq_puts(s, ",demote_interface_used");
if (args->ar_rgrplvb)
- seq_printf(s, ",rgrplvb");
+ seq_puts(s, ",rgrplvb");
return 0;
}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 0546ab4e28e8..42bfd3361979 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -44,7 +44,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
if (!tr)
return -ENOMEM;
- tr->tr_ip = (unsigned long)__builtin_return_address(0);
+ tr->tr_ip = _RET_IP_;
tr->tr_blocks = blocks;
tr->tr_revokes = revokes;
tr->tr_reserved = 1;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 0524cda47a6e..95d255219b1e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -242,8 +242,6 @@ extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *);
/* super.c */
extern void hfs_mark_mdb_dirty(struct super_block *sb);
-extern struct timezone sys_tz;
-
/*
* There are two time systems. Both are based on seconds since
* a particular time/date.
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 9c88da0e855a..4fcd40d6f308 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major,
extern int link_file(const char *from, const char *to);
extern int hostfs_do_readlink(char *file, char *buf, int size);
extern int rename_file(char *from, char *to);
+extern int rename2_file(char *from, char *to, unsigned int flags);
extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
long long *bfree_out, long long *bavail_out,
long long *files_out, long long *ffree_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bb529f3b7f2b..fd62cae0fdcb 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
return err;
}
-static int hostfs_rename(struct inode *from_ino, struct dentry *from,
- struct inode *to_ino, struct dentry *to)
+static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
- char *from_name, *to_name;
+ char *old_name, *new_name;
int err;
- if ((from_name = dentry_name(from)) == NULL)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ old_name = dentry_name(old_dentry);
+ if (old_name == NULL)
return -ENOMEM;
- if ((to_name = dentry_name(to)) == NULL) {
- __putname(from_name);
+ new_name = dentry_name(new_dentry);
+ if (new_name == NULL) {
+ __putname(old_name);
return -ENOMEM;
}
- err = rename_file(from_name, to_name);
- __putname(from_name);
- __putname(to_name);
+ if (!flags)
+ err = rename_file(old_name, new_name);
+ else
+ err = rename2_file(old_name, new_name, flags);
+
+ __putname(old_name);
+ __putname(new_name);
return err;
}
@@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = {
.mkdir = hostfs_mkdir,
.rmdir = hostfs_rmdir,
.mknod = hostfs_mknod,
- .rename = hostfs_rename,
+ .rename2 = hostfs_rename2,
.permission = hostfs_permission,
.setattr = hostfs_setattr,
};
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 67838f3aa20a..9765dab95cbd 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -14,6 +14,7 @@
#include <sys/time.h>
#include <sys/types.h>
#include <sys/vfs.h>
+#include <sys/syscall.h>
#include "hostfs.h"
#include <utime.h>
@@ -360,6 +361,33 @@ int rename_file(char *from, char *to)
return 0;
}
+int rename2_file(char *from, char *to, unsigned int flags)
+{
+ int err;
+
+#ifndef SYS_renameat2
+# ifdef __x86_64__
+# define SYS_renameat2 316
+# endif
+# ifdef __i386__
+# define SYS_renameat2 353
+# endif
+#endif
+
+#ifdef SYS_renameat2
+ err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
+ if (err < 0) {
+ if (errno != ENOSYS)
+ return -errno;
+ else
+ return -EINVAL;
+ }
+ return 0;
+#else
+ return -EINVAL;
+#endif
+}
+
int do_statfs(char *root, long *bsize_out, long long *blocks_out,
long long *bfree_out, long long *bavail_out,
long long *files_out, long long *ffree_out,
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index f36fc010fccb..2923a7bd82ac 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -545,12 +545,13 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
struct dnode *d1;
struct quad_buffer_head qbh1;
if (hpfs_sb(i->i_sb)->sb_chk)
- if (up != i->i_ino) {
- hpfs_error(i->i_sb,
- "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx",
- dno, up, (unsigned long)i->i_ino);
- return;
- }
+ if (up != i->i_ino) {
+ hpfs_error(i->i_sb,
+ "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx",
+ dno, up,
+ (unsigned long)i->i_ino);
+ return;
+ }
if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) {
d1->up = cpu_to_le32(up);
d1->root_dnode = 1;
@@ -1061,8 +1062,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
hpfs_brelse4(qbh);
if (hpfs_sb(s)->sb_chk)
if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) {
- kfree(name2);
- return NULL;
+ kfree(name2);
+ return NULL;
}
goto go_down;
}
diff --git a/fs/inode.c b/fs/inode.c
index 5938f3928944..26753ba7b6d6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,6 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
+ atomic_set(&mapping->i_mmap_writable, 0);
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
diff --git a/fs/internal.h b/fs/internal.h
index 465742407466..757ba2abf21e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
#endif
/*
+ * buffer.c
+ */
+extern void guard_bio_eod(int rw, struct bio *bio);
+
+/*
* char_dev.c
*/
extern void __init chrdev_init(void);
@@ -42,7 +47,6 @@ extern void __init chrdev_init(void);
/*
* namei.c
*/
-extern int __inode_permission(struct inode *, int);
extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
@@ -51,7 +55,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
* namespace.c
*/
extern int copy_mount_options(const void __user *, unsigned long *);
-extern int copy_mount_string(const void __user *, char **);
+extern char *copy_mount_string(const void __user *);
extern struct vfsmount *lookup_mnt(struct path *);
extern int finish_automount(struct vfsmount *, struct path *);
@@ -131,16 +135,15 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
/*
* read_write.c
*/
-extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
/*
- * splice.c
+ * pipe.c
*/
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- loff_t *opos, size_t len, unsigned int flags);
+extern const struct file_operations pipefifo_fops;
/*
- * pipe.c
+ * fs_pin.c
*/
-extern const struct file_operations pipefifo_fops;
+extern void sb_pin_kill(struct super_block *sb);
+extern void mnt_pin_kill(struct mount *m);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 592e5115a561..f311bf084015 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -158,8 +158,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
"zisofs: zisofs_inflate returned"
" %d, inode = %lu,"
" page idx = %d, bh idx = %d,"
- " avail_in = %d,"
- " avail_out = %d\n",
+ " avail_in = %ld,"
+ " avail_out = %ld\n",
zerr, inode->i_ino, curpage,
curbh, stream.avail_in,
stream.avail_out);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4556ce1af5b0..fe839b915116 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -29,13 +29,9 @@
#define BEQUIET
static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
-static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
static int isofs_dentry_cmpi(const struct dentry *parent,
const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name);
-static int isofs_dentry_cmp(const struct dentry *parent,
- const struct dentry *dentry,
- unsigned int len, const char *str, const struct qstr *name);
#ifdef CONFIG_JOLIET
static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
@@ -61,7 +57,7 @@ static void isofs_put_super(struct super_block *sb)
return;
}
-static int isofs_read_inode(struct inode *);
+static int isofs_read_inode(struct inode *, int relocated);
static int isofs_statfs (struct dentry *, struct kstatfs *);
static struct kmem_cache *isofs_inode_cachep;
@@ -135,10 +131,6 @@ static const struct super_operations isofs_sops = {
static const struct dentry_operations isofs_dentry_ops[] = {
{
- .d_hash = isofs_hash,
- .d_compare = isofs_dentry_cmp,
- },
- {
.d_hash = isofs_hashi,
.d_compare = isofs_dentry_cmpi,
},
@@ -247,7 +239,7 @@ static int isofs_dentry_cmp_common(
}
if (alen == blen) {
if (ci) {
- if (strnicmp(name->name, str, alen) == 0)
+ if (strncasecmp(name->name, str, alen) == 0)
return 0;
} else {
if (strncmp(name->name, str, alen) == 0)
@@ -258,25 +250,12 @@ static int isofs_dentry_cmp_common(
}
static int
-isofs_hash(const struct dentry *dentry, struct qstr *qstr)
-{
- return isofs_hash_common(qstr, 0);
-}
-
-static int
isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
{
return isofs_hashi_common(qstr, 0);
}
static int
-isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
- unsigned int len, const char *str, const struct qstr *name)
-{
- return isofs_dentry_cmp_common(len, str, name, 0, 0);
-}
-
-static int
isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
@@ -930,7 +909,8 @@ root_found:
if (opt.check == 'r')
table++;
- s->s_d_op = &isofs_dentry_ops[table];
+ if (table)
+ s->s_d_op = &isofs_dentry_ops[table - 1];
/* get the root dentry */
s->s_root = d_make_root(inode);
@@ -1259,7 +1239,7 @@ out_toomany:
goto out;
}
-static int isofs_read_inode(struct inode *inode)
+static int isofs_read_inode(struct inode *inode, int relocated)
{
struct super_block *sb = inode->i_sb;
struct isofs_sb_info *sbi = ISOFS_SB(sb);
@@ -1404,7 +1384,7 @@ static int isofs_read_inode(struct inode *inode)
*/
if (!high_sierra) {
- parse_rock_ridge_inode(de, inode);
+ parse_rock_ridge_inode(de, inode, relocated);
/* if we want uid/gid set, override the rock ridge setting */
if (sbi->s_uid_set)
inode->i_uid = sbi->s_uid;
@@ -1483,9 +1463,10 @@ static int isofs_iget5_set(struct inode *ino, void *data)
* offset that point to the underlying meta-data for the inode. The
* code below is otherwise similar to the iget() code in
* include/linux/fs.h */
-struct inode *isofs_iget(struct super_block *sb,
- unsigned long block,
- unsigned long offset)
+struct inode *__isofs_iget(struct super_block *sb,
+ unsigned long block,
+ unsigned long offset,
+ int relocated)
{
unsigned long hashval;
struct inode *inode;
@@ -1507,7 +1488,7 @@ struct inode *isofs_iget(struct super_block *sb,
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
- ret = isofs_read_inode(inode);
+ ret = isofs_read_inode(inode, relocated);
if (ret < 0) {
iget_failed(inode);
inode = ERR_PTR(ret);
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 99167238518d..0ac4c1f73fbd 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -107,7 +107,7 @@ extern int iso_date(char *, int);
struct inode; /* To make gcc happy */
-extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *);
+extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated);
extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);
@@ -118,9 +118,24 @@ extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int
extern struct buffer_head *isofs_bread(struct inode *, sector_t);
extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
-extern struct inode *isofs_iget(struct super_block *sb,
- unsigned long block,
- unsigned long offset);
+struct inode *__isofs_iget(struct super_block *sb,
+ unsigned long block,
+ unsigned long offset,
+ int relocated);
+
+static inline struct inode *isofs_iget(struct super_block *sb,
+ unsigned long block,
+ unsigned long offset)
+{
+ return __isofs_iget(sb, block, offset, 0);
+}
+
+static inline struct inode *isofs_iget_reloc(struct super_block *sb,
+ unsigned long block,
+ unsigned long offset)
+{
+ return __isofs_iget(sb, block, offset, 1);
+}
/* Because the inode number is no longer relevant to finding the
* underlying meta-data for an inode, we are free to choose a more
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 95295640d9c8..7b543e6b6526 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -18,25 +18,10 @@ static int
isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
{
struct qstr qstr;
-
- if (!compare)
- return 1;
-
- /* check special "." and ".." files */
- if (dlen == 1) {
- /* "." */
- if (compare[0] == 0) {
- if (!dentry->d_name.len)
- return 0;
- compare = ".";
- } else if (compare[0] == 1) {
- compare = "..";
- dlen = 2;
- }
- }
-
qstr.name = compare;
qstr.len = dlen;
+ if (likely(!dentry->d_op))
+ return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen);
return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
}
@@ -146,7 +131,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
(!(de->flags[-sbi->s_high_sierra] & 1))) &&
(sbi->s_showassoc ||
(!(de->flags[-sbi->s_high_sierra] & 4)))) {
- match = (isofs_cmp(dentry, dpnt, dlen) == 0);
+ if (dpnt && (dlen > 1 || dpnt[0] > 1))
+ match = (isofs_cmp(dentry, dpnt, dlen) == 0);
}
if (match) {
isofs_normalize_block_and_offset(de,
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c0bf42472e40..f488bbae541a 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -288,12 +288,16 @@ eio:
goto out;
}
+#define RR_REGARD_XA 1
+#define RR_RELOC_DE 2
+
static int
parse_rock_ridge_inode_internal(struct iso_directory_record *de,
- struct inode *inode, int regard_xa)
+ struct inode *inode, int flags)
{
int symlink_len = 0;
int cnt, sig;
+ unsigned int reloc_block;
struct inode *reloc;
struct rock_ridge *rr;
int rootflag;
@@ -305,7 +309,7 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
init_rock_state(&rs, inode);
setup_rock_ridge(de, inode, &rs);
- if (regard_xa) {
+ if (flags & RR_REGARD_XA) {
rs.chr += 14;
rs.len -= 14;
if (rs.len < 0)
@@ -485,12 +489,22 @@ repeat:
"relocated directory\n");
goto out;
case SIG('C', 'L'):
- ISOFS_I(inode)->i_first_extent =
- isonum_733(rr->u.CL.location);
- reloc =
- isofs_iget(inode->i_sb,
- ISOFS_I(inode)->i_first_extent,
- 0);
+ if (flags & RR_RELOC_DE) {
+ printk(KERN_ERR
+ "ISOFS: Recursive directory relocation "
+ "is not supported\n");
+ goto eio;
+ }
+ reloc_block = isonum_733(rr->u.CL.location);
+ if (reloc_block == ISOFS_I(inode)->i_iget5_block &&
+ ISOFS_I(inode)->i_iget5_offset == 0) {
+ printk(KERN_ERR
+ "ISOFS: Directory relocation points to "
+ "itself\n");
+ goto eio;
+ }
+ ISOFS_I(inode)->i_first_extent = reloc_block;
+ reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0);
if (IS_ERR(reloc)) {
ret = PTR_ERR(reloc);
goto out;
@@ -637,9 +651,11 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
return rpnt;
}
-int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
+int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
+ int relocated)
{
- int result = parse_rock_ridge_inode_internal(de, inode, 0);
+ int flags = relocated ? RR_RELOC_DE : 0;
+ int result = parse_rock_ridge_inode_internal(de, inode, flags);
/*
* if rockridge flag was reset and we didn't look for attributes
@@ -647,7 +663,8 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
*/
if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
&& (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
- result = parse_rock_ridge_inode_internal(de, inode, 14);
+ result = parse_rock_ridge_inode_internal(de, inode,
+ flags | RR_REGARD_XA);
}
return result;
}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 06fe11e0abfa..aab8549591e7 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode)
goto out_err;
}
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
printk(KERN_ERR
"%s: Cannot get buffer for journal superblock\n",
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8898bbd2b61e..dcead636c33b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -93,6 +93,7 @@
#include <linux/bio.h>
#endif
#include <linux/log2.h>
+#include <linux/hash.h>
static struct kmem_cache *revoke_record_cache;
static struct kmem_cache *revoke_table_cache;
@@ -129,15 +130,11 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
/* Utility functions to maintain the revoke table */
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
static inline int hash(journal_t *journal, unsigned int block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
- int hash_shift = table->hash_shift;
- return ((block << (hash_shift - 6)) ^
- (block >> 13) ^
- (block << (hash_shift - 12))) & (table->hash_size - 1);
+ return hash_32(block, table->hash_shift);
}
static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 7f34f4716165..988b32ed4c87 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -96,15 +96,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
if (jh->b_transaction == NULL && !buffer_locked(bh) &&
!buffer_dirty(bh) && !buffer_write_io_error(bh)) {
- /*
- * Get our reference so that bh cannot be freed before
- * we unlock it
- */
- get_bh(bh);
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
- BUFFER_TRACE(bh, "release");
- __brelse(bh);
}
return ret;
}
@@ -122,8 +115,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
nblocks = jbd2_space_needed(journal);
while (jbd2_log_space_left(journal) < nblocks) {
- if (journal->j_flags & JBD2_ABORT)
- return;
write_unlock(&journal->j_state_lock);
mutex_lock(&journal->j_checkpoint_mutex);
@@ -139,6 +130,10 @@ void __jbd2_log_wait_for_space(journal_t *journal)
* trace for forensic evidence.
*/
write_lock(&journal->j_state_lock);
+ if (journal->j_flags & JBD2_ABORT) {
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ return;
+ }
spin_lock(&journal->j_list_lock);
nblocks = jbd2_space_needed(journal);
space_left = jbd2_log_space_left(journal);
@@ -183,58 +178,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
}
}
-/*
- * Clean up transaction's list of buffers submitted for io.
- * We wait for any pending IO to complete and remove any clean
- * buffers. Note that we take the buffers in the opposite ordering
- * from the one in which they were submitted for IO.
- *
- * Return 0 on success, and return <0 if some buffers have failed
- * to be written out.
- *
- * Called with j_list_lock held.
- */
-static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
-{
- struct journal_head *jh;
- struct buffer_head *bh;
- tid_t this_tid;
- int released = 0;
- int ret = 0;
-
- this_tid = transaction->t_tid;
-restart:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- return ret;
- while (!released && transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart;
- }
- if (unlikely(buffer_write_io_error(bh)))
- ret = -EIO;
-
- /*
- * Now in whatever state the buffer currently is, we know that
- * it has been written out and so we can drop it from the list
- */
- released = __jbd2_journal_remove_checkpoint(jh);
- __brelse(bh);
- }
-
- return ret;
-}
-
static void
__flush_batch(journal_t *journal, int *batch_count)
{
@@ -255,81 +198,6 @@ __flush_batch(journal_t *journal, int *batch_count)
}
/*
- * Try to flush one buffer from the checkpoint list to disk.
- *
- * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list. Return <0 if the buffer has failed to
- * be written out.
- *
- * Called with j_list_lock held and drops it if 1 is returned
- */
-static int __process_buffer(journal_t *journal, struct journal_head *jh,
- int *batch_count, transaction_t *transaction)
-{
- struct buffer_head *bh = jh2bh(jh);
- int ret = 0;
-
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- ret = 1;
- } else if (jh->b_transaction != NULL) {
- transaction_t *t = jh->b_transaction;
- tid_t tid = t->t_tid;
-
- transaction->t_chp_stats.cs_forced_to_close++;
- spin_unlock(&journal->j_list_lock);
- if (unlikely(journal->j_flags & JBD2_UNMOUNT))
- /*
- * The journal thread is dead; so starting and
- * waiting for a commit to finish will cause
- * us to wait for a _very_ long time.
- */
- printk(KERN_ERR "JBD2: %s: "
- "Waiting for Godot: block %llu\n",
- journal->j_devname,
- (unsigned long long) bh->b_blocknr);
- jbd2_log_start_commit(journal, tid);
- jbd2_log_wait_commit(journal, tid);
- ret = 1;
- } else if (!buffer_dirty(bh)) {
- ret = 1;
- if (unlikely(buffer_write_io_error(bh)))
- ret = -EIO;
- get_bh(bh);
- BUFFER_TRACE(bh, "remove from checkpoint");
- __jbd2_journal_remove_checkpoint(jh);
- spin_unlock(&journal->j_list_lock);
- __brelse(bh);
- } else {
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal lock.
- * We cannot afford to let the transaction logic start
- * messing around with this buffer before we write it to
- * disk, as that would break recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[*batch_count] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
- (*batch_count)++;
- if (*batch_count == JBD2_NR_BATCH) {
- spin_unlock(&journal->j_list_lock);
- __flush_batch(journal, batch_count);
- ret = 1;
- }
- }
- return ret;
-}
-
-/*
* Perform an actual checkpoint. We take the first transaction on the
* list of transactions to be checkpointed and send all its buffers
* to disk. We submit larger chunks of data at once.
@@ -339,9 +207,11 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
*/
int jbd2_log_do_checkpoint(journal_t *journal)
{
- transaction_t *transaction;
- tid_t this_tid;
- int result;
+ struct journal_head *jh;
+ struct buffer_head *bh;
+ transaction_t *transaction;
+ tid_t this_tid;
+ int result, batch_count = 0;
jbd_debug(1, "Start checkpoint\n");
@@ -374,45 +244,117 @@ restart:
* done (maybe it's a new transaction, but it fell at the same
* address).
*/
- if (journal->j_checkpoint_transactions == transaction &&
- transaction->t_tid == this_tid) {
- int batch_count = 0;
- struct journal_head *jh;
- int retry = 0, err;
-
- while (!retry && transaction->t_checkpoint_list) {
- jh = transaction->t_checkpoint_list;
- retry = __process_buffer(journal, jh, &batch_count,
- transaction);
- if (retry < 0 && !result)
- result = retry;
- if (!retry && (need_resched() ||
- spin_needbreak(&journal->j_list_lock))) {
- spin_unlock(&journal->j_list_lock);
- retry = 1;
- break;
- }
+ if (journal->j_checkpoint_transactions != transaction ||
+ transaction->t_tid != this_tid)
+ goto out;
+
+ /* checkpoint all of the transaction's buffers */
+ while (transaction->t_checkpoint_list) {
+ jh = transaction->t_checkpoint_list;
+ bh = jh2bh(jh);
+
+ if (buffer_locked(bh)) {
+ spin_unlock(&journal->j_list_lock);
+ get_bh(bh);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ goto retry;
}
+ if (jh->b_transaction != NULL) {
+ transaction_t *t = jh->b_transaction;
+ tid_t tid = t->t_tid;
- if (batch_count) {
- if (!retry) {
- spin_unlock(&journal->j_list_lock);
- retry = 1;
- }
- __flush_batch(journal, &batch_count);
+ transaction->t_chp_stats.cs_forced_to_close++;
+ spin_unlock(&journal->j_list_lock);
+ if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+ /*
+ * The journal thread is dead; so
+ * starting and waiting for a commit
+ * to finish will cause us to wait for
+ * a _very_ long time.
+ */
+ printk(KERN_ERR
+ "JBD2: %s: Waiting for Godot: block %llu\n",
+ journal->j_devname, (unsigned long long) bh->b_blocknr);
+
+ jbd2_log_start_commit(journal, tid);
+ jbd2_log_wait_commit(journal, tid);
+ goto retry;
+ }
+ if (!buffer_dirty(bh)) {
+ if (unlikely(buffer_write_io_error(bh)) && !result)
+ result = -EIO;
+ BUFFER_TRACE(bh, "remove from checkpoint");
+ if (__jbd2_journal_remove_checkpoint(jh))
+ /* The transaction was released; we're done */
+ goto out;
+ continue;
}
+ /*
+ * Important: we are about to write the buffer, and
+ * possibly block, while still holding the journal
+ * lock. We cannot afford to let the transaction
+ * logic start messing around with this buffer before
+ * we write it to disk, as that would break
+ * recoverability.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ __buffer_relink_io(jh);
+ transaction->t_chp_stats.cs_written++;
+ if ((batch_count == JBD2_NR_BATCH) ||
+ need_resched() ||
+ spin_needbreak(&journal->j_list_lock))
+ goto unlock_and_flush;
+ }
- if (retry) {
+ if (batch_count) {
+ unlock_and_flush:
+ spin_unlock(&journal->j_list_lock);
+ retry:
+ if (batch_count)
+ __flush_batch(journal, &batch_count);
spin_lock(&journal->j_list_lock);
goto restart;
+ }
+
+ /*
+ * Now we issued all of the transaction's buffers, let's deal
+ * with the buffers that are out for I/O.
+ */
+restart2:
+ /* Did somebody clean up the transaction in the meanwhile? */
+ if (journal->j_checkpoint_transactions != transaction ||
+ transaction->t_tid != this_tid)
+ goto out;
+
+ while (transaction->t_checkpoint_io_list) {
+ jh = transaction->t_checkpoint_io_list;
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ spin_unlock(&journal->j_list_lock);
+ get_bh(bh);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ spin_lock(&journal->j_list_lock);
+ goto restart2;
}
+ if (unlikely(buffer_write_io_error(bh)) && !result)
+ result = -EIO;
+
/*
- * Now we have cleaned up the first transaction's checkpoint
- * list. Let's clean up the second one
+ * Now in whatever state the buffer currently is, we
+ * know that it has been written out and so we can
+ * drop it from the list
*/
- err = __wait_cp_io(journal, transaction);
- if (!result)
- result = err;
+ if (__jbd2_journal_remove_checkpoint(jh))
+ break;
}
out:
spin_unlock(&journal->j_list_lock);
@@ -478,18 +420,16 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
* Find all the written-back checkpoint buffers in the given list and
* release them.
*
- * Called with the journal locked.
* Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
+ * Returns 1 if we freed the transaction, 0 otherwise.
*/
-
-static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+static int journal_clean_one_cp_list(struct journal_head *jh)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
- int ret, freed = 0;
+ int ret;
+ int freed = 0;
- *released = 0;
if (!jh)
return 0;
@@ -498,13 +438,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
jh = next_jh;
next_jh = jh->b_cpnext;
ret = __try_to_free_cp_buf(jh);
- if (ret) {
- freed++;
- if (ret == 2) {
- *released = 1;
- return freed;
- }
- }
+ if (!ret)
+ return freed;
+ if (ret == 2)
+ return 1;
+ freed = 1;
/*
* This function only frees up some memory
* if possible so we dont have an obligation
@@ -523,49 +461,49 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
*
* Find all the written-back checkpoint buffers in the journal and release them.
*
- * Called with the journal locked.
* Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
*/
-
-int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
{
transaction_t *transaction, *last_transaction, *next_transaction;
- int ret = 0;
- int released;
+ int ret;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
- goto out;
+ return;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
- ret += journal_clean_one_cp_list(transaction->
- t_checkpoint_list, &released);
+ ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
* preemption requested:
*/
if (need_resched())
- goto out;
- if (released)
+ return;
+ if (ret)
continue;
/*
* It is essential that we are as careful as in the case of
* t_checkpoint_list with removing the buffer from the list as
* we can possibly see not yet submitted buffers on io_list
*/
- ret += journal_clean_one_cp_list(transaction->
- t_checkpoint_io_list, &released);
+ ret = journal_clean_one_cp_list(transaction->
+ t_checkpoint_io_list);
if (need_resched())
- goto out;
+ return;
+ /*
+ * Stop scanning if we couldn't free the transaction. This
+ * avoids pointless scanning of transactions which still
+ * weren't checkpointed.
+ */
+ if (!ret)
+ return;
} while (transaction != last_transaction);
-out:
- return ret;
}
/*
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6fac74349856..b73e0215baa7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
struct commit_header *h;
__u32 csum;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
h = (struct commit_header *)(bh->b_data);
@@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
return checksum;
}
-static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
unsigned long long block)
{
tag->t_blocknr = cpu_to_be32(block & (u32)~0);
- if (tag_bytes > JBD2_TAG_SIZE32)
+ if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
@@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j,
struct jbd2_journal_block_tail *tail;
__u32 csum;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
@@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j,
static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
+ journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
struct page *page = bh->b_page;
__u8 *addr;
__u32 csum32;
__be32 seq;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
seq = cpu_to_be32(sequence);
@@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
bh->b_size);
kunmap_atomic(addr);
- /* We only have space to store the lower 16 bits of the crc32c. */
- tag->t_checksum = cpu_to_be16(csum32);
+ if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ tag3->t_checksum = cpu_to_be32(csum32);
+ else
+ tag->t_checksum = cpu_to_be16(csum32);
}
/*
* jbd2_journal_commit_transaction
@@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
LIST_HEAD(io_bufs);
LIST_HEAD(log_bufs);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_block_tail);
/*
@@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag_flag |= JBD2_FLAG_SAME_UUID;
tag = (journal_block_tag_t *) tagp;
- write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
+ write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be16(tag_flag);
jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
commit_transaction->t_tid);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 67b8e303946c..e4dc74713a43 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
/* Checksumming functions */
static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
{
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
return sb->s_checksum == jbd2_superblock_csum(j, sb);
@@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
{
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
sb->s_checksum = jbd2_superblock_csum(j, sb);
@@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
goto out_err;
}
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
printk(KERN_ERR
"%s: Cannot get buffer for journal superblock\n",
@@ -1522,10 +1522,18 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
- if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
- JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
+ JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ /* Can't have checksum v2 and v3 at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
+ "at the same time!\n");
+ goto out;
+ }
+
+ if (jbd2_journal_has_csum_v2or3(journal) &&
+ JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
/* Can't have checksum v1 and v2 on at the same time! */
- printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
+ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
"at the same time!\n");
goto out;
}
@@ -1536,7 +1544,7 @@ static int journal_get_superblock(journal_t *journal)
}
/* Load the checksum driver */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ if (jbd2_journal_has_csum_v2or3(journal)) {
journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal)
}
/* Precompute checksum seed for all metadata */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
@@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
return 0;
- /* Asking for checksumming v2 and v1? Only give them v2. */
- if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
+ /* If enabling v2 checksums, turn on v3 instead */
+ if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
+ incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
+ incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
+ }
+
+ /* Asking for checksumming v3 and v1? Only give them v3. */
+ if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
compat & JBD2_FEATURE_COMPAT_CHECKSUM)
compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb = journal->j_superblock;
- /* If enabling v2 checksums, update superblock */
- if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ /* If enabling v3 checksums, update superblock */
+ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
@@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
}
/* Precompute checksum seed for all metadata */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0,
sb->s_uuid,
sizeof(sb->s_uuid));
@@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
/* If enabling v1 checksums, downgrade superblock */
if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
sb->s_feature_incompat &=
- ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
+ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
+ JBD2_FEATURE_INCOMPAT_CSUM_V3);
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
@@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
*/
size_t journal_tag_bytes(journal_t *journal)
{
- journal_block_tag_t tag;
- size_t x = 0;
+ size_t sz;
+
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ return sizeof(journal_block_tag3_t);
+
+ sz = sizeof(journal_block_tag_t);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
- x += sizeof(tag.t_checksum);
+ sz += sizeof(__u16);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
- return x + JBD2_TAG_SIZE64;
+ return sz;
else
- return x + JBD2_TAG_SIZE32;
+ return sz - sizeof(__u32);
}
/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3b6bb19d60b1..bcbef08a4d8f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
__be32 provided;
__u32 calculated;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
@@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
size -= sizeof(struct jbd2_journal_block_tail);
tagp = &bh->b_data[sizeof(journal_header_t)];
@@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal)
return err;
}
-static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
+static inline unsigned long long read_tag_block(journal_t *journal,
+ journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
- if (tag_bytes > JBD2_TAG_SIZE32)
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
@@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
__be32 provided;
__u32 calculated;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
h = buf;
@@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
+ journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
__u32 csum32;
__be32 seq;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
seq = cpu_to_be32(sequence);
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- return tag->t_checksum == cpu_to_be16(csum32);
+ if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ return tag3->t_checksum == cpu_to_be32(csum32);
+ else
+ return tag->t_checksum == cpu_to_be16(csum32);
}
static int do_one_pass(journal_t *journal,
@@ -426,6 +431,7 @@ static int do_one_pass(journal_t *journal,
int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
int descr_csum_size = 0;
+ int block_error = 0;
/*
* First thing is to establish what we expect to find in the log
@@ -512,14 +518,14 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* Verify checksum first */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
descr_csum_size =
sizeof(struct jbd2_journal_block_tail);
if (descr_csum_size > 0 &&
!jbd2_descr_block_csum_verify(journal,
bh->b_data)) {
err = -EIO;
+ brelse(bh);
goto failed;
}
@@ -574,7 +580,7 @@ static int do_one_pass(journal_t *journal,
unsigned long long blocknr;
J_ASSERT(obh != NULL);
- blocknr = read_tag_block(tag_bytes,
+ blocknr = read_tag_block(journal,
tag);
/* If the block has been
@@ -598,7 +604,8 @@ static int do_one_pass(journal_t *journal,
"checksum recovering "
"block %llu in log\n",
blocknr);
- continue;
+ block_error = 1;
+ goto skip_write;
}
/* Find a buffer for the new
@@ -797,7 +804,8 @@ static int do_one_pass(journal_t *journal,
success = -EIO;
}
}
-
+ if (block_error && success == 0)
+ success = -EIO;
return success;
failed:
@@ -811,7 +819,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
__be32 provided;
__u32 calculated;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return 1;
tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 198c9c10276d..c6cbaef2bda1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -91,8 +91,9 @@
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bio.h>
-#endif
#include <linux/log2.h>
+#include <linux/hash.h>
+#endif
static struct kmem_cache *jbd2_revoke_record_cache;
static struct kmem_cache *jbd2_revoke_table_cache;
@@ -130,16 +131,9 @@ static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
/* Utility functions to maintain the revoke table */
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
static inline int hash(journal_t *journal, unsigned long long block)
{
- struct jbd2_revoke_table_s *table = journal->j_revoke;
- int hash_shift = table->hash_shift;
- int hash = (int)block ^ (int)((block >> 31) >> 1);
-
- return ((hash << (hash_shift - 6)) ^
- (hash >> 13) ^
- (hash << (hash_shift - 12))) & (table->hash_size - 1);
+ return hash_64(block, journal->j_revoke->hash_shift);
}
static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
@@ -597,7 +591,7 @@ static void write_one_revoke_record(journal_t *journal,
offset = *offsetp;
/* Do we need to leave space at the end for a checksum? */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
/* Make sure we have a descriptor with space left for the record */
@@ -644,7 +638,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
struct jbd2_journal_revoke_tail *tail;
__u32 csum;
- if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (!jbd2_journal_has_csum_v2or3(j))
return;
tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 009ec0b5993d..2f7a3c090489 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -202,8 +202,7 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
} else {
acl = ERR_PTR(rc);
}
- if (value)
- kfree(value);
+ kfree(value);
if (!IS_ERR(acl))
set_cached_acl(inode, type, acl);
return acl;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 0b9a1e44e833..5698dae5d92d 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -94,11 +94,12 @@ static int jffs2_zlib_compress(unsigned char *data_in,
while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) {
def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE);
- def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out);
- jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n",
+ def_strm.avail_in = min_t(unsigned long,
+ (*sourcelen-def_strm.total_in), def_strm.avail_out);
+ jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n",
def_strm.avail_in, def_strm.avail_out);
ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH);
- jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n",
+ jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n",
def_strm.avail_in, def_strm.avail_out,
def_strm.total_in, def_strm.total_out);
if (ret != Z_OK) {
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 413ef89c2d1b..046fee8b6e9b 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -134,8 +134,6 @@ struct jffs2_sb_info {
struct rw_semaphore wbuf_sem; /* Protects the write buffer */
struct delayed_work wbuf_dwork; /* write-buffer write-out work */
- int wbuf_queued; /* non-zero delayed work is queued */
- spinlock_t wbuf_dwork_lock; /* protects wbuf_dwork and and wbuf_queued */
unsigned char *oobbuf;
int oobavail; /* How many bytes are available for JFFS2 in OOB */
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a6597d60d76d..09ed55190ee2 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1162,10 +1162,6 @@ static void delayed_wbuf_sync(struct work_struct *work)
struct jffs2_sb_info *c = work_to_sb(work);
struct super_block *sb = OFNI_BS_2SFFJ(c);
- spin_lock(&c->wbuf_dwork_lock);
- c->wbuf_queued = 0;
- spin_unlock(&c->wbuf_dwork_lock);
-
if (!(sb->s_flags & MS_RDONLY)) {
jffs2_dbg(1, "%s()\n", __func__);
jffs2_flush_wbuf_gc(c, 0);
@@ -1180,14 +1176,9 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c)
if (sb->s_flags & MS_RDONLY)
return;
- spin_lock(&c->wbuf_dwork_lock);
- if (!c->wbuf_queued) {
+ delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+ if (queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay))
jffs2_dbg(1, "%s()\n", __func__);
- delay = msecs_to_jiffies(dirty_writeback_interval * 10);
- queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay);
- c->wbuf_queued = 1;
- }
- spin_unlock(&c->wbuf_dwork_lock);
}
int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
@@ -1211,7 +1202,6 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
/* Initialise write buffer */
init_rwsem(&c->wbuf_sem);
- spin_lock_init(&c->wbuf_dwork_lock);
INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->writesize;
c->wbuf_ofs = 0xFFFFFFFF;
@@ -1251,7 +1241,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
/* Initialize write buffer */
init_rwsem(&c->wbuf_sem);
- spin_lock_init(&c->wbuf_dwork_lock);
INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->erasesize;
@@ -1311,7 +1300,6 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
/* Initialize write buffer */
init_rwsem(&c->wbuf_sem);
- spin_lock_init(&c->wbuf_dwork_lock);
INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->writesize;
@@ -1346,7 +1334,6 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
return 0;
init_rwsem(&c->wbuf_sem);
- spin_lock_init(&c->wbuf_dwork_lock);
INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->writesize;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index ad0f2e2a1700..d72817ac51f6 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -756,8 +756,7 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
list_del(&xd->xindex);
- if (xd->xname)
- kfree(xd->xname);
+ kfree(xd->xname);
jffs2_free_xattr_datum(xd);
}
}
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 0acddf60af55..bc462dcd7a40 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1585,7 +1585,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
set_current_state(TASK_UNINTERRUPTIBLE);
LOGGC_UNLOCK(log);
schedule();
- __set_current_state(TASK_RUNNING);
LOGGC_LOCK(log);
remove_wait_queue(&target->gcwait, &__wait);
}
@@ -2359,7 +2358,6 @@ int jfsIOWait(void *arg)
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irq(&log_redrive_lock);
schedule();
- __set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 564c4f279ac6..d595856453b2 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -136,7 +136,6 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
set_current_state(TASK_UNINTERRUPTIBLE);
TXN_UNLOCK();
io_schedule();
- __set_current_state(TASK_RUNNING);
remove_wait_queue(event, &wait);
}
@@ -2808,7 +2807,6 @@ int jfs_lazycommit(void *arg)
set_current_state(TASK_INTERRUPTIBLE);
LAZY_UNLOCK(flags);
schedule();
- __set_current_state(TASK_RUNNING);
remove_wait_queue(&jfs_commit_thread_wait, &wq);
}
} while (!kthread_should_stop());
@@ -2996,7 +2994,6 @@ int jfs_sync(void *arg)
set_current_state(TASK_INTERRUPTIBLE);
TXN_UNLOCK();
schedule();
- __set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index adf8cb045b9e..93e897e588a8 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -550,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
inode->i_ino = 0;
inode->i_size = sb->s_bdev->bd_inode->i_size;
inode->i_mapping->a_ops = &jfs_metapage_aops;
- insert_inode_hash(inode);
+ hlist_add_fake(&inode->i_hash);
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
sbi->direct_inode = inode;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a693f5b01ae6..1c771931bb60 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -463,21 +463,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
goto out_bad;
mutex_unlock(&kernfs_mutex);
-out_valid:
return 1;
out_bad:
mutex_unlock(&kernfs_mutex);
out_bad_unlocked:
- /*
- * @dentry doesn't match the underlying kernfs node, drop the
- * dentry and force lookup. If we have submounts we must allow the
- * vfs caches to lie about the state of the filesystem to prevent
- * leaks and other nasty things, so use check_submounts_and_drop()
- * instead of d_drop().
- */
- if (check_submounts_and_drop(dentry) != 0)
- goto out_valid;
-
return 0;
}
diff --git a/fs/libfs.c b/fs/libfs.c
index 88e3e00e2eca..171d2846f2a3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1075,3 +1075,21 @@ struct inode *alloc_anon_inode(struct super_block *s)
return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);
+
+/**
+ * simple_nosetlease - generic helper for prohibiting leases
+ * @filp: file pointer
+ * @arg: type of lease to obtain
+ * @flp: new lease supplied for insertion
+ * @priv: private data for lm_setup operation
+ *
+ * Generic helper for filesystems that do not wish to allow leases to be set.
+ * All arguments are ignored and it just returns -EINVAL.
+ */
+int
+simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
+ void **priv)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(simple_nosetlease);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ca58d64374ca..9b320cc2a8cf 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,7 @@
obj-$(CONFIG_LOCKD) += lockd.o
lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
- svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o
+ svcshare.o svcproc.o svcsubs.o mon.o xdr.o
lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
+lockd-objs-$(CONFIG_PROC_FS) += procfs.o
lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index daa8e7514eae..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
msg.rpc_proc = &clnt->cl_procinfo[proc];
status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+ if (status == -ECONNREFUSED) {
+ dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n",
+ status);
+ rpc_force_rebind(clnt);
+ status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+ }
if (status < 0)
dprintk("lockd: NSM upcall RPC failed, status=%d\n",
status);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5010b55628b4..097bfa3adb1c 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -11,7 +11,6 @@ struct lockd_net {
struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
- struct list_head grace_list;
spinlock_t nsm_clnt_lock;
unsigned int nsm_users;
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
new file mode 100644
index 000000000000..2a0a98480e39
--- /dev/null
+++ b/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+
+#include "netns.h"
+#include "procfs.h"
+
+/*
+ * We only allow strings that start with 'Y', 'y', or '1'.
+ */
+static ssize_t
+nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
+ loff_t *pos)
+{
+ char *data;
+ struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+ lockd_net_id);
+
+ if (size < 1)
+ return -EINVAL;
+
+ data = simple_transaction_get(file, buf, size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ switch(data[0]) {
+ case 'Y':
+ case 'y':
+ case '1':
+ locks_end_grace(&ln->lockd_manager);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return size;
+}
+
+static ssize_t
+nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
+ loff_t *pos)
+{
+ struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
+ lockd_net_id);
+ char resp[3];
+
+ resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
+ resp[1] = '\n';
+ resp[2] = '\0';
+
+ return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
+}
+
+static const struct file_operations lockd_end_grace_operations = {
+ .write = nlm_end_grace_write,
+ .read = nlm_end_grace_read,
+ .llseek = default_llseek,
+ .release = simple_transaction_release,
+ .owner = THIS_MODULE,
+};
+
+int __init
+lockd_create_procfs(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = proc_mkdir("fs/lockd", NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
+ &lockd_end_grace_operations);
+ if (!entry) {
+ remove_proc_entry("fs/lockd", NULL);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void __exit
+lockd_remove_procfs(void)
+{
+ remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
+ remove_proc_entry("fs/lockd", NULL);
+}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
new file mode 100644
index 000000000000..2257a1311027
--- /dev/null
+++ b/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
+/*
+ * Procfs support for lockd
+ *
+ * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+#ifndef _LOCKD_PROCFS_H
+#define _LOCKD_PROCFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_PROC_FS)
+int lockd_create_procfs(void);
+void lockd_remove_procfs(void);
+#else
+static inline int
+lockd_create_procfs(void)
+{
+ return 0;
+}
+
+static inline void
+lockd_remove_procfs(void)
+{
+ return;
+}
+#endif /* IS_ENABLED(CONFIG_PROC_FS) */
+
+#endif /* _LOCKD_PROCFS_H */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 8f27c93f8d2e..d1bb7ecfd201 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -36,6 +36,7 @@
#include <linux/nfs.h>
#include "netns.h"
+#include "procfs.h"
#define NLMDBG_FACILITY NLMDBG_SVC
#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
@@ -253,13 +254,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
error = make_socks(serv, net);
if (error < 0)
- goto err_socks;
+ goto err_bind;
set_grace_period(net);
dprintk("lockd_up_net: per-net data created; net=%p\n", net);
return 0;
-err_socks:
- svc_rpcb_cleanup(serv, net);
err_bind:
ln->nlmsvc_users--;
return error;
@@ -306,13 +305,16 @@ static int lockd_start_svc(struct svc_serv *serv)
svc_sock_update_bufs(serv);
serv->sv_maxconn = nlm_max_connections;
- nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
+ nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name);
if (IS_ERR(nlmsvc_task)) {
error = PTR_ERR(nlmsvc_task);
printk(KERN_WARNING
"lockd_up: kthread_run failed, error=%d\n", error);
goto out_task;
}
+ nlmsvc_rqst->rq_task = nlmsvc_task;
+ wake_up_process(nlmsvc_task);
+
dprintk("lockd_up: service started\n");
return 0;
@@ -583,7 +585,7 @@ static int lockd_init_net(struct net *net)
struct lockd_net *ln = net_generic(net, lockd_net_id);
INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
- INIT_LIST_HEAD(&ln->grace_list);
+ INIT_LIST_HEAD(&ln->lockd_manager.list);
spin_lock_init(&ln->nsm_clnt_lock);
return 0;
}
@@ -617,8 +619,15 @@ static int __init init_nlm(void)
err = register_pernet_subsys(&lockd_net_ops);
if (err)
goto err_pernet;
+
+ err = lockd_create_procfs();
+ if (err)
+ goto err_procfs;
+
return 0;
+err_procfs:
+ unregister_pernet_subsys(&lockd_net_ops);
err_pernet:
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
@@ -631,6 +640,7 @@ static void __exit exit_nlm(void)
{
/* FIXME: delete all NLM clients */
nlm_shutdown_hosts();
+ lockd_remove_procfs();
unregister_pernet_subsys(&lockd_net_ops);
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ab798a88ec1d..13db95f54176 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -245,7 +245,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
block->b_daemon = rqstp->rq_server;
block->b_host = host;
block->b_file = file;
- block->b_fl = NULL;
file->f_count++;
/* Add to file's list of blocks */
@@ -295,7 +294,6 @@ static void nlmsvc_free_block(struct kref *kref)
nlmsvc_freegrantargs(block->b_call);
nlmsvc_release_call(block->b_call);
nlm_release_file(block->b_file);
- kfree(block->b_fl);
kfree(block);
}
@@ -508,7 +506,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock,
struct nlm_lock *conflock, struct nlm_cookie *cookie)
{
- struct nlm_block *block = NULL;
int error;
__be32 ret;
@@ -519,63 +516,26 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
- /* Get existing block (in case client is busy-waiting) */
- block = nlmsvc_lookup_block(file, lock);
-
- if (block == NULL) {
- struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-
- if (conf == NULL)
- return nlm_granted;
- block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
- if (block == NULL) {
- kfree(conf);
- return nlm_granted;
- }
- block->b_fl = conf;
- }
- if (block->b_flags & B_QUEUED) {
- dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n",
- block, block->b_flags, block->b_fl);
- if (block->b_flags & B_TIMED_OUT) {
- nlmsvc_unlink_block(block);
- ret = nlm_lck_denied;
- goto out;
- }
- if (block->b_flags & B_GOT_CALLBACK) {
- nlmsvc_unlink_block(block);
- if (block->b_fl != NULL
- && block->b_fl->fl_type != F_UNLCK) {
- lock->fl = *block->b_fl;
- goto conf_lock;
- } else {
- ret = nlm_granted;
- goto out;
- }
- }
- ret = nlm_drop_reply;
- goto out;
- }
-
if (locks_in_grace(SVC_NET(rqstp))) {
ret = nlm_lck_denied_grace_period;
goto out;
}
+
error = vfs_test_lock(file->f_file, &lock->fl);
- if (error == FILE_LOCK_DEFERRED) {
- ret = nlmsvc_defer_lock_rqst(rqstp, block);
- goto out;
- }
if (error) {
+ /* We can't currently deal with deferred test requests */
+ if (error == FILE_LOCK_DEFERRED)
+ WARN_ON_ONCE(1);
+
ret = nlm_lck_denied_nolocks;
goto out;
}
+
if (lock->fl.fl_type == F_UNLCK) {
ret = nlm_granted;
goto out;
}
-conf_lock:
dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
lock->fl.fl_type, (long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -586,10 +546,9 @@ conf_lock:
conflock->fl.fl_type = lock->fl.fl_type;
conflock->fl.fl_start = lock->fl.fl_start;
conflock->fl.fl_end = lock->fl.fl_end;
+ locks_release_private(&lock->fl);
ret = nlm_lck_denied;
out:
- if (block)
- nlmsvc_release_block(block);
return ret;
}
@@ -660,29 +619,22 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
* This is a callback from the filesystem for VFS file lock requests.
* It will be used if lm_grant is defined and the filesystem can not
* respond to the request immediately.
- * For GETLK request it will copy the reply to the nlm_block.
* For SETLK or SETLKW request it will get the local posix lock.
* In all cases it will move the block to the head of nlm_blocked q where
* nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the
* deferred rpc for GETLK and SETLK.
*/
static void
-nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf,
- int result)
+nlmsvc_update_deferred_block(struct nlm_block *block, int result)
{
block->b_flags |= B_GOT_CALLBACK;
if (result == 0)
block->b_granted = 1;
else
block->b_flags |= B_TIMED_OUT;
- if (conf) {
- if (block->b_fl)
- __locks_copy_lock(block->b_fl, conf);
- }
}
-static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
- int result)
+static int nlmsvc_grant_deferred(struct file_lock *fl, int result)
{
struct nlm_block *block;
int rc = -ENOENT;
@@ -697,7 +649,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
rc = -ENOLCK;
break;
}
- nlmsvc_update_deferred_block(block, conf, result);
+ nlmsvc_update_deferred_block(block, result);
} else if (result == 0)
block->b_granted = 1;
diff --git a/fs/locks.c b/fs/locks.c
index a6f54802d277..735b8d3fa78c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -230,8 +230,12 @@ void locks_release_private(struct file_lock *fl)
fl->fl_ops->fl_release_private(fl);
fl->fl_ops = NULL;
}
- fl->fl_lmops = NULL;
+ if (fl->fl_lmops) {
+ if (fl->fl_lmops->lm_put_owner)
+ fl->fl_lmops->lm_put_owner(fl);
+ fl->fl_lmops = NULL;
+ }
}
EXPORT_SYMBOL_GPL(locks_release_private);
@@ -247,6 +251,18 @@ void locks_free_lock(struct file_lock *fl)
}
EXPORT_SYMBOL(locks_free_lock);
+static void
+locks_dispose_list(struct list_head *dispose)
+{
+ struct file_lock *fl;
+
+ while (!list_empty(dispose)) {
+ fl = list_first_entry(dispose, struct file_lock, fl_block);
+ list_del_init(&fl->fl_block);
+ locks_free_lock(fl);
+ }
+}
+
void locks_init_lock(struct file_lock *fl)
{
memset(fl, 0, sizeof(struct file_lock));
@@ -255,21 +271,10 @@ void locks_init_lock(struct file_lock *fl)
EXPORT_SYMBOL(locks_init_lock);
-static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
-{
- if (fl->fl_ops) {
- if (fl->fl_ops->fl_copy_lock)
- fl->fl_ops->fl_copy_lock(new, fl);
- new->fl_ops = fl->fl_ops;
- }
- if (fl->fl_lmops)
- new->fl_lmops = fl->fl_lmops;
-}
-
/*
* Initialize a new lock from an existing file_lock structure.
*/
-void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl)
+void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
new->fl_owner = fl->fl_owner;
new->fl_pid = fl->fl_pid;
@@ -278,21 +283,30 @@ void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl)
new->fl_type = fl->fl_type;
new->fl_start = fl->fl_start;
new->fl_end = fl->fl_end;
+ new->fl_lmops = fl->fl_lmops;
new->fl_ops = NULL;
- new->fl_lmops = NULL;
+
+ if (fl->fl_lmops) {
+ if (fl->fl_lmops->lm_get_owner)
+ fl->fl_lmops->lm_get_owner(new, fl);
+ }
}
-EXPORT_SYMBOL(__locks_copy_lock);
+EXPORT_SYMBOL(locks_copy_conflock);
void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
- locks_release_private(new);
+ /* "new" must be a freshly-initialized lock */
+ WARN_ON_ONCE(new->fl_ops);
+
+ locks_copy_conflock(new, fl);
- __locks_copy_lock(new, fl);
new->fl_file = fl->fl_file;
new->fl_ops = fl->fl_ops;
- new->fl_lmops = fl->fl_lmops;
- locks_copy_private(new, fl);
+ if (fl->fl_ops) {
+ if (fl->fl_ops->fl_copy_lock)
+ fl->fl_ops->fl_copy_lock(new, fl);
+ }
}
EXPORT_SYMBOL(locks_copy_lock);
@@ -312,17 +326,18 @@ static inline int flock_translate_cmd(int cmd) {
}
/* Fill in a file_lock structure with an appropriate FLOCK lock. */
-static int flock_make_lock(struct file *filp, struct file_lock **lock,
- unsigned int cmd)
+static struct file_lock *
+flock_make_lock(struct file *filp, unsigned int cmd)
{
struct file_lock *fl;
int type = flock_translate_cmd(cmd);
+
if (type < 0)
- return type;
+ return ERR_PTR(type);
fl = locks_alloc_lock();
if (fl == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
fl->fl_file = filp;
fl->fl_owner = filp;
@@ -331,8 +346,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock,
fl->fl_type = type;
fl->fl_end = OFFSET_MAX;
- *lock = fl;
- return 0;
+ return fl;
}
static int assign_type(struct file_lock *fl, long type)
@@ -413,14 +427,34 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
}
/* default lease lock manager operations */
-static void lease_break_callback(struct file_lock *fl)
+static bool
+lease_break_callback(struct file_lock *fl)
{
kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
+ return false;
+}
+
+static void
+lease_setup(struct file_lock *fl, void **priv)
+{
+ struct file *filp = fl->fl_file;
+ struct fasync_struct *fa = *priv;
+
+ /*
+ * fasync_insert_entry() returns the old entry if any. If there was no
+ * old entry, then it used "priv" and inserted it into the fasync list.
+ * Clear the pointer to indicate that it shouldn't be freed.
+ */
+ if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
+ *priv = NULL;
+
+ __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
}
static const struct lock_manager_operations lease_manager_ops = {
.lm_break = lease_break_callback,
.lm_change = lease_modify,
+ .lm_setup = lease_setup,
};
/*
@@ -431,7 +465,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
if (assign_type(fl, type) != 0)
return -EINVAL;
- fl->fl_owner = current->files;
+ fl->fl_owner = filp;
fl->fl_pid = current->tgid;
fl->fl_file = filp;
@@ -650,12 +684,16 @@ static void locks_unlink_lock(struct file_lock **thisfl_p)
*
* Must be called with i_lock held!
*/
-static void locks_delete_lock(struct file_lock **thisfl_p)
+static void locks_delete_lock(struct file_lock **thisfl_p,
+ struct list_head *dispose)
{
struct file_lock *fl = *thisfl_p;
locks_unlink_lock(thisfl_p);
- locks_free_lock(fl);
+ if (dispose)
+ list_add(&fl->fl_block, dispose);
+ else
+ locks_free_lock(fl);
}
/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
@@ -718,7 +756,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
break;
}
if (cfl) {
- __locks_copy_lock(fl, cfl);
+ locks_copy_conflock(fl, cfl);
if (cfl->fl_nspid)
fl->fl_pid = pid_vnr(cfl->fl_nspid);
} else
@@ -811,6 +849,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
struct inode * inode = file_inode(filp);
int error = 0;
int found = 0;
+ LIST_HEAD(dispose);
if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
new_fl = locks_alloc_lock();
@@ -833,7 +872,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
if (request->fl_type == fl->fl_type)
goto out;
found = 1;
- locks_delete_lock(before);
+ locks_delete_lock(before, &dispose);
break;
}
@@ -880,6 +919,7 @@ out:
spin_unlock(&inode->i_lock);
if (new_fl)
locks_free_lock(new_fl);
+ locks_dispose_list(&dispose);
return error;
}
@@ -893,6 +933,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
struct file_lock **before;
int error;
bool added = false;
+ LIST_HEAD(dispose);
/*
* We may need two file_lock structures for this operation,
@@ -921,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
if (!posix_locks_conflict(request, fl))
continue;
if (conflock)
- __locks_copy_lock(conflock, fl);
+ locks_copy_conflock(conflock, fl);
error = -EAGAIN;
if (!(request->fl_flags & FL_SLEEP))
goto out;
@@ -988,7 +1029,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
else
request->fl_end = fl->fl_end;
if (added) {
- locks_delete_lock(before);
+ locks_delete_lock(before, &dispose);
continue;
}
request = fl;
@@ -1018,21 +1059,24 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
* one (This may happen several times).
*/
if (added) {
- locks_delete_lock(before);
+ locks_delete_lock(before, &dispose);
continue;
}
- /* Replace the old lock with the new one.
- * Wake up anybody waiting for the old one,
- * as the change in lock type might satisfy
- * their needs.
+ /*
+ * Replace the old lock with new_fl, and
+ * remove the old one. It's safe to do the
+ * insert here since we know that we won't be
+ * using new_fl later, and that the lock is
+ * just replacing an existing lock.
*/
- locks_wake_up_blocks(fl);
- fl->fl_start = request->fl_start;
- fl->fl_end = request->fl_end;
- fl->fl_type = request->fl_type;
- locks_release_private(fl);
- locks_copy_private(fl, request);
- request = fl;
+ error = -ENOLCK;
+ if (!new_fl)
+ goto out;
+ locks_copy_lock(new_fl, request);
+ request = new_fl;
+ new_fl = NULL;
+ locks_delete_lock(before, &dispose);
+ locks_insert_lock(before, request);
added = true;
}
}
@@ -1093,6 +1137,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
locks_free_lock(new_fl);
if (new_fl2)
locks_free_lock(new_fl2);
+ locks_dispose_list(&dispose);
return error;
}
@@ -1249,7 +1294,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
}
/* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock **before, int arg)
+int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
{
struct file_lock *fl = *before;
int error = assign_type(fl, arg);
@@ -1268,11 +1313,10 @@ int lease_modify(struct file_lock **before, int arg)
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
fl->fl_fasync = NULL;
}
- locks_delete_lock(before);
+ locks_delete_lock(before, dispose);
}
return 0;
}
-
EXPORT_SYMBOL(lease_modify);
static bool past_time(unsigned long then)
@@ -1283,18 +1327,20 @@ static bool past_time(unsigned long then)
return time_after(jiffies, then);
}
-static void time_out_leases(struct inode *inode)
+static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
struct file_lock **before;
struct file_lock *fl;
+ lockdep_assert_held(&inode->i_lock);
+
before = &inode->i_flock;
while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
- lease_modify(before, F_RDLCK);
+ lease_modify(before, F_RDLCK, dispose);
if (past_time(fl->fl_break_time))
- lease_modify(before, F_UNLCK);
+ lease_modify(before, F_UNLCK, dispose);
if (fl == *before) /* lease_modify may have freed fl */
before = &fl->fl_next;
}
@@ -1307,6 +1353,20 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
return locks_conflict(breaker, lease);
}
+static bool
+any_leases_conflict(struct inode *inode, struct file_lock *breaker)
+{
+ struct file_lock *fl;
+
+ lockdep_assert_held(&inode->i_lock);
+
+ for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) {
+ if (leases_conflict(fl, breaker))
+ return true;
+ }
+ return false;
+}
+
/**
* __break_lease - revoke all outstanding leases on file
* @inode: the inode of the file to return
@@ -1323,12 +1383,11 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
- struct file_lock *new_fl, *flock;
- struct file_lock *fl;
+ struct file_lock *new_fl;
+ struct file_lock *fl, **before;
unsigned long break_time;
- int i_have_this_lease = 0;
- bool lease_conflict = false;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
+ LIST_HEAD(dispose);
new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
if (IS_ERR(new_fl))
@@ -1337,20 +1396,9 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
spin_lock(&inode->i_lock);
- time_out_leases(inode);
-
- flock = inode->i_flock;
- if ((flock == NULL) || !IS_LEASE(flock))
- goto out;
+ time_out_leases(inode, &dispose);
- for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
- if (leases_conflict(fl, new_fl)) {
- lease_conflict = true;
- if (fl->fl_owner == current->files)
- i_have_this_lease = 1;
- }
- }
- if (!lease_conflict)
+ if (!any_leases_conflict(inode, new_fl))
goto out;
break_time = 0;
@@ -1360,7 +1408,9 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
break_time++; /* so that 0 means no break time */
}
- for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
+ for (before = &inode->i_flock;
+ ((fl = *before) != NULL) && IS_LEASE(fl);
+ before = &fl->fl_next) {
if (!leases_conflict(fl, new_fl))
continue;
if (want_write) {
@@ -1369,51 +1419,56 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
fl->fl_flags |= FL_UNLOCK_PENDING;
fl->fl_break_time = break_time;
} else {
- if (lease_breaking(flock))
+ if (lease_breaking(inode->i_flock))
continue;
fl->fl_flags |= FL_DOWNGRADE_PENDING;
fl->fl_downgrade_time = break_time;
}
- fl->fl_lmops->lm_break(fl);
+ if (fl->fl_lmops->lm_break(fl))
+ locks_delete_lock(before, &dispose);
}
- if (i_have_this_lease || (mode & O_NONBLOCK)) {
+ fl = inode->i_flock;
+ if (!fl || !IS_LEASE(fl))
+ goto out;
+
+ if (mode & O_NONBLOCK) {
trace_break_lease_noblock(inode, new_fl);
error = -EWOULDBLOCK;
goto out;
}
restart:
- break_time = flock->fl_break_time;
+ break_time = inode->i_flock->fl_break_time;
if (break_time != 0)
break_time -= jiffies;
if (break_time == 0)
break_time++;
- locks_insert_block(flock, new_fl);
+ locks_insert_block(inode->i_flock, new_fl);
trace_break_lease_block(inode, new_fl);
spin_unlock(&inode->i_lock);
+ locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_next, break_time);
spin_lock(&inode->i_lock);
trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
if (error >= 0) {
- if (error == 0)
- time_out_leases(inode);
/*
* Wait for the next conflicting lease that has not been
* broken yet
*/
- for (flock = inode->i_flock; flock && IS_LEASE(flock);
- flock = flock->fl_next) {
- if (leases_conflict(new_fl, flock))
- goto restart;
- }
+ if (error == 0)
+ time_out_leases(inode, &dispose);
+ if (any_leases_conflict(inode, new_fl))
+ goto restart;
+
error = 0;
}
out:
spin_unlock(&inode->i_lock);
+ locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
}
@@ -1431,8 +1486,18 @@ EXPORT_SYMBOL(__break_lease);
*/
void lease_get_mtime(struct inode *inode, struct timespec *time)
{
- struct file_lock *flock = inode->i_flock;
- if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
+ bool has_lease = false;
+ struct file_lock *flock;
+
+ if (inode->i_flock) {
+ spin_lock(&inode->i_lock);
+ flock = inode->i_flock;
+ if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
+ has_lease = true;
+ spin_unlock(&inode->i_lock);
+ }
+
+ if (has_lease)
*time = current_fs_time(inode->i_sb);
else
*time = inode->i_mtime;
@@ -1468,9 +1533,10 @@ int fcntl_getlease(struct file *filp)
struct file_lock *fl;
struct inode *inode = file_inode(filp);
int type = F_UNLCK;
+ LIST_HEAD(dispose);
spin_lock(&inode->i_lock);
- time_out_leases(file_inode(filp));
+ time_out_leases(file_inode(filp), &dispose);
for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
fl = fl->fl_next) {
if (fl->fl_file == filp) {
@@ -1479,6 +1545,7 @@ int fcntl_getlease(struct file *filp)
}
}
spin_unlock(&inode->i_lock);
+ locks_dispose_list(&dispose);
return type;
}
@@ -1508,13 +1575,15 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
return ret;
}
-static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
+static int
+generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
{
struct file_lock *fl, **before, **my_before = NULL, *lease;
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
bool is_deleg = (*flp)->fl_flags & FL_DELEG;
int error;
+ LIST_HEAD(dispose);
lease = *flp;
trace_generic_add_lease(inode, lease);
@@ -1537,6 +1606,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
return -EINVAL;
}
+ spin_lock(&inode->i_lock);
+ time_out_leases(inode, &dispose);
error = check_conflicting_open(dentry, arg);
if (error)
goto out;
@@ -1572,10 +1643,11 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
}
if (my_before != NULL) {
- error = lease->fl_lmops->lm_change(my_before, arg);
- if (!error)
- *flp = *my_before;
- goto out;
+ lease = *my_before;
+ error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
+ if (error)
+ goto out;
+ goto out_setup;
}
error = -EINVAL;
@@ -1595,43 +1667,61 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
smp_mb();
error = check_conflicting_open(dentry, arg);
if (error)
- locks_unlink_lock(flp);
+ goto out_unlink;
+
+out_setup:
+ if (lease->fl_lmops->lm_setup)
+ lease->fl_lmops->lm_setup(lease, priv);
out:
+ spin_unlock(&inode->i_lock);
+ locks_dispose_list(&dispose);
if (is_deleg)
mutex_unlock(&inode->i_mutex);
+ if (!error && !my_before)
+ *flp = NULL;
return error;
+out_unlink:
+ locks_unlink_lock(before);
+ goto out;
}
-static int generic_delete_lease(struct file *filp, struct file_lock **flp)
+static int generic_delete_lease(struct file *filp)
{
+ int error = -EAGAIN;
struct file_lock *fl, **before;
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
+ LIST_HEAD(dispose);
- trace_generic_delete_lease(inode, *flp);
-
+ spin_lock(&inode->i_lock);
+ time_out_leases(inode, &dispose);
for (before = &inode->i_flock;
((fl = *before) != NULL) && IS_LEASE(fl);
before = &fl->fl_next) {
- if (fl->fl_file != filp)
- continue;
- return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
+ if (fl->fl_file == filp)
+ break;
}
- return -EAGAIN;
+ trace_generic_delete_lease(inode, fl);
+ if (fl)
+ error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose);
+ spin_unlock(&inode->i_lock);
+ locks_dispose_list(&dispose);
+ return error;
}
/**
* generic_setlease - sets a lease on an open file
- * @filp: file pointer
- * @arg: type of lease to obtain
- * @flp: input - file_lock to use, output - file_lock inserted
+ * @filp: file pointer
+ * @arg: type of lease to obtain
+ * @flp: input - file_lock to use, output - file_lock inserted
+ * @priv: private data for lm_setup (may be NULL if lm_setup
+ * doesn't require it)
*
* The (input) flp->fl_lmops->lm_break function is required
* by break_lease().
- *
- * Called with inode->i_lock held.
*/
-int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
+ void **priv)
{
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
@@ -1645,83 +1735,52 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
if (error)
return error;
- time_out_leases(inode);
-
- BUG_ON(!(*flp)->fl_lmops->lm_break);
-
switch (arg) {
case F_UNLCK:
- return generic_delete_lease(filp, flp);
+ return generic_delete_lease(filp);
case F_RDLCK:
case F_WRLCK:
- return generic_add_lease(filp, arg, flp);
+ if (!(*flp)->fl_lmops->lm_break) {
+ WARN_ON_ONCE(1);
+ return -ENOLCK;
+ }
+ return generic_add_lease(filp, arg, flp, priv);
default:
return -EINVAL;
}
}
EXPORT_SYMBOL(generic_setlease);
-static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
-{
- if (filp->f_op->setlease)
- return filp->f_op->setlease(filp, arg, lease);
- else
- return generic_setlease(filp, arg, lease);
-}
-
/**
- * vfs_setlease - sets a lease on an open file
- * @filp: file pointer
- * @arg: type of lease to obtain
- * @lease: file_lock to use
- *
- * Call this to establish a lease on the file.
- * The (*lease)->fl_lmops->lm_break operation must be set; if not,
- * break_lease will oops!
- *
- * This will call the filesystem's setlease file method, if
- * defined. Note that there is no getlease method; instead, the
- * filesystem setlease method should call back to setlease() to
- * add a lease to the inode's lease list, where fcntl_getlease() can
- * find it. Since fcntl_getlease() only reports whether the current
- * task holds a lease, a cluster filesystem need only do this for
- * leases held by processes on this node.
- *
- * There is also no break_lease method; filesystems that
- * handle their own leases should break leases themselves from the
- * filesystem's open, create, and (on truncate) setattr methods.
- *
- * Warning: the only current setlease methods exist only to disable
- * leases in certain cases. More vfs changes may be required to
- * allow a full filesystem lease implementation.
+ * vfs_setlease - sets a lease on an open file
+ * @filp: file pointer
+ * @arg: type of lease to obtain
+ * @lease: file_lock to use when adding a lease
+ * @priv: private info for lm_setup when adding a lease (may be
+ * NULL if lm_setup doesn't require it)
+ *
+ * Call this to establish a lease on the file. The "lease" argument is not
+ * used for F_UNLCK requests and may be NULL. For commands that set or alter
+ * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set;
+ * if not, this function will return -ENOLCK (and generate a scary-looking
+ * stack trace).
+ *
+ * The "priv" pointer is passed directly to the lm_setup function as-is. It
+ * may be NULL if the lm_setup operation doesn't require it.
*/
-
-int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
+int
+vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
{
- struct inode *inode = file_inode(filp);
- int error;
-
- spin_lock(&inode->i_lock);
- error = __vfs_setlease(filp, arg, lease);
- spin_unlock(&inode->i_lock);
-
- return error;
+ if (filp->f_op->setlease)
+ return filp->f_op->setlease(filp, arg, lease, priv);
+ else
+ return generic_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);
-static int do_fcntl_delete_lease(struct file *filp)
-{
- struct file_lock fl, *flp = &fl;
-
- lease_init(filp, F_UNLCK, flp);
-
- return vfs_setlease(filp, F_UNLCK, &flp);
-}
-
static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
{
- struct file_lock *fl, *ret;
- struct inode *inode = file_inode(filp);
+ struct file_lock *fl;
struct fasync_struct *new;
int error;
@@ -1734,30 +1793,11 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
locks_free_lock(fl);
return -ENOMEM;
}
- ret = fl;
- spin_lock(&inode->i_lock);
- error = __vfs_setlease(filp, arg, &ret);
- if (error) {
- spin_unlock(&inode->i_lock);
- locks_free_lock(fl);
- goto out_free_fasync;
- }
- if (ret != fl)
- locks_free_lock(fl);
-
- /*
- * fasync_insert_entry() returns the old entry if any.
- * If there was no old entry, then it used 'new' and
- * inserted it into the fasync list. Clear new so that
- * we don't release it here.
- */
- if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
- new = NULL;
+ new->fa_fd = fd;
- error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
- spin_unlock(&inode->i_lock);
-
-out_free_fasync:
+ error = vfs_setlease(filp, arg, &fl, (void **)&new);
+ if (fl)
+ locks_free_lock(fl);
if (new)
fasync_free(new);
return error;
@@ -1776,7 +1816,7 @@ out_free_fasync:
int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
if (arg == F_UNLCK)
- return do_fcntl_delete_lease(filp);
+ return vfs_setlease(filp, F_UNLCK, NULL, NULL);
return do_fcntl_add_lease(fd, filp, arg);
}
@@ -1845,9 +1885,12 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
!(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
goto out_putf;
- error = flock_make_lock(f.file, &lock, cmd);
- if (error)
+ lock = flock_make_lock(f.file, cmd);
+ if (IS_ERR(lock)) {
+ error = PTR_ERR(lock);
goto out_putf;
+ }
+
if (can_sleep)
lock->fl_flags |= FL_SLEEP;
@@ -1959,11 +2002,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
if (file_lock.fl_type != F_UNLCK) {
error = posix_lock_to_flock(&flock, &file_lock);
if (error)
- goto out;
+ goto rel_priv;
}
error = -EFAULT;
if (!copy_to_user(l, &flock, sizeof(flock)))
error = 0;
+rel_priv:
+ locks_release_private(&file_lock);
out:
return error;
}
@@ -2184,7 +2229,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
error = -EFAULT;
if (!copy_to_user(l, &flock, sizeof(flock)))
error = 0;
-
+
+ locks_release_private(&file_lock);
out:
return error;
}
@@ -2320,6 +2366,7 @@ void locks_remove_file(struct file *filp)
struct inode * inode = file_inode(filp);
struct file_lock *fl;
struct file_lock **before;
+ LIST_HEAD(dispose);
if (!inode->i_flock)
return;
@@ -2346,7 +2393,7 @@ void locks_remove_file(struct file *filp)
while ((fl = *before) != NULL) {
if (fl->fl_file == filp) {
if (IS_LEASE(fl)) {
- lease_modify(before, F_UNLCK);
+ lease_modify(before, F_UNLCK, &dispose);
continue;
}
@@ -2365,12 +2412,13 @@ void locks_remove_file(struct file *filp)
fl->fl_type, fl->fl_flags,
fl->fl_start, fl->fl_end);
- locks_delete_lock(before);
+ locks_delete_lock(before, &dispose);
continue;
}
before = &fl->fl_next;
}
spin_unlock(&inode->i_lock);
+ locks_dispose_list(&dispose);
}
/**
@@ -2452,7 +2500,11 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_puts(f, "FLOCK ADVISORY ");
}
} else if (IS_LEASE(fl)) {
- seq_puts(f, "LEASE ");
+ if (fl->fl_flags & FL_DELEG)
+ seq_puts(f, "DELEG ");
+ else
+ seq_puts(f, "LEASE ");
+
if (lease_breaking(fl))
seq_puts(f, "BREAKING ");
else if (fl->fl_file)
@@ -2565,86 +2617,6 @@ static int __init proc_locks_init(void)
module_init(proc_locks_init);
#endif
-/**
- * lock_may_read - checks that the region is free of locks
- * @inode: the inode that is being read
- * @start: the first byte to read
- * @len: the number of bytes to read
- *
- * Emulates Windows locking requirements. Whole-file
- * mandatory locks (share modes) can prohibit a read and
- * byte-range POSIX locks can prohibit a read if they overlap.
- *
- * N.B. this function is only ever called
- * from knfsd and ownership of locks is never checked.
- */
-int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
-{
- struct file_lock *fl;
- int result = 1;
-
- spin_lock(&inode->i_lock);
- for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
- if (IS_POSIX(fl)) {
- if (fl->fl_type == F_RDLCK)
- continue;
- if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
- continue;
- } else if (IS_FLOCK(fl)) {
- if (!(fl->fl_type & LOCK_MAND))
- continue;
- if (fl->fl_type & LOCK_READ)
- continue;
- } else
- continue;
- result = 0;
- break;
- }
- spin_unlock(&inode->i_lock);
- return result;
-}
-
-EXPORT_SYMBOL(lock_may_read);
-
-/**
- * lock_may_write - checks that the region is free of locks
- * @inode: the inode that is being written
- * @start: the first byte to write
- * @len: the number of bytes to write
- *
- * Emulates Windows locking requirements. Whole-file
- * mandatory locks (share modes) can prohibit a write and
- * byte-range POSIX locks can prohibit a write if they overlap.
- *
- * N.B. this function is only ever called
- * from knfsd and ownership of locks is never checked.
- */
-int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
-{
- struct file_lock *fl;
- int result = 1;
-
- spin_lock(&inode->i_lock);
- for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
- if (IS_POSIX(fl)) {
- if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
- continue;
- } else if (IS_FLOCK(fl)) {
- if (!(fl->fl_type & LOCK_MAND))
- continue;
- if (fl->fl_type & LOCK_WRITE)
- continue;
- } else
- continue;
- result = 0;
- break;
- }
- spin_unlock(&inode->i_lock);
- return result;
-}
-
-EXPORT_SYMBOL(lock_may_write);
-
static int __init filelock_init(void)
{
int i;
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 48140315f627..380d86e1ab45 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1019,11 +1019,11 @@ static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
/**
* logfs_is_valid_block - check whether this block is still valid
*
- * @sb - superblock
- * @ofs - block physical offset
- * @ino - block inode number
- * @bix - block index
- * @level - block level
+ * @sb: superblock
+ * @ofs: block physical offset
+ * @ino: block inode number
+ * @bix: block index
+ * @gc_level: block level
*
* Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
* become invalid once the journal is written.
@@ -2226,10 +2226,9 @@ void btree_write_block(struct logfs_block *block)
*
* @inode: parent inode (ifile or directory)
* @buf: object to write (inode or dentry)
- * @n: object size
- * @_pos: object number (file position in blocks/objects)
+ * @count: object size
+ * @bix: block index
* @flags: write flags
- * @lock: 0 if write lock is already taken, 1 otherwise
* @shadow_tree: shadow below this inode
*
* FIXME: All caller of this put a 200-300 byte variable on the stack,
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 4bc50dac8e97..742942a983be 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -96,7 +96,7 @@ int minix_new_block(struct inode * inode)
unsigned long minix_count_free_blocks(struct super_block *sb)
{
struct minix_sb_info *sbi = minix_sb(sb);
- u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
+ u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1;
return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
<< sbi->s_log_zone_size);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f007a3355570..3f57af196a7d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -267,12 +267,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
if (sbi->s_imap_blocks < block) {
printk("MINIX-fs: file system does not have enough "
- "imap blocks allocated. Refusing to mount\n");
+ "imap blocks allocated. Refusing to mount.\n");
goto out_no_bitmap;
}
block = minix_blocks_needed(
- (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
+ (sbi->s_nzones - sbi->s_firstdatazone + 1),
s->s_blocksize);
if (sbi->s_zmap_blocks < block) {
printk("MINIX-fs: file system does not have enough "
diff --git a/fs/mount.h b/fs/mount.h
index d55297f2fa05..f82c62840905 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -21,6 +21,7 @@ struct mnt_pcp {
struct mountpoint {
struct hlist_node m_hash;
struct dentry *m_dentry;
+ struct hlist_head m_list;
int m_count;
};
@@ -29,7 +30,10 @@ struct mount {
struct mount *mnt_parent;
struct dentry *mnt_mountpoint;
struct vfsmount mnt;
- struct rcu_head mnt_rcu;
+ union {
+ struct rcu_head mnt_rcu;
+ struct llist_node mnt_llist;
+ };
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
#else
@@ -48,6 +52,7 @@ struct mount {
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
+ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
#ifdef CONFIG_FSNOTIFY
struct hlist_head mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
@@ -55,7 +60,7 @@ struct mount {
int mnt_id; /* mount identifier */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
- int mnt_pinned;
+ struct hlist_head mnt_pins;
struct path mnt_ex_mountpoint;
};
@@ -82,6 +87,15 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
extern bool legitimize_mnt(struct vfsmount *, unsigned);
+extern void __detach_mounts(struct dentry *dentry);
+
+static inline void detach_mounts(struct dentry *dentry)
+{
+ if (!d_mountpoint(dentry))
+ return;
+ __detach_mounts(dentry);
+}
+
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
atomic_inc(&ns->count);
@@ -112,3 +126,12 @@ struct proc_mounts {
#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
extern const struct seq_operations mounts_op;
+
+extern bool __is_local_mountpoint(struct dentry *dentry);
+static inline bool is_local_mountpoint(struct dentry *dentry)
+{
+ if (!d_mountpoint(dentry))
+ return false;
+
+ return __is_local_mountpoint(dentry);
+}
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f9ed622274f..3e79220babac 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -28,6 +28,7 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/cleancache.h>
+#include "internal.h"
/*
* I/O completion handler for multipage BIOs.
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err)
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
+ guard_bio_eod(rw, bio);
submit_bio(rw, bio);
return NULL;
}
diff --git a/fs/namei.c b/fs/namei.c
index 9eb787e5c167..db5fe86319e6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
+#include <linux/hash.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -415,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask)
return security_inode_permission(inode, mask);
}
+EXPORT_SYMBOL(__inode_permission);
/**
* sb_permission - Check superblock-level permissions
@@ -643,24 +645,22 @@ static int complete_walk(struct nameidata *nd)
static __always_inline void set_root(struct nameidata *nd)
{
- if (!nd->root.mnt)
- get_fs_root(current->fs, &nd->root);
+ get_fs_root(current->fs, &nd->root);
}
static int link_path_walk(const char *, struct nameidata *);
-static __always_inline void set_root_rcu(struct nameidata *nd)
+static __always_inline unsigned set_root_rcu(struct nameidata *nd)
{
- if (!nd->root.mnt) {
- struct fs_struct *fs = current->fs;
- unsigned seq;
+ struct fs_struct *fs = current->fs;
+ unsigned seq, res;
- do {
- seq = read_seqcount_begin(&fs->seq);
- nd->root = fs->root;
- nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
- }
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ nd->root = fs->root;
+ res = __read_seqcount_begin(&nd->root.dentry->d_seq);
+ } while (read_seqcount_retry(&fs->seq, seq));
+ return res;
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -860,7 +860,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
return PTR_ERR(s);
}
if (*s == '/') {
- set_root(nd);
+ if (!nd->root.mnt)
+ set_root(nd);
path_put(&nd->path);
nd->path = nd->root;
path_get(&nd->root);
@@ -1091,10 +1092,10 @@ int follow_down_one(struct path *path)
}
EXPORT_SYMBOL(follow_down_one);
-static inline bool managed_dentry_might_block(struct dentry *dentry)
+static inline int managed_dentry_rcu(struct dentry *dentry)
{
- return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
- dentry->d_op->d_manage(dentry, true) < 0);
+ return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+ dentry->d_op->d_manage(dentry, true) : 0;
}
/*
@@ -1110,11 +1111,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
* Don't forget we might have a non-mountpoint managed dentry
* that wants to block transit.
*/
- if (unlikely(managed_dentry_might_block(path->dentry)))
+ switch (managed_dentry_rcu(path->dentry)) {
+ case -ECHILD:
+ default:
return false;
+ case -EISDIR:
+ return true;
+ case 0:
+ break;
+ }
if (!d_mountpoint(path->dentry))
- return true;
+ return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
mounted = __lookup_mnt(path->mnt, path->dentry);
if (!mounted)
@@ -1130,12 +1138,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
*/
*inode = path->dentry->d_inode;
}
- return read_seqretry(&mount_lock, nd->m_seq);
+ return !read_seqretry(&mount_lock, nd->m_seq) &&
+ !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
}
static int follow_dotdot_rcu(struct nameidata *nd)
{
- set_root_rcu(nd);
+ struct inode *inode = nd->inode;
+ if (!nd->root.mnt)
+ set_root_rcu(nd);
while (1) {
if (nd->path.dentry == nd->root.dentry &&
@@ -1147,6 +1158,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
struct dentry *parent = old->d_parent;
unsigned seq;
+ inode = parent->d_inode;
seq = read_seqcount_begin(&parent->d_seq);
if (read_seqcount_retry(&old->d_seq, nd->seq))
goto failed;
@@ -1156,6 +1168,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
}
if (!follow_up_rcu(&nd->path))
break;
+ inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
}
while (d_mountpoint(nd->path.dentry)) {
@@ -1165,11 +1178,12 @@ static int follow_dotdot_rcu(struct nameidata *nd)
break;
nd->path.mnt = &mounted->mnt;
nd->path.dentry = mounted->mnt.mnt_root;
+ inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
- if (!read_seqretry(&mount_lock, nd->m_seq))
+ if (read_seqretry(&mount_lock, nd->m_seq))
goto failed;
}
- nd->inode = nd->path.dentry->d_inode;
+ nd->inode = inode;
return 0;
failed:
@@ -1248,7 +1262,8 @@ static void follow_mount(struct path *path)
static void follow_dotdot(struct nameidata *nd)
{
- set_root(nd);
+ if (!nd->root.mnt)
+ set_root(nd);
while(1) {
struct dentry *old = nd->path.dentry;
@@ -1292,7 +1307,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
if (error < 0) {
dput(dentry);
return ERR_PTR(error);
- } else if (!d_invalidate(dentry)) {
+ } else {
+ d_invalidate(dentry);
dput(dentry);
dentry = NULL;
}
@@ -1402,11 +1418,8 @@ static int lookup_fast(struct nameidata *nd,
}
path->mnt = mnt;
path->dentry = dentry;
- if (unlikely(!__follow_mount_rcu(nd, path, inode)))
- goto unlazy;
- if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
- goto unlazy;
- return 0;
+ if (likely(__follow_mount_rcu(nd, path, inode)))
+ return 0;
unlazy:
if (unlazy_walk(nd, dentry))
return -ECHILD;
@@ -1424,10 +1437,9 @@ unlazy:
dput(dentry);
return status;
}
- if (!d_invalidate(dentry)) {
- dput(dentry);
- goto need_lookup;
- }
+ d_invalidate(dentry);
+ dput(dentry);
+ goto need_lookup;
}
path->mnt = mnt;
@@ -1629,8 +1641,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
static inline unsigned int fold_hash(unsigned long hash)
{
- hash += hash >> (8*sizeof(int));
- return hash;
+ return hash_64(hash, 32);
}
#else /* 32-bit case */
@@ -1664,9 +1675,9 @@ EXPORT_SYMBOL(full_name_hash);
/*
* Calculate the length and hash of the path component, and
- * return the length of the component;
+ * return the "hash_len" as the result.
*/
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline u64 hash_name(const char *name)
{
unsigned long a, b, adata, bdata, mask, hash, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
@@ -1686,9 +1697,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
mask = create_zero_mask(adata | bdata);
hash += a & zero_bytemask(mask);
- *hashp = fold_hash(hash);
-
- return len + find_zero(mask);
+ len += find_zero(mask);
+ return hashlen_create(fold_hash(hash), len);
}
#else
@@ -1706,7 +1716,7 @@ EXPORT_SYMBOL(full_name_hash);
* We know there's a real path component here of at least
* one character.
*/
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline u64 hash_name(const char *name)
{
unsigned long hash = init_name_hash();
unsigned long len = 0, c;
@@ -1717,8 +1727,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
hash = partial_name_hash(c, hash);
c = (unsigned char)name[len];
} while (c && c != '/');
- *hashp = end_name_hash(hash);
- return len;
+ return hashlen_create(end_name_hash(hash), len);
}
#endif
@@ -1743,20 +1752,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
/* At this point we know we have a real path component. */
for(;;) {
- struct qstr this;
- long len;
+ u64 hash_len;
int type;
err = may_lookup(nd);
if (err)
break;
- len = hash_name(name, &this.hash);
- this.name = name;
- this.len = len;
+ hash_len = hash_name(name);
type = LAST_NORM;
- if (name[0] == '.') switch (len) {
+ if (name[0] == '.') switch (hashlen_len(hash_len)) {
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
@@ -1770,29 +1776,32 @@ static int link_path_walk(const char *name, struct nameidata *nd)
struct dentry *parent = nd->path.dentry;
nd->flags &= ~LOOKUP_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+ struct qstr this = { { .hash_len = hash_len }, .name = name };
err = parent->d_op->d_hash(parent, &this);
if (err < 0)
break;
+ hash_len = this.hash_len;
+ name = this.name;
}
}
- nd->last = this;
+ nd->last.hash_len = hash_len;
+ nd->last.name = name;
nd->last_type = type;
- if (!name[len])
+ name += hashlen_len(hash_len);
+ if (!*name)
return 0;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
- len++;
- } while (unlikely(name[len] == '/'));
- if (!name[len])
+ name++;
+ } while (unlikely(*name == '/'));
+ if (!*name)
return 0;
- name += len;
-
err = walk_component(nd, &next, LOOKUP_FOLLOW);
if (err < 0)
return err;
@@ -1847,7 +1856,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (*name=='/') {
if (flags & LOOKUP_RCU) {
rcu_read_lock();
- set_root_rcu(nd);
+ nd->seq = set_root_rcu(nd);
} else {
set_root(nd);
path_get(&nd->root);
@@ -1898,7 +1907,14 @@ static int path_init(int dfd, const char *name, unsigned int flags,
}
nd->inode = nd->path.dentry->d_inode;
- return 0;
+ if (!(flags & LOOKUP_RCU))
+ return 0;
+ if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
+ return 0;
+ if (!(nd->flags & LOOKUP_ROOT))
+ nd->root.mnt = NULL;
+ rcu_read_unlock();
+ return -ECHILD;
}
static inline int lookup_last(struct nameidata *nd, struct path *path)
@@ -1935,7 +1951,7 @@ static int path_lookupat(int dfd, const char *name,
err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
if (unlikely(err))
- return err;
+ goto out;
current->total_link_count = 0;
err = link_path_walk(name, nd);
@@ -1967,6 +1983,7 @@ static int path_lookupat(int dfd, const char *name,
}
}
+out:
if (base)
fput(base);
@@ -2286,7 +2303,7 @@ path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags
err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
if (unlikely(err))
- return err;
+ goto out;
current->total_link_count = 0;
err = link_path_walk(name, &nd);
@@ -2367,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path,
}
EXPORT_SYMBOL(kern_path_mountpoint);
-/*
- * It's inline, so penalty for filesystems that don't use sticky bit is
- * minimal.
- */
-static inline int check_sticky(struct inode *dir, struct inode *inode)
+int __check_sticky(struct inode *dir, struct inode *inode)
{
kuid_t fsuid = current_fsuid();
- if (!(dir->i_mode & S_ISVTX))
- return 0;
if (uid_eq(inode->i_uid, fsuid))
return 0;
if (uid_eq(dir->i_uid, fsuid))
return 0;
return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
}
+EXPORT_SYMBOL(__check_sticky);
/*
* Check whether we can remove a link victim from directory dir, check
@@ -2485,7 +2497,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
}
mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
return NULL;
}
EXPORT_SYMBOL(lock_rename);
@@ -3048,9 +3060,12 @@ finish_open_created:
error = may_open(&nd->path, acc_mode, open_flag);
if (error)
goto out;
- file->f_path.mnt = nd->path.mnt;
- error = finish_open(file, nd->path.dentry, NULL, opened);
- if (error) {
+
+ BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
+ error = vfs_open(&nd->path, file, current_cred());
+ if (!error) {
+ *opened |= FILE_OPENED;
+ } else {
if (error == -EOPENSTALE)
goto stale_open;
goto out;
@@ -3059,7 +3074,7 @@ opened:
error = open_check_o_direct(file);
if (error)
goto exit_fput;
- error = ima_file_check(file, op->acc_mode);
+ error = ima_file_check(file, op->acc_mode, *opened);
if (error)
goto exit_fput;
@@ -3139,7 +3154,8 @@ static int do_tmpfile(int dfd, struct filename *pathname,
if (error)
goto out2;
audit_inode(pathname, nd->path.dentry, 0);
- error = may_open(&nd->path, op->acc_mode, op->open_flag);
+ /* Don't check for other permissions, the inode was just created */
+ error = may_open(&nd->path, MAY_OPEN, op->open_flag);
if (error)
goto out2;
file->f_path.mnt = nd->path.mnt;
@@ -3550,7 +3566,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
mutex_lock(&dentry->d_inode->i_mutex);
error = -EBUSY;
- if (d_mountpoint(dentry))
+ if (is_local_mountpoint(dentry))
goto out;
error = security_inode_rmdir(dir, dentry);
@@ -3564,6 +3580,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
dentry->d_inode->i_flags |= S_DEAD;
dont_mount(dentry);
+ detach_mounts(dentry);
out:
mutex_unlock(&dentry->d_inode->i_mutex);
@@ -3666,7 +3683,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
return -EPERM;
mutex_lock(&target->i_mutex);
- if (d_mountpoint(dentry))
+ if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
@@ -3675,8 +3692,10 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
if (error)
goto out;
error = dir->i_op->unlink(dir, dentry);
- if (!error)
+ if (!error) {
dont_mount(dentry);
+ detach_mounts(dentry);
+ }
}
}
out:
@@ -4019,7 +4038,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* The worst of all namespace operations - renaming directory. "Perverted"
* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
* Problems:
- * a) we can get into loop creation. Check is done in is_subdir().
+ * a) we can get into loop creation.
* b) race potential - two innocent renames can create a loop together.
* That's where 4.4 screws up. Current fix: serialization on
* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
@@ -4075,7 +4094,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
- if (!old_dir->i_op->rename)
+ if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
return -EPERM;
if (flags && !old_dir->i_op->rename2)
@@ -4111,7 +4130,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
mutex_lock(&target->i_mutex);
error = -EBUSY;
- if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+ if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
goto out;
if (max_links && new_dir != old_dir) {
@@ -4134,10 +4153,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
goto out;
}
- if (!flags) {
+ if (!old_dir->i_op->rename2) {
error = old_dir->i_op->rename(old_dir, old_dentry,
new_dir, new_dentry);
} else {
+ WARN_ON(old_dir->i_op->rename != NULL);
error = old_dir->i_op->rename2(old_dir, old_dentry,
new_dir, new_dentry, flags);
}
@@ -4148,6 +4168,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (is_dir)
target->i_flags |= S_DEAD;
dont_mount(new_dentry);
+ detach_mounts(new_dentry);
}
if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
if (!(flags & RENAME_EXCHANGE))
@@ -4189,12 +4210,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
bool should_retry = false;
int error;
- if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
- if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
+ if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
+ (flags & RENAME_EXCHANGE))
return -EINVAL;
+ if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+ return -EPERM;
+
retry:
from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
if (IS_ERR(from)) {
@@ -4326,6 +4351,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
+int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+{
+ int error = may_create(dir, dentry);
+ if (error)
+ return error;
+
+ if (!dir->i_op->mknod)
+ return -EPERM;
+
+ return dir->i_op->mknod(dir, dentry,
+ S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+}
+EXPORT_SYMBOL(vfs_whiteout);
+
int readlink_copy(char __user *buffer, int buflen, const char *link)
{
int len = PTR_ERR(link);
diff --git a/fs/namespace.c b/fs/namespace.c
index 182bc41cd887..5b66b2b3624d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -16,7 +16,6 @@
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/idr.h>
-#include <linux/acct.h> /* acct_auto_close_mnt */
#include <linux/init.h> /* init_rootfs */
#include <linux/fs_struct.h> /* get_fs_root et.al. */
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
@@ -24,6 +23,7 @@
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/bootmem.h>
+#include <linux/task_work.h>
#include "pnode.h"
#include "internal.h"
@@ -225,6 +225,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
+ INIT_HLIST_NODE(&mnt->mnt_mp_list);
#ifdef CONFIG_FSNOTIFY
INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
@@ -667,11 +668,45 @@ struct vfsmount *lookup_mnt(struct path *path)
return m;
}
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
+/*
+ * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
+ * current mount namespace.
+ *
+ * The common case is dentries are not mountpoints at all and that
+ * test is handled inline. For the slow case when we are actually
+ * dealing with a mountpoint of some kind, walk through all of the
+ * mounts in the current mount namespace and test to see if the dentry
+ * is a mountpoint.
+ *
+ * The mount_hashtable is not usable in the context because we
+ * need to identify all mounts that may be in the current mount
+ * namespace not just a mount that happens to have some specified
+ * parent mount.
+ */
+bool __is_local_mountpoint(struct dentry *dentry)
+{
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ struct mount *mnt;
+ bool is_covered = false;
+
+ if (!d_mountpoint(dentry))
+ goto out;
+
+ down_read(&namespace_sem);
+ list_for_each_entry(mnt, &ns->list, mnt_list) {
+ is_covered = (mnt->mnt_mountpoint == dentry);
+ if (is_covered)
+ break;
+ }
+ up_read(&namespace_sem);
+out:
+ return is_covered;
+}
+
+static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
struct hlist_head *chain = mp_hash(dentry);
struct mountpoint *mp;
- int ret;
hlist_for_each_entry(mp, chain, m_hash) {
if (mp->m_dentry == dentry) {
@@ -682,6 +717,14 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
return mp;
}
}
+ return NULL;
+}
+
+static struct mountpoint *new_mountpoint(struct dentry *dentry)
+{
+ struct hlist_head *chain = mp_hash(dentry);
+ struct mountpoint *mp;
+ int ret;
mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
if (!mp)
@@ -696,6 +739,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
mp->m_dentry = dentry;
mp->m_count = 1;
hlist_add_head(&mp->m_hash, chain);
+ INIT_HLIST_HEAD(&mp->m_list);
return mp;
}
@@ -703,6 +747,7 @@ static void put_mountpoint(struct mountpoint *mp)
{
if (!--mp->m_count) {
struct dentry *dentry = mp->m_dentry;
+ BUG_ON(!hlist_empty(&mp->m_list));
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
@@ -749,6 +794,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
hlist_del_init_rcu(&mnt->mnt_hash);
+ hlist_del_init(&mnt->mnt_mp_list);
put_mountpoint(mnt->mnt_mp);
mnt->mnt_mp = NULL;
}
@@ -765,6 +811,7 @@ void mnt_set_mountpoint(struct mount *mnt,
child_mnt->mnt_mountpoint = dget(mp->m_dentry);
child_mnt->mnt_parent = mnt;
child_mnt->mnt_mp = mp;
+ hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}
/*
@@ -779,6 +826,20 @@ static void attach_mnt(struct mount *mnt,
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}
+static void attach_shadowed(struct mount *mnt,
+ struct mount *parent,
+ struct mount *shadows)
+{
+ if (shadows) {
+ hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
+ list_add(&mnt->mnt_child, &shadows->mnt_child);
+ } else {
+ hlist_add_head_rcu(&mnt->mnt_hash,
+ m_hash(&parent->mnt, mnt->mnt_mountpoint));
+ list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+ }
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -797,12 +858,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
list_splice(&head, n->list.prev);
- if (shadows)
- hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
- else
- hlist_add_head_rcu(&mnt->mnt_hash,
- m_hash(&parent->mnt, mnt->mnt_mountpoint));
- list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+ attach_shadowed(mnt, parent, shadows);
touch_mnt_namespace(n);
}
@@ -890,8 +946,21 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
/* Don't allow unprivileged users to change mount flags */
- if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
- mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+ if (flag & CL_UNPRIVILEGED) {
+ mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
+
+ if (mnt->mnt.mnt_flags & MNT_READONLY)
+ mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+
+ if (mnt->mnt.mnt_flags & MNT_NODEV)
+ mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
+
+ if (mnt->mnt.mnt_flags & MNT_NOSUID)
+ mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
+
+ if (mnt->mnt.mnt_flags & MNT_NOEXEC)
+ mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
+ }
/* Don't allow unprivileged users to reveal what is under a mount */
if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
@@ -936,9 +1005,48 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
return ERR_PTR(err);
}
+static void cleanup_mnt(struct mount *mnt)
+{
+ /*
+ * This probably indicates that somebody messed
+ * up a mnt_want/drop_write() pair. If this
+ * happens, the filesystem was probably unable
+ * to make r/w->r/o transitions.
+ */
+ /*
+ * The locking used to deal with mnt_count decrement provides barriers,
+ * so mnt_get_writers() below is safe.
+ */
+ WARN_ON(mnt_get_writers(mnt));
+ if (unlikely(mnt->mnt_pins.first))
+ mnt_pin_kill(mnt);
+ fsnotify_vfsmount_delete(&mnt->mnt);
+ dput(mnt->mnt.mnt_root);
+ deactivate_super(mnt->mnt.mnt_sb);
+ mnt_free_id(mnt);
+ call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
+}
+
+static void __cleanup_mnt(struct rcu_head *head)
+{
+ cleanup_mnt(container_of(head, struct mount, mnt_rcu));
+}
+
+static LLIST_HEAD(delayed_mntput_list);
+static void delayed_mntput(struct work_struct *unused)
+{
+ struct llist_node *node = llist_del_all(&delayed_mntput_list);
+ struct llist_node *next;
+
+ for (; node; node = next) {
+ next = llist_next(node);
+ cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
+ }
+}
+static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
+
static void mntput_no_expire(struct mount *mnt)
{
-put_again:
rcu_read_lock();
mnt_add_count(mnt, -1);
if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@ -951,14 +1059,6 @@ put_again:
unlock_mount_hash();
return;
}
- if (unlikely(mnt->mnt_pinned)) {
- mnt_add_count(mnt, mnt->mnt_pinned + 1);
- mnt->mnt_pinned = 0;
- rcu_read_unlock();
- unlock_mount_hash();
- acct_auto_close_mnt(&mnt->mnt);
- goto put_again;
- }
if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
rcu_read_unlock();
unlock_mount_hash();
@@ -970,22 +1070,18 @@ put_again:
list_del(&mnt->mnt_instance);
unlock_mount_hash();
- /*
- * This probably indicates that somebody messed
- * up a mnt_want/drop_write() pair. If this
- * happens, the filesystem was probably unable
- * to make r/w->r/o transitions.
- */
- /*
- * The locking used to deal with mnt_count decrement provides barriers,
- * so mnt_get_writers() below is safe.
- */
- WARN_ON(mnt_get_writers(mnt));
- fsnotify_vfsmount_delete(&mnt->mnt);
- dput(mnt->mnt.mnt_root);
- deactivate_super(mnt->mnt.mnt_sb);
- mnt_free_id(mnt);
- call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
+ if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
+ struct task_struct *task = current;
+ if (likely(!(task->flags & PF_KTHREAD))) {
+ init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
+ if (!task_work_add(task, &mnt->mnt_rcu, true))
+ return;
+ }
+ if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
+ schedule_delayed_work(&delayed_mntput_work, 1);
+ return;
+ }
+ cleanup_mnt(mnt);
}
void mntput(struct vfsmount *mnt)
@@ -1008,25 +1104,15 @@ struct vfsmount *mntget(struct vfsmount *mnt)
}
EXPORT_SYMBOL(mntget);
-void mnt_pin(struct vfsmount *mnt)
-{
- lock_mount_hash();
- real_mount(mnt)->mnt_pinned++;
- unlock_mount_hash();
-}
-EXPORT_SYMBOL(mnt_pin);
-
-void mnt_unpin(struct vfsmount *m)
+struct vfsmount *mnt_clone_internal(struct path *path)
{
- struct mount *mnt = real_mount(m);
- lock_mount_hash();
- if (mnt->mnt_pinned) {
- mnt_add_count(mnt, 1);
- mnt->mnt_pinned--;
- }
- unlock_mount_hash();
+ struct mount *p;
+ p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
+ if (IS_ERR(p))
+ return ERR_CAST(p);
+ p->mnt.mnt_flags |= MNT_INTERNAL;
+ return &p->mnt;
}
-EXPORT_SYMBOL(mnt_unpin);
static inline void mangle(struct seq_file *m, const char *s)
{
@@ -1213,6 +1299,11 @@ static void namespace_unlock(void)
head.first->pprev = &head.first;
INIT_HLIST_HEAD(&unmounted);
+ /* undo decrements we'd done in umount_tree() */
+ hlist_for_each_entry(mnt, &head, mnt_hash)
+ if (mnt->mnt_ex_mountpoint.mnt)
+ mntget(mnt->mnt_ex_mountpoint.mnt);
+
up_write(&namespace_sem);
synchronize_rcu();
@@ -1249,6 +1340,9 @@ void umount_tree(struct mount *mnt, int how)
hlist_add_head(&p->mnt_hash, &tmp_list);
}
+ hlist_for_each_entry(p, &tmp_list, mnt_hash)
+ list_del_init(&p->mnt_child);
+
if (how)
propagate_umount(&tmp_list);
@@ -1259,9 +1353,10 @@ void umount_tree(struct mount *mnt, int how)
p->mnt_ns = NULL;
if (how < 2)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
- list_del_init(&p->mnt_child);
if (mnt_has_parent(p)) {
+ hlist_del_init(&p->mnt_mp_list);
put_mountpoint(p->mnt_mp);
+ mnt_add_count(p->mnt_parent, -1);
/* move the reference to mountpoint into ->mnt_ex_mountpoint */
p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
@@ -1344,6 +1439,8 @@ static int do_umount(struct mount *mnt, int flags)
* Special case for "unmounting" root ...
* we just try to remount it readonly.
*/
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
down_write(&sb->s_umount);
if (!(sb->s_flags & MS_RDONLY))
retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
@@ -1373,6 +1470,37 @@ static int do_umount(struct mount *mnt, int flags)
return retval;
}
+/*
+ * __detach_mounts - lazily unmount all mounts on the specified dentry
+ *
+ * During unlink, rmdir, and d_drop it is possible to loose the path
+ * to an existing mountpoint, and wind up leaking the mount.
+ * detach_mounts allows lazily unmounting those mounts instead of
+ * leaking them.
+ *
+ * The caller may hold dentry->d_inode->i_mutex.
+ */
+void __detach_mounts(struct dentry *dentry)
+{
+ struct mountpoint *mp;
+ struct mount *mnt;
+
+ namespace_lock();
+ mp = lookup_mountpoint(dentry);
+ if (!mp)
+ goto out_unlock;
+
+ lock_mount_hash();
+ while (!hlist_empty(&mp->m_list)) {
+ mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
+ umount_tree(mnt, 2);
+ }
+ unlock_mount_hash();
+ put_mountpoint(mp);
+out_unlock:
+ namespace_unlock();
+}
+
/*
* Is the caller allowed to modify his namespace?
*/
@@ -1492,6 +1620,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
continue;
for (s = r; s; s = next_mnt(s, r)) {
+ struct mount *t = NULL;
if (!(flag & CL_COPY_UNBINDABLE) &&
IS_MNT_UNBINDABLE(s)) {
s = skip_mnt_tree(s);
@@ -1513,7 +1642,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
goto out;
lock_mount_hash();
list_add_tail(&q->mnt_list, &res->mnt_list);
- attach_mnt(q, parent, p->mnt_mp);
+ mnt_set_mountpoint(parent, p->mnt_mp, q);
+ if (!list_empty(&parent->mnt_mounts)) {
+ t = list_last_entry(&parent->mnt_mounts,
+ struct mount, mnt_child);
+ if (t->mnt_mp != p->mnt_mp)
+ t = NULL;
+ }
+ attach_shadowed(q, parent, t);
unlock_mount_hash();
}
}
@@ -1550,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
+/**
+ * clone_private_mount - create a private clone of a path
+ *
+ * This creates a new vfsmount, which will be the clone of @path. The new will
+ * not be attached anywhere in the namespace and will be private (i.e. changes
+ * to the originating mount won't be propagated into this).
+ *
+ * Release with mntput().
+ */
+struct vfsmount *clone_private_mount(struct path *path)
+{
+ struct mount *old_mnt = real_mount(path->mnt);
+ struct mount *new_mnt;
+
+ if (IS_MNT_UNBINDABLE(old_mnt))
+ return ERR_PTR(-EINVAL);
+
+ down_read(&namespace_sem);
+ new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+ up_read(&namespace_sem);
+ if (IS_ERR(new_mnt))
+ return ERR_CAST(new_mnt);
+
+ return &new_mnt->mnt;
+}
+EXPORT_SYMBOL_GPL(clone_private_mount);
+
int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
struct vfsmount *root)
{
@@ -1722,7 +1885,9 @@ retry:
namespace_lock();
mnt = lookup_mnt(path);
if (likely(!mnt)) {
- struct mountpoint *mp = new_mountpoint(dentry);
+ struct mountpoint *mp = lookup_mountpoint(dentry);
+ if (!mp)
+ mp = new_mountpoint(dentry);
if (IS_ERR(mp)) {
namespace_unlock();
mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1896,9 +2061,6 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
if (readonly_request == __mnt_is_readonly(mnt))
return 0;
- if (mnt->mnt_flags & MNT_LOCK_READONLY)
- return -EPERM;
-
if (readonly_request)
error = mnt_make_readonly(real_mount(mnt));
else
@@ -1924,6 +2086,33 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
+ /* Don't allow changing of locked mnt flags.
+ *
+ * No locks need to be held here while testing the various
+ * MNT_LOCK flags because those flags can never be cleared
+ * once they are set.
+ */
+ if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+ !(mnt_flags & MNT_READONLY)) {
+ return -EPERM;
+ }
+ if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+ !(mnt_flags & MNT_NODEV)) {
+ return -EPERM;
+ }
+ if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
+ !(mnt_flags & MNT_NOSUID)) {
+ return -EPERM;
+ }
+ if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
+ !(mnt_flags & MNT_NOEXEC)) {
+ return -EPERM;
+ }
+ if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
+ ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
+ return -EPERM;
+ }
+
err = security_sb_remount(sb, data);
if (err)
return err;
@@ -1937,7 +2126,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
err = do_remount_sb(sb, flags, data, 0);
if (!err) {
lock_mount_hash();
- mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
+ mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
mnt->mnt.mnt_flags = mnt_flags;
touch_mnt_namespace(mnt->mnt_ns);
unlock_mount_hash();
@@ -2122,7 +2311,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
*/
if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
flags |= MS_NODEV;
- mnt_flags |= MNT_NODEV;
+ mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
}
}
@@ -2354,21 +2543,9 @@ int copy_mount_options(const void __user * data, unsigned long *where)
return 0;
}
-int copy_mount_string(const void __user *data, char **where)
+char *copy_mount_string(const void __user *data)
{
- char *tmp;
-
- if (!data) {
- *where = NULL;
- return 0;
- }
-
- tmp = strndup_user(data, PAGE_SIZE);
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
-
- *where = tmp;
- return 0;
+ return data ? strndup_user(data, PAGE_SIZE) : NULL;
}
/*
@@ -2385,7 +2562,7 @@ int copy_mount_string(const void __user *data, char **where)
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
-long do_mount(const char *dev_name, const char *dir_name,
+long do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
struct path path;
@@ -2397,15 +2574,11 @@ long do_mount(const char *dev_name, const char *dir_name,
flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
-
- if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
- return -EINVAL;
-
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
/* ... and get the mountpoint */
- retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
+ retval = user_path(dir_name, &path);
if (retval)
return retval;
@@ -2436,6 +2609,14 @@ long do_mount(const char *dev_name, const char *dir_name,
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
+ /* The default atime for remount is preservation */
+ if ((flags & MS_REMOUNT) &&
+ ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
+ MS_STRICTATIME)) == 0)) {
+ mnt_flags &= ~MNT_ATIME_MASK;
+ mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
+ }
+
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
MS_STRICTATIME);
@@ -2622,37 +2803,30 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
{
int ret;
char *kernel_type;
- struct filename *kernel_dir;
char *kernel_dev;
unsigned long data_page;
- ret = copy_mount_string(type, &kernel_type);
- if (ret < 0)
+ kernel_type = copy_mount_string(type);
+ ret = PTR_ERR(kernel_type);
+ if (IS_ERR(kernel_type))
goto out_type;
- kernel_dir = getname(dir_name);
- if (IS_ERR(kernel_dir)) {
- ret = PTR_ERR(kernel_dir);
- goto out_dir;
- }
-
- ret = copy_mount_string(dev_name, &kernel_dev);
- if (ret < 0)
+ kernel_dev = copy_mount_string(dev_name);
+ ret = PTR_ERR(kernel_dev);
+ if (IS_ERR(kernel_dev))
goto out_dev;
ret = copy_mount_options(data, &data_page);
if (ret < 0)
goto out_data;
- ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags,
+ ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
(void *) data_page);
free_page(data_page);
out_data:
kfree(kernel_dev);
out_dev:
- putname(kernel_dir);
-out_dir:
kfree(kernel_type);
out_type:
return ret;
@@ -2768,6 +2942,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* make sure we can reach put_old from new_root */
if (!is_path_reachable(old_mnt, old.dentry, &new))
goto out4;
+ /* make certain new is below the root */
+ if (!is_path_reachable(new_mnt, new.dentry, &root))
+ goto out4;
root_mp->m_count++; /* pin it so it won't go away */
lock_mount_hash();
detach_mnt(new_mnt, &parent_path);
@@ -2972,13 +3149,13 @@ static void *mntns_get(struct task_struct *task)
struct mnt_namespace *ns = NULL;
struct nsproxy *nsproxy;
- rcu_read_lock();
- nsproxy = task_nsproxy(task);
+ task_lock(task);
+ nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->mnt_ns;
get_mnt_ns(ns);
}
- rcu_read_unlock();
+ task_unlock(task);
return ns;
}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 08b8ea8c353e..7cb751dfbeef 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -388,7 +388,6 @@ static struct dentry *
ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
{
struct dentry *dent = dentry;
- struct list_head *next;
if (d_validate(dent, parent)) {
if (dent->d_name.len <= NCP_MAXPATHLEN &&
@@ -404,9 +403,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
/* If a pointer is invalid, we search the dentry. */
spin_lock(&parent->d_lock);
- next = parent->d_subdirs.next;
- while (next != &parent->d_subdirs) {
- dent = list_entry(next, struct dentry, d_u.d_child);
+ list_for_each_entry(dent, &parent->d_subdirs, d_u.d_child) {
if ((unsigned long)dent->d_fsdata == fpos) {
if (dent->d_inode)
dget(dent);
@@ -415,7 +412,6 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
spin_unlock(&parent->d_lock);
goto out;
}
- next = next->next;
}
spin_unlock(&parent->d_lock);
return NULL;
@@ -1182,9 +1178,6 @@ static int day_n[] =
{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */
-
-extern struct timezone sys_tz;
-
static int utc2local(int time)
{
return time - sys_tz.tz_minuteswest * 60;
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 32c06587351a..52cb19d66ecb 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -188,20 +188,14 @@ static inline void
ncp_renew_dentries(struct dentry *parent)
{
struct ncp_server *server = NCP_SERVER(parent->d_inode);
- struct list_head *next;
struct dentry *dentry;
spin_lock(&parent->d_lock);
- next = parent->d_subdirs.next;
- while (next != &parent->d_subdirs) {
- dentry = list_entry(next, struct dentry, d_u.d_child);
-
+ list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) {
if (dentry->d_fsdata == NULL)
ncp_age_dentry(server, dentry);
else
ncp_new_dentry(dentry);
-
- next = next->next;
}
spin_unlock(&parent->d_lock);
}
@@ -210,16 +204,12 @@ static inline void
ncp_invalidate_dircache_entries(struct dentry *parent)
{
struct ncp_server *server = NCP_SERVER(parent->d_inode);
- struct list_head *next;
struct dentry *dentry;
spin_lock(&parent->d_lock);
- next = parent->d_subdirs.next;
- while (next != &parent->d_subdirs) {
- dentry = list_entry(next, struct dentry, d_u.d_child);
+ list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) {
dentry->d_fsdata = NULL;
ncp_age_dentry(server, dentry);
- next = next->next;
}
spin_unlock(&parent->d_lock);
}
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4782e0840dcc..04cb830fa09f 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -28,6 +28,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
+
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9b431f44fad9..4f46f7a05289 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/bio.h> /* struct bio */
-#include <linux/buffer_head.h> /* various write calls */
#include <linux/prefetch.h>
#include <linux/pagevec.h>
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
-static void print_page(struct page *page)
+static bool is_hole(struct pnfs_block_extent *be)
{
- dprintk("PRINTPAGE page %p\n", page);
- dprintk(" PagePrivate %d\n", PagePrivate(page));
- dprintk(" PageUptodate %d\n", PageUptodate(page));
- dprintk(" PageError %d\n", PageError(page));
- dprintk(" PageDirty %d\n", PageDirty(page));
- dprintk(" PageReferenced %d\n", PageReferenced(page));
- dprintk(" PageLocked %d\n", PageLocked(page));
- dprintk(" PageWriteback %d\n", PageWriteback(page));
- dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
- dprintk("\n");
-}
-
-/* Given the be associated with isect, determine if page data needs to be
- * initialized.
- */
-static int is_hole(struct pnfs_block_extent *be, sector_t isect)
-{
- if (be->be_state == PNFS_BLOCK_NONE_DATA)
- return 1;
- else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
- return 0;
- else
- return !bl_is_sector_init(be->be_inval, isect);
-}
-
-/* Given the be associated with isect, determine if page data can be
- * written to disk.
- */
-static int is_writable(struct pnfs_block_extent *be, sector_t isect)
-{
- return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
- be->be_state == PNFS_BLOCK_INVALID_DATA);
+ switch (be->be_state) {
+ case PNFS_BLOCK_NONE_DATA:
+ return true;
+ case PNFS_BLOCK_INVALID_DATA:
+ return be->be_tag ? false : true;
+ default:
+ return false;
+ }
}
/* The data we are handed might be spread across several bios. We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
*/
struct parallel_io {
struct kref refcnt;
- void (*pnfs_callback) (void *data, int num_se);
+ void (*pnfs_callback) (void *data);
void *data;
- int bse_count;
};
static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
if (rv) {
rv->data = data;
kref_init(&rv->refcnt);
- rv->bse_count = 0;
}
return rv;
}
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
dprintk("%s enter\n", __func__);
- p->pnfs_callback(p->data, p->bse_count);
+ p->pnfs_callback(p->data);
kfree(p);
}
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
return NULL;
}
-static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
+static struct bio *
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+ void (*end_io)(struct bio *, int err), struct parallel_io *par)
{
struct bio *bio;
@@ -156,67 +128,73 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
}
if (bio) {
- bio->bi_iter.bi_sector = isect - be->be_f_offset +
- be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
+ bio->bi_iter.bi_sector = disk_sector;
+ bio->bi_bdev = bdev;
bio->bi_end_io = end_io;
bio->bi_private = par;
}
return bio;
}
-static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
- sector_t isect, struct page *page,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par,
- unsigned int offset, int len)
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+ struct page *page, struct pnfs_block_dev_map *map,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par, unsigned int offset, int *len)
{
- isect = isect + (offset >> SECTOR_SHIFT);
+ struct pnfs_block_dev *dev =
+ container_of(be->be_device, struct pnfs_block_dev, node);
+ u64 disk_addr, end;
+
dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
- npg, rw, (unsigned long long)isect, offset, len);
+ npg, rw, (unsigned long long)isect, offset, *len);
+
+ /* translate to device offset */
+ isect += be->be_v_offset;
+ isect -= be->be_f_offset;
+
+ /* translate to physical disk offset */
+ disk_addr = (u64)isect << SECTOR_SHIFT;
+ if (disk_addr < map->start || disk_addr >= map->start + map->len) {
+ if (!dev->map(dev, disk_addr, map))
+ return ERR_PTR(-EIO);
+ bio = bl_submit_bio(rw, bio);
+ }
+ disk_addr += map->disk_offset;
+ disk_addr -= map->start;
+
+ /* limit length to what the device mapping allows */
+ end = disk_addr + *len;
+ if (end >= map->start + map->len)
+ *len = map->start + map->len - disk_addr;
+
retry:
if (!bio) {
- bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+ bio = bl_alloc_init_bio(npg, map->bdev,
+ disk_addr >> SECTOR_SHIFT, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
}
- if (bio_add_page(bio, page, len, offset) < len) {
+ if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(rw, bio);
goto retry;
}
return bio;
}
-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
- sector_t isect, struct page *page,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
-{
- return do_add_page_to_bio(bio, npg, rw, isect, page, be,
- end_io, par, 0, PAGE_CACHE_SIZE);
-}
-
-/* This is basically copied from mpage_end_io_read */
static void bl_end_io_read(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
- struct bio_vec *bvec;
- int i;
-
- if (!err)
- bio_for_each_segment_all(bvec, bio, i)
- SetPageUptodate(bvec->bv_page);
if (err) {
- struct nfs_pgio_data *rdata = par->data;
- struct nfs_pgio_header *header = rdata->header;
+ struct nfs_pgio_header *header = par->data;
if (!header->pnfs_error)
header->pnfs_error = -EIO;
pnfs_set_lo_fail(header->lseg);
}
+
bio_put(bio);
put_parallel(par);
}
@@ -224,104 +202,96 @@ static void bl_end_io_read(struct bio *bio, int err)
static void bl_read_cleanup(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_pgio_data *rdata;
+ struct nfs_pgio_header *hdr;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- rdata = container_of(task, struct nfs_pgio_data, task);
- pnfs_ld_read_done(rdata);
+ hdr = container_of(task, struct nfs_pgio_header, task);
+ pnfs_ld_read_done(hdr);
}
static void
-bl_end_par_io_read(void *data, int unused)
+bl_end_par_io_read(void *data)
{
- struct nfs_pgio_data *rdata = data;
+ struct nfs_pgio_header *hdr = data;
- rdata->task.tk_status = rdata->header->pnfs_error;
- INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
- schedule_work(&rdata->task.u.tk_work);
+ hdr->task.tk_status = hdr->pnfs_error;
+ INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
+ schedule_work(&hdr->task.u.tk_work);
}
static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_data *rdata)
+bl_read_pagelist(struct nfs_pgio_header *header)
{
- struct nfs_pgio_header *header = rdata->header;
- int i, hole;
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
- struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
struct parallel_io *par;
- loff_t f_offset = rdata->args.offset;
- size_t bytes_left = rdata->args.count;
+ loff_t f_offset = header->args.offset;
+ size_t bytes_left = header->args.count;
unsigned int pg_offset, pg_len;
- struct page **pages = rdata->args.pages;
- int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
const bool is_dio = (header->dreq != NULL);
+ struct blk_plug plug;
+ int i;
dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
- rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
+ header->page_array.npages, f_offset,
+ (unsigned int)header->args.count);
- par = alloc_parallel(rdata);
+ par = alloc_parallel(header);
if (!par)
- goto use_mds;
+ return PNFS_NOT_ATTEMPTED;
par->pnfs_callback = bl_end_par_io_read;
- /* At this point, we can no longer jump to use_mds */
+
+ blk_start_plug(&plug);
isect = (sector_t) (f_offset >> SECTOR_SHIFT);
/* Code assumes extents are page-aligned */
- for (i = pg_index; i < rdata->pages.npages; i++) {
- if (!extent_length) {
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
/* We've used up the previous extent */
- bl_put_extent(be);
- bl_put_extent(cow_read);
bio = bl_submit_bio(READ, bio);
+
/* Get the next one */
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
- isect, &cow_read);
- if (!be) {
+ if (!ext_tree_lookup(bl, isect, &be, false)) {
header->pnfs_error = -EIO;
goto out;
}
- extent_length = be->be_length -
- (isect - be->be_f_offset);
- if (cow_read) {
- sector_t cow_length = cow_read->be_length -
- (isect - cow_read->be_f_offset);
- extent_length = min(extent_length, cow_length);
- }
+ extent_length = be.be_length - (isect - be.be_f_offset);
}
+ pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
- pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
pg_len = PAGE_CACHE_SIZE - pg_offset;
else
pg_len = bytes_left;
-
- f_offset += pg_len;
- bytes_left -= pg_len;
- isect += (pg_offset >> SECTOR_SHIFT);
} else {
- pg_offset = 0;
+ BUG_ON(pg_offset != 0);
pg_len = PAGE_CACHE_SIZE;
}
- hole = is_hole(be, isect);
- if (hole && !cow_read) {
+ isect += (pg_offset >> SECTOR_SHIFT);
+ extent_length -= (pg_offset >> SECTOR_SHIFT);
+
+ if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
dprintk("%s Zeroing page for hole\n", __func__);
zero_user_segment(pages[i], pg_offset, pg_len);
- print_page(pages[i]);
- SetPageUptodate(pages[i]);
- } else {
- struct pnfs_block_extent *be_read;
- be_read = (hole && cow_read) ? cow_read : be;
- bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
+ /* invalidate map */
+ map.start = NFS4_MAX_UINT64;
+ } else {
+ bio = do_add_page_to_bio(bio,
+ header->page_array.npages - i,
READ,
- isect, pages[i], be_read,
+ isect, pages[i], &map, &be,
bl_end_io_read, par,
- pg_offset, pg_len);
+ pg_offset, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
@@ -329,84 +299,28 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
}
}
isect += (pg_len >> SECTOR_SHIFT);
- extent_length -= PAGE_CACHE_SECTORS;
+ extent_length -= (pg_len >> SECTOR_SHIFT);
+ f_offset += pg_len;
+ bytes_left -= pg_len;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
- rdata->res.eof = 1;
- rdata->res.count = header->inode->i_size - rdata->args.offset;
+ header->res.eof = 1;
+ header->res.count = header->inode->i_size - header->args.offset;
} else {
- rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
+ header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
}
out:
- bl_put_extent(be);
- bl_put_extent(cow_read);
bl_submit_bio(READ, bio);
+ blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
-
- use_mds:
- dprintk("Giving up and using normal NFS\n");
- return PNFS_NOT_ATTEMPTED;
-}
-
-static void mark_extents_written(struct pnfs_block_layout *bl,
- __u64 offset, __u32 count)
-{
- sector_t isect, end;
- struct pnfs_block_extent *be;
- struct pnfs_block_short_extent *se;
-
- dprintk("%s(%llu, %u)\n", __func__, offset, count);
- if (count == 0)
- return;
- isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
- end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
- end >>= SECTOR_SHIFT;
- while (isect < end) {
- sector_t len;
- be = bl_find_get_extent(bl, isect, NULL);
- BUG_ON(!be); /* FIXME */
- len = min(end, be->be_f_offset + be->be_length) - isect;
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- se = bl_pop_one_short_extent(be->be_inval);
- BUG_ON(!se);
- bl_mark_for_commit(be, isect, len, se);
- }
- isect += len;
- bl_put_extent(be);
- }
-}
-
-static void bl_end_io_write_zero(struct bio *bio, int err)
-{
- struct parallel_io *par = bio->bi_private;
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- /* This is the zeroing page we added */
- end_page_writeback(bvec->bv_page);
- page_cache_release(bvec->bv_page);
- }
-
- if (unlikely(err)) {
- struct nfs_pgio_data *data = par->data;
- struct nfs_pgio_header *header = data->header;
-
- if (!header->pnfs_error)
- header->pnfs_error = -EIO;
- pnfs_set_lo_fail(header->lseg);
- }
- bio_put(bio);
- put_parallel(par);
}
static void bl_end_io_write(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct nfs_pgio_data *data = par->data;
- struct nfs_pgio_header *header = data->header;
+ struct nfs_pgio_header *header = par->data;
if (!uptodate) {
if (!header->pnfs_error)
@@ -422,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
*/
static void bl_write_cleanup(struct work_struct *work)
{
- struct rpc_task *task;
- struct nfs_pgio_data *wdata;
- dprintk("%s enter\n", __func__);
- task = container_of(work, struct rpc_task, u.tk_work);
- wdata = container_of(task, struct nfs_pgio_data, task);
- if (likely(!wdata->header->pnfs_error)) {
- /* Marks for LAYOUTCOMMIT */
- mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
- wdata->args.offset, wdata->args.count);
- }
- pnfs_ld_write_done(wdata);
-}
-
-/* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data, int num_se)
-{
- struct nfs_pgio_data *wdata = data;
-
- if (unlikely(wdata->header->pnfs_error)) {
- bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
- num_se);
- }
-
- wdata->task.tk_status = wdata->header->pnfs_error;
- wdata->verf.committed = NFS_FILE_SYNC;
- INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
- schedule_work(&wdata->task.u.tk_work);
-}
-
-/* FIXME STUB - mark intersection of layout and page as bad, so is not
- * used again.
- */
-static void mark_bad_read(void)
-{
- return;
-}
-
-/*
- * map_block: map a requested I/0 block (isect) into an offset in the LVM
- * block_device
- */
-static void
-map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
-{
- dprintk("%s enter be=%p\n", __func__, be);
-
- set_buffer_mapped(bh);
- bh->b_bdev = be->be_mdev;
- bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
- (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
-
- dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
- __func__, (unsigned long long)isect, (long)bh->b_blocknr,
- bh->b_size);
- return;
-}
-
-static void
-bl_read_single_end_io(struct bio *bio, int error)
-{
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct page *page = bvec->bv_page;
-
- /* Only one page in bvec */
- unlock_page(page);
-}
-
-static int
-bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
- unsigned int offset, unsigned int len)
-{
- struct bio *bio;
- struct page *shadow_page;
- sector_t isect;
- char *kaddr, *kshadow_addr;
- int ret = 0;
+ struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
+ struct nfs_pgio_header *hdr =
+ container_of(task, struct nfs_pgio_header, task);
- dprintk("%s: offset %u len %u\n", __func__, offset, len);
-
- shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (shadow_page == NULL)
- return -ENOMEM;
-
- bio = bio_alloc(GFP_NOIO, 1);
- if (bio == NULL)
- return -ENOMEM;
-
- isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
- (offset / SECTOR_SIZE);
-
- bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
- bio->bi_end_io = bl_read_single_end_io;
-
- lock_page(shadow_page);
- if (bio_add_page(bio, shadow_page,
- SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
- unlock_page(shadow_page);
- bio_put(bio);
- return -EIO;
- }
-
- submit_bio(READ, bio);
- wait_on_page_locked(shadow_page);
- if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
- ret = -EIO;
- } else {
- kaddr = kmap_atomic(page);
- kshadow_addr = kmap_atomic(shadow_page);
- memcpy(kaddr + offset, kshadow_addr + offset, len);
- kunmap_atomic(kshadow_addr);
- kunmap_atomic(kaddr);
- }
- __free_page(shadow_page);
- bio_put(bio);
-
- return ret;
-}
-
-static int
-bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
- unsigned int dirty_offset, unsigned int dirty_len,
- bool full_page)
-{
- int ret = 0;
- unsigned int start, end;
+ dprintk("%s enter\n", __func__);
- if (full_page) {
- start = 0;
- end = PAGE_CACHE_SIZE;
- } else {
- start = round_down(dirty_offset, SECTOR_SIZE);
- end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
- }
+ if (likely(!hdr->pnfs_error)) {
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
+ u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+ u64 end = (hdr->args.offset + hdr->args.count +
+ PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
- dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
- if (!be) {
- zero_user_segments(page, start, dirty_offset,
- dirty_offset + dirty_len, end);
- if (start == 0 && end == PAGE_CACHE_SIZE &&
- trylock_page(page)) {
- SetPageUptodate(page);
- unlock_page(page);
- }
- return ret;
+ ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+ (end - start) >> SECTOR_SHIFT);
}
- if (start != dirty_offset)
- ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
-
- if (!ret && (dirty_offset + dirty_len < end))
- ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
- end - dirty_offset - dirty_len);
-
- return ret;
+ pnfs_ld_write_done(hdr);
}
-/* Given an unmapped page, zero it or read in page for COW, page is locked
- * by caller.
- */
-static int
-init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data)
{
- struct buffer_head *bh = NULL;
- int ret = 0;
- sector_t isect;
-
- dprintk("%s enter, %p\n", __func__, page);
- BUG_ON(PageUptodate(page));
- if (!cow_read) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
- goto cleanup;
- }
-
- bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
- if (!bh) {
- ret = -ENOMEM;
- goto cleanup;
- }
+ struct nfs_pgio_header *hdr = data;
- isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
- map_block(bh, isect, cow_read);
- if (!bh_uptodate_or_lock(bh))
- ret = bh_submit_read(bh);
- if (ret)
- goto cleanup;
- SetPageUptodate(page);
-
-cleanup:
- if (bh)
- free_buffer_head(bh);
- if (ret) {
- /* Need to mark layout with bad read...should now
- * just use nfs4 for reads and writes.
- */
- mark_bad_read();
- }
- return ret;
-}
-
-/* Find or create a zeroing page marked being writeback.
- * Return ERR_PTR on error, NULL to indicate skip this page and page itself
- * to indicate write out.
- */
-static struct page *
-bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
- struct pnfs_block_extent *cow_read)
-{
- struct page *page;
- int locked = 0;
- page = find_get_page(inode->i_mapping, index);
- if (page)
- goto check_page;
-
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
- if (unlikely(!page)) {
- dprintk("%s oom\n", __func__);
- return ERR_PTR(-ENOMEM);
- }
- locked = 1;
-
-check_page:
- /* PageDirty: Other will write this out
- * PageWriteback: Other is writing this out
- * PageUptodate: It was read before
- */
- if (PageDirty(page) || PageWriteback(page)) {
- print_page(page);
- if (locked)
- unlock_page(page);
- page_cache_release(page);
- return NULL;
- }
-
- if (!locked) {
- lock_page(page);
- locked = 1;
- goto check_page;
- }
- if (!PageUptodate(page)) {
- /* New page, readin or zero it */
- init_page_for_write(page, cow_read);
- }
- set_page_writeback(page);
- unlock_page(page);
-
- return page;
+ hdr->task.tk_status = hdr->pnfs_error;
+ hdr->verf.committed = NFS_FILE_SYNC;
+ INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
+ schedule_work(&hdr->task.u.tk_work);
}
static enum pnfs_try_status
-bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
{
- struct nfs_pgio_header *header = wdata->header;
- int i, ret, npg_zero, pg_index, last = 0;
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
- struct pnfs_block_extent *be = NULL, *cow_read = NULL;
- sector_t isect, last_isect = 0, extent_length = 0;
+ struct pnfs_block_extent be;
+ sector_t isect, extent_length = 0;
struct parallel_io *par = NULL;
- loff_t offset = wdata->args.offset;
- size_t count = wdata->args.count;
- unsigned int pg_offset, pg_len, saved_len;
- struct page **pages = wdata->args.pages;
- struct page *page;
- pgoff_t index;
- u64 temp;
- int npg_per_block =
- NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+ loff_t offset = header->args.offset;
+ size_t count = header->args.count;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ unsigned int pg_len;
+ struct blk_plug plug;
+ int i;
dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
- if (header->dreq != NULL &&
- (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
- !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
- dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
- goto out_mds;
- }
- /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+ /* At this point, header->page_aray is a (sequential) list of nfs_pages.
* We want to write each, and if there is an error set pnfs_error
* to have it redone using nfs.
*/
- par = alloc_parallel(wdata);
+ par = alloc_parallel(header);
if (!par)
- goto out_mds;
+ return PNFS_NOT_ATTEMPTED;
par->pnfs_callback = bl_end_par_io_write;
- /* At this point, have to be more careful with error handling */
- isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
- if (!be || !is_writable(be, isect)) {
- dprintk("%s no matching extents!\n", __func__);
- goto out_mds;
- }
+ blk_start_plug(&plug);
- /* First page inside INVALID extent */
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (likely(!bl_push_one_short_extent(be->be_inval)))
- par->bse_count++;
- else
- goto out_mds;
- temp = offset >> PAGE_CACHE_SHIFT;
- npg_zero = do_div(temp, npg_per_block);
- isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
- (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
- extent_length = be->be_length - (isect - be->be_f_offset);
-
-fill_invalid_ext:
- dprintk("%s need to zero %d pages\n", __func__, npg_zero);
- for (;npg_zero > 0; npg_zero--) {
- if (bl_is_sector_init(be->be_inval, isect)) {
- dprintk("isect %llu already init\n",
- (unsigned long long)isect);
- goto next_page;
- }
- /* page ref released in bl_end_io_write_zero */
- index = isect >> PAGE_CACHE_SECTOR_SHIFT;
- dprintk("%s zero %dth page: index %lu isect %llu\n",
- __func__, npg_zero, index,
- (unsigned long long)isect);
- page = bl_find_get_zeroing_page(header->inode, index,
- cow_read);
- if (unlikely(IS_ERR(page))) {
- header->pnfs_error = PTR_ERR(page);
- goto out;
- } else if (page == NULL)
- goto next_page;
-
- ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS);
- if (unlikely(ret)) {
- dprintk("%s bl_mark_sectors_init fail %d\n",
- __func__, ret);
- end_page_writeback(page);
- page_cache_release(page);
- header->pnfs_error = ret;
- goto out;
- }
- if (likely(!bl_push_one_short_extent(be->be_inval)))
- par->bse_count++;
- else {
- end_page_writeback(page);
- page_cache_release(page);
- header->pnfs_error = -ENOMEM;
- goto out;
- }
- /* FIXME: This should be done in bi_end_io */
- mark_extents_written(BLK_LSEG2EXT(header->lseg),
- page->index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE);
-
- bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
- isect, page, be,
- bl_end_io_write_zero, par);
- if (IS_ERR(bio)) {
- header->pnfs_error = PTR_ERR(bio);
- bio = NULL;
- goto out;
- }
-next_page:
- isect += PAGE_CACHE_SECTORS;
- extent_length -= PAGE_CACHE_SECTORS;
- }
- if (last)
- goto write_done;
- }
- bio = bl_submit_bio(WRITE, bio);
+ /* we always write out the whole page */
+ offset = offset & (loff_t)PAGE_CACHE_MASK;
+ isect = offset >> SECTOR_SHIFT;
- /* Middle pages */
- pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
- for (i = pg_index; i < wdata->pages.npages; i++) {
- if (!extent_length) {
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
/* We've used up the previous extent */
- bl_put_extent(be);
- bl_put_extent(cow_read);
bio = bl_submit_bio(WRITE, bio);
/* Get the next one */
- be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
- isect, &cow_read);
- if (!be || !is_writable(be, isect)) {
+ if (!ext_tree_lookup(bl, isect, &be, true)) {
header->pnfs_error = -EINVAL;
goto out;
}
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (likely(!bl_push_one_short_extent(
- be->be_inval)))
- par->bse_count++;
- else {
- header->pnfs_error = -ENOMEM;
- goto out;
- }
- }
- extent_length = be->be_length -
- (isect - be->be_f_offset);
- }
- dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
- pg_offset = offset & ~PAGE_CACHE_MASK;
- if (pg_offset + count > PAGE_CACHE_SIZE)
- pg_len = PAGE_CACHE_SIZE - pg_offset;
- else
- pg_len = count;
-
- saved_len = pg_len;
- if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
- !bl_is_sector_init(be->be_inval, isect)) {
- ret = bl_read_partial_page_sync(pages[i], cow_read,
- pg_offset, pg_len, true);
- if (ret) {
- dprintk("%s bl_read_partial_page_sync fail %d\n",
- __func__, ret);
- header->pnfs_error = ret;
- goto out;
- }
-
- ret = bl_mark_sectors_init(be->be_inval, isect,
- PAGE_CACHE_SECTORS);
- if (unlikely(ret)) {
- dprintk("%s bl_mark_sectors_init fail %d\n",
- __func__, ret);
- header->pnfs_error = ret;
- goto out;
- }
-
- /* Expand to full page write */
- pg_offset = 0;
- pg_len = PAGE_CACHE_SIZE;
- } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
- (pg_len & (SECTOR_SIZE - 1))){
- /* ahh, nasty case. We have to do sync full sector
- * read-modify-write cycles.
- */
- unsigned int saved_offset = pg_offset;
- ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
- pg_len, false);
- pg_offset = round_down(pg_offset, SECTOR_SIZE);
- pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
- - pg_offset;
+ extent_length = be.be_length - (isect - be.be_f_offset);
}
-
- bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
- isect, pages[i], be,
+ pg_len = PAGE_CACHE_SIZE;
+ bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+ WRITE, isect, pages[i], &map, &be,
bl_end_io_write, par,
- pg_offset, pg_len);
+ 0, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
- offset += saved_len;
- count -= saved_len;
- isect += PAGE_CACHE_SECTORS;
- last_isect = isect;
- extent_length -= PAGE_CACHE_SECTORS;
- }
- /* Last page inside INVALID extent */
- if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- bio = bl_submit_bio(WRITE, bio);
- temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
- npg_zero = npg_per_block - do_div(temp, npg_per_block);
- if (npg_zero < npg_per_block) {
- last = 1;
- goto fill_invalid_ext;
- }
+ offset += pg_len;
+ count -= pg_len;
+ isect += (pg_len >> SECTOR_SHIFT);
+ extent_length -= (pg_len >> SECTOR_SHIFT);
}
-write_done:
- wdata->res.count = wdata->args.count;
+ header->res.count = header->args.count;
out:
- bl_put_extent(be);
- bl_put_extent(cow_read);
bl_submit_bio(WRITE, bio);
+ blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
-out_mds:
- bl_put_extent(be);
- bl_put_extent(cow_read);
- kfree(par);
- return PNFS_NOT_ATTEMPTED;
-}
-
-/* FIXME - range ignored */
-static void
-release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
-{
- int i;
- struct pnfs_block_extent *be;
-
- spin_lock(&bl->bl_ext_lock);
- for (i = 0; i < EXTENT_LISTS; i++) {
- while (!list_empty(&bl->bl_extents[i])) {
- be = list_first_entry(&bl->bl_extents[i],
- struct pnfs_block_extent,
- be_node);
- list_del(&be->be_node);
- bl_put_extent(be);
- }
- }
- spin_unlock(&bl->bl_ext_lock);
-}
-
-static void
-release_inval_marks(struct pnfs_inval_markings *marks)
-{
- struct pnfs_inval_tracking *pos, *temp;
- struct pnfs_block_short_extent *se, *stemp;
-
- list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
- list_del(&pos->it_link);
- kfree(pos);
- }
-
- list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
- list_del(&se->bse_node);
- kfree(se);
- }
- return;
}
static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ int err;
dprintk("%s enter\n", __func__);
- release_extents(bl, NULL);
- release_inval_marks(&bl->bl_inval);
+
+ err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+ WARN_ON(err);
+
kfree(bl);
}
@@ -961,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
bl = kzalloc(sizeof(*bl), gfp_flags);
if (!bl)
return NULL;
+
+ bl->bl_ext_rw = RB_ROOT;
+ bl->bl_ext_ro = RB_ROOT;
spin_lock_init(&bl->bl_ext_lock);
- INIT_LIST_HEAD(&bl->bl_extents[0]);
- INIT_LIST_HEAD(&bl->bl_extents[1]);
- INIT_LIST_HEAD(&bl->bl_commit);
- INIT_LIST_HEAD(&bl->bl_committing);
- bl->bl_count = 0;
- bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
- BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+
return &bl->bl_layout;
}
@@ -978,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
kfree(lseg);
}
-/* We pretty much ignore lseg, and store all data layout wide, so we
- * can correctly merge.
- */
-static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr,
- gfp_t gfp_flags)
-{
- struct pnfs_layout_segment *lseg;
- int status;
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
- dprintk("%s enter\n", __func__);
- lseg = kzalloc(sizeof(*lseg), gfp_flags);
- if (!lseg)
- return ERR_PTR(-ENOMEM);
- status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
- if (status) {
- /* We don't want to call the full-blown bl_free_lseg,
- * since on error extents were not touched.
- */
- kfree(lseg);
- return ERR_PTR(status);
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
}
- return lseg;
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
}
-static void
-bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg)
+static int decode_sector_number(__be32 **rp, sector_t *sp)
{
- dprintk("%s enter\n", __func__);
- encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+ uint64_t s;
+
+ *rp = xdr_decode_hyper(*rp, &s);
+ if (s & 0x1ff) {
+ printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+ return -1;
+ }
+ *sp = s >> SECTOR_SHIFT;
+ return 0;
}
-static void
-bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+static int
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+ struct layout_verification *lv, struct list_head *extents,
+ gfp_t gfp_mask)
{
- struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+ struct pnfs_block_extent *be;
+ struct nfs4_deviceid id;
+ int error;
+ __be32 *p;
- dprintk("%s enter\n", __func__);
- clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
-}
+ p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+ if (!p)
+ return -EIO;
-static void free_blk_mountid(struct block_mount_id *mid)
-{
- if (mid) {
- struct pnfs_block_dev *dev, *tmp;
+ be = kzalloc(sizeof(*be), GFP_NOFS);
+ if (!be)
+ return -ENOMEM;
- /* No need to take bm_lock as we are last user freeing bm_devlist */
- list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
- list_del(&dev->bm_node);
- bl_free_block_dev(dev);
- }
- kfree(mid);
+ memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ error = -EIO;
+ be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+ lo->plh_lc_cred, gfp_mask);
+ if (!be->be_device)
+ goto out_free_be;
+
+ /*
+ * The next three values are read in as bytes, but stored in the
+ * extent structure in 512-byte granularity.
+ */
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_put_deviceid;
+ be->be_state = be32_to_cpup(p++);
+
+ error = verify_extent(be, lv);
+ if (error) {
+ dprintk("%s: extent verification failed\n", __func__);
+ goto out_put_deviceid;
}
+
+ list_add_tail(&be->be_list, extents);
+ return 0;
+
+out_put_deviceid:
+ nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+ kfree(be);
+ return error;
}
-/* This is mostly copied from the filelayout_get_device_info function.
- * It seems much of this should be at the generic pnfs level.
- */
-static struct pnfs_block_dev *
-nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
- struct nfs4_deviceid *d_id)
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_mask)
{
- struct pnfs_device *dev;
- struct pnfs_block_dev *rv;
- u32 max_resp_sz;
- int max_pages;
- struct page **pages = NULL;
- int i, rc;
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> SECTOR_SHIFT,
+ .inval = lgr->range.offset >> SECTOR_SHIFT,
+ .cowread = lgr->range.offset >> SECTOR_SHIFT,
+ };
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ struct pnfs_layout_segment *lseg;
+ struct xdr_buf buf;
+ struct xdr_stream xdr;
+ struct page *scratch;
+ int status, i;
+ uint32_t count;
+ __be32 *p;
+ LIST_HEAD(extents);
+
+ dprintk("---> %s\n", __func__);
+
+ lseg = kzalloc(sizeof(*lseg), gfp_mask);
+ if (!lseg)
+ return ERR_PTR(-ENOMEM);
+
+ status = -ENOMEM;
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf,
+ lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ status = -EIO;
+ p = xdr_inline_decode(&xdr, 4);
+ if (unlikely(!p))
+ goto out_free_scratch;
+
+ count = be32_to_cpup(p++);
+ dprintk("%s: number of extents %d\n", __func__, count);
/*
- * Use the session max response size as the basis for setting
- * GETDEVICEINFO's maxcount
+ * Decode individual extents, putting them in temporary staging area
+ * until whole layout is decoded to make error recovery easier.
*/
- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- max_pages = nfs_page_array_len(0, max_resp_sz);
- dprintk("%s max_resp_sz %u max_pages %d\n",
- __func__, max_resp_sz, max_pages);
-
- dev = kmalloc(sizeof(*dev), GFP_NOFS);
- if (!dev) {
- dprintk("%s kmalloc failed\n", __func__);
- return ERR_PTR(-ENOMEM);
+ for (i = 0; i < count; i++) {
+ status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+ if (status)
+ goto process_extents;
}
- pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
+ if (lgr->range.offset + lgr->range.length !=
+ lv.start << SECTOR_SHIFT) {
+ dprintk("%s Final length mismatch\n", __func__);
+ status = -EIO;
+ goto process_extents;
}
- for (i = 0; i < max_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i]) {
- rv = ERR_PTR(-ENOMEM);
- goto out_free;
- }
+
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ status = -EIO;
}
- memcpy(&dev->dev_id, d_id, sizeof(*d_id));
- dev->layout_type = LAYOUT_BLOCK_VOLUME;
- dev->pages = pages;
- dev->pgbase = 0;
- dev->pglen = PAGE_SIZE * max_pages;
- dev->mincount = 0;
- dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
- dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
- rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
- dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc) {
- rv = ERR_PTR(rc);
- goto out_free;
+process_extents:
+ while (!list_empty(&extents)) {
+ struct pnfs_block_extent *be =
+ list_first_entry(&extents, struct pnfs_block_extent,
+ be_list);
+ list_del(&be->be_list);
+
+ if (!status)
+ status = ext_tree_insert(bl, be);
+
+ if (status) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
}
- rv = nfs4_blk_decode_device(server, dev);
- out_free:
- for (i = 0; i < max_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- kfree(dev);
- return rv;
+out_free_scratch:
+ __free_page(scratch);
+out:
+ dprintk("%s returns %d\n", __func__, status);
+ if (status) {
+ kfree(lseg);
+ return ERR_PTR(status);
+ }
+ return lseg;
}
-static int
-bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+static void
+bl_return_range(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
{
- struct block_mount_id *b_mt_id = NULL;
- struct pnfs_devicelist *dlist = NULL;
- struct pnfs_block_dev *bdev;
- LIST_HEAD(block_disklist);
- int status, i;
-
- dprintk("%s enter\n", __func__);
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ sector_t offset = range->offset >> SECTOR_SHIFT, end;
- if (server->pnfs_blksize == 0) {
- dprintk("%s Server did not return blksize\n", __func__);
- return -EINVAL;
- }
- b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
- if (!b_mt_id) {
- status = -ENOMEM;
- goto out_error;
- }
- /* Initialize nfs4 block layout mount id */
- spin_lock_init(&b_mt_id->bm_lock);
- INIT_LIST_HEAD(&b_mt_id->bm_devlist);
-
- dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
- if (!dlist) {
- status = -ENOMEM;
- goto out_error;
+ if (range->offset % 8) {
+ dprintk("%s: offset %lld not block size aligned\n",
+ __func__, range->offset);
+ return;
}
- dlist->eof = 0;
- while (!dlist->eof) {
- status = nfs4_proc_getdevicelist(server, fh, dlist);
- if (status)
- goto out_error;
- dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
- __func__, dlist->num_devs, dlist->eof);
- for (i = 0; i < dlist->num_devs; i++) {
- bdev = nfs4_blk_get_deviceinfo(server, fh,
- &dlist->dev_id[i]);
- if (IS_ERR(bdev)) {
- status = PTR_ERR(bdev);
- goto out_error;
- }
- spin_lock(&b_mt_id->bm_lock);
- list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
- spin_unlock(&b_mt_id->bm_lock);
+
+ if (range->length != NFS4_MAX_UINT64) {
+ if (range->length % 8) {
+ dprintk("%s: length %lld not block size aligned\n",
+ __func__, range->length);
+ return;
}
- }
- dprintk("%s SUCCESS\n", __func__);
- server->pnfs_ld_data = b_mt_id;
- out_return:
- kfree(dlist);
- return status;
+ end = offset + (range->length >> SECTOR_SHIFT);
+ } else {
+ end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+ }
- out_error:
- free_blk_mountid(b_mt_id);
- goto out_return;
+ ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
}
static int
-bl_clear_layoutdriver(struct nfs_server *server)
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
{
- struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+ return ext_tree_prepare_commit(arg);
+}
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+ ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
+}
+
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
dprintk("%s enter\n", __func__);
- free_blk_mountid(b_mt_id);
- dprintk("%s RETURNS\n", __func__);
+
+ if (server->pnfs_blksize == 0) {
+ dprintk("%s Server did not return blksize\n", __func__);
+ return -EINVAL;
+ }
+ if (server->pnfs_blksize > PAGE_SIZE) {
+ printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+ __func__, server->pnfs_blksize);
+ return -EINVAL;
+ }
+
return 0;
}
static bool
-is_aligned_req(struct nfs_page *req, unsigned int alignment)
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, unsigned int alignment)
{
- return IS_ALIGNED(req->wb_offset, alignment) &&
- IS_ALIGNED(req->wb_bytes, alignment);
+ /*
+ * Always accept buffered writes, higher layers take care of the
+ * right alignment.
+ */
+ if (pgio->pg_dreq == NULL)
+ return true;
+
+ if (!IS_ALIGNED(req->wb_offset, alignment))
+ return false;
+
+ if (IS_ALIGNED(req->wb_bytes, alignment))
+ return true;
+
+ if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+ /*
+ * If the write goes up to the inode size, just write
+ * the full page. Data past the inode size is
+ * guaranteed to be zeroed by the higher level client
+ * code, and this behaviour is mandated by RFC 5663
+ * section 2.3.2.
+ */
+ return true;
+ }
+
+ return false;
}
static void
bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, SECTOR_SIZE))
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
nfs_pageio_reset_read_mds(pgio);
- else
- pnfs_generic_pg_init_read(pgio, req);
+ return;
+ }
+
+ pnfs_generic_pg_init_read(pgio, req);
}
/*
@@ -1197,10 +796,8 @@ static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, SECTOR_SIZE))
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE))
return 0;
-
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1230,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
static void
bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, PAGE_CACHE_SIZE)) {
+ u64 wb_size;
+
+ if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
nfs_pageio_reset_write_mds(pgio);
- } else {
- u64 wb_size;
- if (pgio->pg_dreq == NULL)
- wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
- req->wb_index);
- else
- wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-
- pnfs_generic_pg_init_write(pgio, req, wb_size);
+ return;
}
+
+ if (pgio->pg_dreq == NULL)
+ wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+ req->wb_index);
+ else
+ wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+ pnfs_generic_pg_init_write(pgio, req, wb_size);
}
/*
@@ -1253,10 +851,8 @@ static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_dreq != NULL &&
- !is_aligned_req(req, PAGE_CACHE_SIZE))
+ if (!is_aligned_req(pgio, req, PAGE_SIZE))
return 0;
-
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1276,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.id = LAYOUT_BLOCK_VOLUME,
.name = "LAYOUT_BLOCK_VOLUME",
.owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_READ_WHOLE_PAGE,
.read_pagelist = bl_read_pagelist,
.write_pagelist = bl_write_pagelist,
.alloc_layout_hdr = bl_alloc_layout_hdr,
.free_layout_hdr = bl_free_layout_hdr,
.alloc_lseg = bl_alloc_lseg,
.free_lseg = bl_free_lseg,
- .encode_layoutcommit = bl_encode_layoutcommit,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
.cleanup_layoutcommit = bl_cleanup_layoutcommit,
.set_layoutdriver = bl_set_layoutdriver,
- .clear_layoutdriver = bl_clear_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
.pg_read_ops = &bl_pg_read_ops,
.pg_write_ops = &bl_pg_write_ops,
};
-static const struct rpc_pipe_ops bl_upcall_ops = {
- .upcall = rpc_pipe_generic_upcall,
- .downcall = bl_pipe_downcall,
- .destroy_msg = bl_pipe_destroy_msg,
-};
-
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- struct dentry *dir, *dentry;
-
- dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
- if (dir == NULL)
- return ERR_PTR(-ENOENT);
- dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
- dput(dir);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- if (pipe->dentry)
- rpc_unlink(pipe->dentry);
-}
-
-static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
- void *ptr)
-{
- struct super_block *sb = ptr;
- struct net *net = sb->s_fs_info;
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
- int ret = 0;
-
- if (!try_module_get(THIS_MODULE))
- return 0;
-
- if (nn->bl_device_pipe == NULL) {
- module_put(THIS_MODULE);
- return 0;
- }
-
- switch (event) {
- case RPC_PIPEFS_MOUNT:
- dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- break;
- }
- nn->bl_device_pipe->dentry = dentry;
- break;
- case RPC_PIPEFS_UMOUNT:
- if (nn->bl_device_pipe->dentry)
- nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
- break;
- default:
- ret = -ENOTSUPP;
- break;
- }
- module_put(THIS_MODULE);
- return ret;
-}
-
-static struct notifier_block nfs4blocklayout_block = {
- .notifier_call = rpc_pipefs_event,
-};
-
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
- struct rpc_pipe *pipe)
-{
- struct super_block *pipefs_sb;
- struct dentry *dentry;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (!pipefs_sb)
- return NULL;
- dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
- rpc_put_sb_net(net);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_net(struct net *net,
- struct rpc_pipe *pipe)
-{
- struct super_block *pipefs_sb;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (pipefs_sb) {
- nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
- rpc_put_sb_net(net);
- }
-}
-
-static int nfs4blocklayout_net_init(struct net *net)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
-
- init_waitqueue_head(&nn->bl_wq);
- nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
- if (IS_ERR(nn->bl_device_pipe))
- return PTR_ERR(nn->bl_device_pipe);
- dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- rpc_destroy_pipe_data(nn->bl_device_pipe);
- return PTR_ERR(dentry);
- }
- nn->bl_device_pipe->dentry = dentry;
- return 0;
-}
-
-static void nfs4blocklayout_net_exit(struct net *net)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
- rpc_destroy_pipe_data(nn->bl_device_pipe);
- nn->bl_device_pipe = NULL;
-}
-
-static struct pernet_operations nfs4blocklayout_net_ops = {
- .init = nfs4blocklayout_net_init,
- .exit = nfs4blocklayout_net_exit,
-};
-
static int __init nfs4blocklayout_init(void)
{
int ret;
@@ -1425,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
ret = pnfs_register_layoutdriver(&blocklayout_type);
if (ret)
goto out;
-
- ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ ret = bl_init_pipefs();
if (ret)
- goto out_remove;
- ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
- if (ret)
- goto out_notifier;
-out:
- return ret;
+ goto out_unregister;
+ return 0;
-out_notifier:
- rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-out_remove:
+out_unregister:
pnfs_unregister_layoutdriver(&blocklayout_type);
+out:
return ret;
}
@@ -1447,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);
- rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
- unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+ bl_cleanup_pipefs();
pnfs_unregister_layoutdriver(&blocklayout_type);
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
-struct block_mount_id {
- spinlock_t bm_lock; /* protects list */
- struct list_head bm_devlist; /* holds pnfs_block_dev */
-};
+struct pnfs_block_dev;
-struct pnfs_block_dev {
- struct list_head bm_node;
- struct nfs4_deviceid bm_mdevid; /* associated devid */
- struct block_device *bm_mdev; /* meta device itself */
- struct net *net;
+enum pnfs_block_volume_type {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0,
+ PNFS_BLOCK_VOLUME_SLICE = 1,
+ PNFS_BLOCK_VOLUME_CONCAT = 2,
+ PNFS_BLOCK_VOLUME_STRIPE = 3,
};
-enum exstate4 {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
- PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
+#define PNFS_BLOCK_MAX_UUIDS 4
+#define PNFS_BLOCK_MAX_DEVICES 64
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ int len;
+ int nr_sigs;
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } sigs[PNFS_BLOCK_MAX_UUIDS];
+ } simple;
+ struct {
+ u64 start;
+ u64 len;
+ u32 volume;
+ } slice;
+ struct {
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } concat;
+ struct {
+ u64 chunk_size;
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } stripe;
+ };
};
-#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+struct pnfs_block_dev_map {
+ sector_t start;
+ sector_t len;
-struct my_tree {
- sector_t mtt_step_size; /* Internal sector alignment */
- struct list_head mtt_stub; /* Should be a radix tree */
+ sector_t disk_offset;
+ struct block_device *bdev;
};
-struct pnfs_inval_markings {
- spinlock_t im_lock;
- struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
- sector_t im_block_size; /* Server blocksize in sectors */
- struct list_head im_extents; /* Short extents for INVAL->RW conversion */
+struct pnfs_block_dev {
+ struct nfs4_deviceid_node node;
+
+ u64 start;
+ u64 len;
+
+ u32 nr_children;
+ struct pnfs_block_dev *children;
+ u64 chunk_size;
+
+ struct block_device *bdev;
+ u64 disk_offset;
+
+ bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map);
};
-struct pnfs_inval_tracking {
- struct list_head it_link;
- int it_sector;
- int it_tags;
+enum exstate4 {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
+ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
};
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
- struct kref be_refcnt;
- struct list_head be_node; /* link into lseg list */
- struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
- struct block_device *be_mdev;
+ union {
+ struct rb_node be_node;
+ struct list_head be_list;
+ };
+ struct nfs4_deviceid_node *be_device;
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
enum exstate4 be_state; /* the state of this extent */
- struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+#define EXTENT_WRITTEN 1
+#define EXTENT_COMMITTING 2
+ unsigned int be_tag;
};
-/* Shortened extent used by LAYOUTCOMMIT */
-struct pnfs_block_short_extent {
- struct list_head bse_node;
- struct nfs4_deviceid bse_devid;
- struct block_device *bse_mdev;
- sector_t bse_f_offset; /* the starting offset in the file */
- sector_t bse_length; /* the size of the extent */
-};
-
-static inline void
-BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
-{
- spin_lock_init(&marks->im_lock);
- INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
- INIT_LIST_HEAD(&marks->im_extents);
- marks->im_block_size = blocksize;
- marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
- blocksize);
-}
-
-enum extentclass4 {
- RW_EXTENT = 0, /* READWRTE and INVAL */
- RO_EXTENT = 1, /* READ and NONE */
- EXTENT_LISTS = 2,
-};
-
-static inline int bl_choose_list(enum exstate4 state)
-{
- if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
- return RO_EXTENT;
- else
- return RW_EXTENT;
-}
+/* on the wire size of the extent */
+#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
struct pnfs_block_layout {
- struct pnfs_layout_hdr bl_layout;
- struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+ struct pnfs_layout_hdr bl_layout;
+ struct rb_root bl_ext_rw;
+ struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
- struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
- struct list_head bl_commit; /* Needs layout commit */
- struct list_head bl_committing; /* Layout committing */
- unsigned int bl_count; /* entries in bl_commit */
- sector_t bl_blocksize; /* Server blocksize in sectors */
};
-#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
-
static inline struct pnfs_block_layout *
BLK_LO2EXT(struct pnfs_layout_hdr *lo)
{
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
-/* blocklayoutdev.c */
-ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-void nfs4_blkdev_put(struct block_device *bdev);
-struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev);
-int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
-
-/* blocklayoutdm.c */
-void bl_free_block_dev(struct pnfs_block_dev *bdev);
-
-/* extents.c */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
- struct pnfs_block_extent **cow_read);
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length);
-void bl_put_extent(struct pnfs_block_extent *be);
-struct pnfs_block_extent *bl_alloc_extent(void);
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
-int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg);
-void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- const struct nfs4_layoutcommit_args *arg,
- int status);
-int bl_add_merge_extent(struct pnfs_block_layout *bl,
- struct pnfs_block_extent *new);
-int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length,
- struct pnfs_block_short_extent *new);
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+/* extent_tree.c */
+int ext_tree_insert(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
+ sector_t end);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len);
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
+
+/* rpc_pipefs.c */
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+ struct pnfs_block_volume *b, gfp_t gfp_mask);
+int __init bl_init_pipefs(void);
+void __exit bl_cleanup_pipefs(void);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c9361..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayoutdev.c
- *
- * Device operations for the pnfs nfs4 file layout driver.
- *
- * Copyright (c) 2006 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <andros@citi.umich.edu>
- * Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include <linux/module.h>
-#include <linux/buffer_head.h> /* __bread */
-
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-static int decode_sector_number(__be32 **rp, sector_t *sp)
-{
- uint64_t s;
-
- *rp = xdr_decode_hyper(*rp, &s);
- if (s & 0x1ff) {
- printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
- return -1;
- }
- *sp = s >> SECTOR_SHIFT;
- return 0;
-}
-
-/*
- * Release the block device
- */
-void nfs4_blkdev_put(struct block_device *bdev)
-{
- dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
- MINOR(bdev->bd_dev));
- blkdev_put(bdev, FMODE_READ);
-}
-
-ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
- size_t mlen)
-{
- struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
- nfs_net_id);
-
- if (mlen != sizeof (struct bl_dev_msg))
- return -EINVAL;
-
- if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
- return -EFAULT;
-
- wake_up(&nn->bl_wq);
-
- return mlen;
-}
-
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-{
- struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
-
- if (msg->errno >= 0)
- return;
- wake_up(bl_pipe_msg->bl_wq);
-}
-
-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct pnfs_block_dev *
-nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev)
-{
- struct pnfs_block_dev *rv;
- struct block_device *bd = NULL;
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_MOUNT,
- .totallen = dev->mincount,
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- int offset, len, i, rc;
- struct net *net = server->nfs_client->cl_net;
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct bl_dev_msg *reply = &nn->bl_mount_reply;
-
- dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
- dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
- dev->mincount);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
- if (!msg->data) {
- rv = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- len = dev->mincount;
- offset = sizeof(bl_msg);
- for (i = 0; len > 0; i++) {
- memcpy(&dataptr[offset], page_address(dev->pages[i]),
- len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
- len -= PAGE_CACHE_SIZE;
- offset += PAGE_CACHE_SIZE;
- }
- msg->len = sizeof(bl_msg) + dev->mincount;
-
- dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
- add_wait_queue(&nn->bl_wq, &wq);
- rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
- if (rc < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- rv = ERR_PTR(rc);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
- if (reply->status != BL_DEVICE_REQUEST_PROC) {
- dprintk("%s failed to open device: %d\n",
- __func__, reply->status);
- rv = ERR_PTR(-EINVAL);
- goto out;
- }
-
- bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
- FMODE_READ, NULL);
- if (IS_ERR(bd)) {
- dprintk("%s failed to open device : %ld\n", __func__,
- PTR_ERR(bd));
- rv = ERR_CAST(bd);
- goto out;
- }
-
- rv = kzalloc(sizeof(*rv), GFP_NOFS);
- if (!rv) {
- rv = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- rv->bm_mdev = bd;
- memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
- rv->net = net;
- dprintk("%s Created device %s with bd_block_size %u\n",
- __func__,
- bd->bd_disk->disk_name,
- bd->bd_block_size);
-
-out:
- kfree(msg->data);
- return rv;
-}
-
-/* Map deviceid returned by the server to constructed block_device */
-static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
- struct nfs4_deviceid *id)
-{
- struct block_device *rv = NULL;
- struct block_mount_id *mid;
- struct pnfs_block_dev *dev;
-
- dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
- mid = BLK_ID(lo);
- spin_lock(&mid->bm_lock);
- list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
- if (memcmp(id->data, dev->bm_mdevid.data,
- NFS4_DEVICEID4_SIZE) == 0) {
- rv = dev->bm_mdev;
- goto out;
- }
- }
- out:
- spin_unlock(&mid->bm_lock);
- dprintk("%s returning %p\n", __func__, rv);
- return rv;
-}
-
-/* Tracks info needed to ensure extents in layout obey constraints of spec */
-struct layout_verification {
- u32 mode; /* R or RW */
- u64 start; /* Expected start of next non-COW extent */
- u64 inval; /* Start of INVAL coverage */
- u64 cowread; /* End of COW read coverage */
-};
-
-/* Verify the extent meets the layout requirements of the pnfs-block draft,
- * section 2.3.1.
- */
-static int verify_extent(struct pnfs_block_extent *be,
- struct layout_verification *lv)
-{
- if (lv->mode == IOMODE_READ) {
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
- be->be_state == PNFS_BLOCK_INVALID_DATA)
- return -EIO;
- if (be->be_f_offset != lv->start)
- return -EIO;
- lv->start += be->be_length;
- return 0;
- }
- /* lv->mode == IOMODE_RW */
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
- if (be->be_f_offset != lv->start)
- return -EIO;
- if (lv->cowread > lv->start)
- return -EIO;
- lv->start += be->be_length;
- lv->inval = lv->start;
- return 0;
- } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (be->be_f_offset != lv->start)
- return -EIO;
- lv->start += be->be_length;
- return 0;
- } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
- if (be->be_f_offset > lv->start)
- return -EIO;
- if (be->be_f_offset < lv->inval)
- return -EIO;
- if (be->be_f_offset < lv->cowread)
- return -EIO;
- /* It looks like you might want to min this with lv->start,
- * but you really don't.
- */
- lv->inval = lv->inval + be->be_length;
- lv->cowread = be->be_f_offset + be->be_length;
- return 0;
- } else
- return -EIO;
-}
-
-/* XDR decode pnfs_block_layout4 structure */
-int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
-{
- struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
- int i, status = -EIO;
- uint32_t count;
- struct pnfs_block_extent *be = NULL, *save;
- struct xdr_stream stream;
- struct xdr_buf buf;
- struct page *scratch;
- __be32 *p;
- struct layout_verification lv = {
- .mode = lgr->range.iomode,
- .start = lgr->range.offset >> SECTOR_SHIFT,
- .inval = lgr->range.offset >> SECTOR_SHIFT,
- .cowread = lgr->range.offset >> SECTOR_SHIFT,
- };
- LIST_HEAD(extents);
-
- dprintk("---> %s\n", __func__);
-
- scratch = alloc_page(gfp_flags);
- if (!scratch)
- return -ENOMEM;
-
- xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-
- p = xdr_inline_decode(&stream, 4);
- if (unlikely(!p))
- goto out_err;
-
- count = be32_to_cpup(p++);
-
- dprintk("%s enter, number of extents %i\n", __func__, count);
- p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
- if (unlikely(!p))
- goto out_err;
-
- /* Decode individual extents, putting them in temporary
- * staging area until whole layout is decoded to make error
- * recovery easier.
- */
- for (i = 0; i < count; i++) {
- be = bl_alloc_extent();
- if (!be) {
- status = -ENOMEM;
- goto out_err;
- }
- memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
- p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
- be->be_mdev = translate_devid(lo, &be->be_devid);
- if (!be->be_mdev)
- goto out_err;
-
- /* The next three values are read in as bytes,
- * but stored as 512-byte sector lengths
- */
- if (decode_sector_number(&p, &be->be_f_offset) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_length) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_v_offset) < 0)
- goto out_err;
- be->be_state = be32_to_cpup(p++);
- if (be->be_state == PNFS_BLOCK_INVALID_DATA)
- be->be_inval = &bl->bl_inval;
- if (verify_extent(be, &lv)) {
- dprintk("%s verify failed\n", __func__);
- goto out_err;
- }
- list_add_tail(&be->be_node, &extents);
- }
- if (lgr->range.offset + lgr->range.length !=
- lv.start << SECTOR_SHIFT) {
- dprintk("%s Final length mismatch\n", __func__);
- be = NULL;
- goto out_err;
- }
- if (lv.start < lv.cowread) {
- dprintk("%s Final uncovered COW extent\n", __func__);
- be = NULL;
- goto out_err;
- }
- /* Extents decoded properly, now try to merge them in to
- * existing layout extents.
- */
- spin_lock(&bl->bl_ext_lock);
- list_for_each_entry_safe(be, save, &extents, be_node) {
- list_del(&be->be_node);
- status = bl_add_merge_extent(bl, be);
- if (status) {
- spin_unlock(&bl->bl_ext_lock);
- /* This is a fairly catastrophic error, as the
- * entire layout extent lists are now corrupted.
- * We should have some way to distinguish this.
- */
- be = NULL;
- goto out_err;
- }
- }
- spin_unlock(&bl->bl_ext_lock);
- status = 0;
- out:
- __free_page(scratch);
- dprintk("%s returns %i\n", __func__, status);
- return status;
-
- out_err:
- bl_put_extent(be);
- while (!list_empty(&extents)) {
- be = list_first_entry(&extents, struct pnfs_block_extent,
- be_node);
- list_del(&be->be_node);
- bl_put_extent(be);
- }
- goto out;
-}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd866..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayoutdm.c
- *
- * Module for the NFSv4.1 pNFS block layout driver.
- *
- * Copyright (c) 2007 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Fred Isaman <iisaman@umich.edu>
- * Andy Adamson <andros@citi.umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-#include <linux/sched.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-static void dev_remove(struct net *net, dev_t dev)
-{
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_dev_msg bl_umount_request;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_UMOUNT,
- .totallen = sizeof(bl_umount_request),
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- dprintk("Entering %s\n", __func__);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->len = sizeof(bl_msg) + bl_msg.totallen;
- msg->data = kzalloc(msg->len, GFP_NOFS);
- if (!msg->data)
- goto out;
-
- memset(&bl_umount_request, 0, sizeof(bl_umount_request));
- bl_umount_request.major = MAJOR(dev);
- bl_umount_request.minor = MINOR(dev);
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-
- add_wait_queue(&nn->bl_wq, &wq);
- if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
-out:
- kfree(msg->data);
-}
-
-/*
- * Release meta device
- */
-static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
-{
- dprintk("%s Releasing\n", __func__);
- nfs4_blkdev_put(bdev->bm_mdev);
- dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
-}
-
-void bl_free_block_dev(struct pnfs_block_dev *bdev)
-{
- if (bdev) {
- if (bdev->bm_mdev) {
- dprintk("%s Removing DM device: %d:%d\n",
- __func__,
- MAJOR(bdev->bm_mdev->bd_dev),
- MINOR(bdev->bm_mdev->bd_dev));
- nfs4_blk_metadev_release(bdev);
- }
- kfree(bdev);
- }
-}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..5aed4f98df41
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+ if (dev->nr_children) {
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++)
+ bl_free_device(&dev->children[i]);
+ kfree(dev->children);
+ } else {
+ if (dev->bdev)
+ blkdev_put(dev->bdev, FMODE_READ);
+ }
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct pnfs_block_dev *dev =
+ container_of(d, struct pnfs_block_dev, node);
+
+ bl_free_device(dev);
+ kfree(dev);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->type = be32_to_cpup(p++);
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->simple.nr_sigs = be32_to_cpup(p++);
+ if (!b->simple.nr_sigs) {
+ dprintk("no signature\n");
+ return -EIO;
+ }
+
+ b->simple.len = 4 + 4;
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+ b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+ if (!p)
+ return -EIO;
+ memcpy(&b->simple.sigs[i].sig, p,
+ b->simple.sigs[i].sig_len);
+
+ b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+ }
+ break;
+ case PNFS_BLOCK_VOLUME_SLICE:
+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->slice.start);
+ p = xdr_decode_hyper(p, &b->slice.len);
+ b->slice.volume = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->concat.volumes_count = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->concat.volumes_count; i++)
+ b->concat.volumes[i] = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+ b->stripe.volumes_count = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->stripe.volumes_count; i++)
+ b->stripe.volumes[i] = be32_to_cpup(p++);
+ break;
+ default:
+ dprintk("unknown volume type!\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ map->start = dev->start;
+ map->len = dev->len;
+ map->disk_offset = dev->disk_offset;
+ map->bdev = dev->bdev;
+ return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++) {
+ struct pnfs_block_dev *child = &dev->children[i];
+
+ if (child->start > offset ||
+ child->start + child->len <= offset)
+ continue;
+
+ child->map(child, offset - child->start, map);
+ return true;
+ }
+
+ dprintk("%s: ran off loop!\n", __func__);
+ return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ struct pnfs_block_dev *child;
+ u64 chunk;
+ u32 chunk_idx;
+ u64 disk_offset;
+
+ chunk = div_u64(offset, dev->chunk_size);
+ div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+
+ if (chunk_idx > dev->nr_children) {
+ dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+ __func__, chunk_idx, offset, dev->chunk_size);
+ /* error, should not happen */
+ return false;
+ }
+
+ /* truncate offset to the beginning of the stripe */
+ offset = chunk * dev->chunk_size;
+
+ /* disk offset of the stripe */
+ disk_offset = div_u64(offset, dev->nr_children);
+
+ child = &dev->children[chunk_idx];
+ child->map(child, disk_offset, map);
+
+ map->start += offset;
+ map->disk_offset += disk_offset;
+ map->len = dev->chunk_size;
+ return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ dev_t dev;
+
+ dev = bl_resolve_deviceid(server, v, gfp_mask);
+ if (!dev)
+ return -EIO;
+
+ d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR(d->bdev)) {
+ printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+ MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+ return PTR_ERR(d->bdev);
+ }
+
+
+ d->len = i_size_read(d->bdev->bd_inode);
+ d->map = bl_map_simple;
+
+ printk(KERN_INFO "pNFS: using block device %s\n",
+ d->bdev->bd_disk->disk_name);
+ return 0;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ int ret;
+
+ ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+ if (ret)
+ return ret;
+
+ d->disk_offset = v->slice.start;
+ d->len = v->slice.len;
+ return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->concat.volumes_count,
+ sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->concat.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->concat.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ d->children[i].start += len;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->map = bl_map_concat;
+ return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->stripe.volumes_count,
+ sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->stripe.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->stripe.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->chunk_size = v->stripe.chunk_size;
+ d->map = bl_map_stripe;
+ return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ switch (volumes[idx].type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SLICE:
+ return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+ default:
+ dprintk("unsupported volume type: %d\n", volumes[idx].type);
+ return -EIO;
+ }
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_mask)
+{
+ struct nfs4_deviceid_node *node = NULL;
+ struct pnfs_block_volume *volumes;
+ struct pnfs_block_dev *top;
+ struct xdr_stream xdr;
+ struct xdr_buf buf;
+ struct page *scratch;
+ int nr_volumes, ret, i;
+ __be32 *p;
+
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ p = xdr_inline_decode(&xdr, sizeof(__be32));
+ if (!p)
+ goto out_free_scratch;
+ nr_volumes = be32_to_cpup(p++);
+
+ volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+ gfp_mask);
+ if (!volumes)
+ goto out_free_scratch;
+
+ for (i = 0; i < nr_volumes; i++) {
+ ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+ if (ret < 0)
+ goto out_free_volumes;
+ }
+
+ top = kzalloc(sizeof(*top), gfp_mask);
+ if (!top)
+ goto out_free_volumes;
+
+ ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+ if (ret) {
+ bl_free_device(top);
+ kfree(top);
+ goto out_free_volumes;
+ }
+
+ node = &top->node;
+ nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+
+out_free_volumes:
+ kfree(volumes);
+out_free_scratch:
+ __free_page(scratch);
+out:
+ return node;
+}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..31d0b5e53dfd
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+
+#include <linux/vmalloc.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+ return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+ struct rb_node *node = rb_first(root);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_prev(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_next(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+ return be->be_f_offset + be->be_length;
+}
+
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+ struct rb_node *node = root->rb_node;
+ struct pnfs_block_extent *be = NULL;
+
+ while (node) {
+ be = ext_node(node);
+ if (start < be->be_f_offset)
+ node = node->rb_left;
+ else if (start >= ext_f_end(be))
+ node = node->rb_right;
+ else
+ return be;
+ }
+
+ if (be) {
+ if (start < be->be_f_offset)
+ return be;
+
+ if (start >= ext_f_end(be))
+ return ext_tree_next(be);
+ }
+
+ return NULL;
+}
+
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+ if (be1->be_state != be2->be_state)
+ return false;
+ if (be1->be_device != be2->be_device)
+ return false;
+
+ if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+ return false;
+
+ if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+ (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+ return false;
+
+ if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+ be1->be_tag != be2->be_tag)
+ return false;
+
+ return true;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ left->be_length += be->be_length;
+ rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ return left;
+ }
+
+ return be;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ be->be_length += right->be_length;
+ rb_erase(&right->be_node, root);
+ nfs4_put_deviceid_node(right->be_device);
+ kfree(right);
+ }
+
+ return be;
+}
+
+static void
+__ext_tree_insert(struct rb_root *root,
+ struct pnfs_block_extent *new, bool merge_ok)
+{
+ struct rb_node **p = &root->rb_node, *parent = NULL;
+ struct pnfs_block_extent *be;
+
+ while (*p) {
+ parent = *p;
+ be = ext_node(parent);
+
+ if (new->be_f_offset < be->be_f_offset) {
+ if (merge_ok && ext_can_merge(new, be)) {
+ be->be_f_offset = new->be_f_offset;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset = new->be_v_offset;
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_left(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_left;
+ } else if (new->be_f_offset >= ext_f_end(be)) {
+ if (merge_ok && ext_can_merge(be, new)) {
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_right(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->be_node, parent, p);
+ rb_insert_color(&new->be_node, root);
+ return;
+free_new:
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+}
+
+static int
+__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+{
+ struct pnfs_block_extent *be;
+ sector_t len1 = 0, len2 = 0;
+ sector_t orig_v_offset;
+ sector_t orig_len;
+
+ be = __ext_tree_search(root, start);
+ if (!be)
+ return 0;
+ if (be->be_f_offset >= end)
+ return 0;
+
+ orig_v_offset = be->be_v_offset;
+ orig_len = be->be_length;
+
+ if (start > be->be_f_offset)
+ len1 = start - be->be_f_offset;
+ if (ext_f_end(be) > end)
+ len2 = ext_f_end(be) - end;
+
+ if (len2 > 0) {
+ if (len1 > 0) {
+ struct pnfs_block_extent *new;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = len1;
+
+ new->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ new->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ new->be_length = len2;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, true);
+ } else {
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ be->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ be->be_length = len2;
+ }
+ } else {
+ if (len1 > 0) {
+ be->be_length = len1;
+ be = ext_tree_next(be);
+ }
+
+ while (be && ext_f_end(be) <= end) {
+ struct pnfs_block_extent *next = ext_tree_next(be);
+
+ rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ be = next;
+ }
+
+ if (be && be->be_f_offset < end) {
+ len1 = ext_f_end(be) - end;
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset += be->be_length - len1;
+ be->be_length = len1;
+ }
+ }
+
+ return 0;
+}
+
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+ struct pnfs_block_extent *be;
+ struct rb_root *root;
+ int err = 0;
+
+ switch (new->be_state) {
+ case PNFS_BLOCK_READWRITE_DATA:
+ case PNFS_BLOCK_INVALID_DATA:
+ root = &bl->bl_ext_rw;
+ break;
+ case PNFS_BLOCK_READ_DATA:
+ case PNFS_BLOCK_NONE_DATA:
+ root = &bl->bl_ext_ro;
+ break;
+ default:
+ dprintk("invalid extent type\n");
+ return -EINVAL;
+ }
+
+ spin_lock(&bl->bl_ext_lock);
+retry:
+ be = __ext_tree_search(root, new->be_f_offset);
+ if (!be || be->be_f_offset >= ext_f_end(new)) {
+ __ext_tree_insert(root, new, true);
+ } else if (new->be_f_offset >= be->be_f_offset) {
+ if (ext_f_end(new) <= ext_f_end(be)) {
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+ } else {
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+ } else if (ext_f_end(new) <= ext_f_end(be)) {
+ new->be_length = be->be_f_offset - new->be_f_offset;
+ __ext_tree_insert(root, new, true);
+ } else {
+ struct pnfs_block_extent *split;
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+ if (!split) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ split->be_length = be->be_f_offset - split->be_f_offset;
+ split->be_device = nfs4_get_deviceid(new->be_device);
+ __ext_tree_insert(root, split, true);
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ return err;
+}
+
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+ struct pnfs_block_extent *ret)
+{
+ struct rb_node *node;
+ struct pnfs_block_extent *be;
+
+ node = root->rb_node;
+ while (node) {
+ be = ext_node(node);
+ if (isect < be->be_f_offset)
+ node = node->rb_left;
+ else if (isect >= ext_f_end(be))
+ node = node->rb_right;
+ else {
+ *ret = *be;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw)
+{
+ bool found = false;
+
+ spin_lock(&bl->bl_ext_lock);
+ if (!rw)
+ found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+ if (!found)
+ found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+ spin_unlock(&bl->bl_ext_lock);
+
+ return found;
+}
+
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+ sector_t start, sector_t end)
+{
+ int err, err2;
+
+ spin_lock(&bl->bl_ext_lock);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ if (rw) {
+ err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+ if (!err)
+ err = err2;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ return err;
+}
+
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+ sector_t split)
+{
+ struct pnfs_block_extent *new;
+ sector_t orig_len = be->be_length;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = split - be->be_f_offset;
+
+ new->be_f_offset = split;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ new->be_v_offset = be->be_v_offset + be->be_length;
+ new->be_length = orig_len - be->be_length;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, false);
+ return 0;
+}
+
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len)
+{
+ struct rb_root *root = &bl->bl_ext_rw;
+ sector_t end = start + len;
+ struct pnfs_block_extent *be;
+ int err = 0;
+
+ spin_lock(&bl->bl_ext_lock);
+ /*
+ * First remove all COW extents or holes from written to range.
+ */
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ if (err)
+ goto out;
+
+ /*
+ * Then mark all invalid extents in the range as written to.
+ */
+ for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+ if (be->be_f_offset >= end)
+ break;
+
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+ continue;
+
+ if (be->be_f_offset < start) {
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ sector_t diff = start - be->be_f_offset;
+
+ left->be_length += diff;
+
+ be->be_f_offset += diff;
+ be->be_v_offset += diff;
+ be->be_length -= diff;
+ } else {
+ err = ext_tree_split(root, be, start);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (ext_f_end(be) > end) {
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ sector_t diff = end - be->be_f_offset;
+
+ be->be_length -= diff;
+
+ right->be_f_offset -= diff;
+ right->be_v_offset -= diff;
+ right->be_length += diff;
+ } else {
+ err = ext_tree_split(root, be, end);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+ be->be_tag = EXTENT_WRITTEN;
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ }
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ return err;
+}
+
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+ size_t buffer_size)
+{
+ if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+ int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+
+ for (i = 0; i < nr_pages; i++)
+ put_page(arg->layoutupdate_pages[i]);
+ kfree(arg->layoutupdate_pages);
+ } else {
+ put_page(arg->layoutupdate_page);
+ }
+}
+
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+ size_t buffer_size, size_t *count)
+{
+ struct pnfs_block_extent *be;
+ int ret = 0;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_WRITTEN)
+ continue;
+
+ (*count)++;
+ if (*count * BL_EXTENT_SIZE > buffer_size) {
+ /* keep counting.. */
+ ret = -ENOSPC;
+ continue;
+ }
+
+ p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+ NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+
+ be->be_tag = EXTENT_COMMITTING;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ return ret;
+}
+
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ size_t count = 0, buffer_size = PAGE_SIZE;
+ __be32 *start_p;
+ int ret;
+
+ dprintk("%s enter\n", __func__);
+
+ arg->layoutupdate_page = alloc_page(GFP_NOFS);
+ if (!arg->layoutupdate_page)
+ return -ENOMEM;
+ start_p = page_address(arg->layoutupdate_page);
+ arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+ if (unlikely(ret)) {
+ ext_tree_free_commitdata(arg, buffer_size);
+
+ buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ count = 0;
+
+ arg->layoutupdate_pages =
+ kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+ sizeof(struct page *), GFP_NOFS);
+ if (!arg->layoutupdate_pages)
+ return -ENOMEM;
+
+ start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+ if (!start_p) {
+ kfree(arg->layoutupdate_pages);
+ return -ENOMEM;
+ }
+
+ goto retry;
+ }
+
+ *start_p = cpu_to_be32(count);
+ arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+
+ if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+ __be32 *p = start_p;
+ int i = 0;
+
+ for (p = start_p;
+ p < start_p + arg->layoutupdate_len;
+ p += PAGE_SIZE) {
+ arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+ }
+ }
+
+ dprintk("%s found %zu ranges\n", __func__, count);
+ return 0;
+}
+
+void
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ struct rb_root *root = &bl->bl_ext_rw;
+ struct pnfs_block_extent *be;
+
+ dprintk("%s status %d\n", __func__, status);
+
+ ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_COMMITTING)
+ continue;
+
+ if (status) {
+ /*
+ * Mark as written and try again.
+ *
+ * XXX: some real error handling here wouldn't hurt..
+ */
+ be->be_tag = EXTENT_WRITTEN;
+ } else {
+ be->be_state = PNFS_BLOCK_READWRITE_DATA;
+ be->be_tag = 0;
+ }
+
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayout.h
- *
- * Module for the NFSv4.1 pNFS block layout driver.
- *
- * Copyright (c) 2006 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <andros@citi.umich.edu>
- * Fred Isaman <iisaman@umich.edu>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include "blocklayout.h"
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-/* Bit numbers */
-#define EXTENT_INITIALIZED 0
-#define EXTENT_WRITTEN 1
-#define EXTENT_IN_COMMIT 2
-#define INTERNAL_EXISTS MY_MAX_TAGS
-#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
-
-/* Returns largest t<=s s.t. t%base==0 */
-static inline sector_t normalize(sector_t s, int base)
-{
- sector_t tmp = s; /* Since do_div modifies its argument */
- return s - sector_div(tmp, base);
-}
-
-static inline sector_t normalize_up(sector_t s, int base)
-{
- return normalize(s + base - 1, base);
-}
-
-/* Complete stub using list while determine API wanted */
-
-/* Returns tags, or negative */
-static int32_t _find_entry(struct my_tree *tree, u64 s)
-{
- struct pnfs_inval_tracking *pos;
-
- dprintk("%s(%llu) enter\n", __func__, s);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector > s)
- continue;
- else if (pos->it_sector == s)
- return pos->it_tags & INTERNAL_MASK;
- else
- break;
- }
- return -ENOENT;
-}
-
-static inline
-int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
-{
- int32_t tags;
-
- dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
- s = normalize(s, tree->mtt_step_size);
- tags = _find_entry(tree, s);
- if ((tags < 0) || !(tags & (1 << tag)))
- return 0;
- else
- return 1;
-}
-
-/* Creates entry with tag, or if entry already exists, unions tag to it.
- * If storage is not NULL, newly created entry will use it.
- * Returns number of entries added, or negative on error.
- */
-static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
- struct pnfs_inval_tracking *storage)
-{
- int found = 0;
- struct pnfs_inval_tracking *pos;
-
- dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector > s)
- continue;
- else if (pos->it_sector == s) {
- found = 1;
- break;
- } else
- break;
- }
- if (found) {
- pos->it_tags |= (1 << tag);
- return 0;
- } else {
- struct pnfs_inval_tracking *new;
- new = storage;
- new->it_sector = s;
- new->it_tags = (1 << tag);
- list_add(&new->it_link, &pos->it_link);
- return 1;
- }
-}
-
-/* XXXX Really want option to not create */
-/* Over range, unions tag with existing entries, else creates entry with tag */
-static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
-{
- u64 i;
-
- dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
- for (i = normalize(s, tree->mtt_step_size); i < s + length;
- i += tree->mtt_step_size)
- if (_add_entry(tree, i, tag, NULL))
- return -ENOMEM;
- return 0;
-}
-
-/* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct pnfs_inval_markings *marks,
- u64 offset, u64 length)
-{
- u64 start, end, s;
- int count, i, used = 0, status = -ENOMEM;
- struct pnfs_inval_tracking **storage;
- struct my_tree *tree = &marks->im_tree;
-
- dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
- start = normalize(offset, tree->mtt_step_size);
- end = normalize_up(offset + length, tree->mtt_step_size);
- count = (int)(end - start) / (int)tree->mtt_step_size;
-
- /* Pre-malloc what memory we might need */
- storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
- if (!storage)
- return -ENOMEM;
- for (i = 0; i < count; i++) {
- storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
- GFP_NOFS);
- if (!storage[i])
- goto out_cleanup;
- }
-
- spin_lock_bh(&marks->im_lock);
- for (s = start; s < end; s += tree->mtt_step_size)
- used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
- spin_unlock_bh(&marks->im_lock);
-
- status = 0;
-
- out_cleanup:
- for (i = used; i < count; i++) {
- if (!storage[i])
- break;
- kfree(storage[i]);
- }
- kfree(storage);
- return status;
-}
-
-/* We are relying on page lock to serialize this */
-int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
-{
- int rv;
-
- spin_lock_bh(&marks->im_lock);
- rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
- spin_unlock_bh(&marks->im_lock);
- return rv;
-}
-
-/* Assume start, end already sector aligned */
-static int
-_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
-{
- struct pnfs_inval_tracking *pos;
- u64 expect = 0;
-
- dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
- list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector >= end)
- continue;
- if (!expect) {
- if ((pos->it_sector == end - tree->mtt_step_size) &&
- (pos->it_tags & (1 << tag))) {
- expect = pos->it_sector - tree->mtt_step_size;
- if (pos->it_sector < tree->mtt_step_size || expect < start)
- return 1;
- continue;
- } else {
- return 0;
- }
- }
- if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
- return 0;
- expect -= tree->mtt_step_size;
- if (expect < start)
- return 1;
- }
- return 0;
-}
-
-static int is_range_written(struct pnfs_inval_markings *marks,
- sector_t start, sector_t end)
-{
- int rv;
-
- spin_lock_bh(&marks->im_lock);
- rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
- spin_unlock_bh(&marks->im_lock);
- return rv;
-}
-
-/* Marks sectors in [offest, offset_length) as having been initialized.
- * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Currently assumes offset is page-aligned
- */
-int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length)
-{
- sector_t start, end;
-
- dprintk("%s(offset=%llu,len=%llu) enter\n",
- __func__, (u64)offset, (u64)length);
-
- start = normalize(offset, marks->im_block_size);
- end = normalize_up(offset + length, marks->im_block_size);
- if (_preload_range(marks, start, end - start))
- goto outerr;
-
- spin_lock_bh(&marks->im_lock);
- if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
- goto out_unlock;
- spin_unlock_bh(&marks->im_lock);
-
- return 0;
-
-out_unlock:
- spin_unlock_bh(&marks->im_lock);
-outerr:
- return -ENOMEM;
-}
-
-/* Marks sectors in [offest, offset+length) as having been written to disk.
- * All lengths should be block aligned.
- */
-static int mark_written_sectors(struct pnfs_inval_markings *marks,
- sector_t offset, sector_t length)
-{
- int status;
-
- dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
- (u64)offset, (u64)length);
- spin_lock_bh(&marks->im_lock);
- status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
- spin_unlock_bh(&marks->im_lock);
- return status;
-}
-
-static void print_short_extent(struct pnfs_block_short_extent *be)
-{
- dprintk("PRINT SHORT EXTENT extent %p\n", be);
- if (be) {
- dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
- dprintk(" be_length %llu\n", (u64)be->bse_length);
- }
-}
-
-static void print_clist(struct list_head *list, unsigned int count)
-{
- struct pnfs_block_short_extent *be;
- unsigned int i = 0;
-
- ifdebug(FACILITY) {
- printk(KERN_DEBUG "****************\n");
- printk(KERN_DEBUG "Extent list looks like:\n");
- list_for_each_entry(be, list, bse_node) {
- i++;
- print_short_extent(be);
- }
- if (i != count)
- printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
- printk(KERN_DEBUG "****************\n");
- }
-}
-
-/* Note: In theory, we should do more checking that devid's match between
- * old and new, but if they don't, the lists are too corrupt to salvage anyway.
- */
-/* Note this is very similar to bl_add_merge_extent */
-static void add_to_commitlist(struct pnfs_block_layout *bl,
- struct pnfs_block_short_extent *new)
-{
- struct list_head *clist = &bl->bl_commit;
- struct pnfs_block_short_extent *old, *save;
- sector_t end = new->bse_f_offset + new->bse_length;
-
- dprintk("%s enter\n", __func__);
- print_short_extent(new);
- print_clist(clist, bl->bl_count);
- bl->bl_count++;
- /* Scan for proper place to insert, extending new to the left
- * as much as possible.
- */
- list_for_each_entry_safe(old, save, clist, bse_node) {
- if (new->bse_f_offset < old->bse_f_offset)
- break;
- if (end <= old->bse_f_offset + old->bse_length) {
- /* Range is already in list */
- bl->bl_count--;
- kfree(new);
- return;
- } else if (new->bse_f_offset <=
- old->bse_f_offset + old->bse_length) {
- /* new overlaps or abuts existing be */
- if (new->bse_mdev == old->bse_mdev) {
- /* extend new to fully replace old */
- new->bse_length += new->bse_f_offset -
- old->bse_f_offset;
- new->bse_f_offset = old->bse_f_offset;
- list_del(&old->bse_node);
- bl->bl_count--;
- kfree(old);
- }
- }
- }
- /* Note that if we never hit the above break, old will not point to a
- * valid extent. However, in that case &old->bse_node==list.
- */
- list_add_tail(&new->bse_node, &old->bse_node);
- /* Scan forward for overlaps. If we find any, extend new and
- * remove the overlapped extent.
- */
- old = list_prepare_entry(new, clist, bse_node);
- list_for_each_entry_safe_continue(old, save, clist, bse_node) {
- if (end < old->bse_f_offset)
- break;
- /* new overlaps or abuts old */
- if (new->bse_mdev == old->bse_mdev) {
- if (end < old->bse_f_offset + old->bse_length) {
- /* extend new to fully cover old */
- end = old->bse_f_offset + old->bse_length;
- new->bse_length = end - new->bse_f_offset;
- }
- list_del(&old->bse_node);
- bl->bl_count--;
- kfree(old);
- }
- }
- dprintk("%s: after merging\n", __func__);
- print_clist(clist, bl->bl_count);
-}
-
-/* Note the range described by offset, length is guaranteed to be contained
- * within be.
- * new will be freed, either by this function or add_to_commitlist if they
- * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
- */
-int bl_mark_for_commit(struct pnfs_block_extent *be,
- sector_t offset, sector_t length,
- struct pnfs_block_short_extent *new)
-{
- sector_t new_end, end = offset + length;
- struct pnfs_block_layout *bl = container_of(be->be_inval,
- struct pnfs_block_layout,
- bl_inval);
-
- mark_written_sectors(be->be_inval, offset, length);
- /* We want to add the range to commit list, but it must be
- * block-normalized, and verified that the normalized range has
- * been entirely written to disk.
- */
- new->bse_f_offset = offset;
- offset = normalize(offset, bl->bl_blocksize);
- if (offset < new->bse_f_offset) {
- if (is_range_written(be->be_inval, offset, new->bse_f_offset))
- new->bse_f_offset = offset;
- else
- new->bse_f_offset = offset + bl->bl_blocksize;
- }
- new_end = normalize_up(end, bl->bl_blocksize);
- if (end < new_end) {
- if (is_range_written(be->be_inval, end, new_end))
- end = new_end;
- else
- end = new_end - bl->bl_blocksize;
- }
- if (end <= new->bse_f_offset) {
- kfree(new);
- return 0;
- }
- new->bse_length = end - new->bse_f_offset;
- new->bse_devid = be->be_devid;
- new->bse_mdev = be->be_mdev;
-
- spin_lock(&bl->bl_ext_lock);
- add_to_commitlist(bl, new);
- spin_unlock(&bl->bl_ext_lock);
- return 0;
-}
-
-static void print_bl_extent(struct pnfs_block_extent *be)
-{
- dprintk("PRINT EXTENT extent %p\n", be);
- if (be) {
- dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
- dprintk(" be_length %llu\n", (u64)be->be_length);
- dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
- dprintk(" be_state %d\n", be->be_state);
- }
-}
-
-static void
-destroy_extent(struct kref *kref)
-{
- struct pnfs_block_extent *be;
-
- be = container_of(kref, struct pnfs_block_extent, be_refcnt);
- dprintk("%s be=%p\n", __func__, be);
- kfree(be);
-}
-
-void
-bl_put_extent(struct pnfs_block_extent *be)
-{
- if (be) {
- dprintk("%s enter %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_put(&be->be_refcnt, destroy_extent);
- }
-}
-
-struct pnfs_block_extent *bl_alloc_extent(void)
-{
- struct pnfs_block_extent *be;
-
- be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
- if (!be)
- return NULL;
- INIT_LIST_HEAD(&be->be_node);
- kref_init(&be->be_refcnt);
- be->be_inval = NULL;
- return be;
-}
-
-static void print_elist(struct list_head *list)
-{
- struct pnfs_block_extent *be;
- dprintk("****************\n");
- dprintk("Extent list looks like:\n");
- list_for_each_entry(be, list, be_node) {
- print_bl_extent(be);
- }
- dprintk("****************\n");
-}
-
-static inline int
-extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
-{
- /* Note this assumes new->be_f_offset >= old->be_f_offset */
- return (new->be_state == old->be_state) &&
- ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
- ((new->be_v_offset - old->be_v_offset ==
- new->be_f_offset - old->be_f_offset) &&
- new->be_mdev == old->be_mdev));
-}
-
-/* Adds new to appropriate list in bl, modifying new and removing existing
- * extents as appropriate to deal with overlaps.
- *
- * See bl_find_get_extent for list constraints.
- *
- * Refcount on new is already set. If end up not using it, or error out,
- * need to put the reference.
- *
- * bl->bl_ext_lock is held by caller.
- */
-int
-bl_add_merge_extent(struct pnfs_block_layout *bl,
- struct pnfs_block_extent *new)
-{
- struct pnfs_block_extent *be, *tmp;
- sector_t end = new->be_f_offset + new->be_length;
- struct list_head *list;
-
- dprintk("%s enter with be=%p\n", __func__, new);
- print_bl_extent(new);
- list = &bl->bl_extents[bl_choose_list(new->be_state)];
- print_elist(list);
-
- /* Scan for proper place to insert, extending new to the left
- * as much as possible.
- */
- list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
- if (new->be_f_offset >= be->be_f_offset + be->be_length)
- break;
- if (new->be_f_offset >= be->be_f_offset) {
- if (end <= be->be_f_offset + be->be_length) {
- /* new is a subset of existing be*/
- if (extents_consistent(be, new)) {
- dprintk("%s: new is subset, ignoring\n",
- __func__);
- bl_put_extent(new);
- return 0;
- } else {
- goto out_err;
- }
- } else {
- /* |<-- be -->|
- * |<-- new -->| */
- if (extents_consistent(be, new)) {
- /* extend new to fully replace be */
- new->be_length += new->be_f_offset -
- be->be_f_offset;
- new->be_f_offset = be->be_f_offset;
- new->be_v_offset = be->be_v_offset;
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- }
- } else if (end >= be->be_f_offset + be->be_length) {
- /* new extent overlap existing be */
- if (extents_consistent(be, new)) {
- /* extend new to fully replace be */
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- } else if (end > be->be_f_offset) {
- /* |<-- be -->|
- *|<-- new -->| */
- if (extents_consistent(new, be)) {
- /* extend new to fully replace be */
- new->be_length += be->be_f_offset + be->be_length -
- new->be_f_offset - new->be_length;
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else {
- goto out_err;
- }
- }
- }
- /* Note that if we never hit the above break, be will not point to a
- * valid extent. However, in that case &be->be_node==list.
- */
- list_add(&new->be_node, &be->be_node);
- dprintk("%s: inserting new\n", __func__);
- print_elist(list);
- /* FIXME - The per-list consistency checks have all been done,
- * should now check cross-list consistency.
- */
- return 0;
-
- out_err:
- bl_put_extent(new);
- return -EIO;
-}
-
-/* Returns extent, or NULL. If a second READ extent exists, it is returned
- * in cow_read, if given.
- *
- * The extents are kept in two seperate ordered lists, one for READ and NONE,
- * one for READWRITE and INVALID. Within each list, we assume:
- * 1. Extents are ordered by file offset.
- * 2. For any given isect, there is at most one extents that matches.
- */
-struct pnfs_block_extent *
-bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
- struct pnfs_block_extent **cow_read)
-{
- struct pnfs_block_extent *be, *cow, *ret;
- int i;
-
- dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
- cow = ret = NULL;
- spin_lock(&bl->bl_ext_lock);
- for (i = 0; i < EXTENT_LISTS; i++) {
- list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
- if (isect >= be->be_f_offset + be->be_length)
- break;
- if (isect >= be->be_f_offset) {
- /* We have found an extent */
- dprintk("%s Get %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_get(&be->be_refcnt);
- if (!ret)
- ret = be;
- else if (be->be_state != PNFS_BLOCK_READ_DATA)
- bl_put_extent(be);
- else
- cow = be;
- break;
- }
- }
- if (ret &&
- (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
- break;
- }
- spin_unlock(&bl->bl_ext_lock);
- if (cow_read)
- *cow_read = cow;
- print_bl_extent(ret);
- return ret;
-}
-
-/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
-static struct pnfs_block_extent *
-bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
-{
- struct pnfs_block_extent *be, *ret = NULL;
- int i;
-
- dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
- for (i = 0; i < EXTENT_LISTS; i++) {
- if (ret)
- break;
- list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
- if (isect >= be->be_f_offset + be->be_length)
- break;
- if (isect >= be->be_f_offset) {
- /* We have found an extent */
- dprintk("%s Get %p (%i)\n", __func__, be,
- atomic_read(&be->be_refcnt.refcount));
- kref_get(&be->be_refcnt);
- ret = be;
- break;
- }
- }
- }
- print_bl_extent(ret);
- return ret;
-}
-
-int
-encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *arg)
-{
- struct pnfs_block_short_extent *lce, *save;
- unsigned int count = 0;
- __be32 *p, *xdr_start;
-
- dprintk("%s enter\n", __func__);
- /* BUG - creation of bl_commit is buggy - need to wait for
- * entire block to be marked WRITTEN before it can be added.
- */
- spin_lock(&bl->bl_ext_lock);
- /* Want to adjust for possible truncate */
- /* We now want to adjust argument range */
-
- /* XDR encode the ranges found */
- xdr_start = xdr_reserve_space(xdr, 8);
- if (!xdr_start)
- goto out;
- list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
- p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
- if (!p)
- break;
- p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
- p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, 0LL);
- *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
- list_move_tail(&lce->bse_node, &bl->bl_committing);
- bl->bl_count--;
- count++;
- }
- xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
- xdr_start[1] = cpu_to_be32(count);
-out:
- spin_unlock(&bl->bl_ext_lock);
- dprintk("%s found %i ranges\n", __func__, count);
- return 0;
-}
-
-/* Helper function to set_to_rw that initialize a new extent */
-static void
-_prep_new_extent(struct pnfs_block_extent *new,
- struct pnfs_block_extent *orig,
- sector_t offset, sector_t length, int state)
-{
- kref_init(&new->be_refcnt);
- /* don't need to INIT_LIST_HEAD(&new->be_node) */
- memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
- new->be_mdev = orig->be_mdev;
- new->be_f_offset = offset;
- new->be_length = length;
- new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
- new->be_state = state;
- new->be_inval = orig->be_inval;
-}
-
-/* Tries to merge be with extent in front of it in list.
- * Frees storage if not used.
- */
-static struct pnfs_block_extent *
-_front_merge(struct pnfs_block_extent *be, struct list_head *head,
- struct pnfs_block_extent *storage)
-{
- struct pnfs_block_extent *prev;
-
- if (!storage)
- goto no_merge;
- if (&be->be_node == head || be->be_node.prev == head)
- goto no_merge;
- prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
- if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
- !extents_consistent(prev, be))
- goto no_merge;
- _prep_new_extent(storage, prev, prev->be_f_offset,
- prev->be_length + be->be_length, prev->be_state);
- list_replace(&prev->be_node, &storage->be_node);
- bl_put_extent(prev);
- list_del(&be->be_node);
- bl_put_extent(be);
- return storage;
-
- no_merge:
- kfree(storage);
- return be;
-}
-
-static u64
-set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
-{
- u64 rv = offset + length;
- struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
- struct pnfs_block_extent *children[3];
- struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
- int i = 0, j;
-
- dprintk("%s(%llu, %llu)\n", __func__, offset, length);
- /* Create storage for up to three new extents e1, e2, e3 */
- e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
- e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
- e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
- /* BUG - we are ignoring any failure */
- if (!e1 || !e2 || !e3)
- goto out_nosplit;
-
- spin_lock(&bl->bl_ext_lock);
- be = bl_find_get_extent_locked(bl, offset);
- rv = be->be_f_offset + be->be_length;
- if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
- spin_unlock(&bl->bl_ext_lock);
- goto out_nosplit;
- }
- /* Add e* to children, bumping e*'s krefs */
- if (be->be_f_offset != offset) {
- _prep_new_extent(e1, be, be->be_f_offset,
- offset - be->be_f_offset,
- PNFS_BLOCK_INVALID_DATA);
- children[i++] = e1;
- print_bl_extent(e1);
- } else
- merge1 = e1;
- _prep_new_extent(e2, be, offset,
- min(length, be->be_f_offset + be->be_length - offset),
- PNFS_BLOCK_READWRITE_DATA);
- children[i++] = e2;
- print_bl_extent(e2);
- if (offset + length < be->be_f_offset + be->be_length) {
- _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
- be->be_f_offset + be->be_length -
- offset - length,
- PNFS_BLOCK_INVALID_DATA);
- children[i++] = e3;
- print_bl_extent(e3);
- } else
- merge2 = e3;
-
- /* Remove be from list, and insert the e* */
- /* We don't get refs on e*, since this list is the base reference
- * set when init'ed.
- */
- if (i < 3)
- children[i] = NULL;
- new = children[0];
- list_replace(&be->be_node, &new->be_node);
- bl_put_extent(be);
- new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
- for (j = 1; j < i; j++) {
- old = new;
- new = children[j];
- list_add(&new->be_node, &old->be_node);
- }
- if (merge2) {
- /* This is a HACK, should just create a _back_merge function */
- new = list_entry(new->be_node.next,
- struct pnfs_block_extent, be_node);
- new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
- }
- spin_unlock(&bl->bl_ext_lock);
-
- /* Since we removed the base reference above, be is now scheduled for
- * destruction.
- */
- bl_put_extent(be);
- dprintk("%s returns %llu after split\n", __func__, rv);
- return rv;
-
- out_nosplit:
- kfree(e1);
- kfree(e2);
- kfree(e3);
- dprintk("%s returns %llu without splitting\n", __func__, rv);
- return rv;
-}
-
-void
-clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
- const struct nfs4_layoutcommit_args *arg,
- int status)
-{
- struct pnfs_block_short_extent *lce, *save;
-
- dprintk("%s status %d\n", __func__, status);
- list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
- if (likely(!status)) {
- u64 offset = lce->bse_f_offset;
- u64 end = offset + lce->bse_length;
-
- do {
- offset = set_to_rw(bl, offset, end - offset);
- } while (offset < end);
- list_del(&lce->bse_node);
-
- kfree(lce);
- } else {
- list_del(&lce->bse_node);
- spin_lock(&bl->bl_ext_lock);
- add_to_commitlist(bl, lce);
- spin_unlock(&bl->bl_ext_lock);
- }
- }
-}
-
-int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
-{
- struct pnfs_block_short_extent *new;
-
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (unlikely(!new))
- return -ENOMEM;
-
- spin_lock_bh(&marks->im_lock);
- list_add(&new->bse_node, &marks->im_extents);
- spin_unlock_bh(&marks->im_lock);
-
- return 0;
-}
-
-struct pnfs_block_short_extent *
-bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
-{
- struct pnfs_block_short_extent *rv = NULL;
-
- spin_lock_bh(&marks->im_lock);
- if (!list_empty(&marks->im_extents)) {
- rv = list_entry((&marks->im_extents)->next,
- struct pnfs_block_short_extent, bse_node);
- list_del_init(&rv->bse_node);
- }
- spin_unlock_bh(&marks->im_lock);
-
- return rv;
-}
-
-void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
-{
- struct pnfs_block_short_extent *se = NULL, *tmp;
-
- if (num_to_free <= 0)
- return;
-
- spin_lock(&marks->im_lock);
- list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
- list_del(&se->bse_node);
- kfree(se);
- if (--num_to_free == 0)
- break;
- }
- spin_unlock(&marks->im_lock);
-
- BUG_ON(num_to_free > 0);
-}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..acbf9ca4018c
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
+{
+ int i;
+
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->simple.nr_sigs);
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+ p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+ b->simple.sigs[i].sig_len);
+ }
+}
+
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
+ gfp_t gfp_mask)
+{
+ struct net *net = server->nfs_client->cl_net;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct bl_dev_msg *reply = &nn->bl_mount_reply;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+ struct bl_msg_hdr *bl_msg;
+ DECLARE_WAITQUEUE(wq, current);
+ dev_t dev = 0;
+ int rc;
+
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+
+ mutex_lock(&nn->bl_mutex);
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+ b->simple.len += 4; /* single volume */
+ if (b->simple.len > PAGE_SIZE)
+ goto out_unlock;
+
+ memset(msg, 0, sizeof(*msg));
+ msg->len = sizeof(*bl_msg) + b->simple.len;
+ msg->data = kzalloc(msg->len, gfp_mask);
+ if (!msg->data)
+ goto out_free_data;
+
+ bl_msg = msg->data;
+ bl_msg->type = BL_DEVICE_MOUNT,
+ bl_msg->totallen = b->simple.len;
+ nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ add_wait_queue(&nn->bl_wq, &wq);
+ rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+ if (rc < 0) {
+ remove_wait_queue(&nn->bl_wq, &wq);
+ goto out_free_data;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ remove_wait_queue(&nn->bl_wq, &wq);
+
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ printk(KERN_WARNING "%s failed to decode device: %d\n",
+ __func__, reply->status);
+ goto out_free_data;
+ }
+
+ dev = MKDEV(reply->major, reply->minor);
+out_free_data:
+ kfree(msg->data);
+out_unlock:
+ mutex_unlock(&nn->bl_mutex);
+ return dev;
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+ size_t mlen)
+{
+ struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+ nfs_net_id);
+
+ if (mlen != sizeof (struct bl_dev_msg))
+ return -EINVAL;
+
+ if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up(&nn->bl_wq);
+
+ return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct bl_pipe_msg *bl_pipe_msg =
+ container_of(msg, struct bl_pipe_msg, msg);
+
+ if (msg->errno >= 0)
+ return;
+ wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (nn->bl_device_pipe == NULL) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (nn->bl_device_pipe->dentry)
+ nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+ struct dentry *dentry;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (!pipefs_sb)
+ return NULL;
+ dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+
+ mutex_init(&nn->bl_mutex);
+ init_waitqueue_head(&nn->bl_wq);
+ nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+ if (IS_ERR(nn->bl_device_pipe))
+ return PTR_ERR(nn->bl_device_pipe);
+ dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ return PTR_ERR(dentry);
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+ .init = nfs4blocklayout_net_init,
+ .exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+ int ret;
+
+ ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ if (ret)
+ goto out;
+ ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+ if (ret)
+ goto out_unregister_notifier;
+ return 0;
+
+out_unregister_notifier:
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+ return ret;
+}
+
+void __exit bl_cleanup_pipefs(void)
+{
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+ unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 073b4cf67ed9..b8fb3a4ef649 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -235,7 +235,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
cb_info->serv = serv;
cb_info->rqst = rqstp;
- cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+ cb_info->task = kthread_create(callback_svc, cb_info->rqst,
"nfsv4.%u-svc", minorversion);
if (IS_ERR(cb_info->task)) {
ret = PTR_ERR(cb_info->task);
@@ -244,6 +244,8 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
cb_info->task = NULL;
return ret;
}
+ rqstp->rq_task = cb_info->task;
+ wake_up_process(cb_info->task);
dprintk("nfs_callback_up: service started\n");
return 0;
}
@@ -428,6 +430,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
if (p == NULL)
return 0;
+ /*
+ * Did we get the acceptor from userland during the SETCLIENID
+ * negotiation?
+ */
+ if (clp->cl_acceptor)
+ return !strcmp(p, clp->cl_acceptor);
+
+ /*
+ * Otherwise try to verify it using the cl_hostname. Note that this
+ * doesn't work if a non-canonical hostname was used in the devname.
+ */
+
/* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
if (memcmp(p, "nfs@", 4) != 0)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto out;
ino = lo->plh_inode;
+
+ spin_lock(&ino->i_lock);
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ spin_unlock(&ino->i_lock);
+
+ pnfs_layoutcommit_inode(ino, false);
+
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
- &args->cbl_range))
+ &args->cbl_range)) {
rv = NFS4ERR_DELAY;
- else
- rv = NFS4ERR_NOMATCHING_LAYOUT;
- pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ goto unlock;
+ }
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+ &args->cbl_range);
+ }
+unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
}
found:
- if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
- dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
- "deleting instead\n", __func__);
nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1d09289c8f0e..f9f4845db989 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
mutex_unlock(&nfs_version_mutex);
}
- if (!IS_ERR(nfs))
- try_module_get(nfs->owner);
+ if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
+ return ERR_PTR(-EAGAIN);
return nfs;
}
@@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
goto error_0;
clp->cl_nfs_mod = cl_init->nfs_mod;
- try_module_get(clp->cl_nfs_mod->owner);
+ if (!try_module_get(clp->cl_nfs_mod->owner))
+ goto error_dealloc;
clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
@@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
error_cleanup:
put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
kfree(clp);
error_0:
return ERR_PTR(err);
@@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp)
put_net(clp->cl_net);
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
+ kfree(clp->cl_acceptor);
kfree(clp);
dprintk("<-- nfs_free_client()\n");
@@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
+ if (cl_init->hostname == NULL) {
+ WARN_ON(1);
+ return NULL;
+ }
+
dprintk("--> nfs_get_client(%s,v%u)\n",
- cl_init->hostname ?: "", rpc_ops->version);
+ cl_init->hostname, rpc_ops->version);
/* see if the client already exists */
do {
@@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
} while (!IS_ERR(new));
dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
- cl_init->hostname ?: "", PTR_ERR(new));
+ cl_init->hostname, PTR_ERR(new));
return new;
}
EXPORT_SYMBOL_GPL(nfs_get_client);
@@ -1205,7 +1213,7 @@ static const struct file_operations nfs_server_list_fops = {
.open = nfs_server_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
.owner = THIS_MODULE,
};
@@ -1226,7 +1234,7 @@ static const struct file_operations nfs_volume_list_fops = {
.open = nfs_volume_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
.owner = THIS_MODULE,
};
@@ -1236,27 +1244,17 @@ static const struct file_operations nfs_volume_list_fops = {
*/
static int nfs_server_list_open(struct inode *inode, struct file *file)
{
- struct seq_file *m;
- int ret;
- struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
- struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
-
- ret = seq_open(file, &nfs_server_list_ops);
- if (ret < 0)
- return ret;
-
- m = file->private_data;
- m->private = net;
-
- return 0;
+ return seq_open_net(inode, file, &nfs_server_list_ops,
+ sizeof(struct seq_net_private));
}
/*
* set up the iterator to start reading from the server list and return the first item
*/
static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
{
- struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* lock the list against modification */
spin_lock(&nn->nfs_client_lock);
@@ -1268,7 +1266,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
*/
static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
{
- struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
return seq_list_next(v, &nn->nfs_client_list, pos);
}
@@ -1277,8 +1275,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
* clean up after reading from the transports list
*/
static void nfs_server_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
{
- struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
spin_unlock(&nn->nfs_client_lock);
}
@@ -1289,7 +1288,7 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)
static int nfs_server_list_show(struct seq_file *m, void *v)
{
struct nfs_client *clp;
- struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* display header on line 1 */
if (v == &nn->nfs_client_list) {
@@ -1321,27 +1320,17 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
*/
static int nfs_volume_list_open(struct inode *inode, struct file *file)
{
- struct seq_file *m;
- int ret;
- struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
- struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
-
- ret = seq_open(file, &nfs_volume_list_ops);
- if (ret < 0)
- return ret;
-
- m = file->private_data;
- m->private = net;
-
- return 0;
+ return seq_open_net(inode, file, &nfs_volume_list_ops,
+ sizeof(struct seq_net_private));
}
/*
* set up the iterator to start reading from the volume list and return the first item
*/
static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
{
- struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* lock the list against modification */
spin_lock(&nn->nfs_client_lock);
@@ -1353,7 +1342,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
*/
static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
{
- struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
return seq_list_next(v, &nn->nfs_volume_list, pos);
}
@@ -1362,8 +1351,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
* clean up after reading from the transports list
*/
static void nfs_volume_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
{
- struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
spin_unlock(&nn->nfs_client_lock);
}
@@ -1376,7 +1366,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
struct nfs_server *server;
struct nfs_client *clp;
char dev[8], fsid[17];
- struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* display header on line 1 */
if (v == &nn->nfs_volume_list) {
@@ -1407,6 +1397,39 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
return 0;
}
+int nfs_fs_proc_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct proc_dir_entry *p;
+
+ nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net);
+ if (!nn->proc_nfsfs)
+ goto error_0;
+
+ /* a file of servers with which we're dealing */
+ p = proc_create("servers", S_IFREG|S_IRUGO,
+ nn->proc_nfsfs, &nfs_server_list_fops);
+ if (!p)
+ goto error_1;
+
+ /* a file of volumes that we have mounted */
+ p = proc_create("volumes", S_IFREG|S_IRUGO,
+ nn->proc_nfsfs, &nfs_volume_list_fops);
+ if (!p)
+ goto error_1;
+ return 0;
+
+error_1:
+ remove_proc_subtree("nfsfs", net->proc_net);
+error_0:
+ return -ENOMEM;
+}
+
+void nfs_fs_proc_net_exit(struct net *net)
+{
+ remove_proc_subtree("nfsfs", net->proc_net);
+}
+
/*
* initialise the /proc/fs/nfsfs/ directory
*/
@@ -1419,14 +1442,12 @@ int __init nfs_fs_proc_init(void)
goto error_0;
/* a file of servers with which we're dealing */
- p = proc_create("servers", S_IFREG|S_IRUGO,
- proc_fs_nfs, &nfs_server_list_fops);
+ p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
if (!p)
goto error_1;
/* a file of volumes that we have mounted */
- p = proc_create("volumes", S_IFREG|S_IRUGO,
- proc_fs_nfs, &nfs_volume_list_fops);
+ p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
if (!p)
goto error_2;
return 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5d8ccecf5f5c..7f3f60641344 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
}
-/**
- * nfs_have_delegation - check if inode has a delegation
- * @inode: inode to check
- * @flags: delegation types to check for
- *
- * Returns one if inode has the indicated delegation, otherwise zero.
- */
-int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+static int
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
{
struct nfs_delegation *delegation;
int ret = 0;
@@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation != NULL && (delegation->type & flags) == flags &&
!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
- nfs_mark_delegation_referenced(delegation);
+ if (mark)
+ nfs_mark_delegation_referenced(delegation);
ret = 1;
}
rcu_read_unlock();
return ret;
}
+/**
+ * nfs_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return nfs4_do_check_delegation(inode, flags, true);
+}
+
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+ return nfs4_do_check_delegation(inode, flags, false);
+}
static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
{
@@ -109,6 +125,8 @@ again:
continue;
if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
if (!nfs4_stateid_match(&state->stateid, stateid))
continue;
get_nfs_open_context(ctx);
@@ -177,7 +195,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
{
int res = 0;
- res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ res = nfs4_proc_delegreturn(inode,
+ delegation->cred,
+ &delegation->stateid,
+ issync);
nfs_free_delegation(delegation);
return res;
}
@@ -364,11 +386,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
{
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
struct nfs_inode *nfsi = NFS_I(inode);
- int err;
+ int err = 0;
if (delegation == NULL)
return 0;
do {
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ break;
err = nfs_delegation_claim_opens(inode, &delegation->stateid);
if (!issync || err != -EAGAIN)
break;
@@ -589,10 +613,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
rcu_read_unlock();
}
+static void nfs_revoke_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL) {
+ set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+ nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+ }
+ rcu_read_unlock();
+}
+
void nfs_remove_bad_delegation(struct inode *inode)
{
struct nfs_delegation *delegation;
+ nfs_revoke_delegation(inode);
delegation = nfs_inode_detach_delegation(inode);
if (delegation) {
nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9a79c7a99d6d..e3c20a3ccc93 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
NFS_DELEGATION_RETURN_IF_CLOSED,
NFS_DELEGATION_REFERENCED,
NFS_DELEGATION_RETURNING,
+ NFS_DELEGATION_REVOKED,
};
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
@@ -59,6 +60,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a3d4ef76127..6e62155abf26 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -486,8 +486,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
goto out;
} else {
- if (d_invalidate(dentry) != 0)
- goto out;
+ d_invalidate(dentry);
dput(dentry);
}
}
@@ -988,9 +987,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
* A check for whether or not the parent directory has changed.
* In the case it has, we assume that the dentries are untrustworthy
* and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
*/
-static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+ int rcu_walk)
{
+ int ret;
+
if (IS_ROOT(dentry))
return 1;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1001,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
/* Revalidate nfsi->cache_change_attribute before we declare a match */
- if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+ if (rcu_walk)
+ ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
+ else
+ ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
+ if (ret < 0)
return 0;
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
@@ -1042,6 +1049,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
out:
return (inode->i_nlink == 0) ? -ENOENT : 0;
out_force:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
ret = __nfs_revalidate_inode(server, inode);
if (ret != 0)
return ret;
@@ -1054,6 +1063,9 @@ out_force:
*
* If parent mtime has changed, we revalidate, else we wait for a
* period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
*/
static inline
int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1076,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
return 0;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
return 1;
- return !nfs_check_verifier(dir, dentry);
+ return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
}
/*
@@ -1088,21 +1100,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct nfs4_label *label = NULL;
int error;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- parent = dget_parent(dentry);
- dir = parent->d_inode;
+ if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = ACCESS_ONCE(parent->d_inode);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+ }
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = dentry->d_inode;
if (!inode) {
- if (nfs_neg_need_reval(dir, dentry, flags))
+ if (nfs_neg_need_reval(dir, dentry, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
goto out_bad;
+ }
goto out_valid_noent;
}
if (is_bad_inode(inode)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
__func__, dentry);
goto out_bad;
@@ -1112,12 +1133,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto out_set_verifier;
/* Force a full look up iff the parent directory has changed */
- if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
- if (nfs_lookup_verify_inode(inode, flags))
+ if (!nfs_is_exclusive_create(dir, flags) &&
+ nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+
+ if (nfs_lookup_verify_inode(inode, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
goto out_zap_parent;
+ }
goto out_valid;
}
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
if (NFS_STALE(inode))
goto out_bad;
@@ -1153,13 +1182,18 @@ out_set_verifier:
/* Success: notify readdir to use READDIRPLUS */
nfs_advise_use_readdirplus(dir);
out_valid_noent:
- dput(parent);
+ if (flags & LOOKUP_RCU) {
+ if (parent != ACCESS_ONCE(dentry->d_parent))
+ return -ECHILD;
+ } else
+ dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
__func__, dentry);
return 1;
out_zap_parent:
nfs_zap_caches(dir);
out_bad:
+ WARN_ON(flags & LOOKUP_RCU);
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
@@ -1176,15 +1210,12 @@ out_zap_parent:
if (IS_ROOT(dentry))
goto out_valid;
}
- /* If we have submounts, don't unhash ! */
- if (check_submounts_and_drop(dentry) != 0)
- goto out_valid;
-
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
__func__, dentry);
return 0;
out_error:
+ WARN_ON(flags & LOOKUP_RCU);
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
@@ -1496,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
case -ENOENT:
d_drop(dentry);
d_add(dentry, NULL);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
break;
case -EISDIR:
case -ENOTDIR:
@@ -1529,14 +1561,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct dentry *parent = NULL;
struct inode *inode;
- struct inode *dir;
int ret = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
goto no_open;
if (d_mountpoint(dentry))
@@ -1545,34 +1572,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
goto no_open;
inode = dentry->d_inode;
- parent = dget_parent(dentry);
- dir = parent->d_inode;
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
*/
if (inode == NULL) {
+ struct dentry *parent;
+ struct inode *dir;
+
+ if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = ACCESS_ONCE(parent->d_inode);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+ }
if (!nfs_neg_need_reval(dir, dentry, flags))
ret = 1;
+ else if (flags & LOOKUP_RCU)
+ ret = -ECHILD;
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
+ else if (parent != ACCESS_ONCE(dentry->d_parent))
+ return -ECHILD;
goto out;
}
/* NFS only supports OPEN on regular files */
if (!S_ISREG(inode->i_mode))
- goto no_open_dput;
+ goto no_open;
/* We cannot do exclusive creation on a positive dentry */
if (flags & LOOKUP_EXCL)
- goto no_open_dput;
+ goto no_open;
/* Let f_op->open() actually open (and revalidate) the file */
ret = 1;
out:
- dput(parent);
return ret;
-no_open_dput:
- dput(parent);
no_open:
return nfs_lookup_revalidate(dentry, flags);
}
@@ -2028,10 +2068,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
static LIST_HEAD(nfs_access_lru_list);
static atomic_long_t nfs_access_nr_entries;
+static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
+
static void nfs_access_free_entry(struct nfs_access_entry *entry)
{
put_rpccred(entry->cred);
- kfree(entry);
+ kfree_rcu(entry, rcu_head);
smp_mb__before_atomic();
atomic_long_dec(&nfs_access_nr_entries);
smp_mb__after_atomic();
@@ -2048,19 +2092,14 @@ static void nfs_access_free_list(struct list_head *head)
}
}
-unsigned long
-nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
{
LIST_HEAD(head);
struct nfs_inode *nfsi, *next;
struct nfs_access_entry *cache;
- int nr_to_scan = sc->nr_to_scan;
- gfp_t gfp_mask = sc->gfp_mask;
long freed = 0;
- if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
- return SHRINK_STOP;
-
spin_lock(&nfs_access_lru_lock);
list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
struct inode *inode;
@@ -2094,11 +2133,39 @@ remove_lru_entry:
}
unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
+ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+ return SHRINK_STOP;
+ return nfs_do_access_cache_scan(nr_to_scan);
+}
+
+
+unsigned long
nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
}
+static void
+nfs_access_cache_enforce_limit(void)
+{
+ long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+ unsigned long diff;
+ unsigned int nr_to_scan;
+
+ if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+ return;
+ nr_to_scan = 100;
+ diff = nr_entries - nfs_access_max_cachesize;
+ if (diff < nr_to_scan)
+ nr_to_scan = diff;
+ nfs_do_access_cache_scan(nr_to_scan);
+}
+
static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
{
struct rb_root *root_node = &nfsi->access_cache;
@@ -2186,6 +2253,38 @@ out_zap:
return -ENOENT;
}
+static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+ /* Only check the most recently returned cache entry,
+ * but do it without locking.
+ */
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_access_entry *cache;
+ int err = -ECHILD;
+ struct list_head *lh;
+
+ rcu_read_lock();
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out;
+ lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+ cache = list_entry(lh, struct nfs_access_entry, lru);
+ if (lh == &nfsi->access_cache_entry_lru ||
+ cred != cache->cred)
+ cache = NULL;
+ if (cache == NULL)
+ goto out;
+ if (!nfs_have_delegated_attributes(inode) &&
+ !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+ goto out;
+ res->jiffies = cache->jiffies;
+ res->cred = cache->cred;
+ res->mask = cache->mask;
+ err = 0;
+out:
+ rcu_read_unlock();
+ return err;
+}
+
static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -2229,6 +2328,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
cache->cred = get_rpccred(set->cred);
cache->mask = set->mask;
+ /* The above field assignments must be visible
+ * before this item appears on the lru. We cannot easily
+ * use rcu_assign_pointer, so just force the memory barrier.
+ */
+ smp_wmb();
nfs_access_add_rbtree(inode, cache);
/* Update accounting */
@@ -2244,6 +2348,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
&nfs_access_lru_list);
spin_unlock(&nfs_access_lru_lock);
}
+ nfs_access_cache_enforce_limit();
}
EXPORT_SYMBOL_GPL(nfs_access_add_cache);
@@ -2267,10 +2372,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
trace_nfs_access_enter(inode);
- status = nfs_access_get_cached(inode, cred, &cache);
+ status = nfs_access_get_cached_rcu(inode, cred, &cache);
+ if (status != 0)
+ status = nfs_access_get_cached(inode, cred, &cache);
if (status == 0)
goto out_cached;
+ status = -ECHILD;
+ if (mask & MAY_NOT_BLOCK)
+ goto out;
+
/* Be clever: ask server to check for all possible rights */
cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
cache.cred = cred;
@@ -2321,9 +2432,6 @@ int nfs_permission(struct inode *inode, int mask)
struct rpc_cred *cred;
int res = 0;
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
-
nfs_inc_stats(inode, NFSIOS_VFSACCESS);
if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2350,12 +2458,23 @@ force_lookup:
if (!NFS_PROTO(inode)->access)
goto out_notsup;
- cred = rpc_lookup_cred();
- if (!IS_ERR(cred)) {
- res = nfs_do_access(inode, cred, mask);
- put_rpccred(cred);
- } else
+ /* Always try fast lookups first */
+ rcu_read_lock();
+ cred = rpc_lookup_cred_nonblock();
+ if (!IS_ERR(cred))
+ res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
+ else
res = PTR_ERR(cred);
+ rcu_read_unlock();
+ if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
+ /* Fast lookup failed, try the slow way */
+ cred = rpc_lookup_cred();
+ if (!IS_ERR(cred)) {
+ res = nfs_do_access(inode, cred, mask);
+ put_rpccred(cred);
+ } else
+ res = PTR_ERR(cred);
+ }
out:
if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
res = -EACCES;
@@ -2364,6 +2483,9 @@ out:
inode->i_sb->s_id, inode->i_ino, mask, res);
return res;
out_notsup:
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (res == 0)
res = generic_permission(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f11b9eed0de1..10bf07280f4a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
{
struct nfs_writeverf *verfp;
- verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
- hdr->data->ds_idx);
+ verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+ hdr->ds_idx);
WARN_ON_ONCE(verfp->committed >= 0);
memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
{
struct nfs_writeverf *verfp;
- verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
- hdr->data->ds_idx);
+ verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+ hdr->ds_idx);
if (verfp->committed < 0) {
nfs_direct_set_hdr_verf(dreq, hdr);
return 0;
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
/*
* nfs_direct_cmp_commit_data_verf - compare verifier for commit data
* @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
WARN_ON_ONCE(verfp->committed < 0);
return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
}
-#endif
/**
* nfs_direct_IO - NFS address space operation for direct I/O
@@ -222,11 +220,9 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
#else
VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
- if (rw == READ || rw == KERNEL_READ)
- return nfs_file_direct_read(iocb, iter, pos,
- rw == READ ? true : false);
- return nfs_file_direct_write(iocb, iter, pos,
- rw == WRITE ? true : false);
+ if (rw == READ)
+ return nfs_file_direct_read(iocb, iter, pos);
+ return nfs_file_direct_write(iocb, iter, pos);
#endif /* CONFIG_NFS_SWAP */
}
@@ -270,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref)
{
struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+ nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
if (dreq->l_ctx != NULL)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
@@ -512,7 +509,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* cache.
*/
ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos, bool uio)
+ loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -576,7 +573,6 @@ out:
return result;
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor desc;
@@ -700,22 +696,11 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
}
-#else
-static void nfs_direct_write_schedule_work(struct work_struct *work)
-{
-}
-
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-{
- nfs_direct_complete(dreq, true);
-}
-#endif
-
static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
struct nfs_commit_info cinfo;
- int bit = -1;
+ bool request_commit = false;
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
@@ -729,27 +714,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
dreq->flags = 0;
dreq->error = hdr->error;
}
- if (dreq->error != 0)
- bit = NFS_IOHDR_ERROR;
- else {
+ if (dreq->error == 0) {
dreq->count += hdr->good_bytes;
- if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- bit = NFS_IOHDR_NEED_RESCHED;
- } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+ if (nfs_write_need_commit(hdr)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
- bit = NFS_IOHDR_NEED_RESCHED;
+ request_commit = true;
else if (dreq->flags == 0) {
nfs_direct_set_hdr_verf(dreq, hdr);
- bit = NFS_IOHDR_NEED_COMMIT;
+ request_commit = true;
dreq->flags = NFS_ODIRECT_DO_COMMIT;
} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+ request_commit = true;
+ if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
dreq->flags =
NFS_ODIRECT_RESCHED_WRITES;
- bit = NFS_IOHDR_NEED_RESCHED;
- } else
- bit = NFS_IOHDR_NEED_COMMIT;
}
}
}
@@ -759,9 +737,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
req = nfs_list_entry(hdr->pages.next);
nfs_list_remove_request(req);
- switch (bit) {
- case NFS_IOHDR_NEED_RESCHED:
- case NFS_IOHDR_NEED_COMMIT:
+ if (request_commit) {
kref_get(&req->wb_kref);
nfs_mark_request_commit(req, hdr->lseg, &cinfo);
}
@@ -902,7 +878,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* is no atomic O_APPEND write facility in the NFS protocol.
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos, bool uio)
+ loff_t pos)
{
ssize_t result = -EINVAL;
struct file *file = iocb->ki_filp;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 524dd80d1898..2ab6f00dba5b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
+#include "pnfs.h"
#include "nfstrace.h"
@@ -171,7 +172,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
ssize_t result;
if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
+ return nfs_file_direct_read(iocb, to, iocb->ki_pos);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
unsigned int end = offset + len;
+ if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+ if (!PageUptodate(page))
+ return 1;
+ return 0;
+ }
+
if ((file->f_mode & FMODE_READ) && /* open for read? */
!PageUptodate(page) && /* Uptodate? */
!PagePrivate(page) && /* i/o request already? */
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
- /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
- * doing this memory reclaim for a fs-related allocation.
+ /* Always try to initiate a 'commit' if relevant, but only
+ * wait for it if __GFP_WAIT is set. Even then, only wait 1
+ * second and only if the 'bdi' is not congested.
+ * Waiting indefinitely can cause deadlocks when the NFS
+ * server is on this machine, when a new TCP connection is
+ * needed and in other rare cases. There is no particular
+ * need to wait extensively here. A short wait has the
+ * benefit that someone else can worry about the freezer.
*/
- if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
- !(current->flags & PF_FSTRANS)) {
- int how = FLUSH_SYNC;
-
- /* Don't let kswapd deadlock waiting for OOM RPC calls */
- if (current_is_kswapd())
- how = 0;
- nfs_commit_inode(mapping->host, how);
+ if (mapping) {
+ struct nfs_server *nfss = NFS_SERVER(mapping->host);
+ nfs_commit_inode(mapping->host, 0);
+ if ((gfp & __GFP_WAIT) &&
+ !bdi_write_congested(&nfss->backing_dev_info)) {
+ wait_on_page_bit_killable_timeout(page, PG_private,
+ HZ);
+ if (PagePrivate(page))
+ set_bdi_congested(&nfss->backing_dev_info,
+ BLK_RW_ASYNC);
+ }
}
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
+ int ret;
+ struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
*span = sis->pages;
- return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+
+ rcu_read_lock();
+ ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+ rcu_read_unlock();
+
+ return ret;
}
static void nfs_swap_deactivate(struct file *file)
{
- xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+ struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
+ rcu_read_lock();
+ xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+ rcu_read_unlock();
}
#endif
@@ -648,7 +676,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
return result;
if (file->f_flags & O_DIRECT)
- return nfs_file_direct_write(iocb, from, pos, true);
+ return nfs_file_direct_write(iocb, from, pos);
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, count, (long long) pos);
@@ -891,17 +919,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
}
EXPORT_SYMBOL_GPL(nfs_flock);
-/*
- * There is no protocol support for leases, so we have no way to implement
- * them correctly in the face of opens by other clients.
- */
-int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
-{
- dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg);
- return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(nfs_setlease);
-
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
.read = new_sync_read,
@@ -918,6 +935,6 @@ const struct file_operations nfs_file_operations = {
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
- .setlease = nfs_setlease,
+ .setlease = simple_nosetlease,
};
EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d2eba1c13b7e..7afb52f6a25a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
BUG();
}
-static void filelayout_reset_write(struct nfs_pgio_data *data)
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
- struct rpc_task *task = &data->task;
+ struct rpc_task *task = &hdr->task;
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
dprintk("%s Reset task %5u for i/o through MDS "
"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
- data->task.tk_pid,
+ hdr->task.tk_pid,
hdr->inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(hdr->inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
- task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
- &hdr->pages,
- hdr->completion_ops,
- hdr->dreq);
+ task->tk_status = pnfs_write_done_resend_to_mds(hdr);
}
}
-static void filelayout_reset_read(struct nfs_pgio_data *data)
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
- struct rpc_task *task = &data->task;
+ struct rpc_task *task = &hdr->task;
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
dprintk("%s Reset task %5u for i/o through MDS "
"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
- data->task.tk_pid,
+ hdr->task.tk_pid,
hdr->inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(hdr->inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
- task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
- &hdr->pages,
- hdr->completion_ops,
- hdr->dreq);
+ task->tk_status = pnfs_read_done_resend_to_mds(hdr);
}
}
@@ -153,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
@@ -243,18 +232,17 @@ wait_on_recovery:
/* NFS_PROTO call done callback routines */
static int filelayout_read_done_cb(struct rpc_task *task,
- struct nfs_pgio_data *data)
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
int err;
- trace_nfs4_pnfs_read(data, task->tk_status);
- err = filelayout_async_handle_error(task, data->args.context->state,
- data->ds_clp, hdr->lseg);
+ trace_nfs4_pnfs_read(hdr, task->tk_status);
+ err = filelayout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg);
switch (err) {
case -NFS4ERR_RESET_TO_MDS:
- filelayout_reset_read(data);
+ filelayout_reset_read(hdr);
return task->tk_status;
case -EAGAIN:
rpc_restart_call_prepare(task);
@@ -270,15 +258,14 @@ static int filelayout_read_done_cb(struct rpc_task *task,
* rfc5661 is not clear about which credential should be used.
*/
static void
-filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = wdata->header;
if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
- wdata->res.verf->committed == NFS_FILE_SYNC)
+ hdr->res.verf->committed != NFS_DATA_SYNC)
return;
- pnfs_set_layoutcommit(wdata);
+ pnfs_set_layoutcommit(hdr);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -305,83 +292,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
*/
static void filelayout_read_prepare(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *rdata = data;
+ struct nfs_pgio_header *hdr = data;
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return;
}
- if (filelayout_reset_to_mds(rdata->header->lseg)) {
+ if (filelayout_reset_to_mds(hdr->lseg)) {
dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
- filelayout_reset_read(rdata);
+ filelayout_reset_read(hdr);
rpc_exit(task, 0);
return;
}
- rdata->pgio_done_cb = filelayout_read_done_cb;
+ hdr->pgio_done_cb = filelayout_read_done_cb;
- if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
- &rdata->args.seq_args,
- &rdata->res.seq_res,
+ if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
task))
return;
- if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
- rdata->args.lock_context, FMODE_READ) == -EIO)
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_READ) == -EIO)
rpc_exit(task, -EIO); /* lost lock, terminate I/O */
}
static void filelayout_read_call_done(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *rdata = data;
+ struct nfs_pgio_header *hdr = data;
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
- if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
- nfs41_sequence_done(task, &rdata->res.seq_res);
+ nfs41_sequence_done(task, &hdr->res.seq_res);
return;
}
/* Note this may cause RPC to be resent */
- rdata->header->mds_ops->rpc_call_done(task, data);
+ hdr->mds_ops->rpc_call_done(task, data);
}
static void filelayout_read_count_stats(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *rdata = data;
+ struct nfs_pgio_header *hdr = data;
- rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
+ rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
}
static void filelayout_read_release(void *data)
{
- struct nfs_pgio_data *rdata = data;
- struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
+ struct nfs_pgio_header *hdr = data;
+ struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
filelayout_fenceme(lo->plh_inode, lo);
- nfs_put_client(rdata->ds_clp);
- rdata->header->mds_ops->rpc_release(data);
+ nfs_put_client(hdr->ds_clp);
+ hdr->mds_ops->rpc_release(data);
}
static int filelayout_write_done_cb(struct rpc_task *task,
- struct nfs_pgio_data *data)
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
int err;
- trace_nfs4_pnfs_write(data, task->tk_status);
- err = filelayout_async_handle_error(task, data->args.context->state,
- data->ds_clp, hdr->lseg);
+ trace_nfs4_pnfs_write(hdr, task->tk_status);
+ err = filelayout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg);
switch (err) {
case -NFS4ERR_RESET_TO_MDS:
- filelayout_reset_write(data);
+ filelayout_reset_write(hdr);
return task->tk_status;
case -EAGAIN:
rpc_restart_call_prepare(task);
return -EAGAIN;
}
- filelayout_set_layoutcommit(data);
+ filelayout_set_layoutcommit(hdr);
return 0;
}
@@ -414,62 +400,65 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
+ if (data->verf.committed == NFS_UNSTABLE)
+ pnfs_commit_set_layoutcommit(data);
+
return 0;
}
static void filelayout_write_prepare(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *wdata = data;
+ struct nfs_pgio_header *hdr = data;
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return;
}
- if (filelayout_reset_to_mds(wdata->header->lseg)) {
+ if (filelayout_reset_to_mds(hdr->lseg)) {
dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
- filelayout_reset_write(wdata);
+ filelayout_reset_write(hdr);
rpc_exit(task, 0);
return;
}
- if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
- &wdata->args.seq_args,
- &wdata->res.seq_res,
+ if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
task))
return;
- if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
- wdata->args.lock_context, FMODE_WRITE) == -EIO)
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_WRITE) == -EIO)
rpc_exit(task, -EIO); /* lost lock, terminate I/O */
}
static void filelayout_write_call_done(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *wdata = data;
+ struct nfs_pgio_header *hdr = data;
- if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
- nfs41_sequence_done(task, &wdata->res.seq_res);
+ nfs41_sequence_done(task, &hdr->res.seq_res);
return;
}
/* Note this may cause RPC to be resent */
- wdata->header->mds_ops->rpc_call_done(task, data);
+ hdr->mds_ops->rpc_call_done(task, data);
}
static void filelayout_write_count_stats(struct rpc_task *task, void *data)
{
- struct nfs_pgio_data *wdata = data;
+ struct nfs_pgio_header *hdr = data;
- rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
+ rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
}
static void filelayout_write_release(void *data)
{
- struct nfs_pgio_data *wdata = data;
- struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
+ struct nfs_pgio_header *hdr = data;
+ struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
filelayout_fenceme(lo->plh_inode, lo);
- nfs_put_client(wdata->ds_clp);
- wdata->header->mds_ops->rpc_release(data);
+ nfs_put_client(hdr->ds_clp);
+ hdr->mds_ops->rpc_release(data);
}
static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -529,19 +518,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
};
static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_pgio_data *data)
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
- loff_t offset = data->args.offset;
+ loff_t offset = hdr->args.offset;
u32 j, idx;
struct nfs_fh *fh;
dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
__func__, hdr->inode->i_ino,
- data->args.pgbase, (size_t)data->args.count, offset);
+ hdr->args.pgbase, (size_t)hdr->args.count, offset);
/* Retrieve the correct rpc_client for the byte range */
j = nfs4_fl_calc_j_index(lseg, offset);
@@ -559,30 +547,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
/* No multipath support. Use first DS */
atomic_inc(&ds->ds_clp->cl_count);
- data->ds_clp = ds->ds_clp;
- data->ds_idx = idx;
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
- data->args.fh = fh;
+ hdr->args.fh = fh;
- data->args.offset = filelayout_get_dserver_offset(lseg, offset);
- data->mds_offset = offset;
+ hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ hdr->mds_offset = offset;
/* Perform an asynchronous read to ds */
- nfs_initiate_pgio(ds_clnt, data,
+ nfs_initiate_pgio(ds_clnt, hdr,
&filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
/* Perform async writes. */
static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
{
- struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
- loff_t offset = data->args.offset;
+ loff_t offset = hdr->args.offset;
u32 j, idx;
struct nfs_fh *fh;
@@ -598,21 +585,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
return PNFS_NOT_ATTEMPTED;
dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
- __func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
+ __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
- data->pgio_done_cb = filelayout_write_done_cb;
+ hdr->pgio_done_cb = filelayout_write_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
- data->ds_clp = ds->ds_clp;
- data->ds_idx = idx;
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
- data->args.fh = fh;
-
- data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ hdr->args.fh = fh;
+ hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
/* Perform an asynchronous write */
- nfs_initiate_pgio(ds_clnt, data,
+ nfs_initiate_pgio(ds_clnt, hdr,
&filelayout_write_call_ops, sync,
RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
@@ -660,18 +646,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
}
/* find and reference the deviceid */
- d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
- NFS_SERVER(lo->plh_inode)->nfs_client, id);
- if (d == NULL) {
- dsaddr = filelayout_get_device_info(lo->plh_inode, id,
- lo->plh_lc_cred, gfp_flags);
- if (dsaddr == NULL)
- goto out;
- } else
- dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
+ lo->plh_lc_cred, gfp_flags);
+ if (d == NULL)
+ goto out;
+
+ dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
/* Found deviceid is unavailable */
if (filelayout_test_devid_unavailable(&dsaddr->id_node))
- goto out_put;
+ goto out_put;
fl->dsaddr = dsaddr;
@@ -1023,6 +1006,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
/* The generic layer is about to remove the req from the commit list.
* If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this is must be called holding the inode (/cinfo) lock
*/
static void
filelayout_clear_request_commit(struct nfs_page *req,
@@ -1030,7 +1014,6 @@ filelayout_clear_request_commit(struct nfs_page *req,
{
struct pnfs_layout_segment *freeme = NULL;
- spin_lock(cinfo->lock);
if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
goto out;
cinfo->ds->nwritten--;
@@ -1045,22 +1028,25 @@ filelayout_clear_request_commit(struct nfs_page *req,
}
out:
nfs_request_remove_commit_list(req, cinfo);
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(freeme);
+ pnfs_put_lseg_locked(freeme);
}
-static struct list_head *
-filelayout_choose_commit_list(struct nfs_page *req,
- struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo)
+
{
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
u32 i, j;
struct list_head *list;
struct pnfs_commit_bucket *buckets;
- if (fl->commit_through_mds)
- return &cinfo->mds->list;
+ if (fl->commit_through_mds) {
+ list = &cinfo->mds->list;
+ spin_lock(cinfo->lock);
+ goto mds_commit;
+ }
/* Note that we are calling nfs4_fl_calc_j_index on each page
* that ends up being committed to a data server. An attractive
@@ -1084,19 +1070,22 @@ filelayout_choose_commit_list(struct nfs_page *req,
}
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
- spin_unlock(cinfo->lock);
- return list;
-}
-
-static void
-filelayout_mark_request_commit(struct nfs_page *req,
- struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
-{
- struct list_head *list;
- list = filelayout_choose_commit_list(req, lseg, cinfo);
- nfs_request_add_commit_list(req, list, cinfo);
+mds_commit:
+ /* nfs_request_add_commit_list(). We need to add req to list without
+ * dropping cinfo lock.
+ */
+ set_bit(PG_CLEAN, &(req)->wb_flags);
+ nfs_list_add_request(req, list);
+ cinfo->mds->ncommit++;
+ spin_unlock(cinfo->lock);
+ if (!cinfo->dreq) {
+ inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+ BDI_RECLAIMABLE);
+ __mark_inode_dirty(req->wb_context->dentry->d_inode,
+ I_DIRTY_DATASYNC);
+ }
}
static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -1244,15 +1233,64 @@ restart:
spin_unlock(cinfo->lock);
}
+/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
+ * for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+static struct nfs_page *
+filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+{
+ struct nfs_page *freq, *t;
+ struct pnfs_commit_bucket *b;
+ int i;
+
+ /* Linearly search the commit lists for each bucket until a matching
+ * request is found */
+ for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+ list_for_each_entry_safe(freq, t, &b->written, wb_list) {
+ if (freq->wb_page == page)
+ return freq->wb_head;
+ }
+ list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
+ if (freq->wb_page == page)
+ return freq->wb_head;
+ }
+ }
+
+ return NULL;
+}
+
+static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_bucket *bucket;
+ struct pnfs_layout_segment *freeme;
+ int i;
+
+ for (i = idx; i < fl_cinfo->nbuckets; i++) {
+ bucket = &fl_cinfo->buckets[i];
+ if (list_empty(&bucket->committing))
+ continue;
+ nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
+ spin_lock(cinfo->lock);
+ freeme = bucket->clseg;
+ bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+ pnfs_put_lseg(freeme);
+ }
+}
+
static unsigned int
alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
{
struct pnfs_ds_commit_info *fl_cinfo;
struct pnfs_commit_bucket *bucket;
struct nfs_commit_data *data;
- int i, j;
+ int i;
unsigned int nreq = 0;
- struct pnfs_layout_segment *freeme;
fl_cinfo = cinfo->ds;
bucket = fl_cinfo->buckets;
@@ -1272,16 +1310,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
}
/* Clean up on error */
- for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
- if (list_empty(&bucket->committing))
- continue;
- nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
- spin_lock(cinfo->lock);
- freeme = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(freeme);
- }
+ filelayout_retry_commit(cinfo, i);
/* Caller will clean up entries put on list */
return nreq;
}
@@ -1301,8 +1330,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
data->lseg = NULL;
list_add(&data->pages, &list);
nreq++;
- } else
+ } else {
nfs_retry_commit(mds_pages, NULL, cinfo);
+ filelayout_retry_commit(cinfo, 0);
+ cinfo->completion_ops->error_cleanup(NFS_I(inode));
+ return -ENOMEM;
+ }
}
nreq += alloc_ds_commits(cinfo, &list);
@@ -1332,6 +1365,17 @@ out:
cinfo->ds->ncommitting = 0;
return PNFS_ATTEMPTED;
}
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr;
+
+ dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
static void
filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1380,9 +1424,11 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.clear_request_commit = filelayout_clear_request_commit,
.scan_commit_lists = filelayout_scan_commit_lists,
.recover_commit_reqs = filelayout_recover_commit_reqs,
+ .search_commit_reqs = filelayout_search_commit_reqs,
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
+ .alloc_deviceid_node = filelayout_alloc_deviceid_node,
.free_deviceid_node = filelayout_free_deveiceid_node,
};
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags);
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred, gfp_t gfp_flags);
#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index e2a0361e24c6..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
}
/* Decode opaque device data and return the result */
-static struct nfs4_file_layout_dsaddr*
-decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
{
int i;
u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
dsaddr->stripe_indices = stripe_indices;
stripe_indices = NULL;
dsaddr->ds_num = num;
- nfs4_init_deviceid_node(&dsaddr->id_node,
- NFS_SERVER(ino)->pnfs_curr_ld,
- NFS_SERVER(ino)->nfs_client,
- &pdev->dev_id);
+ nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
INIT_LIST_HEAD(&dsaddrs);
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
+ da = decode_ds_addr(server->nfs_client->cl_net,
&stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
return NULL;
}
-/*
- * Decode the opaque device specified in 'dev' and add it to the cache of
- * available devices.
- */
-static struct nfs4_file_layout_dsaddr *
-decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
-{
- struct nfs4_deviceid_node *d;
- struct nfs4_file_layout_dsaddr *n, *new;
-
- new = decode_device(inode, dev, gfp_flags);
- if (!new) {
- printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
- __func__);
- return NULL;
- }
-
- d = nfs4_insert_deviceid_node(&new->id_node);
- n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
- if (n != new) {
- nfs4_fl_free_deviceid(new);
- return n;
- }
-
- return new;
-}
-
-/*
- * Retrieve the information for dev_id, add it to the list
- * of available devices, and return it.
- */
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode,
- struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred,
- gfp_t gfp_flags)
-{
- struct pnfs_device *pdev = NULL;
- u32 max_resp_sz;
- int max_pages;
- struct page **pages = NULL;
- struct nfs4_file_layout_dsaddr *dsaddr = NULL;
- int rc, i;
- struct nfs_server *server = NFS_SERVER(inode);
-
- /*
- * Use the session max response size as the basis for setting
- * GETDEVICEINFO's maxcount
- */
- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- max_pages = nfs_page_array_len(0, max_resp_sz);
- dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
- __func__, inode, max_resp_sz, max_pages);
-
- pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
- if (pdev == NULL)
- return NULL;
-
- pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
- if (pages == NULL) {
- kfree(pdev);
- return NULL;
- }
- for (i = 0; i < max_pages; i++) {
- pages[i] = alloc_page(gfp_flags);
- if (!pages[i])
- goto out_free;
- }
-
- memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
- pdev->layout_type = LAYOUT_NFSV4_1_FILES;
- pdev->pages = pages;
- pdev->pgbase = 0;
- pdev->pglen = max_resp_sz;
- pdev->mincount = 0;
- pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
- rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
- dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc)
- goto out_free;
-
- /*
- * Found new device, need to decode it and then add it to the
- * list of known devices for this mountpoint.
- */
- dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
-out_free:
- for (i = 0; i < max_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- kfree(pdev);
- dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
- return dsaddr;
-}
-
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b08..777b055063f6 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
struct nfs_server_key *key = buffer;
uint16_t len = sizeof(struct nfs_server_key);
+ memset(key, 0, len);
key->nfsversion = clp->rpc_ops->version;
key->family = clp->cl_addr.ss_family;
- memset(key, 0, len);
-
switch (clp->cl_addr.ss_family) {
case AF_INET:
key->port = sin->sin_port;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b94f80420a58..880618a8b048 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
* if the dentry tree reaches them; however if the dentry already
* exists, we'll pick it up at this point and use it as the root
*/
- ret = d_obtain_alias(inode);
+ ret = d_obtain_root(inode);
if (IS_ERR(ret)) {
dprintk("nfs_get_root: get root dentry failed\n");
goto out;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 7dd55b745c4d..2f5db844c172 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -177,7 +177,6 @@ static struct key_type key_type_id_resolver = {
.preparse = user_preparse,
.free_preparse = user_free_preparse,
.instantiate = generic_key_instantiate,
- .match = user_match,
.revoke = user_revoke,
.destroy = user_destroy,
.describe = user_describe,
@@ -401,7 +400,6 @@ static struct key_type key_type_id_resolver_legacy = {
.preparse = user_preparse,
.free_preparse = user_free_preparse,
.instantiate = generic_key_instantiate,
- .match = user_match,
.revoke = user_revoke,
.destroy = user_destroy,
.describe = user_describe,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index abd37a380535..00689a8a85e4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
if (attr->ia_valid & ATTR_SIZE) {
- if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode))
+ BUG_ON(!S_ISREG(inode->i_mode));
+
+ if (attr->ia_size == i_size_read(inode))
attr->ia_valid &= ~ATTR_SIZE;
}
@@ -624,7 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
- int err;
+ int err = 0;
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
@@ -716,6 +718,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
kfree(new);
return res;
}
+EXPORT_SYMBOL_GPL(nfs_get_lock_context);
void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
{
@@ -728,6 +731,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
spin_unlock(&inode->i_lock);
kfree(l_ctx);
}
+EXPORT_SYMBOL_GPL(nfs_put_lock_context);
/**
* nfs_close_context - Common close_context() routine NFSv2/v3
@@ -1002,6 +1006,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
}
EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
+int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
+{
+ if (!(NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+ && !nfs_attribute_cache_expired(inode))
+ return NFS_STALE(inode) ? -ESTALE : 0;
+ return -ECHILD;
+}
+
static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -1840,11 +1853,12 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
nfs_clients_init(net);
- return 0;
+ return nfs_fs_proc_net_init(net);
}
static void nfs_net_exit(struct net *net)
{
+ nfs_fs_proc_net_exit(net);
nfs_cleanup_cb_ident_idr(net);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 617f36611d4a..efaa31c70fbe 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -195,7 +195,16 @@ extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
+extern int nfs_fs_proc_net_init(struct net *net);
+extern void nfs_fs_proc_net_exit(struct net *net);
#else
+static inline int nfs_fs_proc_net_init(struct net *net)
+{
+ return 0;
+}
+static inline void nfs_fs_proc_net_exit(struct net *net)
+{
+}
static inline int nfs_fs_proc_init(void)
{
return 0;
@@ -209,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
#endif
-/* nfs3client.c */
-#if IS_ENABLED(CONFIG_NFS_V3)
-struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
-struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
- struct nfs_fattr *, rpc_authflavor_t);
-#endif
-
/* callback_xdr.c */
extern struct svc_version nfs4_callback_version1;
extern struct svc_version nfs4_callback_version4;
@@ -238,11 +240,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
int nfs_iocounter_wait(struct nfs_io_counter *c);
extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
-void nfs_rw_header_free(struct nfs_pgio_header *);
-void nfs_pgio_data_release(struct nfs_pgio_data *);
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
+void nfs_pgio_data_destroy(struct nfs_pgio_header *);
int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
const struct rpc_call_ops *, int, int);
void nfs_free_request(struct nfs_page *req);
@@ -337,7 +339,6 @@ int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
int nfs_check_flags(int);
-int nfs_setlease(struct file *, long, struct file_lock **);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
@@ -442,6 +443,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
void nfs_mark_request_commit(struct nfs_page *req,
struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo);
+int nfs_write_need_commit(struct nfs_pgio_header *);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
@@ -482,7 +484,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
const char *ip_addr);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 8ee1fab83268..f0e06e4acbef 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -19,6 +19,7 @@ struct nfs_net {
struct rpc_pipe *bl_device_pipe;
struct bl_dev_msg bl_mount_reply;
wait_queue_head_t bl_wq;
+ struct mutex bl_mutex;
struct list_head nfs_client_list;
struct list_head nfs_volume_list;
#if IS_ENABLED(CONFIG_NFS_V4)
@@ -29,6 +30,9 @@ struct nfs_net {
#endif
spinlock_t nfs_client_lock;
struct timespec boot_time;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_nfsfs;
+#endif
};
extern int nfs_net_id;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..333ae4068506
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+ struct nfs_fattr *, rpc_authflavor_t);
+
+
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 8f854dde4150..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
#include <linux/nfsacl.h>
#include "internal.h"
+#include "nfs3_fs.h"
#define NFSDBG_FACILITY NFSDBG_PROC
@@ -129,7 +130,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
.rpc_argp = &args,
.rpc_resp = &fattr,
};
- int status;
+ int status = 0;
+
+ if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
+ goto out;
status = -EOPNOTSUPP;
if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -256,7 +260,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
char *p = data + *result;
acl = get_acl(inode, type);
- if (!acl)
+ if (IS_ERR_OR_NULL(acl))
return 0;
posix_acl_release(acl);
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include "internal.h"
+#include "nfs3_fs.h"
#ifdef CONFIG_NFS_V3_ACL
static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f0afa291fd58..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
#include "iostat.h"
#include "internal.h"
+#include "nfs3_fs.h"
#define NFSDBG_FACILITY NFSDBG_PROC
@@ -795,41 +796,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
nfs_invalidate_atime(inode);
- nfs_refresh_inode(inode, &data->fattr);
+ nfs_refresh_inode(inode, &hdr->fattr);
return 0;
}
-static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
}
-static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
rpc_call_start(task);
return 0;
}
-static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
if (task->tk_status >= 0)
- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+ nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
return 0;
}
-static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
}
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
#include <linux/module.h>
#include <linux/nfs_fs.h>
#include "internal.h"
+#include "nfs3_fs.h"
#include "nfs.h"
static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
new file mode 100644
index 000000000000..d10333a197bf
--- /dev/null
+++ b/fs/nfs/nfs42.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_2_H
+#define __LINUX_FS_NFS_NFS4_2_H
+
+/* nfs4.2proc.c */
+loff_t nfs42_proc_llseek(struct file *, loff_t, int);
+
+/* nfs4.2xdr.h */
+extern struct rpc_procinfo nfs4_2_procedures[];
+
+#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
new file mode 100644
index 000000000000..0886f1db5917
--- /dev/null
+++ b/fs/nfs/nfs42proc.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#include <linux/fs.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_xdr.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "nfs42.h"
+
+static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
+ fmode_t fmode)
+{
+ struct nfs_open_context *open;
+ struct nfs_lock_context *lock;
+ int ret;
+
+ open = get_nfs_open_context(nfs_file_open_context(file));
+ lock = nfs_get_lock_context(open);
+ if (IS_ERR(lock)) {
+ put_nfs_open_context(open);
+ return PTR_ERR(lock);
+ }
+
+ ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
+
+ nfs_put_lock_context(lock);
+ put_nfs_open_context(open);
+ return ret;
+}
+
+loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs42_seek_args args = {
+ .sa_fh = NFS_FH(inode),
+ .sa_offset = offset,
+ .sa_what = (whence == SEEK_HOLE) ?
+ NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
+ };
+ struct nfs42_seek_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct nfs_server *server = NFS_SERVER(inode);
+ int status;
+
+ if (!(server->caps & NFS_CAP_SEEK))
+ return -ENOTSUPP;
+
+ status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
+ if (status)
+ return status;
+
+ nfs_wb_all(inode);
+ status = nfs4_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == -ENOTSUPP)
+ server->caps &= ~NFS_CAP_SEEK;
+ if (status)
+ return status;
+
+ return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
new file mode 100644
index 000000000000..c90469b604b8
--- /dev/null
+++ b/fs/nfs/nfs42xdr.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
+#define __LINUX_FS_NFS_NFS4_2XDR_H
+
+#define encode_seek_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* offset */ + \
+ 1 /* whence */)
+#define decode_seek_maxsz (op_decode_hdr_maxsz + \
+ 1 /* eof */ + \
+ 1 /* whence */ + \
+ 2 /* offset */ + \
+ 2 /* length */)
+
+#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_seek_maxsz)
+#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_seek_maxsz)
+
+
+static void encode_seek(struct xdr_stream *xdr,
+ struct nfs42_seek_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SEEK, decode_seek_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->sa_stateid);
+ encode_uint64(xdr, args->sa_offset);
+ encode_uint32(xdr, args->sa_what);
+}
+
+/*
+ * Encode SEEK request
+ */
+static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_seek_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->sa_fh, &hdr);
+ encode_seek(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
+{
+ int status;
+ __be32 *p;
+
+ status = decode_op_hdr(xdr, OP_SEEK);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4 + 8);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ res->sr_eof = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &res->sr_offset);
+ return 0;
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+/*
+ * Decode SEEK request
+ */
+static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_seek_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_seek(xdr, res);
+out:
+ return status;
+}
+#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ba2affa51941..be6cac37ea10 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
const nfs4_stateid *);
int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
struct nfs_fsinfo *);
- int (*free_lock_state)(struct nfs_server *,
+ void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
const struct rpc_call_ops *call_sync_ops;
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
@@ -129,17 +129,6 @@ enum {
* LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
*/
-struct nfs4_lock_owner {
- unsigned int lo_type;
-#define NFS4_ANY_LOCK_TYPE (0U)
-#define NFS4_FLOCK_LOCK_TYPE (1U << 0)
-#define NFS4_POSIX_LOCK_TYPE (1U << 1)
- union {
- fl_owner_t posix_owner;
- pid_t flock_owner;
- } lo_u;
-};
-
struct nfs4_lock_state {
struct list_head ls_locks; /* Other lock stateids */
struct nfs4_state * ls_state; /* Pointer to open state */
@@ -149,7 +138,7 @@ struct nfs4_lock_state {
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
atomic_t ls_count;
- struct nfs4_lock_owner ls_owner;
+ fl_owner_t ls_owner;
};
/* bits for nfs4_state->flags */
@@ -237,6 +226,9 @@ int nfs4_replace_transport(struct nfs_server *server,
const struct nfs4_fs_locations *locations);
/* nfs4proc.c */
+extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
+ struct rpc_message *, struct nfs4_sequence_args *,
+ struct nfs4_sequence_res *, int);
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
@@ -337,11 +329,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
*/
static inline void
nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
- struct rpc_message *msg, struct nfs_pgio_data *wdata)
+ struct rpc_message *msg, struct nfs_pgio_header *hdr)
{
if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
!test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
- wdata->args.stable = NFS_FILE_SYNC;
+ hdr->args.stable = NFS_FILE_SYNC;
}
#else /* CONFIG_NFS_v4_1 */
static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -369,7 +361,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
static inline void
nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
- struct rpc_message *msg, struct nfs_pgio_data *wdata)
+ struct rpc_message *msg, struct nfs_pgio_header *hdr)
{
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index aa9ef4876046..ffdb28d86cf8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos->rpc_ops != new->rpc_ops)
+ continue;
+
+ if (pos->cl_proto != new->cl_proto)
+ continue;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ continue;
+
/* If "pos" isn't marked ready, we can't trust the
* remaining fields in "pos" */
if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_proto != new->cl_proto)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
if (pos->cl_clientid != new->cl_clientid)
continue;
@@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos->rpc_ops != new->rpc_ops)
+ continue;
+
+ if (pos->cl_proto != new->cl_proto)
+ continue;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ continue;
+
/* If "pos" isn't marked ready, we can't trust the
* remaining fields in "pos", especially the client
* ID and serverowner fields. Wait for CREATE_SESSION
@@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_proto != new->cl_proto)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
if (!nfs4_match_clientids(pos, new))
continue;
@@ -855,6 +857,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
};
struct rpc_timeout ds_timeout;
struct nfs_client *clp;
+ char buf[INET6_ADDRSTRLEN + 1];
+
+ if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ return ERR_PTR(-EINVAL);
+ cl_init.hostname = buf;
/*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index a816f0627a6c..c51fb4db9bfe 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -8,6 +8,10 @@
#include "fscache.h"
#include "pnfs.h"
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif
+
#define NFSDBG_FACILITY NFSDBG_FILE
static int
@@ -115,8 +119,29 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
}
+#ifdef CONFIG_NFS_V4_2
+static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ switch (whence) {
+ case SEEK_HOLE:
+ case SEEK_DATA:
+ ret = nfs42_proc_llseek(filep, offset, whence);
+ if (ret != -ENOTSUPP)
+ return ret;
+ default:
+ return nfs_file_llseek(filep, offset, whence);
+ }
+}
+#endif /* CONFIG_NFS_V4_2 */
+
const struct file_operations nfs4_file_operations = {
+#ifdef CONFIG_NFS_V4_2
+ .llseek = nfs4_file_llseek,
+#else
.llseek = nfs_file_llseek,
+#endif
.read = new_sync_read,
.write = new_sync_write,
.read_iter = nfs_file_read,
@@ -131,5 +156,5 @@ const struct file_operations nfs4_file_operations = {
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
- .setlease = nfs_setlease,
+ .setlease = simple_nosetlease,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bf3d97cc5a0..69dc20a743f9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static long nfs4_update_delay(long *timeout)
+{
+ long ret;
+ if (!timeout)
+ return NFS4_POLL_RETRY_MAX;
+ if (*timeout <= 0)
+ *timeout = NFS4_POLL_RETRY_MIN;
+ if (*timeout > NFS4_POLL_RETRY_MAX)
+ *timeout = NFS4_POLL_RETRY_MAX;
+ ret = *timeout;
+ *timeout <<= 1;
+ return ret;
+}
+
static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
{
int res = 0;
might_sleep();
- if (*timeout <= 0)
- *timeout = NFS4_POLL_RETRY_MIN;
- if (*timeout > NFS4_POLL_RETRY_MAX)
- *timeout = NFS4_POLL_RETRY_MAX;
- freezable_schedule_timeout_killable_unsafe(*timeout);
+ freezable_schedule_timeout_killable_unsafe(
+ nfs4_update_delay(timeout));
if (fatal_signal_pending(current))
res = -ERESTARTSYS;
- *timeout <<= 1;
return res;
}
@@ -360,11 +370,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
- nfs_remove_bad_delegation(inode);
- exception->retry = 1;
- break;
- }
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -875,7 +880,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
return ret;
}
-static
int nfs4_call_sync(struct rpc_clnt *clnt,
struct nfs_server *server,
struct rpc_message *msg,
@@ -1307,15 +1311,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
int ret = -EAGAIN;
for (;;) {
+ spin_lock(&state->owner->so_lock);
if (can_open_cached(state, fmode, open_mode)) {
- spin_lock(&state->owner->so_lock);
- if (can_open_cached(state, fmode, open_mode)) {
- update_open_stateflags(state, fmode);
- spin_unlock(&state->owner->so_lock);
- goto out_return_state;
- }
+ update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
+ goto out_return_state;
}
+ spin_unlock(&state->owner->so_lock);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
if (!can_open_delegated(delegation, fmode)) {
@@ -1647,7 +1649,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
nfs_inode_find_state_and_recover(state->inode,
stateid);
nfs4_schedule_stateid_recovery(server, state);
- return 0;
+ return -EAGAIN;
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
set_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -1952,6 +1954,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
return status;
}
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
static int nfs4_opendata_access(struct rpc_cred *cred,
struct nfs4_opendata *opendata,
struct nfs4_state *state, fmode_t fmode,
@@ -1966,14 +1976,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
return 0;
mask = 0;
- /* don't check MAY_WRITE - a newly created file may not have
- * write mode bits, but POSIX allows the creating process to write.
- * use openflags to check for exec, because fmode won't
- * always have FMODE_EXEC set when file open for exec. */
+ /*
+ * Use openflags to check for exec, because fmode won't
+ * always have FMODE_EXEC set when file open for exec.
+ */
if (openflags & __FMODE_EXEC) {
/* ONLY check for exec rights */
mask = MAY_EXEC;
- } else if (fmode & FMODE_READ)
+ } else if ((fmode & FMODE_READ) && !opendata->file_created)
mask = MAY_READ;
cache.cred = cred;
@@ -2094,46 +2104,60 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
return ret;
}
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+ nfs_remove_bad_delegation(state->inode);
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+ if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+ nfs_finish_clear_delegation_stateid(state);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+ nfs40_clear_delegation_stateid(state);
+ return nfs4_open_expired(sp, state);
+}
+
#if defined(CONFIG_NFS_V4_1)
-static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
- nfs4_stateid *stateid = &state->stateid;
+ nfs4_stateid stateid;
struct nfs_delegation *delegation;
- struct rpc_cred *cred = NULL;
- int status = -NFS4ERR_BAD_STATEID;
-
- /* If a state reset has been done, test_stateid is unneeded */
- if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
- return;
+ struct rpc_cred *cred;
+ int status;
/* Get the delegation credential for use by test/free_stateid */
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
- if (delegation != NULL &&
- nfs4_stateid_match(&delegation->stateid, stateid)) {
- cred = get_rpccred(delegation->cred);
- rcu_read_unlock();
- status = nfs41_test_stateid(server, stateid, cred);
- trace_nfs4_test_delegation_stateid(state, NULL, status);
- } else
+ if (delegation == NULL) {
rcu_read_unlock();
+ return;
+ }
+
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ cred = get_rpccred(delegation->cred);
+ rcu_read_unlock();
+ status = nfs41_test_stateid(server, &stateid, cred);
+ trace_nfs4_test_delegation_stateid(state, NULL, status);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid, cred);
- nfs_remove_bad_delegation(state->inode);
-
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs41_free_stateid(server, &stateid, cred);
+ nfs_finish_clear_delegation_stateid(state);
}
- if (cred != NULL)
- put_rpccred(cred);
+ put_rpccred(cred);
}
/**
@@ -2177,7 +2201,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
{
int status;
- nfs41_clear_delegation_stateid(state);
+ nfs41_check_delegation_stateid(state);
status = nfs41_check_open_stateid(state);
if (status != NFS_OK)
status = nfs4_open_expired(sp, state);
@@ -2545,6 +2569,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
struct nfs4_closedata *calldata = data;
struct nfs4_state *state = calldata->state;
struct nfs_server *server = NFS_SERVER(calldata->inode);
+ nfs4_stateid *res_stateid = NULL;
dprintk("%s: begin!\n", __func__);
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -2555,12 +2580,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
*/
switch (task->tk_status) {
case 0:
- if (calldata->roc)
+ res_stateid = &calldata->res.stateid;
+ if (calldata->arg.fmode == 0 && calldata->roc)
pnfs_roc_set_barrier(state->inode,
calldata->roc_barrier);
- nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
renew_lease(server, calldata->timestamp);
- goto out_release;
+ break;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_OLD_STATEID:
@@ -2569,12 +2594,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (calldata->arg.fmode == 0)
break;
default:
- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
goto out_release;
}
}
- nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
+ nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
out_release:
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2586,6 +2611,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
struct nfs4_closedata *calldata = data;
struct nfs4_state *state = calldata->state;
struct inode *inode = calldata->inode;
+ bool is_rdonly, is_wronly, is_rdwr;
int call_close = 0;
dprintk("%s: begin!\n", __func__);
@@ -2593,21 +2619,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_wait;
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
- calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
spin_lock(&state->owner->so_lock);
+ is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
+ is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
+ is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
/* Calculate the change in open mode */
+ calldata->arg.fmode = 0;
if (state->n_rdwr == 0) {
- if (state->n_rdonly == 0) {
- call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
- call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
- calldata->arg.fmode &= ~FMODE_READ;
- }
- if (state->n_wronly == 0) {
- call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
- call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
- calldata->arg.fmode &= ~FMODE_WRITE;
- }
- }
+ if (state->n_rdonly == 0)
+ call_close |= is_rdonly;
+ else if (is_rdonly)
+ calldata->arg.fmode |= FMODE_READ;
+ if (state->n_wronly == 0)
+ call_close |= is_wronly;
+ else if (is_wronly)
+ calldata->arg.fmode |= FMODE_WRITE;
+ } else if (is_rdwr)
+ calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+ if (calldata->arg.fmode == 0)
+ call_close |= is_rdwr;
+
if (!nfs4_valid_open_stateid(state))
call_close = 0;
spin_unlock(&state->owner->so_lock);
@@ -2647,6 +2679,48 @@ static const struct rpc_call_ops nfs4_close_ops = {
.rpc_release = nfs4_free_closedata,
};
+static bool nfs4_state_has_opener(struct nfs4_state *state)
+{
+ /* first check existing openers */
+ if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
+ state->n_rdonly != 0)
+ return true;
+
+ if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
+ state->n_wronly != 0)
+ return true;
+
+ if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
+ state->n_rdwr != 0)
+ return true;
+
+ return false;
+}
+
+static bool nfs4_roc(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ if (nfs4_state_has_opener(state)) {
+ spin_unlock(&inode->i_lock);
+ return false;
+ }
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (nfs4_check_delegation(inode, FMODE_READ))
+ return false;
+
+ return pnfs_roc(inode);
+}
+
/*
* It is possible for data to be read/written from a mem-mapped file
* after the sys_close call (which hits the vfs layer as a flush).
@@ -2697,7 +2771,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
calldata->res.server = server;
- calldata->roc = pnfs_roc(state->inode);
+ calldata->roc = nfs4_roc(state->inode);
nfs_sb_active(calldata->inode->i_sb);
msg.rpc_argp = &calldata->arg;
@@ -3148,7 +3222,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs4_label *label = NULL;
int status;
- if (pnfs_ld_layoutret_on_setattr(inode))
+ if (pnfs_ld_layoutret_on_setattr(inode) &&
+ sattr->ia_valid & ATTR_SIZE &&
+ sattr->ia_size < i_size_read(inode))
pnfs_commit_and_return_layout(inode);
nfs_fattr_init(fattr);
@@ -3507,7 +3583,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL,
+ &data->timeout) == -EAGAIN)
return 0;
update_changeattr(dir, &res->cinfo);
return 1;
@@ -3540,7 +3617,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
return 0;
update_changeattr(old_dir, &res->old_cinfo);
@@ -4033,24 +4110,26 @@ static bool nfs4_error_stateid_expired(int err)
return false;
}
-void __nfs4_read_done_cb(struct nfs_pgio_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
{
- nfs_invalidate_atime(data->header->inode);
+ nfs_invalidate_atime(hdr->inode);
}
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct nfs_server *server = NFS_SERVER(data->header->inode);
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
- trace_nfs4_read(data, task->tk_status);
- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+ trace_nfs4_read(hdr, task->tk_status);
+ if (nfs4_async_handle_error(task, server,
+ hdr->args.context->state,
+ NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
- __nfs4_read_done_cb(data);
+ __nfs4_read_done_cb(hdr);
if (task->tk_status > 0)
- renew_lease(server, data->timestamp);
+ renew_lease(server, hdr->timestamp);
return 0;
}
@@ -4068,54 +4147,60 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
dprintk("--> %s\n", __func__);
- if (!nfs4_sequence_done(task, &data->res.seq_res))
+ if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
- if (nfs4_read_stateid_changed(task, &data->args))
+ if (nfs4_read_stateid_changed(task, &hdr->args))
return -EAGAIN;
- return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
- nfs4_read_done_cb(task, data);
+ return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+ nfs4_read_done_cb(task, hdr);
}
-static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
- data->timestamp = jiffies;
- data->pgio_done_cb = nfs4_read_done_cb;
+ hdr->timestamp = jiffies;
+ hdr->pgio_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
- nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
}
-static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
- &data->args.seq_args,
- &data->res.seq_res,
+ if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
task))
return 0;
- if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context,
+ hdr->rw_ops->rw_mode) == -EIO)
return -EIO;
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
return -EIO;
return 0;
}
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
-
- trace_nfs4_write(data, task->tk_status);
- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+ struct inode *inode = hdr->inode;
+
+ trace_nfs4_write(hdr, task->tk_status);
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+ hdr->args.context->state,
+ NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
if (task->tk_status >= 0) {
- renew_lease(NFS_SERVER(inode), data->timestamp);
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+ renew_lease(NFS_SERVER(inode), hdr->timestamp);
+ nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
}
return 0;
}
@@ -4134,23 +4219,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- if (!nfs4_sequence_done(task, &data->res.seq_res))
+ if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
- if (nfs4_write_stateid_changed(task, &data->args))
+ if (nfs4_write_stateid_changed(task, &hdr->args))
return -EAGAIN;
- return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
- nfs4_write_done_cb(task, data);
+ return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+ nfs4_write_done_cb(task, hdr);
}
static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
{
- const struct nfs_pgio_header *hdr = data->header;
-
/* Don't request attributes for pNFS or O_DIRECT writes */
- if (data->ds_clp != NULL || hdr->dreq != NULL)
+ if (hdr->ds_clp != NULL || hdr->dreq != NULL)
return false;
/* Otherwise, request attributes if and only if we don't hold
* a delegation
@@ -4158,23 +4241,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
}
-static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
- struct nfs_server *server = NFS_SERVER(data->header->inode);
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
- if (!nfs4_write_need_cache_consistency_data(data)) {
- data->args.bitmask = NULL;
- data->res.fattr = NULL;
+ if (!nfs4_write_need_cache_consistency_data(hdr)) {
+ hdr->args.bitmask = NULL;
+ hdr->res.fattr = NULL;
} else
- data->args.bitmask = server->cache_consistency_bitmask;
+ hdr->args.bitmask = server->cache_consistency_bitmask;
- if (!data->pgio_done_cb)
- data->pgio_done_cb = nfs4_write_done_cb;
- data->res.server = server;
- data->timestamp = jiffies;
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_write_done_cb;
+ hdr->res.server = server;
+ hdr->timestamp = jiffies;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
- nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
}
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4190,7 +4274,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
struct inode *inode = data->inode;
trace_nfs4_commit(data, task->tk_status);
- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+ NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -4743,7 +4828,8 @@ out:
static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
{
struct nfs_client *clp = server->nfs_client;
@@ -4753,9 +4839,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
@@ -4793,6 +4876,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
#endif /* CONFIG_NFS_V4_1 */
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
+ rpc_delay(task, nfs4_update_delay(timeout));
+ goto restart_call;
case -NFS4ERR_GRACE:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -4881,6 +4966,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
return scnprintf(buf, len, "tcp");
}
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_setclientid *sc = calldata;
+
+ if (task->tk_status == 0)
+ sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+ .rpc_call_done = nfs4_setclientid_done,
+};
+
/**
* nfs4_proc_setclientid - Negotiate client ID
* @clp: state data structure
@@ -4907,6 +5004,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
.rpc_resp = res,
.rpc_cred = cred,
};
+ struct rpc_task *task;
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_setclientid_ops,
+ .callback_data = &setclientid,
+ .flags = RPC_TASK_TIMEOUT,
+ };
int status;
/* nfs_client_id4 */
@@ -4933,7 +5038,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
dprintk("NFS call setclientid auth=%s, '%.*s'\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
setclientid.sc_name_len, setclientid.sc_name);
- status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ status = PTR_ERR(task);
+ goto out;
+ }
+ status = task->tk_status;
+ if (setclientid.sc_cred) {
+ clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+ put_rpccred(setclientid.sc_cred);
+ }
+ rpc_put_task(task);
+out:
trace_nfs4_setclientid(clp, status);
dprintk("NFS reply setclientid: %d\n", status);
return status;
@@ -4975,6 +5091,9 @@ struct nfs4_delegreturndata {
unsigned long timestamp;
struct nfs_fattr fattr;
int rpc_status;
+ struct inode *inode;
+ bool roc;
+ u32 roc_barrier;
};
static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -4988,7 +5107,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
renew_lease(data->res.server, data->timestamp);
- break;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_BAD_STATEID:
@@ -4996,10 +5114,12 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
task->tk_status = 0;
+ if (data->roc)
+ pnfs_roc_set_barrier(data->inode, data->roc_barrier);
break;
default:
- if (nfs4_async_handle_error(task, data->res.server, NULL) ==
- -EAGAIN) {
+ if (nfs4_async_handle_error(task, data->res.server,
+ NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return;
}
@@ -5009,6 +5129,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
static void nfs4_delegreturn_release(void *calldata)
{
+ struct nfs4_delegreturndata *data = calldata;
+
+ if (data->roc)
+ pnfs_roc_release(data->inode);
kfree(calldata);
}
@@ -5018,6 +5142,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
+ if (d_data->roc &&
+ pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
+ return;
+
nfs4_setup_sequence(d_data->res.server,
&d_data->args.seq_args,
&d_data->res.seq_res,
@@ -5061,6 +5189,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
nfs_fattr_init(data->res.fattr);
data->timestamp = jiffies;
data->rpc_status = 0;
+ data->inode = inode;
+ data->roc = list_empty(&NFS_I(inode)->open_files) ?
+ pnfs_roc(inode) : false;
task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
@@ -5252,7 +5383,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
case -NFS4ERR_EXPIRED:
break;
default:
- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, calldata->server,
+ NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
nfs_release_seqid(calldata->arg.seqid);
@@ -5834,8 +5966,10 @@ struct nfs_release_lockowner_data {
static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_release_lockowner_data *data = calldata;
- nfs40_setup_sequence(data->server,
- &data->args.seq_args, &data->res.seq_res, task);
+ struct nfs_server *server = data->server;
+ nfs40_setup_sequence(server, &data->args.seq_args,
+ &data->res.seq_res, task);
+ data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
data->timestamp = jiffies;
}
@@ -5852,9 +5986,12 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
break;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_EXPIRED:
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ break;
case -NFS4ERR_LEASE_MOVED:
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server,
+ NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
}
@@ -5872,7 +6009,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
.rpc_release = nfs4_release_lockowner_release,
};
-static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
struct nfs_release_lockowner_data *data;
struct rpc_message msg = {
@@ -5880,11 +6018,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
};
if (server->nfs_client->cl_mvops->minor_version != 0)
- return -EINVAL;
+ return;
data = kmalloc(sizeof(*data), GFP_NOFS);
if (!data)
- return -ENOMEM;
+ return;
data->lsp = lsp;
data->server = server;
data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5895,7 +6033,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
msg.rpc_resp = &data->res;
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
- return 0;
}
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -7229,7 +7366,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
int ret = 0;
if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
- return 0;
+ return -EAGAIN;
task = _nfs41_proc_sequence(clp, cred, false);
if (IS_ERR(task))
ret = PTR_ERR(task);
@@ -7459,14 +7596,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
} else {
LIST_HEAD(head);
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
spin_unlock(&inode->i_lock);
- /* Mark the bad layout state as invalid, then
- * retry using the open stateid. */
pnfs_free_lseg_list(&head);
+
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
}
}
- if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
out:
dprintk("<-- %s\n", __func__);
@@ -7626,7 +7768,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
case 0:
break;
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
+ if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
break;
rpc_restart_call_prepare(task);
return;
@@ -7685,54 +7827,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
return status;
}
-/*
- * Retrieve the list of Data Server devices from the MDS.
- */
-static int _nfs4_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist)
-{
- struct nfs4_getdevicelist_args args = {
- .fh = fh,
- .layoutclass = server->pnfs_curr_ld->id,
- };
- struct nfs4_getdevicelist_res res = {
- .devlist = devlist,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
- .rpc_argp = &args,
- .rpc_resp = &res,
- };
- int status;
-
- dprintk("--> %s\n", __func__);
- status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
- &res.seq_res, 0);
- dprintk("<-- %s status=%d\n", __func__, status);
- return status;
-}
-
-int nfs4_proc_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist)
-{
- struct nfs4_exception exception = { };
- int err;
-
- do {
- err = nfs4_handle_exception(server,
- _nfs4_getdevicelist(server, fh, devlist),
- &exception);
- } while (exception.retry);
-
- dprintk("%s: err=%d, num_devs=%u\n", __func__,
- err, devlist->num_devs);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
-
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *pdev,
@@ -7805,7 +7899,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
case 0:
break;
default:
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return;
}
@@ -8101,7 +8195,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
}
@@ -8182,7 +8276,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
return ret;
}
-static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
struct rpc_task *task;
struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8190,9 +8285,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
nfs4_free_lock_state(server, lsp);
if (IS_ERR(task))
- return PTR_ERR(task);
+ return;
rpc_put_task(task);
- return 0;
}
static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -8242,7 +8336,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
.state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
- .recover_open = nfs4_open_expired,
+ .recover_open = nfs40_open_expired,
.recover_lock = nfs4_lock_expired,
.establish_clid = nfs4_init_clientid,
};
@@ -8331,7 +8425,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
- | NFS_CAP_ATOMIC_OPEN_V1,
+ | NFS_CAP_ATOMIC_OPEN_V1
+ | NFS_CAP_SEEK,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
}
nfs_expire_all_delegations(clp);
} else {
+ int ret;
+
/* Queue an asynchronous RENEW. */
- ops->sched_state_renewal(clp, cred, renew_flags);
+ ret = ops->sched_state_renewal(clp, cred, renew_flags);
put_rpccred(cred);
- goto out_exp;
+ switch (ret) {
+ default:
+ goto out_exp;
+ case -EAGAIN:
+ case -ENOMEM:
+ break;
+ }
}
} else {
dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 42f121182167..5194933ed419 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -787,21 +787,12 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
* that is compatible with current->files
*/
static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
{
struct nfs4_lock_state *pos;
list_for_each_entry(pos, &state->lock_states, ls_locks) {
- if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
+ if (pos->ls_owner != fl_owner)
continue;
- switch (pos->ls_owner.lo_type) {
- case NFS4_POSIX_LOCK_TYPE:
- if (pos->ls_owner.lo_u.posix_owner != fl_owner)
- continue;
- break;
- case NFS4_FLOCK_LOCK_TYPE:
- if (pos->ls_owner.lo_u.flock_owner != fl_pid)
- continue;
- }
atomic_inc(&pos->ls_count);
return pos;
}
@@ -813,7 +804,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
* exists, return an uninitialized one.
*
*/
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
{
struct nfs4_lock_state *lsp;
struct nfs_server *server = state->owner->so_server;
@@ -824,17 +815,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
nfs4_init_seqid_counter(&lsp->ls_seqid);
atomic_set(&lsp->ls_count, 1);
lsp->ls_state = state;
- lsp->ls_owner.lo_type = type;
- switch (lsp->ls_owner.lo_type) {
- case NFS4_FLOCK_LOCK_TYPE:
- lsp->ls_owner.lo_u.flock_owner = fl_pid;
- break;
- case NFS4_POSIX_LOCK_TYPE:
- lsp->ls_owner.lo_u.posix_owner = fl_owner;
- break;
- default:
- goto out_free;
- }
+ lsp->ls_owner = fl_owner;
lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
@@ -857,13 +838,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
* exists, return an uninitialized one.
*
*/
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
{
struct nfs4_lock_state *lsp, *new = NULL;
for(;;) {
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, owner, pid, type);
+ lsp = __nfs4_find_lock_state(state, owner);
if (lsp != NULL)
break;
if (new != NULL) {
@@ -874,7 +855,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
break;
}
spin_unlock(&state->state_lock);
- new = nfs4_alloc_lock_state(state, owner, pid, type);
+ new = nfs4_alloc_lock_state(state, owner);
if (new == NULL)
return NULL;
}
@@ -935,13 +916,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
if (fl->fl_ops != NULL)
return 0;
- if (fl->fl_flags & FL_POSIX)
- lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
- else if (fl->fl_flags & FL_FLOCK)
- lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
- NFS4_FLOCK_LOCK_TYPE);
- else
- return -EINVAL;
+ lsp = nfs4_get_lock_state(state, fl->fl_owner);
if (lsp == NULL)
return -ENOMEM;
fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +930,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
{
struct nfs4_lock_state *lsp;
fl_owner_t fl_owner;
- pid_t fl_pid;
int ret = -ENOENT;
@@ -966,9 +940,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
goto out;
fl_owner = lockowner->l_owner;
- fl_pid = lockowner->l_pid;
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
+ lsp = __nfs4_find_lock_state(state, fl_owner);
if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
ret = -EIO;
else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -1732,7 +1705,8 @@ restart:
if (status < 0) {
set_bit(ops->owner_flag_bit, &sp->so_flags);
nfs4_put_state_owner(sp);
- return nfs4_recovery_handle_error(clp, status);
+ status = nfs4_recovery_handle_error(clp, status);
+ return (status != 0) ? status : -EAGAIN;
}
nfs4_put_state_owner(sp);
@@ -1741,7 +1715,7 @@ restart:
spin_unlock(&clp->cl_lock);
}
rcu_read_unlock();
- return status;
+ return 0;
}
static int nfs4_check_lease(struct nfs_client *clp)
@@ -1788,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
break;
case -NFS4ERR_STALE_CLIENTID:
clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
- nfs4_state_clear_reclaim_reboot(clp);
nfs4_state_start_reclaim_reboot(clp);
break;
case -NFS4ERR_CLID_INUSE:
@@ -2372,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
status = nfs4_check_lease(clp);
if (status < 0)
goto out_error;
+ continue;
}
if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2393,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim reboot";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->reboot_recovery_ops);
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
- continue;
- nfs4_state_end_reclaim_reboot(clp);
- if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+ if (status == -EAGAIN)
continue;
if (status < 0)
goto out_error;
+ nfs4_state_end_reclaim_reboot(clp);
}
/* Now recover expired state... */
@@ -2408,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim nograce";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->nograce_recovery_ops);
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
- test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+ if (status == -EAGAIN)
continue;
if (status < 0)
goto out_error;
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 0a744f3a86f6..1c32adbe728d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
DECLARE_EVENT_CLASS(nfs4_read_event,
TP_PROTO(
- const struct nfs_pgio_data *data,
+ const struct nfs_pgio_header *hdr,
int error
),
- TP_ARGS(data, error),
+ TP_ARGS(hdr, error),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
),
TP_fast_assign(
- const struct inode *inode = data->header->inode;
+ const struct inode *inode = hdr->inode;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
- __entry->offset = data->args.offset;
- __entry->count = data->args.count;
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
__entry->error = error;
),
@@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
#define DEFINE_NFS4_READ_EVENT(name) \
DEFINE_EVENT(nfs4_read_event, name, \
TP_PROTO( \
- const struct nfs_pgio_data *data, \
+ const struct nfs_pgio_header *hdr, \
int error \
), \
- TP_ARGS(data, error))
+ TP_ARGS(hdr, error))
DEFINE_NFS4_READ_EVENT(nfs4_read);
#ifdef CONFIG_NFS_V4_1
DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
@@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
DECLARE_EVENT_CLASS(nfs4_write_event,
TP_PROTO(
- const struct nfs_pgio_data *data,
+ const struct nfs_pgio_header *hdr,
int error
),
- TP_ARGS(data, error),
+ TP_ARGS(hdr, error),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
),
TP_fast_assign(
- const struct inode *inode = data->header->inode;
+ const struct inode *inode = hdr->inode;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
- __entry->offset = data->args.offset;
- __entry->count = data->args.count;
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
__entry->error = error;
),
@@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
#define DEFINE_NFS4_WRITE_EVENT(name) \
DEFINE_EVENT(nfs4_write_event, name, \
TP_PROTO( \
- const struct nfs_pgio_data *data, \
+ const struct nfs_pgio_header *hdr, \
int error \
), \
- TP_ARGS(data, error))
+ TP_ARGS(hdr, error))
DEFINE_NFS4_WRITE_EVENT(nfs4_write);
#ifdef CONFIG_NFS_V4_1
DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 939ae606cfa4..206c08a60c7f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
-#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
- encode_verifier_maxsz)
-#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
- 2 /* nfs_cookie4 gdlr_cookie */ + \
- decode_verifier_maxsz \
- /* verifier4 gdlr_verifier */ + \
- 1 /* gdlr_deviceid_list count */ + \
- XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
- NFS4_DEVICEID4_SIZE) \
- /* gdlr_deviceid_list */ + \
- 1 /* bool gdlr_eof */)
-#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
- XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* layout type */ + \
+ 1 /* maxcount */ + \
+ 1 /* bitmap size */ + \
+ 1 /* notification bitmap length */ + \
+ 1 /* notification bitmap, word 0 */)
#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
1 /* layout type */ + \
1 /* opaque devaddr4 length */ + \
/* devaddr4 payload is read into page */ \
1 /* notification bitmap length */ + \
- 1 /* notification bitmap */)
+ 1 /* notification bitmap, word 0 */)
#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
encode_stateid_maxsz)
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
2 /* last byte written */ + \
1 /* nt_timechanged (false) */ + \
1 /* layoutupdate4 layout type */ + \
- 1 /* NULL filelayout layoutupdate4 payload */)
+ 1 /* layoutupdate4 opaqueue len */)
+ /* the actual content of layoutupdate4 should
+ be allocated by drivers and spliced in
+ using xdr_write_pages */
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_reclaim_complete_maxsz)
-#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
- encode_sequence_maxsz + \
- encode_putfh_maxsz + \
- encode_getdevicelist_maxsz)
-#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
- decode_sequence_maxsz + \
- decode_putfh_maxsz + \
- decode_getdevicelist_maxsz)
#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz +\
encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
#ifdef CONFIG_NFS_V4_1
static void
-encode_getdevicelist(struct xdr_stream *xdr,
- const struct nfs4_getdevicelist_args *args,
- struct compound_hdr *hdr)
-{
- __be32 *p;
- nfs4_verifier dummy = {
- .data = "dummmmmy",
- };
-
- encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
- p = reserve_space(xdr, 16);
- *p++ = cpu_to_be32(args->layoutclass);
- *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
- xdr_encode_hyper(p, 0ULL); /* cookie */
- encode_nfs4_verifier(xdr, &dummy);
-}
-
-static void
encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfs4_getdeviceinfo_args *args,
struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
__be32 *p;
encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
- p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(args->pdev->layout_type);
*p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
- *p++ = cpu_to_be32(0); /* bitmap length 0 */
+
+ p = reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
}
static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
static int
encode_layoutcommit(struct xdr_stream *xdr,
struct inode *inode,
- const struct nfs4_layoutcommit_args *args,
+ struct nfs4_layoutcommit_args *args,
struct compound_hdr *hdr)
{
__be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
- if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
NFS_I(inode)->layout, xdr, args);
- else
- encode_uint32(xdr, 0); /* no layout-type payload */
+ } else {
+ encode_uint32(xdr, args->layoutupdate_len);
+ if (args->layoutupdate_pages) {
+ xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+ args->layoutupdate_len);
+ }
+ }
return 0;
}
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
}
/*
- * Encode GETDEVICELIST request
- */
-static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
- struct xdr_stream *xdr,
- struct nfs4_getdevicelist_args *args)
-{
- struct compound_hdr hdr = {
- .minorversion = nfs4_xdr_minorversion(&args->seq_args),
- };
-
- encode_compound_hdr(xdr, req, &hdr);
- encode_sequence(xdr, &args->seq_args, &hdr);
- encode_putfh(xdr, args->fh, &hdr);
- encode_getdevicelist(xdr, args, &hdr);
- encode_nops(&hdr);
-}
-
-/*
* Encode GETDEVICEINFO request
*/
static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
}
#if defined(CONFIG_NFS_V4_1)
-/*
- * TODO: Need to handle case when EOF != true;
- */
-static int decode_getdevicelist(struct xdr_stream *xdr,
- struct pnfs_devicelist *res)
-{
- __be32 *p;
- int status, i;
- nfs4_verifier verftemp;
-
- status = decode_op_hdr(xdr, OP_GETDEVICELIST);
- if (status)
- return status;
-
- p = xdr_inline_decode(xdr, 8 + 8 + 4);
- if (unlikely(!p))
- goto out_overflow;
-
- /* TODO: Skip cookie for now */
- p += 2;
-
- /* Read verifier */
- p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
-
- res->num_devs = be32_to_cpup(p);
-
- dprintk("%s: num_dev %d\n", __func__, res->num_devs);
-
- if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
- printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
- __func__, res->num_devs);
- return -EIO;
- }
-
- p = xdr_inline_decode(xdr,
- res->num_devs * NFS4_DEVICEID4_SIZE + 4);
- if (unlikely(!p))
- goto out_overflow;
- for (i = 0; i < res->num_devs; i++)
- p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
- NFS4_DEVICEID4_SIZE);
- res->eof = be32_to_cpup(p);
- return 0;
-out_overflow:
- print_overflow_msg(__func__, xdr);
- return -EIO;
-}
-
static int decode_getdeviceinfo(struct xdr_stream *xdr,
struct pnfs_device *pdev)
{
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4 * len);
if (unlikely(!p))
goto out_overflow;
- for (i = 0; i < len; i++, p++) {
- if (be32_to_cpup(p)) {
- dprintk("%s: notifications not supported\n",
+
+ if (be32_to_cpup(p++) &
+ ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
+ dprintk("%s: unsupported notification\n",
+ __func__);
+ }
+
+ for (i = 1; i < len; i++) {
+ if (be32_to_cpup(p++)) {
+ dprintk("%s: unsupported notification\n",
__func__);
return -EIO;
}
@@ -7092,33 +7012,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
if (!status)
status = decode_sequence(xdr, &res->seq_res, rqstp);
if (!status)
- status = decode_reclaim_complete(xdr, (void *)NULL);
- return status;
-}
-
-/*
- * Decode GETDEVICELIST response
- */
-static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
- struct xdr_stream *xdr,
- struct nfs4_getdevicelist_res *res)
-{
- struct compound_hdr hdr;
- int status;
-
- dprintk("encoding getdevicelist!\n");
-
- status = decode_compound_hdr(xdr, &hdr);
- if (status != 0)
- goto out;
- status = decode_sequence(xdr, &res->seq_res, rqstp);
- if (status != 0)
- goto out;
- status = decode_putfh(xdr);
- if (status != 0)
- goto out;
- status = decode_getdevicelist(xdr, res->devlist);
-out:
+ status = decode_reclaim_complete(xdr, NULL);
return status;
}
@@ -7427,6 +7321,10 @@ nfs4_stat_to_errno(int stat)
return -stat;
}
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42xdr.c"
+#endif /* CONFIG_NFS_V4_2 */
+
#define PROC(proc, argtype, restype) \
[NFSPROC4_CLNT_##proc] = { \
.p_proc = NFSPROC4_COMPOUND, \
@@ -7490,11 +7388,13 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
- PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
PROC(BIND_CONN_TO_SESSION,
enc_bind_conn_to_session, dec_bind_conn_to_session),
PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+ PROC(SEEK, enc_seek, dec_seek),
+#endif /* CONFIG_NFS_V4_2 */
};
const struct rpc_version nfs_version4 = {
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 611320753db2..9e5bc42180e4 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -5,7 +5,7 @@
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
kfree(de);
}
-static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
- const struct nfs4_deviceid *d_id)
-{
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *de;
-
- d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
- if (!d)
- return NULL;
-
- de = container_of(d, struct objio_dev_ent, id_node);
- return de;
-}
-
-static struct objio_dev_ent *
-_dev_list_add(const struct nfs_server *nfss,
- const struct nfs4_deviceid *d_id, struct osd_dev *od,
- gfp_t gfp_flags)
-{
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
- struct objio_dev_ent *n;
-
- if (!de) {
- dprintk("%s: -ENOMEM od=%p\n", __func__, od);
- return NULL;
- }
-
- dprintk("%s: Adding od=%p\n", __func__, od);
- nfs4_init_deviceid_node(&de->id_node,
- nfss->pnfs_curr_ld,
- nfss->nfs_client,
- d_id);
- de->od.od = od;
-
- d = nfs4_insert_deviceid_node(&de->id_node);
- n = container_of(d, struct objio_dev_ent, id_node);
- if (n != de) {
- dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
- objio_free_deviceid_node(&de->id_node);
- de = n;
- }
-
- return de;
-}
-
struct objio_segment {
struct pnfs_layout_segment lseg;
@@ -130,29 +84,24 @@ struct objio_state {
/* Send and wait for a get_device_info of devices in the layout,
then look them up with the osd_initiator library */
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
- gfp_t gfp_flags)
+struct nfs4_deviceid_node *
+objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
{
struct pnfs_osd_deviceaddr *deviceaddr;
- struct objio_dev_ent *ode;
+ struct objio_dev_ent *ode = NULL;
struct osd_dev *od;
struct osd_dev_info odi;
bool retry_flag = true;
+ __be32 *p;
int err;
- ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
- if (ode) {
- objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
- return 0;
- }
+ deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
+ if (!deviceaddr)
+ return NULL;
- err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
- if (unlikely(err)) {
- dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
- __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
- return err;
- }
+ p = page_address(pdev->pages[0]);
+ pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
odi.systemid_len = deviceaddr->oda_systemid.len;
if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
goto out;
}
- ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
- gfp_flags);
- objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
dprintk("Adding new dev_id(%llx:%llx)\n",
- _DEVID_LO(d_id), _DEVID_HI(d_id));
+ _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
+
+ ode = kzalloc(sizeof(*ode), gfp_flags);
+ if (!ode) {
+ dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+ goto out;
+ }
+
+ nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
+ kfree(deviceaddr);
+
+ ode->od.od = od;
+ return &ode->id_node;
+
out:
- objlayout_put_deviceinfo(deviceaddr);
- return err;
+ kfree(deviceaddr);
+ return NULL;
}
static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct xdr_stream *xdr,
gfp_t gfp_flags)
{
+ struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
struct objio_segment *objio_seg;
struct pnfs_osd_xdr_decode_layout_iter iter;
struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
objio_seg->oc.first_dev = layout.olo_comps_index;
cur_comp = 0;
while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+ struct nfs4_deviceid_node *d;
+ struct objio_dev_ent *ode;
+
copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
- err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
- &src_comp.oc_object_id.oid_device_id,
- gfp_flags);
- if (err)
+
+ d = nfs4_find_get_deviceid(server,
+ &src_comp.oc_object_id.oid_device_id,
+ pnfslay->plh_lc_cred, gfp_flags);
+ if (!d) {
+ err = -ENXIO;
goto err;
- ++cur_comp;
+ }
+
+ ode = container_of(d, struct objio_dev_ent, id_node);
+ objio_seg->oc.ods[cur_comp++] = &ode->od;
}
/* pnfs_osd_xdr_decode_layout_comp returns false on error */
if (unlikely(err))
@@ -439,22 +407,21 @@ static void _read_done(struct ore_io_state *ios, void *private)
objlayout_read_done(&objios->oir, status, objios->sync);
}
-int objio_read_pagelist(struct nfs_pgio_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = rdata->header;
struct objio_state *objios;
int ret;
ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
- hdr->lseg, rdata->args.pages, rdata->args.pgbase,
- rdata->args.offset, rdata->args.count, rdata,
+ hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+ hdr->args.offset, hdr->args.count, hdr,
GFP_KERNEL, &objios);
if (unlikely(ret))
return ret;
objios->ios->done = _read_done;
dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
- rdata->args.offset, rdata->args.count);
+ hdr->args.offset, hdr->args.count);
ret = ore_read(objios->ios);
if (unlikely(ret))
objio_free_result(&objios->oir);
@@ -487,11 +454,11 @@ static void _write_done(struct ore_io_state *ios, void *private)
static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
{
struct objio_state *objios = priv;
- struct nfs_pgio_data *wdata = objios->oir.rpcdata;
- struct address_space *mapping = wdata->header->inode->i_mapping;
+ struct nfs_pgio_header *hdr = objios->oir.rpcdata;
+ struct address_space *mapping = hdr->inode->i_mapping;
pgoff_t index = offset / PAGE_SIZE;
struct page *page;
- loff_t i_size = i_size_read(wdata->header->inode);
+ loff_t i_size = i_size_read(hdr->inode);
if (offset >= i_size) {
*uptodate = true;
@@ -531,15 +498,14 @@ static const struct _ore_r4w_op _r4w_op = {
.put_page = &__r4w_put_page,
};
-int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
{
- struct nfs_pgio_header *hdr = wdata->header;
struct objio_state *objios;
int ret;
ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
- hdr->lseg, wdata->args.pages, wdata->args.pgbase,
- wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+ hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+ hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
&objios);
if (unlikely(ret))
return ret;
@@ -551,7 +517,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
objios->ios->done = _write_done;
dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
- wdata->args.offset, wdata->args.count);
+ hdr->args.offset, hdr->args.count);
ret = ore_write(objios->ios);
if (unlikely(ret)) {
objio_free_result(&objios->oir);
@@ -655,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
.flags = PNFS_LAYOUTRET_ON_SETATTR |
PNFS_LAYOUTRET_ON_ERROR,
+ .max_deviceinfo_size = PAGE_SIZE,
.owner = THIS_MODULE,
.alloc_layout_hdr = objlayout_alloc_layout_hdr,
.free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 765d3f54e986..919efd4a1a23 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -5,7 +5,7 @@
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
@@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
static void _rpc_read_complete(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_pgio_data *rdata;
+ struct nfs_pgio_header *hdr;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- rdata = container_of(task, struct nfs_pgio_data, task);
+ hdr = container_of(task, struct nfs_pgio_header, task);
- pnfs_ld_read_done(rdata);
+ pnfs_ld_read_done(hdr);
}
void
objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_pgio_data *rdata = oir->rpcdata;
+ struct nfs_pgio_header *hdr = oir->rpcdata;
- oir->status = rdata->task.tk_status = status;
+ oir->status = hdr->task.tk_status = status;
if (status >= 0)
- rdata->res.count = status;
+ hdr->res.count = status;
else
- rdata->header->pnfs_error = status;
+ hdr->pnfs_error = status;
objlayout_iodone(oir);
/* must not use oir after this point */
dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
- status, rdata->res.eof, sync);
+ status, hdr->res.eof, sync);
if (sync)
- pnfs_ld_read_done(rdata);
+ pnfs_ld_read_done(hdr);
else {
- INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
- schedule_work(&rdata->task.u.tk_work);
+ INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
+ schedule_work(&hdr->task.u.tk_work);
}
}
@@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
* Perform sync or async reads.
*/
enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_pgio_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = rdata->header;
struct inode *inode = hdr->inode;
- loff_t offset = rdata->args.offset;
- size_t count = rdata->args.count;
+ loff_t offset = hdr->args.offset;
+ size_t count = hdr->args.count;
int err;
loff_t eof;
@@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
if (unlikely(offset + count > eof)) {
if (offset >= eof) {
err = 0;
- rdata->res.count = 0;
- rdata->res.eof = 1;
+ hdr->res.count = 0;
+ hdr->res.eof = 1;
/*FIXME: do we need to call pnfs_ld_read_done() */
goto out;
}
count = eof - offset;
}
- rdata->res.eof = (offset + count) >= eof;
- _fix_verify_io_params(hdr->lseg, &rdata->args.pages,
- &rdata->args.pgbase,
- rdata->args.offset, rdata->args.count);
+ hdr->res.eof = (offset + count) >= eof;
+ _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+ &hdr->args.pgbase,
+ hdr->args.offset, hdr->args.count);
dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
- __func__, inode->i_ino, offset, count, rdata->res.eof);
+ __func__, inode->i_ino, offset, count, hdr->res.eof);
- err = objio_read_pagelist(rdata);
+ err = objio_read_pagelist(hdr);
out:
if (unlikely(err)) {
hdr->pnfs_error = err;
@@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
static void _rpc_write_complete(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_pgio_data *wdata;
+ struct nfs_pgio_header *hdr;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- wdata = container_of(task, struct nfs_pgio_data, task);
+ hdr = container_of(task, struct nfs_pgio_header, task);
- pnfs_ld_write_done(wdata);
+ pnfs_ld_write_done(hdr);
}
void
objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_pgio_data *wdata = oir->rpcdata;
+ struct nfs_pgio_header *hdr = oir->rpcdata;
- oir->status = wdata->task.tk_status = status;
+ oir->status = hdr->task.tk_status = status;
if (status >= 0) {
- wdata->res.count = status;
- wdata->verf.committed = oir->committed;
+ hdr->res.count = status;
+ hdr->verf.committed = oir->committed;
} else {
- wdata->header->pnfs_error = status;
+ hdr->pnfs_error = status;
}
objlayout_iodone(oir);
/* must not use oir after this point */
dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
- status, wdata->verf.committed, sync);
+ status, hdr->verf.committed, sync);
if (sync)
- pnfs_ld_write_done(wdata);
+ pnfs_ld_write_done(hdr);
else {
- INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
- schedule_work(&wdata->task.u.tk_work);
+ INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
+ schedule_work(&hdr->task.u.tk_work);
}
}
@@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
* Perform sync or async writes.
*/
enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_pgio_data *wdata,
- int how)
+objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
{
- struct nfs_pgio_header *hdr = wdata->header;
int err;
- _fix_verify_io_params(hdr->lseg, &wdata->args.pages,
- &wdata->args.pgbase,
- wdata->args.offset, wdata->args.count);
+ _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+ &hdr->args.pgbase,
+ hdr->args.offset, hdr->args.count);
- err = objio_write_pagelist(wdata, how);
+ err = objio_write_pagelist(hdr, how);
if (unlikely(err)) {
hdr->pnfs_error = err;
dprintk("%s: Returned Error %d\n", __func__, err);
@@ -577,76 +574,6 @@ loop_done:
dprintk("%s: Return\n", __func__);
}
-
-/*
- * Get Device Info API for io engines
- */
-struct objlayout_deviceinfo {
- struct page *page;
- struct pnfs_osd_deviceaddr da; /* This must be last */
-};
-
-/* Initialize and call nfs_getdeviceinfo, then decode and return a
- * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
- * should be called.
- */
-int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
- gfp_t gfp_flags)
-{
- struct objlayout_deviceinfo *odi;
- struct pnfs_device pd;
- struct page *page, **pages;
- u32 *p;
- int err;
-
- page = alloc_page(gfp_flags);
- if (!page)
- return -ENOMEM;
-
- pages = &page;
- pd.pages = pages;
-
- memcpy(&pd.dev_id, d_id, sizeof(*d_id));
- pd.layout_type = LAYOUT_OSD2_OBJECTS;
- pd.pages = &page;
- pd.pgbase = 0;
- pd.pglen = PAGE_SIZE;
- pd.mincount = 0;
- pd.maxcount = PAGE_SIZE;
-
- err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
- pnfslay->plh_lc_cred);
- dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
- if (err)
- goto err_out;
-
- p = page_address(page);
- odi = kzalloc(sizeof(*odi), gfp_flags);
- if (!odi) {
- err = -ENOMEM;
- goto err_out;
- }
- pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
- odi->page = page;
- *deviceaddr = &odi->da;
- return 0;
-
-err_out:
- __free_page(page);
- return err;
-}
-
-void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
-{
- struct objlayout_deviceinfo *odi = container_of(deviceaddr,
- struct objlayout_deviceinfo,
- da);
-
- __free_page(odi->page);
- kfree(odi);
-}
-
enum {
OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 01e041029a6c..2641dbad345c 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -6,7 +6,7 @@
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
*/
extern void objio_free_result(struct objlayout_io_res *oir);
-extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
-extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
+extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
/*
* callback API
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
extern void objlayout_write_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);
-extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
- gfp_t gfp_flags);
-extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
-
/*
* exported generic objects function vectors
*/
@@ -168,10 +163,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
extern void objlayout_free_lseg(struct pnfs_layout_segment *);
extern enum pnfs_try_status objlayout_read_pagelist(
- struct nfs_pgio_data *);
+ struct nfs_pgio_header *);
extern enum pnfs_try_status objlayout_write_pagelist(
- struct nfs_pgio_data *,
+ struct nfs_pgio_header *,
int how);
extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index b3918f7ac34d..f093c7ec983b 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -5,7 +5,7 @@
* All rights reserved.
*
* Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <bharrosh@panasas.com>
+ * Boaz Harrosh <ooo@electrozaur.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 0be5050638f7..ed0db61f8543 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -116,7 +116,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c)
if (atomic_read(&c->io_count) == 0)
break;
ret = nfs_wait_bit_killable(&q.key);
- } while (atomic_read(&c->io_count) != 0);
+ } while (atomic_read(&c->io_count) != 0 && !ret);
finish_wait(wq, &q.wait);
return ret;
}
@@ -139,18 +139,49 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
/*
* nfs_page_group_lock - lock the head of the page group
* @req - request in group that is to be locked
+ * @nonblock - if true don't block waiting for lock
*
* this lock must be held if modifying the page group list
+ *
+ * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
+ * result from wait_on_bit_lock
+ *
+ * NOTE: calling with nonblock=false should always have set the
+ * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
+ * with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
+ */
+int
+nfs_page_group_lock(struct nfs_page *req, bool nonblock)
+{
+ struct nfs_page *head = req->wb_head;
+
+ WARN_ON_ONCE(head != head->wb_head);
+
+ if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
+ return 0;
+
+ if (!nonblock)
+ return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ TASK_UNINTERRUPTIBLE);
+
+ return -EAGAIN;
+}
+
+/*
+ * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
+ * @req - a request in the group
+ *
+ * This is a blocking call to wait for the group lock to be cleared.
*/
void
-nfs_page_group_lock(struct nfs_page *req)
+nfs_page_group_lock_wait(struct nfs_page *req)
{
struct nfs_page *head = req->wb_head;
WARN_ON_ONCE(head != head->wb_head);
- wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
- TASK_UNINTERRUPTIBLE);
+ wait_on_bit(&head->wb_flags, PG_HEADLOCK,
+ TASK_UNINTERRUPTIBLE);
}
/*
@@ -211,7 +242,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
{
bool ret;
- nfs_page_group_lock(req);
+ nfs_page_group_lock(req, false);
ret = nfs_page_group_sync_on_bit_locked(req, bit);
nfs_page_group_unlock(req);
@@ -450,127 +481,85 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
return 0;
}
+ /*
+ * Limit the request size so that we can still allocate a page array
+ * for it without upsetting the slab allocator.
+ */
+ if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+ sizeof(struct page) > PAGE_SIZE)
+ return 0;
+
return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
}
EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
-static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
-{
- return container_of(hdr, struct nfs_rw_header, header);
-}
-
-/**
- * nfs_rw_header_alloc - Allocate a header for a read or write
- * @ops: Read or write function vector
- */
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
{
- struct nfs_rw_header *header = ops->rw_alloc_header();
-
- if (header) {
- struct nfs_pgio_header *hdr = &header->header;
+ struct nfs_pgio_header *hdr = ops->rw_alloc_header();
+ if (hdr) {
INIT_LIST_HEAD(&hdr->pages);
spin_lock_init(&hdr->lock);
- atomic_set(&hdr->refcnt, 0);
hdr->rw_ops = ops;
}
- return header;
+ return hdr;
}
-EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
/*
- * nfs_rw_header_free - Free a read or write header
+ * nfs_pgio_header_free - Free a read or write header
* @hdr: The header to free
*/
-void nfs_rw_header_free(struct nfs_pgio_header *hdr)
-{
- hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
-}
-EXPORT_SYMBOL_GPL(nfs_rw_header_free);
-
-/**
- * nfs_pgio_data_alloc - Allocate pageio data
- * @hdr: The header making a request
- * @pagecount: Number of pages to create
- */
-static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
- unsigned int pagecount)
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_data *data, *prealloc;
-
- prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
- if (prealloc->header == NULL)
- data = prealloc;
- else
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
-
- if (nfs_pgarray_set(&data->pages, pagecount)) {
- data->header = hdr;
- atomic_inc(&hdr->refcnt);
- } else {
- if (data != prealloc)
- kfree(data);
- data = NULL;
- }
-out:
- return data;
+ hdr->rw_ops->rw_free_header(hdr);
}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
/**
- * nfs_pgio_data_release - Properly free pageio data
- * @data: The data to release
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
+ *
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
+ * be called again.
+ *
+ * @hdr: A header that has had nfs_generic_pgio called
*/
-void nfs_pgio_data_release(struct nfs_pgio_data *data)
+void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
- struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr);
-
- put_nfs_open_context(data->args.context);
- if (data->pages.pagevec != data->pages.page_array)
- kfree(data->pages.pagevec);
- if (data == &pageio_header->rpc_data) {
- data->header = NULL;
- data = NULL;
- }
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- /* Note: we only free the rpc_task after callbacks are done.
- * See the comment in rpc_free_task() for why
- */
- kfree(data);
+ if (hdr->args.context)
+ put_nfs_open_context(hdr->args.context);
+ if (hdr->page_array.pagevec != hdr->page_array.page_array)
+ kfree(hdr->page_array.pagevec);
}
-EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
+EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
/**
* nfs_pgio_rpcsetup - Set up arguments for a pageio call
- * @data: The pageio data
+ * @hdr: The pageio hdr
* @count: Number of bytes to read
* @offset: Initial offset
* @how: How to commit data (writes only)
* @cinfo: Commit information for the call (writes only)
*/
-static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
unsigned int count, unsigned int offset,
int how, struct nfs_commit_info *cinfo)
{
- struct nfs_page *req = data->header->req;
+ struct nfs_page *req = hdr->req;
/* Set up the RPC argument and reply structs
- * NB: take care not to mess about with data->commit et al. */
+ * NB: take care not to mess about with hdr->commit et al. */
- data->args.fh = NFS_FH(data->header->inode);
- data->args.offset = req_offset(req) + offset;
+ hdr->args.fh = NFS_FH(hdr->inode);
+ hdr->args.offset = req_offset(req) + offset;
/* pnfs_set_layoutcommit needs this */
- data->mds_offset = data->args.offset;
- data->args.pgbase = req->wb_pgbase + offset;
- data->args.pages = data->pages.pagevec;
- data->args.count = count;
- data->args.context = get_nfs_open_context(req->wb_context);
- data->args.lock_context = req->wb_lock_context;
- data->args.stable = NFS_UNSTABLE;
+ hdr->mds_offset = hdr->args.offset;
+ hdr->args.pgbase = req->wb_pgbase + offset;
+ hdr->args.pages = hdr->page_array.pagevec;
+ hdr->args.count = count;
+ hdr->args.context = get_nfs_open_context(req->wb_context);
+ hdr->args.lock_context = req->wb_lock_context;
+ hdr->args.stable = NFS_UNSTABLE;
switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
case 0:
break;
@@ -578,59 +567,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
if (nfs_reqs_to_commit(cinfo))
break;
default:
- data->args.stable = NFS_FILE_SYNC;
+ hdr->args.stable = NFS_FILE_SYNC;
}
- data->res.fattr = &data->fattr;
- data->res.count = count;
- data->res.eof = 0;
- data->res.verf = &data->verf;
- nfs_fattr_init(&data->fattr);
+ hdr->res.fattr = &hdr->fattr;
+ hdr->res.count = count;
+ hdr->res.eof = 0;
+ hdr->res.verf = &hdr->verf;
+ nfs_fattr_init(&hdr->fattr);
}
/**
- * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
* @task: The current task
- * @calldata: pageio data to prepare
+ * @calldata: pageio header to prepare
*/
static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
{
- struct nfs_pgio_data *data = calldata;
+ struct nfs_pgio_header *hdr = calldata;
int err;
- err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+ err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
if (err)
rpc_exit(task, err);
}
-int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops, int how, int flags)
{
struct rpc_task *task;
struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
- .rpc_cred = data->header->cred,
+ .rpc_argp = &hdr->args,
+ .rpc_resp = &hdr->res,
+ .rpc_cred = hdr->cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = clnt,
- .task = &data->task,
+ .task = &hdr->task,
.rpc_message = &msg,
.callback_ops = call_ops,
- .callback_data = data,
+ .callback_data = hdr,
.workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC | flags,
};
int ret = 0;
- data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+ hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
dprintk("NFS: %5u initiated pgio call "
"(req %s/%llu, %u bytes @ offset %llu)\n",
- data->task.tk_pid,
- data->header->inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(data->header->inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task)) {
@@ -657,22 +646,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
set_bit(NFS_IOHDR_REDO, &hdr->flags);
- nfs_pgio_data_release(hdr->data);
- hdr->data = NULL;
+ nfs_pgio_data_destroy(hdr);
+ hdr->completion_ops->completion(hdr);
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
return -ENOMEM;
}
/**
* nfs_pgio_release - Release pageio data
- * @calldata: The pageio data to release
+ * @calldata: The pageio header to release
*/
static void nfs_pgio_release(void *calldata)
{
- struct nfs_pgio_data *data = calldata;
- if (data->header->rw_ops->rw_release)
- data->header->rw_ops->rw_release(data);
- nfs_pgio_data_release(data);
+ struct nfs_pgio_header *hdr = calldata;
+ if (hdr->rw_ops->rw_release)
+ hdr->rw_ops->rw_release(hdr);
+ nfs_pgio_data_destroy(hdr);
+ hdr->completion_ops->completion(hdr);
}
/**
@@ -713,22 +703,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init);
/**
* nfs_pgio_result - Basic pageio error handling
* @task: The task that ran
- * @calldata: Pageio data to check
+ * @calldata: Pageio header to check
*/
static void nfs_pgio_result(struct rpc_task *task, void *calldata)
{
- struct nfs_pgio_data *data = calldata;
- struct inode *inode = data->header->inode;
+ struct nfs_pgio_header *hdr = calldata;
+ struct inode *inode = hdr->inode;
dprintk("NFS: %s: %5u, (status %d)\n", __func__,
task->tk_pid, task->tk_status);
- if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+ if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
return;
if (task->tk_status < 0)
- nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+ nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
else
- data->header->rw_ops->rw_result(task, data);
+ hdr->rw_ops->rw_result(task, hdr);
}
/*
@@ -743,32 +733,41 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
struct nfs_page *req;
- struct page **pages;
- struct nfs_pgio_data *data;
+ struct page **pages,
+ *last_page;
struct list_head *head = &desc->pg_list;
struct nfs_commit_info cinfo;
+ unsigned int pagecount, pageused;
- data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base,
- desc->pg_count));
- if (!data)
+ pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+ if (!nfs_pgarray_set(&hdr->page_array, pagecount))
return nfs_pgio_error(desc, hdr);
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
- pages = data->pages.pagevec;
+ pages = hdr->page_array.pagevec;
+ last_page = NULL;
+ pageused = 0;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_list_add_request(req, &hdr->pages);
- *pages++ = req->wb_page;
+
+ if (!last_page || last_page != req->wb_page) {
+ pageused++;
+ if (pageused > pagecount)
+ break;
+ *pages++ = last_page = req->wb_page;
+ }
}
+ if (WARN_ON_ONCE(pageused != pagecount))
+ return nfs_pgio_error(desc, hdr);
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
desc->pg_ioflags &= ~FLUSH_COND_STABLE;
/* Set up the argument struct */
- nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
- hdr->data = data;
+ nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
desc->pg_rpc_callops = &nfs_pgio_common_ops;
return 0;
}
@@ -776,25 +775,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
{
- struct nfs_rw_header *rw_hdr;
struct nfs_pgio_header *hdr;
int ret;
- rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops);
- if (!rw_hdr) {
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
return -ENOMEM;
}
- hdr = &rw_hdr->header;
- nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
- atomic_inc(&hdr->refcnt);
+ nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
if (ret == 0)
ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
- hdr->data, desc->pg_rpc_callops,
+ hdr, desc->pg_rpc_callops,
desc->pg_ioflags, 0);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
return ret;
}
@@ -837,6 +831,14 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
return false;
if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
return false;
+ if (req->wb_page == prev->wb_page) {
+ if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
+ return false;
+ } else {
+ if (req->wb_pgbase != 0 ||
+ prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+ return false;
+ }
}
size = pgio->pg_ops->pg_test(pgio, prev, req);
WARN_ON_ONCE(size > req->wb_bytes);
@@ -908,7 +910,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
unsigned int bytes_left = 0;
unsigned int offset, pgbase;
- nfs_page_group_lock(req);
+ nfs_page_group_lock(req, false);
subreq = req;
bytes_left = subreq->wb_bytes;
@@ -930,7 +932,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (desc->pg_recoalesce)
return 0;
/* retry add_request for this subreq */
- nfs_page_group_lock(req);
+ nfs_page_group_lock(req, false);
continue;
}
@@ -1005,7 +1007,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
} while (ret);
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_pageio_add_request);
+
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ LIST_HEAD(failed);
+
+ desc->pg_dreq = hdr->dreq;
+ while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+ nfs_list_remove_request(req);
+ if (!nfs_pageio_add_request(desc, req))
+ nfs_list_add_request(req, &failed);
+ }
+ nfs_pageio_complete(desc);
+ if (!list_empty(&failed)) {
+ list_move(&failed, &hdr->pages);
+ return -EIO;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
/**
* nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
@@ -1021,7 +1054,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
break;
}
}
-EXPORT_SYMBOL_GPL(nfs_pageio_complete);
/**
* nfs_pageio_cond_complete - Conditional I/O completion
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a8914b335617..0a5dda4d85c2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -361,6 +361,44 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg);
+static void pnfs_free_lseg_async_work(struct work_struct *work)
+{
+ struct pnfs_layout_segment *lseg;
+ struct pnfs_layout_hdr *lo;
+
+ lseg = container_of(work, struct pnfs_layout_segment, pls_work);
+ lo = lseg->pls_layout;
+
+ pnfs_free_lseg(lseg);
+ pnfs_put_layout_hdr(lo);
+}
+
+static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
+{
+ INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
+ schedule_work(&lseg->pls_work);
+}
+
+void
+pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
+{
+ if (!lseg)
+ return;
+
+ assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
+
+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+ atomic_read(&lseg->pls_refcount),
+ test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ pnfs_get_layout_hdr(lo);
+ pnfs_layout_remove_lseg(lo, lseg);
+ pnfs_free_lseg_async(lseg);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
+
static u64
end_offset(u64 start, u64 len)
{
@@ -577,6 +615,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
dprintk("%s freeing layout for inode %lu\n", __func__,
lo->plh_inode->i_ino);
inode = lo->plh_inode;
+
+ pnfs_layoutcommit_inode(inode, false);
+
spin_lock(&inode->i_lock);
list_del_init(&lo->plh_bulk_destroy);
lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -665,17 +706,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
return (s32)(s1 - s2) > 0;
}
-static void
-pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new,
- struct list_head *free_me_list)
-{
- if (nfs4_stateid_match_other(&lo->plh_stateid, new))
- return;
- /* Layout is new! Kill existing layout segments */
- pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
-}
-
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -732,7 +762,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
status = -EAGAIN;
} else if (!nfs4_valid_open_stateid(open_state)) {
status = -EBADF;
- } else if (list_empty(&lo->plh_segs)) {
+ } else if (list_empty(&lo->plh_segs) ||
+ test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
int seq;
do {
@@ -847,6 +878,16 @@ _pnfs_return_layout(struct inode *ino)
empty = list_empty(&lo->plh_segs);
pnfs_clear_layoutcommit(ino, &tmp_list);
pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+ }
+
/* Don't send a LAYOUTRETURN if list was initially empty */
if (empty) {
spin_unlock(&ino->i_lock);
@@ -854,6 +895,8 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout segments to return\n", __func__);
goto out;
}
+
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
lo->plh_block_lgets++;
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -1341,25 +1384,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out;
}
+ init_lseg(lo, lseg);
+ lseg->pls_range = res->range;
+
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
dprintk("%s forget reply due to recall\n", __func__);
goto out_forget_reply;
}
- if (pnfs_layoutgets_blocked(lo, 1) ||
- pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ if (pnfs_layoutgets_blocked(lo, 1)) {
dprintk("%s forget reply due to state\n", __func__);
goto out_forget_reply;
}
- /* Check that the new stateid matches the old stateid */
- pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
- /* Done processing layoutget. Set the layout stateid */
- pnfs_set_layout_stateid(lo, &res->stateid, false);
+ if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+ /* existing state ID, make sure the sequence number matches. */
+ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ dprintk("%s forget reply due to sequence\n", __func__);
+ goto out_forget_reply;
+ }
+ pnfs_set_layout_stateid(lo, &res->stateid, false);
+ } else {
+ /*
+ * We got an entirely new state ID. Mark all segments for the
+ * inode invalid, and don't bother validating the stateid
+ * sequence number.
+ */
+ pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+
+ nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
+ lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+ }
+
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- init_lseg(lo, lseg);
- lseg->pls_range = res->range;
pnfs_get_lseg(lseg);
pnfs_layout_insert_lseg(lo, lseg);
@@ -1470,41 +1529,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
-int pnfs_write_done_resend_to_mds(struct inode *inode,
- struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops,
- struct nfs_direct_req *dreq)
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
{
struct nfs_pageio_descriptor pgio;
- LIST_HEAD(failed);
/* Resend all requests through the MDS */
- nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
- pgio.pg_dreq = dreq;
- while (!list_empty(head)) {
- struct nfs_page *req = nfs_list_entry(head->next);
-
- nfs_list_remove_request(req);
- if (!nfs_pageio_add_request(&pgio, req))
- nfs_list_add_request(req, &failed);
- }
- nfs_pageio_complete(&pgio);
-
- if (!list_empty(&failed)) {
- /* For some reason our attempt to resend pages. Mark the
- * overall send request as having failed, and let
- * nfs_writeback_release_full deal with the error.
- */
- list_move(&failed, head);
- return -EIO;
- }
- return 0;
+ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
+ hdr->completion_ops);
+ return nfs_pageio_resend(&pgio, hdr);
}
EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
-static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
dprintk("pnfs write error = %d\n", hdr->pnfs_error);
if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
@@ -1512,50 +1549,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
pnfs_return_layout(hdr->inode);
}
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
- data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
- &hdr->pages,
- hdr->completion_ops,
- hdr->dreq);
+ hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
}
/*
* Called by non rpc-based layout drivers
*/
-void pnfs_ld_write_done(struct nfs_pgio_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
- trace_nfs4_pnfs_write(data, hdr->pnfs_error);
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
if (!hdr->pnfs_error) {
- pnfs_set_layoutcommit(data);
- hdr->mds_ops->rpc_call_done(&data->task, data);
+ pnfs_set_layoutcommit(hdr);
+ hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
} else
- pnfs_ld_handle_write_error(data);
- hdr->mds_ops->rpc_release(data);
+ pnfs_ld_handle_write_error(hdr);
+ hdr->mds_ops->rpc_release(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
static void
pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_data *data)
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
list_splice_tail_init(&hdr->pages, &desc->pg_list);
nfs_pageio_reset_write_mds(desc);
desc->pg_recoalesce = 1;
}
- nfs_pgio_data_release(data);
+ nfs_pgio_data_destroy(hdr);
}
static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops,
struct pnfs_layout_segment *lseg,
int how)
{
- struct nfs_pgio_header *hdr = wdata->header;
struct inode *inode = hdr->inode;
enum pnfs_try_status trypnfs;
struct nfs_server *nfss = NFS_SERVER(inode);
@@ -1563,8 +1592,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
hdr->mds_ops = call_ops;
dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
- inode->i_ino, wdata->args.count, wdata->args.offset, how);
- trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+ inode->i_ino, hdr->args.count, hdr->args.offset, how);
+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
if (trypnfs != PNFS_NOT_ATTEMPTED)
nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1575,139 +1604,105 @@ static void
pnfs_do_write(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr, int how)
{
- struct nfs_pgio_data *data = hdr->data;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
enum pnfs_try_status trypnfs;
desc->pg_lseg = NULL;
- trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+ trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_write_through_mds(desc, data);
+ pnfs_write_through_mds(desc, hdr);
pnfs_put_lseg(lseg);
}
static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
{
pnfs_put_lseg(hdr->lseg);
- nfs_rw_header_free(hdr);
+ nfs_pgio_header_free(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_rw_header *whdr;
struct nfs_pgio_header *hdr;
int ret;
- whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
- if (!whdr) {
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
return -ENOMEM;
}
- hdr = &whdr->header;
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
- atomic_inc(&hdr->refcnt);
ret = nfs_generic_pgio(desc, hdr);
if (ret != 0) {
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
pnfs_do_write(desc, hdr, desc->pg_ioflags);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
return ret;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
-int pnfs_read_done_resend_to_mds(struct inode *inode,
- struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops,
- struct nfs_direct_req *dreq)
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
{
struct nfs_pageio_descriptor pgio;
- LIST_HEAD(failed);
/* Resend all requests through the MDS */
- nfs_pageio_init_read(&pgio, inode, true, compl_ops);
- pgio.pg_dreq = dreq;
- while (!list_empty(head)) {
- struct nfs_page *req = nfs_list_entry(head->next);
-
- nfs_list_remove_request(req);
- if (!nfs_pageio_add_request(&pgio, req))
- nfs_list_add_request(req, &failed);
- }
- nfs_pageio_complete(&pgio);
-
- if (!list_empty(&failed)) {
- list_move(&failed, head);
- return -EIO;
- }
- return 0;
+ nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
+ return nfs_pageio_resend(&pgio, hdr);
}
EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
-static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
dprintk("pnfs read error = %d\n", hdr->pnfs_error);
if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
PNFS_LAYOUTRET_ON_ERROR) {
pnfs_return_layout(hdr->inode);
}
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
- data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
- &hdr->pages,
- hdr->completion_ops,
- hdr->dreq);
+ hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
}
/*
* Called by non rpc-based layout drivers
*/
-void pnfs_ld_read_done(struct nfs_pgio_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
- trace_nfs4_pnfs_read(data, hdr->pnfs_error);
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
if (likely(!hdr->pnfs_error)) {
- __nfs4_read_done_cb(data);
- hdr->mds_ops->rpc_call_done(&data->task, data);
+ __nfs4_read_done_cb(hdr);
+ hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
} else
- pnfs_ld_handle_read_error(data);
- hdr->mds_ops->rpc_release(data);
+ pnfs_ld_handle_read_error(hdr);
+ hdr->mds_ops->rpc_release(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
static void
pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_data *data)
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
list_splice_tail_init(&hdr->pages, &desc->pg_list);
nfs_pageio_reset_read_mds(desc);
desc->pg_recoalesce = 1;
}
- nfs_pgio_data_release(data);
+ nfs_pgio_data_destroy(hdr);
}
/*
* Call the appropriate parallel I/O subsystem read function.
*/
static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
const struct rpc_call_ops *call_ops,
struct pnfs_layout_segment *lseg)
{
- struct nfs_pgio_header *hdr = rdata->header;
struct inode *inode = hdr->inode;
struct nfs_server *nfss = NFS_SERVER(inode);
enum pnfs_try_status trypnfs;
@@ -1715,9 +1710,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
hdr->mds_ops = call_ops;
dprintk("%s: Reading ino:%lu %u@%llu\n",
- __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+ __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
- trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
if (trypnfs != PNFS_NOT_ATTEMPTED)
nfs_inc_stats(inode, NFSIOS_PNFS_READ);
dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1727,52 +1722,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
static void
pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_data *data = hdr->data;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
enum pnfs_try_status trypnfs;
desc->pg_lseg = NULL;
- trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+ trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_read_through_mds(desc, data);
+ pnfs_read_through_mds(desc, hdr);
pnfs_put_lseg(lseg);
}
static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
{
pnfs_put_lseg(hdr->lseg);
- nfs_rw_header_free(hdr);
+ nfs_pgio_header_free(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_rw_header *rhdr;
struct nfs_pgio_header *hdr;
int ret;
- rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
- if (!rhdr) {
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
ret = -ENOMEM;
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
return ret;
}
- hdr = &rhdr->header;
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
- atomic_inc(&hdr->refcnt);
ret = nfs_generic_pgio(desc, hdr);
if (ret != 0) {
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
pnfs_do_read(desc, hdr);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
return ret;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
@@ -1820,12 +1809,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
void
-pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = wdata->header;
struct inode *inode = hdr->inode;
struct nfs_inode *nfsi = NFS_I(inode);
- loff_t end_pos = wdata->mds_offset + wdata->res.count;
+ loff_t end_pos = hdr->mds_offset + hdr->res.count;
bool mark_as_dirty = false;
spin_lock(&inode->i_lock);
@@ -1851,6 +1839,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
}
EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
+{
+ struct inode *inode = data->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ bool mark_as_dirty = false;
+
+ spin_lock(&inode->i_lock);
+ if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+ mark_as_dirty = true;
+ dprintk("%s: Set layoutcommit for inode %lu ",
+ __func__, inode->i_ino);
+ }
+ if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
+ /* references matched in nfs4_layoutcommit_release */
+ pnfs_get_lseg(data->lseg);
+ }
+ if (data->lwb > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = data->lwb;
+ spin_unlock(&inode->i_lock);
+ dprintk("%s: lseg %p end_pos %llu\n",
+ __func__, data->lseg, nfsi->layout->plh_lwb);
+
+ /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+ * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+ if (mark_as_dirty)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
+
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
{
struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1871,6 +1888,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
int
pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
struct nfs4_layoutcommit_data *data;
struct nfs_inode *nfsi = NFS_I(inode);
loff_t end_pos;
@@ -1921,6 +1939,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
data->args.lastbytewritten = end_pos - 1;
data->res.server = NFS_SERVER(inode);
+ if (ld->prepare_layoutcommit) {
+ status = ld->prepare_layoutcommit(&data->args);
+ if (status) {
+ spin_lock(&inode->i_lock);
+ if (end_pos < nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ spin_unlock(&inode->i_lock);
+ put_rpccred(data->cred);
+ set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+ goto clear_layoutcommitting;
+ }
+ }
+
+
status = nfs4_proc_layoutcommit(data, sync);
out:
if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4fb309a2b4c4..9ae5b765b073 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
+#include <linux/workqueue.h>
enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
@@ -46,6 +47,7 @@ struct pnfs_layout_segment {
atomic_t pls_refcount;
unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
+ struct work_struct pls_work;
};
enum pnfs_try_status {
@@ -63,12 +65,15 @@ enum {
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_RETURN, /* Return this layout ASAP */
+ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
};
enum layoutdriver_policy_flags {
- /* Should the pNFS client commit and return the layout upon a setattr */
+ /* Should the pNFS client commit and return the layout upon truncate to
+ * a smaller size */
PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
+ PNFS_READ_WHOLE_PAGE = 1 << 2,
};
struct nfs4_deviceid_node;
@@ -80,6 +85,7 @@ struct pnfs_layoutdriver_type {
const char *name;
struct module *owner;
unsigned flags;
+ unsigned max_deviceinfo_size;
int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
int (*clear_layoutdriver) (struct nfs_server *);
@@ -90,6 +96,9 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+ void (*return_range) (struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range);
+
/* test for nfs page cache coalescing */
const struct nfs_pageio_ops *pg_read_ops;
const struct nfs_pageio_ops *pg_write_ops;
@@ -104,6 +113,8 @@ struct pnfs_layoutdriver_type {
int max);
void (*recover_commit_reqs) (struct list_head *list,
struct nfs_commit_info *cinfo);
+ struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+ struct page *page);
int (*commit_pagelist)(struct inode *inode,
struct list_head *mds_pages,
int how,
@@ -113,18 +124,21 @@ struct pnfs_layoutdriver_type {
* Return PNFS_ATTEMPTED to indicate the layout code has attempted
* I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
*/
- enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
- enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
+ enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
+ enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+ struct nfs4_deviceid_node * (*alloc_deviceid_node)
+ (struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);
void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
-
- void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
};
@@ -167,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
/* nfs4proc.c */
-extern int nfs4_proc_getdevicelist(struct nfs_server *server,
- const struct nfs_fh *fh,
- struct pnfs_devicelist *devlist);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
struct rpc_cred *cred);
@@ -179,6 +190,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
/* pnfs.c */
void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -213,13 +225,14 @@ bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_header *);
+void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_pgio_data *);
-void pnfs_ld_read_done(struct nfs_pgio_data *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
@@ -228,12 +241,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
gfp_t gfp_flags);
void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
-int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops,
- struct nfs_direct_req *dreq);
-int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
- const struct nfs_pgio_completion_ops *compl_ops,
- struct nfs_direct_req *dreq);
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
/* nfs4_deviceid_flags */
@@ -254,18 +263,25 @@ struct nfs4_deviceid_node {
atomic_t ref;
};
-struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ gfp_t gfp_mask);
void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
-void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
- const struct pnfs_layoutdriver_type *,
- const struct nfs_client *,
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
const struct nfs4_deviceid *);
-struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+ atomic_inc(&d->ref);
+ return d;
+}
+
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -345,6 +361,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
}
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+ struct page *page)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld == NULL || ld->search_commit_reqs == NULL)
+ return NULL;
+ return ld->search_commit_reqs(cinfo, page);
+}
+
/* Should the pNFS client commit and return the layout upon a setattr */
static inline bool
pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -356,6 +383,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
}
static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+
+static inline bool
pnfs_layoutcommit_outstanding(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -427,6 +462,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
}
static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ return false;
+}
+
+static inline bool
pnfs_roc(struct inode *ino)
{
return false;
@@ -496,6 +537,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
{
}
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+ struct page *page)
+{
+ return NULL;
+}
+
static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
return 0;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
*/
#include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
return NULL;
}
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+ const struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred, gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d = NULL;
+ struct pnfs_device *pdev = NULL;
+ struct page **pages = NULL;
+ u32 max_resp_sz;
+ int max_pages;
+ int rc, i;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ if (server->pnfs_curr_ld->max_deviceinfo_size &&
+ server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+ max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+ max_pages = nfs_page_array_len(0, max_resp_sz);
+ dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+ __func__, server, max_resp_sz, max_pages);
+
+ pdev = kzalloc(sizeof(*pdev), gfp_flags);
+ if (!pdev)
+ return NULL;
+
+ pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+ if (!pages)
+ goto out_free_pdev;
+
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(gfp_flags);
+ if (!pages[i])
+ goto out_free_pages;
+ }
+
+ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+ pdev->layout_type = server->pnfs_curr_ld->id;
+ pdev->pages = pages;
+ pdev->pgbase = 0;
+ pdev->pglen = max_resp_sz;
+ pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc)
+ goto out_free_pages;
+
+ /*
+ * Found new device, need to decode it and then add it to the
+ * list of known devices for this mountpoint.
+ */
+ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+ gfp_flags);
+
+out_free_pages:
+ for (i = 0; i < max_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+out_free_pdev:
+ kfree(pdev);
+ dprintk("<-- %s d %p\n", __func__, d);
+ return d;
+}
+
/*
* Lookup a deviceid in cache and get a reference count on it if found
*
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
* @id deviceid to look up
*/
static struct nfs4_deviceid_node *
-_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *clp, const struct nfs4_deviceid *id,
- long hash)
+__nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, long hash)
{
struct nfs4_deviceid_node *d;
rcu_read_lock();
- d = _lookup_deviceid(ld, clp, id, hash);
+ d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+ hash);
if (d != NULL)
atomic_inc(&d->ref);
rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
}
struct nfs4_deviceid_node *
-nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ gfp_t gfp_mask)
{
- return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+ long hash = nfs4_deviceid_hash(id);
+ struct nfs4_deviceid_node *d, *new;
+
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d)
+ return d;
+
+ new = nfs4_get_device_info(server, id, cred, gfp_mask);
+ if (!new)
+ return new;
+
+ spin_lock(&nfs4_deviceid_lock);
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ server->pnfs_curr_ld->free_deviceid_node(new);
+ return d;
+ }
+ hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+ atomic_inc(&new->ref);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ return new;
}
EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
void
-nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
- const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *nfs_client,
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
const struct nfs4_deviceid *id)
{
INIT_HLIST_NODE(&d->node);
INIT_HLIST_NODE(&d->tmpnode);
- d->ld = ld;
- d->nfs_client = nfs_client;
+ d->ld = server->pnfs_curr_ld;
+ d->nfs_client = server->nfs_client;
d->flags = 0;
d->deviceid = *id;
atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
/*
- * Uniquely initialize and insert a deviceid node into cache
- *
- * @new new deviceid node
- * Note that the caller must set up the following members:
- * new->ld
- * new->nfs_client
- * new->deviceid
- *
- * @ret the inserted node, if none found, otherwise, the found entry.
- */
-struct nfs4_deviceid_node *
-nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
-{
- struct nfs4_deviceid_node *d;
- long hash;
-
- spin_lock(&nfs4_deviceid_lock);
- hash = nfs4_deviceid_hash(&new->deviceid);
- d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
- if (d) {
- spin_unlock(&nfs4_deviceid_lock);
- return d;
- }
-
- hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
- spin_unlock(&nfs4_deviceid_lock);
- atomic_inc(&new->ref);
-
- return new;
-}
-EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
-
-/*
* Dereference a deviceid node and delete it when its reference count drops
* to zero.
*
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
}
rcu_read_unlock();
}
-
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c171ce1a8a30..b09cc23d6f43 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return 0;
}
-static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
nfs_invalidate_atime(inode);
if (task->tk_status >= 0) {
- nfs_refresh_inode(inode, data->res.fattr);
+ nfs_refresh_inode(inode, hdr->res.fattr);
/* Emulate the eof flag, which isn't normally needed in NFSv2
* as it is guaranteed to always return the file attributes
*/
- if (data->args.offset + data->res.count >= data->res.fattr->size)
- data->res.eof = 1;
+ if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+ hdr->res.eof = 1;
}
return 0;
}
-static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
}
-static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
rpc_call_start(task);
return 0;
}
-static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
if (task->tk_status >= 0)
- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+ nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
return 0;
}
-static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
{
/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
- data->args.stable = NFS_FILE_SYNC;
+ hdr->args.stable = NFS_FILE_SYNC;
msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e818a475ca64..beff2769c5c5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops;
static struct kmem_cache *nfs_rdata_cachep;
-static struct nfs_rw_header *nfs_readhdr_alloc(void)
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
{
return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
}
-static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
{
kmem_cache_free(nfs_rdata_cachep, rhdr);
}
@@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req)
unlock_page(req->wb_page);
}
-
- dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
- req->wb_context->dentry->d_inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
- req->wb_bytes,
- (long long)req_offset(req));
nfs_release_request(req);
}
@@ -172,14 +166,15 @@ out:
hdr->release(hdr);
}
-static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
struct rpc_task_setup *task_setup_data, int how)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
task_setup_data->flags |= swap_flags;
- NFS_PROTO(inode)->read_setup(data, msg);
+ NFS_PROTO(inode)->read_setup(hdr, msg);
}
static void
@@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
* This is the callback from RPC telling us whether a reply was
* received or some error occurred (timeout or socket shutdown).
*/
-static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_readpage_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr,
struct inode *inode)
{
- int status = NFS_PROTO(inode)->read_done(task, data);
+ int status = NFS_PROTO(inode)->read_done(task, hdr);
if (status != 0)
return status;
- nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);
+ nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
if (task->tk_status == -ESTALE) {
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
@@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
return 0;
}
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_retry(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_args *argp = &data->args;
- struct nfs_pgio_res *resp = &data->res;
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
/* This is a short read! */
- nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
+ nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
/* Has the server at least made some progress? */
if (resp->count == 0) {
- nfs_set_pgio_error(data->header, -EIO, argp->offset);
+ nfs_set_pgio_error(hdr, -EIO, argp->offset);
return;
}
- /* Yes, so retry the read at the end of the data */
- data->mds_offset += resp->count;
+ /* Yes, so retry the read at the end of the hdr */
+ hdr->mds_offset += resp->count;
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
rpc_restart_call_prepare(task);
}
-static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_result(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
-
- if (data->res.eof) {
+ if (hdr->res.eof) {
loff_t bound;
- bound = data->args.offset + data->res.count;
+ bound = hdr->args.offset + hdr->res.count;
spin_lock(&hdr->lock);
if (bound < hdr->io_start + hdr->good_bytes) {
set_bit(NFS_IOHDR_EOF, &hdr->flags);
@@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat
hdr->good_bytes = bound - hdr->io_start;
}
spin_unlock(&hdr->lock);
- } else if (data->res.count != data->args.count)
- nfs_readpage_retry(task, data);
+ } else if (hdr->res.count != hdr->args.count)
+ nfs_readpage_retry(task, hdr);
}
/*
@@ -404,7 +400,7 @@ out:
int __init nfs_init_readpagecache(void)
{
nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
- sizeof(struct nfs_rw_header),
+ sizeof(struct nfs_pgio_header),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 084af1060d79..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
rpc_authflavor_t flavor)
{
unsigned int i;
- unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
- sizeof(auth_info->flavors[0]));
+ unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
/* make sure this flavor isn't already in the list */
for (i = 0; i < auth_info->flavor_len; i++) {
@@ -2066,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
return NFS_TEXT_DATA;
}
-#if !IS_ENABLED(CONFIG_NFS_V3)
- if (args->version == 3)
- goto out_v3_not_compiled;
-#endif /* !CONFIG_NFS_V3 */
-
return 0;
out_no_data:
@@ -2086,12 +2080,6 @@ out_no_sec:
dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
return -EINVAL;
-#if !IS_ENABLED(CONFIG_NFS_V3)
-out_v3_not_compiled:
- dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
- return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V3 */
-
out_nomem:
dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
return -ENOMEM;
@@ -2180,7 +2168,7 @@ out_no_address:
return -EINVAL;
}
-#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
| NFS_MOUNT_SECURE \
| NFS_MOUNT_TCP \
| NFS_MOUNT_VER3 \
@@ -2188,15 +2176,16 @@ out_no_address:
| NFS_MOUNT_NONLM \
| NFS_MOUNT_BROKEN_SUID \
| NFS_MOUNT_STRICTLOCK \
- | NFS_MOUNT_UNSHARED \
- | NFS_MOUNT_NORESVPORT \
| NFS_MOUNT_LEGACY_INTERFACE)
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+ ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
+
static int
nfs_compare_remount_data(struct nfs_server *nfss,
struct nfs_parsed_mount_data *data)
{
- if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||
+ if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
data->rsize != nfss->rsize ||
data->wsize != nfss->wsize ||
data->version != nfss->nfs_client->rpc_ops->version ||
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 962c9ee758be..f83b02dc9166 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -47,6 +47,11 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
static const struct nfs_rw_ops nfs_rw_write_ops;
static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+ struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct page *page);
static struct kmem_cache *nfs_wdata_cachep;
static mempool_t *nfs_wdata_mempool;
@@ -71,18 +76,18 @@ void nfs_commit_free(struct nfs_commit_data *p)
}
EXPORT_SYMBOL_GPL(nfs_commit_free);
-static struct nfs_rw_header *nfs_writehdr_alloc(void)
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
{
- struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+ struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
if (p)
memset(p, 0, sizeof(*p));
return p;
}
-static void nfs_writehdr_free(struct nfs_rw_header *whdr)
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
{
- mempool_free(whdr, nfs_wdata_mempool);
+ mempool_free(hdr, nfs_wdata_mempool);
}
static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -106,21 +111,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
if (PagePrivate(page))
req = (struct nfs_page *)page_private(page);
- else if (unlikely(PageSwapCache(page))) {
- struct nfs_page *freq, *t;
-
- /* Linearly search the commit list for the correct req */
- list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
- if (freq->wb_page == page) {
- req = freq->wb_head;
- break;
- }
- }
- }
+ else if (unlikely(PageSwapCache(page)))
+ req = nfs_page_search_commits_for_head_request_locked(nfsi,
+ page);
if (req) {
WARN_ON_ONCE(req->wb_head != req);
-
kref_get(&req->wb_kref);
}
@@ -216,7 +212,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
unsigned int pos = 0;
unsigned int len = nfs_page_length(req->wb_page);
- nfs_page_group_lock(req);
+ nfs_page_group_lock(req, false);
do {
tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -246,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
static int wb_priority(struct writeback_control *wbc)
{
+ int ret = 0;
if (wbc->for_reclaim)
return FLUSH_HIGHPRI | FLUSH_STABLE;
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ ret = FLUSH_COND_STABLE;
if (wbc->for_kupdate || wbc->for_background)
- return FLUSH_LOWPRI | FLUSH_COND_STABLE;
- return FLUSH_COND_STABLE;
+ ret |= FLUSH_LOWPRI;
+ return ret;
}
/*
@@ -379,8 +378,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
subreq->wb_head = subreq;
subreq->wb_this_page = subreq;
- nfs_clear_request_commit(subreq);
-
/* subreq is now totally disconnected from page group or any
* write / commit lists. last chance to wake any waiters */
nfs_unlock_request(subreq);
@@ -455,8 +452,23 @@ try_again:
return NULL;
}
+ /* holding inode lock, so always make a non-blocking call to try the
+ * page group lock */
+ ret = nfs_page_group_lock(head, true);
+ if (ret < 0) {
+ spin_unlock(&inode->i_lock);
+
+ if (!nonblock && ret == -EAGAIN) {
+ nfs_page_group_lock_wait(head);
+ nfs_release_request(head);
+ goto try_again;
+ }
+
+ nfs_release_request(head);
+ return ERR_PTR(ret);
+ }
+
/* lock each request in the page group */
- nfs_page_group_lock(head);
subreq = head;
do {
/*
@@ -488,7 +500,7 @@ try_again:
* Commit list removal accounting is done after locks are dropped */
subreq = head;
do {
- nfs_list_remove_request(subreq);
+ nfs_clear_request_commit(subreq);
subreq = subreq->wb_this_page;
} while (subreq != head);
@@ -518,15 +530,11 @@ try_again:
nfs_page_group_unlock(head);
- /* drop lock to clear_request_commit the head req and clean up
- * requests on destroy list */
+ /* drop lock to clean uprequests on destroy list */
spin_unlock(&inode->i_lock);
nfs_destroy_unlinked_subrequests(destroy_list, head);
- /* clean up commit list state */
- nfs_clear_request_commit(head);
-
/* still holds ref on head from nfs_page_find_head_request_locked
* and still has lock on head from lock loop */
return head;
@@ -697,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
if (likely(!PageSwapCache(head->wb_page))) {
set_page_private(head->wb_page, 0);
ClearPagePrivate(head->wb_page);
+ smp_mb__after_atomic();
+ wake_up_page(head->wb_page, PG_private);
clear_bit(PG_MAPPED, &head->wb_flags);
}
nfsi->npages--;
@@ -713,7 +723,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
__set_page_dirty_nobuffers(req->wb_page);
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct page *page)
+{
+ struct nfs_page *freq, *t;
+ struct nfs_commit_info cinfo;
+ struct inode *inode = &nfsi->vfs_inode;
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+
+ /* search through pnfs commit lists */
+ freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+ if (freq)
+ return freq->wb_head;
+
+ /* Linearly search the commit list for the correct request */
+ list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+ if (freq->wb_page == page)
+ return freq->wb_head;
+ }
+
+ return NULL;
+}
+
/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
@@ -808,6 +849,7 @@ nfs_clear_page_commit(struct page *page)
dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
}
+/* Called holding inode (/cinfo) lock */
static void
nfs_clear_request_commit(struct nfs_page *req)
{
@@ -817,53 +859,19 @@ nfs_clear_request_commit(struct nfs_page *req)
nfs_init_cinfo_from_inode(&cinfo, inode);
if (!pnfs_clear_request_commit(req, &cinfo)) {
- spin_lock(cinfo.lock);
nfs_request_remove_commit_list(req, &cinfo);
- spin_unlock(cinfo.lock);
}
nfs_clear_page_commit(req->wb_page);
}
}
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
-{
- if (data->verf.committed == NFS_DATA_SYNC)
- return data->header->lseg == NULL;
- return data->verf.committed != NFS_FILE_SYNC;
-}
-
-#else
-static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
- struct inode *inode)
-{
-}
-
-void nfs_init_cinfo(struct nfs_commit_info *cinfo,
- struct inode *inode,
- struct nfs_direct_req *dreq)
-{
-}
-
-void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
-{
-}
-
-static void
-nfs_clear_request_commit(struct nfs_page *req)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
{
+ if (hdr->verf.committed == NFS_DATA_SYNC)
+ return hdr->lseg == NULL;
+ return hdr->verf.committed != NFS_FILE_SYNC;
}
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
-{
- return 0;
-}
-
-#endif
-
static void nfs_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_commit_info cinfo;
@@ -883,11 +891,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
nfs_context_set_write_error(req->wb_context, hdr->error);
goto remove_req;
}
- if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
- nfs_mark_request_dirty(req);
- goto next;
- }
- if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+ if (nfs_write_need_commit(hdr)) {
memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo);
goto next;
@@ -903,7 +907,6 @@ out:
hdr->release(hdr);
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
unsigned long
nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
{
@@ -960,19 +963,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
return ret;
}
-#else
-unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
-{
- return 0;
-}
-
-int nfs_scan_commit(struct inode *inode, struct list_head *dst,
- struct nfs_commit_info *cinfo)
-{
- return 0;
-}
-#endif
-
/*
* Search for an existing write request, and attempt to update
* it to reflect a new dirty region on a given page.
@@ -1038,9 +1028,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
else
req->wb_bytes = rqend - req->wb_offset;
out_unlock:
- spin_unlock(&inode->i_lock);
if (req)
nfs_clear_request_commit(req);
+ spin_unlock(&inode->i_lock);
return req;
out_flushme:
spin_unlock(&inode->i_lock);
@@ -1241,17 +1231,18 @@ static int flush_task_priority(int how)
return RPC_PRIORITY_NORMAL;
}
-static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
struct rpc_task_setup *task_setup_data, int how)
{
- struct inode *inode = data->header->inode;
+ struct inode *inode = hdr->inode;
int priority = flush_task_priority(how);
task_setup_data->priority = priority;
- NFS_PROTO(inode)->write_setup(data, msg);
+ NFS_PROTO(inode)->write_setup(hdr, msg);
nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
- &task_setup_data->rpc_client, msg, data);
+ &task_setup_data->rpc_client, msg, hdr);
}
/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1313,21 +1304,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
}
-static void nfs_writeback_release_common(struct nfs_pgio_data *data)
+static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_header *hdr = data->header;
- int status = data->task.tk_status;
-
- if ((status >= 0) && nfs_write_need_commit(data)) {
- spin_lock(&hdr->lock);
- if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
- ; /* Do nothing */
- else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
- memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
- else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
- set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
- spin_unlock(&hdr->lock);
- }
+ /* do nothing! */
}
/*
@@ -1358,7 +1337,8 @@ static int nfs_should_remove_suid(const struct inode *inode)
/*
* This function is called when the WRITE call is complete.
*/
-static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_writeback_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr,
struct inode *inode)
{
int status;
@@ -1370,13 +1350,13 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
* another writer had changed the file, but some applications
* depend on tighter cache coherency when writing.
*/
- status = NFS_PROTO(inode)->write_done(task, data);
+ status = NFS_PROTO(inode)->write_done(task, hdr);
if (status != 0)
return status;
- nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
+ nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
- if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
+ if (hdr->res.verf->committed < hdr->args.stable &&
+ task->tk_status >= 0) {
/* We tried a write call, but the server did not
* commit data to stable storage even though we
* requested it.
@@ -1392,11 +1372,10 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
NFS_SERVER(inode)->nfs_client->cl_hostname,
- data->res.verf->committed, data->args.stable);
+ hdr->res.verf->committed, hdr->args.stable);
complain = jiffies + 300 * HZ;
}
}
-#endif
/* Deal with the suid/sgid bit corner case */
if (nfs_should_remove_suid(inode))
@@ -1407,16 +1386,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
/*
* This function is called when the WRITE call is complete.
*/
-static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_writeback_result(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_args *argp = &data->args;
- struct nfs_pgio_res *resp = &data->res;
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
if (resp->count < argp->count) {
static unsigned long complain;
/* This a short write! */
- nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
+ nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
/* Has the server at least made some progress? */
if (resp->count == 0) {
@@ -1426,14 +1406,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
argp->count);
complain = jiffies + 300 * HZ;
}
- nfs_set_pgio_error(data->header, -EIO, argp->offset);
+ nfs_set_pgio_error(hdr, -EIO, argp->offset);
task->tk_status = -EIO;
return;
}
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
- data->mds_offset += resp->count;
+ hdr->mds_offset += resp->count;
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
@@ -1448,7 +1428,6 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
{
int ret;
@@ -1517,6 +1496,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
}
EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+ loff_t lwb = 0;
+ struct nfs_page *req;
+
+ list_for_each_entry(req, head, wb_list)
+ if (lwb < (req_offset(req) + req->wb_bytes))
+ lwb = req_offset(req) + req->wb_bytes;
+
+ return lwb;
+}
+
/*
* Set up the argument/result storage required for the RPC call.
*/
@@ -1536,6 +1527,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
data->inode = inode;
data->cred = first->wb_context->cred;
data->lseg = lseg; /* reference transferred */
+ /* only set lwb for pnfs commit */
+ if (lseg)
+ data->lwb = nfs_get_lwb(&data->pages);
data->mds_ops = &nfs_commit_ops;
data->completion_ops = cinfo->completion_ops;
data->dreq = cinfo->dreq;
@@ -1615,6 +1609,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
struct nfs_page *req;
int status = data->task.tk_status;
struct nfs_commit_info cinfo;
+ struct nfs_server *nfss;
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
@@ -1648,6 +1643,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
next:
nfs_unlock_and_release_request(req);
}
+ nfss = NFS_SERVER(data->inode);
+ if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+ clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1757,12 +1756,6 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
-#else
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
-{
- return 0;
-}
-#endif
int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
@@ -1884,7 +1877,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
int __init nfs_init_writepagecache(void)
{
nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
- sizeof(struct nfs_rw_header),
+ sizeof(struct nfs_pgio_header),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (nfs_wdata_cachep == NULL)
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index f689ed82af3a..d153ca3ea577 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -3,5 +3,6 @@
#
obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
-
nfs_acl-objs := nfsacl.o
+
+obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c
index 6d1ee7204c88..ae6e58ea4de5 100644
--- a/fs/lockd/grace.c
+++ b/fs/nfs_common/grace.c
@@ -1,17 +1,20 @@
/*
* Common code for control of lockd and nfsv4 grace periods.
+ *
+ * Transplanted from lockd code
*/
#include <linux/module.h>
-#include <linux/lockd/bind.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/fs.h>
-#include "netns.h"
-
+static int grace_net_id;
static DEFINE_SPINLOCK(grace_lock);
/**
* locks_start_grace
+ * @net: net namespace that this lock manager belongs to
* @lm: who this grace period is for
*
* A grace period is a period during which locks should not be given
@@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock);
*
* This function is called to start a grace period.
*/
-void locks_start_grace(struct net *net, struct lock_manager *lm)
+void
+locks_start_grace(struct net *net, struct lock_manager *lm)
{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
+ struct list_head *grace_list = net_generic(net, grace_net_id);
spin_lock(&grace_lock);
- list_add(&lm->list, &ln->grace_list);
+ list_add(&lm->list, grace_list);
spin_unlock(&grace_lock);
}
EXPORT_SYMBOL_GPL(locks_start_grace);
/**
* locks_end_grace
+ * @net: net namespace that this lock manager belongs to
* @lm: who this grace period is for
*
* Call this function to state that the given lock manager is ready to
@@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
* Note that callers count on it being safe to call this more than once,
* and the second call should be a no-op.
*/
-void locks_end_grace(struct lock_manager *lm)
+void
+locks_end_grace(struct lock_manager *lm)
{
spin_lock(&grace_lock);
list_del_init(&lm->list);
@@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
* to answer ordinary lock requests, and when they should accept only
* lock reclaims.
*/
-int locks_in_grace(struct net *net)
+int
+locks_in_grace(struct net *net)
{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
+ struct list_head *grace_list = net_generic(net, grace_net_id);
- return !list_empty(&ln->grace_list);
+ return !list_empty(grace_list);
}
EXPORT_SYMBOL_GPL(locks_in_grace);
+
+static int __net_init
+grace_init_net(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ INIT_LIST_HEAD(grace_list);
+ return 0;
+}
+
+static void __net_exit
+grace_exit_net(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ BUG_ON(!list_empty(grace_list));
+}
+
+static struct pernet_operations grace_net_ops = {
+ .init = grace_init_net,
+ .exit = grace_exit_net,
+ .id = &grace_net_id,
+ .size = sizeof(struct list_head),
+};
+
+static int __init
+init_grace(void)
+{
+ return register_pernet_subsys(&grace_net_ops);
+}
+
+static void __exit
+exit_grace(void)
+{
+ unregister_pernet_subsys(&grace_net_ops);
+}
+
+MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
+MODULE_LICENSE("GPL");
+module_init(init_grace)
+module_exit(exit_grace)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index ed628f71274c..538f142935ea 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -30,9 +30,6 @@
MODULE_LICENSE("GPL");
-EXPORT_SYMBOL_GPL(nfsacl_encode);
-EXPORT_SYMBOL_GPL(nfsacl_decode);
-
struct nfsacl_encode_desc {
struct xdr_array2_desc desc;
unsigned int count;
@@ -136,6 +133,7 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
nfsacl_desc.desc.array_len;
return err;
}
+EXPORT_SYMBOL_GPL(nfsacl_encode);
struct nfsacl_decode_desc {
struct xdr_array2_desc desc;
@@ -295,3 +293,4 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
return 8 + nfsacl_desc.desc.elem_size *
nfsacl_desc.desc.array_len;
}
+EXPORT_SYMBOL_GPL(nfsacl_decode);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f994e750e0d1..73395156bdb4 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -71,6 +71,7 @@ config NFSD_V4
select FS_POSIX_ACL
select SUNRPC_GSS
select CRYPTO
+ select GRACE_PERIOD
help
This option enables support in your system's NFS server for
version 4 of the NFS protocol (RFC 3530).
@@ -94,9 +95,6 @@ config NFSD_V4_SECURITY_LABEL
If you do not wish to enable fine-grained security labels SELinux or
Smack policies on NFSv4 files, say N.
- WARNING: there is still a chance of backwards-incompatible protocol changes.
- For now we recommend "Y" only for developers and testers.
-
config NFSD_FAULT_INJECTION
bool "NFS server manual fault injection"
depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index a986ceb6fd0d..4cd7c69a6cb9 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -47,7 +47,7 @@ struct svc_rqst;
#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
/ sizeof(struct nfs4_ace))
-struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_bytes(int entries);
int nfs4_acl_get_whotype(char *, u32);
__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 72f44823adbb..9d46a0bdd9f9 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -28,7 +28,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
validate_process_creds();
/* discard any old override before preparing the new set */
- revert_creds(get_cred(current->real_cred));
+ revert_creds(get_cred(current_real_cred()));
new = prepare_creds();
if (!new)
return -ENOMEM;
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b582f9ab6b2a..dd96a3830004 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -18,7 +18,6 @@
* is much larger than a sockaddr_in6.
*/
struct svc_cacherep {
- struct hlist_node c_hash;
struct list_head c_lru;
unsigned char c_state, /* unused, inprog, done */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 13b85f94d9e2..30a739d896ff 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -698,8 +698,8 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
kref_get(&item->ex_client->ref);
new->ex_client = item->ex_client;
- new->ex_path.dentry = dget(item->ex_path.dentry);
- new->ex_path.mnt = mntget(item->ex_path.mnt);
+ new->ex_path = item->ex_path;
+ path_get(&item->ex_path);
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
@@ -1145,6 +1145,7 @@ static struct flags {
{ NFSEXP_ALLSQUASH, {"all_squash", ""}},
{ NFSEXP_ASYNC, {"async", "sync"}},
{ NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}},
+ { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}},
{ NFSEXP_NOHIDE, {"nohide", ""}},
{ NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
@@ -1253,7 +1254,7 @@ static int e_show(struct seq_file *m, void *p)
return 0;
}
- cache_get(&exp->h);
+ exp_get(exp);
if (cache_check(cd, &exp->h, NULL))
return 0;
exp_put(exp);
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index cfeea85c5bed..04dc8c167b0c 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -101,9 +101,10 @@ static inline void exp_put(struct svc_export *exp)
cache_put(&exp->h, exp->cd);
}
-static inline void exp_get(struct svc_export *exp)
+static inline struct svc_export *exp_get(struct svc_export *exp)
{
cache_get(&exp->h);
+ return exp;
}
struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 2ed05c3cd43d..c16bf5af6831 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -17,81 +17,13 @@
struct nfsd_fault_inject_op {
char *file;
- u64 (*forget)(struct nfs4_client *, u64);
- u64 (*print)(struct nfs4_client *, u64);
+ u64 (*get)(void);
+ u64 (*set_val)(u64);
+ u64 (*set_clnt)(struct sockaddr_storage *, size_t);
};
-static struct nfsd_fault_inject_op inject_ops[] = {
- {
- .file = "forget_clients",
- .forget = nfsd_forget_client,
- .print = nfsd_print_client,
- },
- {
- .file = "forget_locks",
- .forget = nfsd_forget_client_locks,
- .print = nfsd_print_client_locks,
- },
- {
- .file = "forget_openowners",
- .forget = nfsd_forget_client_openowners,
- .print = nfsd_print_client_openowners,
- },
- {
- .file = "forget_delegations",
- .forget = nfsd_forget_client_delegations,
- .print = nfsd_print_client_delegations,
- },
- {
- .file = "recall_delegations",
- .forget = nfsd_recall_client_delegations,
- .print = nfsd_print_client_delegations,
- },
-};
-
-static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
static struct dentry *debug_dir;
-static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)
-{
- u64 count = 0;
-
- if (val == 0)
- printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
- else
- printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
-
- nfs4_lock_state();
- count = nfsd_for_n_state(val, op->forget);
- nfs4_unlock_state();
- printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);
-}
-
-static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op,
- struct sockaddr_storage *addr,
- size_t addr_size)
-{
- char buf[INET6_ADDRSTRLEN];
- struct nfs4_client *clp;
- u64 count;
-
- nfs4_lock_state();
- clp = nfsd_find_client(addr, addr_size);
- if (clp) {
- count = op->forget(clp, 0);
- rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
- printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count);
- }
- nfs4_unlock_state();
-}
-
-static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val)
-{
- nfs4_lock_state();
- *val = nfsd_for_n_state(0, op->print);
- nfs4_unlock_state();
-}
-
static ssize_t fault_inject_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
@@ -99,9 +31,10 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf,
char read_buf[25];
size_t size;
loff_t pos = *ppos;
+ struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
if (!pos)
- nfsd_inject_get(file_inode(file)->i_private, &val);
+ val = op->get();
size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
return simple_read_from_buffer(buf, len, ppos, read_buf, size);
@@ -114,18 +47,36 @@ static ssize_t fault_inject_write(struct file *file, const char __user *buf,
size_t size = min(sizeof(write_buf) - 1, len);
struct net *net = current->nsproxy->net_ns;
struct sockaddr_storage sa;
+ struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
u64 val;
+ char *nl;
if (copy_from_user(write_buf, buf, size))
return -EFAULT;
write_buf[size] = '\0';
+ /* Deal with any embedded newlines in the string */
+ nl = strchr(write_buf, '\n');
+ if (nl) {
+ size = nl - write_buf;
+ *nl = '\0';
+ }
+
size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
- if (size > 0)
- nfsd_inject_set_client(file_inode(file)->i_private, &sa, size);
- else {
+ if (size > 0) {
+ val = op->set_clnt(&sa, size);
+ if (val)
+ pr_info("NFSD [%s]: Client %s had %llu state object(s)\n",
+ op->file, write_buf, val);
+ } else {
val = simple_strtoll(write_buf, NULL, 0);
- nfsd_inject_set(file_inode(file)->i_private, val);
+ if (val == 0)
+ pr_info("NFSD Fault Injection: %s (all)", op->file);
+ else
+ pr_info("NFSD Fault Injection: %s (n = %llu)",
+ op->file, val);
+ val = op->set_val(val);
+ pr_info("NFSD: %s: found %llu", op->file, val);
}
return len; /* on success, claim we got the whole input */
}
@@ -141,6 +92,41 @@ void nfsd_fault_inject_cleanup(void)
debugfs_remove_recursive(debug_dir);
}
+static struct nfsd_fault_inject_op inject_ops[] = {
+ {
+ .file = "forget_clients",
+ .get = nfsd_inject_print_clients,
+ .set_val = nfsd_inject_forget_clients,
+ .set_clnt = nfsd_inject_forget_client,
+ },
+ {
+ .file = "forget_locks",
+ .get = nfsd_inject_print_locks,
+ .set_val = nfsd_inject_forget_locks,
+ .set_clnt = nfsd_inject_forget_client_locks,
+ },
+ {
+ .file = "forget_openowners",
+ .get = nfsd_inject_print_openowners,
+ .set_val = nfsd_inject_forget_openowners,
+ .set_clnt = nfsd_inject_forget_client_openowners,
+ },
+ {
+ .file = "forget_delegations",
+ .get = nfsd_inject_print_delegations,
+ .set_val = nfsd_inject_forget_delegations,
+ .set_clnt = nfsd_inject_forget_client_delegations,
+ },
+ {
+ .file = "recall_delegations",
+ .get = nfsd_inject_print_delegations,
+ .set_val = nfsd_inject_recall_delegations,
+ .set_clnt = nfsd_inject_recall_client_delegations,
+ },
+};
+
+#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op))
+
int nfsd_fault_inject_init(void)
{
unsigned int i;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d32b3aa6600d..ea6749a32760 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -29,14 +29,19 @@
#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
-#define LOCKOWNER_INO_HASH_BITS 8
-#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
-
#define SESSION_HASH_SIZE 512
struct cld_net;
struct nfsd4_client_tracking_ops;
+/*
+ * Represents a nfsd "container". With respect to nfsv4 state tracking, the
+ * fields of interest are the *_id_hashtbls and the *_name_tree. These track
+ * the nfs4_client objects by either short or long form clientid.
+ *
+ * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean
+ * up expired clients and delegations within the container.
+ */
struct nfsd_net {
struct cld_net *cld_net;
@@ -66,8 +71,6 @@ struct nfsd_net {
struct rb_root conf_name_tree;
struct list_head *unconf_id_hashtbl;
struct rb_root unconf_name_tree;
- struct list_head *ownerstr_hashtbl;
- struct list_head *lockowner_ino_hashtbl;
struct list_head *sessionid_hashtbl;
/*
* client_lru holds client queue ordered by nfs4_client.cl_time
@@ -97,10 +100,16 @@ struct nfsd_net {
bool nfsd_net_up;
bool lockd_up;
+ /* Time of server startup */
+ struct timeval nfssvc_boot;
+
/*
- * Time of server startup
+ * Max number of connections this nfsd container will allow. Defaults
+ * to '0' which is means that it bases this on the number of threads.
*/
- struct timeval nfssvc_boot;
+ unsigned int max_connections;
+
+ u32 clientid_counter;
struct svc_serv *nfsd_serv;
};
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 12b023a7ab7d..ac54ea60b3f6 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -54,14 +54,14 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
acl = get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl)) {
- nfserr = nfserrno(PTR_ERR(acl));
- goto fail;
- }
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
}
+ if (IS_ERR(acl)) {
+ nfserr = nfserrno(PTR_ERR(acl));
+ goto fail;
+ }
resp->acl_access = acl;
}
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 2a514e21dc74..34cbbab6abd7 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -47,14 +47,14 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
acl = get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl)) {
- nfserr = nfserrno(PTR_ERR(acl));
- goto fail;
- }
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
}
+ if (IS_ERR(acl)) {
+ nfserr = nfserrno(PTR_ERR(acl));
+ goto fail;
+ }
resp->acl_access = acl;
}
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 401289913130..12f2aab4f614 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -157,11 +157,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
* 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
* + 1 (xdr opaque byte count) = 26
*/
-
- resp->count = argp->count;
- if (max_blocksize < resp->count)
- resp->count = max_blocksize;
-
+ resp->count = min(argp->count, max_blocksize);
svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
fh_copy(&resp->fh, &argp->fh);
@@ -227,11 +223,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
attr = &argp->attrs;
- /* Get the directory inode */
- nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
- if (nfserr)
- RETURN_STATUS(nfserr);
-
/* Unfudge the mode bits */
attr->ia_mode &= ~S_IFMT;
if (!(attr->ia_valid & ATTR_MODE)) {
@@ -286,8 +277,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
fh_copy(&resp->dirfh, &argp->ffh);
fh_init(&resp->fh, NFS3_FHSIZE);
nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen,
- argp->tname, argp->tlen,
- &resp->fh, &argp->attrs);
+ argp->tname, &resp->fh);
RETURN_STATUS(nfserr);
}
@@ -476,6 +466,14 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
resp->buflen = resp->count;
resp->rqstp = rqstp;
offset = argp->cookie;
+
+ nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
+ if (nfserr)
+ RETURN_STATUS(nfserr);
+
+ if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS)
+ RETURN_STATUS(nfserr_notsupp);
+
nfserr = nfsd_readdir(rqstp, &resp->fh,
&offset,
&resp->common,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index e6c01e80325e..39c5eb3ad33a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -120,10 +120,7 @@ decode_sattr3(__be32 *p, struct iattr *iap)
iap->ia_valid |= ATTR_SIZE;
p = xdr_decode_hyper(p, &newsize);
- if (newsize <= NFS_OFFSET_MAX)
- iap->ia_size = newsize;
- else
- iap->ia_size = NFS_OFFSET_MAX;
+ iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
}
if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
iap->ia_valid |= ATTR_ATIME;
@@ -338,10 +335,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
return 0;
p = xdr_decode_hyper(p, &args->offset);
- len = args->count = ntohl(*p++);
-
- if (len > max_blocksize)
- len = max_blocksize;
+ args->count = ntohl(*p++);
+ len = min(args->count, max_blocksize);
/* set up the kvec */
v=0;
@@ -349,7 +344,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
struct page *p = *(rqstp->rq_next_page++);
rqstp->rq_vec[v].iov_base = page_address(p);
- rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
+ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
len -= rqstp->rq_vec[v].iov_len;
v++;
}
@@ -484,9 +479,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
}
/* now copy next page if there is one */
if (len && !avail && rqstp->rq_arg.page_len) {
- avail = rqstp->rq_arg.page_len;
- if (avail > PAGE_SIZE)
- avail = PAGE_SIZE;
+ avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE);
old = page_address(rqstp->rq_arg.pages[0]);
}
while (len && avail && *old) {
@@ -571,10 +564,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
args->verf = p; p += 2;
args->dircount = ~0;
args->count = ntohl(*p++);
-
- if (args->count > PAGE_SIZE)
- args->count = PAGE_SIZE;
-
+ args->count = min_t(u32, args->count, PAGE_SIZE);
args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
@@ -595,10 +585,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
args->dircount = ntohl(*p++);
args->count = ntohl(*p++);
- len = (args->count > max_blocksize) ? max_blocksize :
- args->count;
- args->count = len;
-
+ len = args->count = min(args->count, max_blocksize);
while (len > 0) {
struct page *p = *(rqstp->rq_next_page++);
if (!args->buffer)
@@ -913,8 +900,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
*/
/* truncate filename if too long */
- if (namlen > NFS3_MAXNAMLEN)
- namlen = NFS3_MAXNAMLEN;
+ namlen = min(namlen, NFS3_MAXNAMLEN);
slen = XDR_QUADLEN(namlen);
elen = slen + NFS3_ENTRY_BAGGAGE
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index d714156a19fd..59fd76651781 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -146,35 +146,43 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
int size = 0;
pacl = get_acl(inode, ACL_TYPE_ACCESS);
- if (!pacl) {
+ if (!pacl)
pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
- if (IS_ERR(pacl))
- return PTR_ERR(pacl);
- }
+
+ if (IS_ERR(pacl))
+ return PTR_ERR(pacl);
+
/* allocate for worst case: one (deny, allow) pair each: */
size += 2 * pacl->a_count;
if (S_ISDIR(inode->i_mode)) {
flags = NFS4_ACL_DIR;
dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(dpacl)) {
+ error = PTR_ERR(dpacl);
+ goto rel_pacl;
+ }
+
if (dpacl)
size += 2 * dpacl->a_count;
}
- *acl = nfs4_acl_new(size);
+ *acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL);
if (*acl == NULL) {
error = -ENOMEM;
goto out;
}
+ (*acl)->naces = 0;
_posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
if (dpacl)
_posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
- out:
- posix_acl_release(pacl);
+out:
posix_acl_release(dpacl);
+rel_pacl:
+ posix_acl_release(pacl);
return error;
}
@@ -872,16 +880,13 @@ ace2type(struct nfs4_ace *ace)
return -1;
}
-struct nfs4_acl *
-nfs4_acl_new(int n)
+/*
+ * return the size of the struct nfs4_acl required to represent an acl
+ * with @entries entries.
+ */
+int nfs4_acl_bytes(int entries)
{
- struct nfs4_acl *acl;
-
- acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL);
- if (acl == NULL)
- return NULL;
- acl->naces = 0;
- return acl;
+ return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace);
}
static struct {
@@ -935,5 +940,5 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who)
return 0;
}
WARN_ON_ONCE(1);
- return -1;
+ return nfserr_serverfault;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 2c73cae9899d..ed2b1151b171 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
/* Index of predefined Linux callback client operations */
-enum {
- NFSPROC4_CLNT_CB_NULL = 0,
- NFSPROC4_CLNT_CB_RECALL,
- NFSPROC4_CLNT_CB_SEQUENCE,
-};
-
struct nfs4_cb_compound_hdr {
/* args */
u32 ident; /* minorversion 0 only */
@@ -337,7 +331,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
p = xdr_reserve_space(xdr, 4);
*p++ = xdr_zero; /* truncate */
- encode_nfs_fh4(xdr, &dp->dl_fh);
+ encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle);
hdr->nops++;
}
@@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
const struct nfsd4_callback *cb)
{
- const struct nfs4_delegation *args = cb->cb_op;
+ const struct nfs4_delegation *dp = cb_to_delegation(cb);
struct nfs4_cb_compound_hdr hdr = {
.ident = cb->cb_clp->cl_cb_ident,
.minorversion = cb->cb_minorversion,
@@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_cb_compound4args(xdr, &hdr);
encode_cb_sequence4args(xdr, cb, &hdr);
- encode_cb_recall4args(xdr, args, &hdr);
+ encode_cb_recall4args(xdr, dp, &hdr);
encode_cb_nops(&hdr);
}
@@ -678,7 +672,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
(clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
return -EINVAL;
args.client_name = clp->cl_cred.cr_principal;
- args.prognumber = conn->cb_prog,
+ args.prognumber = conn->cb_prog;
args.protocol = XPRT_TRANSPORT_TCP;
args.authflavor = clp->cl_cred.cr_flavor;
clp->cl_cb_ident = conn->cb_ident;
@@ -689,7 +683,8 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
clp->cl_cb_session = ses;
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
- args.protocol = XPRT_TRANSPORT_BC_TCP;
+ args.protocol = conn->cb_xprt->xpt_class->xcl_ident |
+ XPRT_TRANSPORT_BC;
args.authflavor = ses->se_cb_sec.flavor;
}
/* Create RPC client */
@@ -745,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
static struct workqueue_struct *callback_wq;
-static void run_nfsd4_cb(struct nfsd4_callback *cb)
-{
- queue_work(callback_wq, &cb->cb_work);
-}
-
-static void do_probe_callback(struct nfs4_client *clp)
-{
- struct nfsd4_callback *cb = &clp->cl_cb_null;
-
- cb->cb_op = NULL;
- cb->cb_clp = clp;
-
- cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
- cb->cb_msg.rpc_argp = NULL;
- cb->cb_msg.rpc_resp = NULL;
-
- cb->cb_ops = &nfsd4_cb_probe_ops;
-
- run_nfsd4_cb(cb);
-}
-
/*
* Poke the callback thread to process any updates to the callback
* parameters, and send a null probe.
@@ -774,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
{
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
- do_probe_callback(clp);
+ nfsd4_run_cb(&clp->cl_cb_null);
}
void nfsd4_probe_callback_sync(struct nfs4_client *clp)
@@ -846,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
rpc_wake_up_next(&clp->cl_cb_waitq);
dprintk("%s: freed slot, new seqid=%d\n", __func__,
clp->cl_cb_session->se_cb_seq_nr);
-
- /* We're done looking into the sequence information */
- task->tk_msg.rpc_resp = NULL;
}
-}
-
-static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
-{
- struct nfsd4_callback *cb = calldata;
- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- struct nfs4_client *clp = cb->cb_clp;
- struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
-
- nfsd4_cb_done(task, calldata);
-
- if (current_rpc_client != task->tk_client) {
+ if (clp->cl_cb_client != task->tk_client) {
/* We're shutting down or changing cl_cb_client; leave
* it to nfsd4_process_cb_update to restart the call if
* necessary. */
@@ -871,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
if (cb->cb_done)
return;
- switch (task->tk_status) {
+
+ switch (cb->cb_ops->done(cb, task)) {
case 0:
- cb->cb_done = true;
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
return;
- case -EBADHANDLE:
- case -NFS4ERR_BAD_STATEID:
- /* Race: client probably got cb_recall
- * before open reply granting delegation */
+ case 1:
break;
- default:
+ case -1:
/* Network partition? */
nfsd4_mark_cb_down(clp, task->tk_status);
+ break;
+ default:
+ BUG();
}
- if (dp->dl_retries--) {
- rpc_delay(task, 2*HZ);
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- return;
- }
- nfsd4_mark_cb_down(clp, task->tk_status);
cb->cb_done = true;
}
-static void nfsd4_cb_recall_release(void *calldata)
+static void nfsd4_cb_release(void *calldata)
{
struct nfsd4_callback *cb = calldata;
struct nfs4_client *clp = cb->cb_clp;
- struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
if (cb->cb_done) {
spin_lock(&clp->cl_lock);
list_del(&cb->cb_per_client);
spin_unlock(&clp->cl_lock);
- nfs4_put_delegation(dp);
+
+ cb->cb_ops->release(cb);
}
}
-static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+static const struct rpc_call_ops nfsd4_cb_ops = {
.rpc_call_prepare = nfsd4_cb_prepare,
- .rpc_call_done = nfsd4_cb_recall_done,
- .rpc_release = nfsd4_cb_recall_release,
+ .rpc_call_done = nfsd4_cb_done,
+ .rpc_release = nfsd4_cb_release,
};
int nfsd4_create_callback_queue(void)
@@ -933,19 +888,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
/*
* Note this won't actually result in a null callback;
- * instead, nfsd4_do_callback_rpc() will detect the killed
+ * instead, nfsd4_run_cb_null() will detect the killed
* client, destroy the rpc client, and stop:
*/
- do_probe_callback(clp);
+ nfsd4_run_cb(&clp->cl_cb_null);
flush_workqueue(callback_wq);
}
-static void nfsd4_release_cb(struct nfsd4_callback *cb)
-{
- if (cb->cb_ops->rpc_release)
- cb->cb_ops->rpc_release(cb);
-}
-
/* requires cl_lock: */
static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
{
@@ -1008,50 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
}
/* Yay, the callback channel's back! Restart any callbacks: */
list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
- run_nfsd4_cb(cb);
+ queue_work(callback_wq, &cb->cb_work);
}
-static void nfsd4_do_callback_rpc(struct work_struct *w)
+static void
+nfsd4_run_cb_work(struct work_struct *work)
{
- struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
+ struct nfsd4_callback *cb =
+ container_of(work, struct nfsd4_callback, cb_work);
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
+ if (cb->cb_ops && cb->cb_ops->prepare)
+ cb->cb_ops->prepare(cb);
+
if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
nfsd4_process_cb_update(cb);
clnt = clp->cl_cb_client;
if (!clnt) {
/* Callback channel broken, or client killed; give up: */
- nfsd4_release_cb(cb);
+ if (cb->cb_ops && cb->cb_ops->release)
+ cb->cb_ops->release(cb);
return;
}
cb->cb_msg.rpc_cred = clp->cl_cb_cred;
rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
- cb->cb_ops, cb);
-}
-
-void nfsd4_init_callback(struct nfsd4_callback *cb)
-{
- INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
+ cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
}
-void nfsd4_cb_recall(struct nfs4_delegation *dp)
+void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+ struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
{
- struct nfsd4_callback *cb = &dp->dl_recall;
- struct nfs4_client *clp = dp->dl_stid.sc_client;
-
- dp->dl_retries = 1;
- cb->cb_op = dp;
cb->cb_clp = clp;
- cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
-
- cb->cb_ops = &nfsd4_cb_recall_ops;
-
+ cb->cb_ops = ops;
+ INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
INIT_LIST_HEAD(&cb->cb_per_client);
cb->cb_done = true;
+}
- run_nfsd4_cb(&dp->dl_recall);
+void nfsd4_run_cb(struct nfsd4_callback *cb)
+{
+ queue_work(callback_wq, &cb->cb_work);
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a0ab0a847d69..e1b3d3d472da 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
memset(&ent, 0, sizeof(ent));
/* Authentication name */
- if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
goto out;
memcpy(ent.authname, buf1, sizeof(ent.authname));
@@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
/* Name */
error = -EINVAL;
len = qword_get(&buf, buf1, PAGE_SIZE);
- if (len < 0)
+ if (len < 0 || len >= IDMAP_NAMESZ)
goto out;
if (len == 0)
set_bit(CACHE_NEGATIVE, &ent.h.flags);
- else if (len >= IDMAP_NAMESZ)
- goto out;
else
memcpy(ent.name, buf1, sizeof(ent.name));
error = -ENOMEM;
@@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
goto out;
cache_put(&res->h, cd);
-
error = 0;
out:
kfree(buf1);
-
return error;
}
-
static struct ent *
idtoname_lookup(struct cache_detail *cd, struct ent *item)
{
@@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
{
struct ent ent, *res;
char *buf1;
- int error = -EINVAL;
+ int len, error = -EINVAL;
if (buf[buflen - 1] != '\n')
return (-EINVAL);
@@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
memset(&ent, 0, sizeof(ent));
/* Authentication name */
- if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
goto out;
memcpy(ent.authname, buf1, sizeof(ent.authname));
@@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
/* Name */
- error = qword_get(&buf, buf1, PAGE_SIZE);
- if (error <= 0 || error >= IDMAP_NAMESZ)
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
goto out;
memcpy(ent.name, buf1, sizeof(ent.name));
@@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
error = 0;
out:
kfree(buf1);
-
return (error);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8f029db5d271..0beb023f25ac 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -177,7 +177,7 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
fh_put(dst);
dget(src->fh_dentry);
if (src->fh_export)
- cache_get(&src->fh_export->h);
+ exp_get(src->fh_export);
*dst = *src;
}
@@ -385,8 +385,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (nfsd4_has_session(cstate))
copy_clientid(&open->op_clientid, cstate->session);
- nfs4_lock_state();
-
/* check seqid for replay. set nfs4_owner */
resp = rqstp->rq_resp;
status = nfsd4_process_open1(&resp->cstate, open, nn);
@@ -431,8 +429,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
status = nfs4_check_open_reclaim(&open->op_clientid,
- cstate->minorversion,
- nn);
+ cstate, nn);
if (status)
goto out;
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
@@ -461,19 +458,17 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* set, (2) sets open->op_stateid, (3) sets open->op_delegation.
*/
status = nfsd4_process_open2(rqstp, resfh, open);
- WARN_ON(status && open->op_created);
+ WARN(status && open->op_created,
+ "nfsd4_process_open2 failed to open newly-created file! status=%u\n",
+ be32_to_cpu(status));
out:
if (resfh && resfh != &cstate->current_fh) {
fh_dup2(&cstate->current_fh, resfh);
fh_put(resfh);
kfree(resfh);
}
- nfsd4_cleanup_open_state(open, status);
- if (open->op_openowner && !nfsd4_has_session(cstate))
- cstate->replay_owner = &open->op_openowner->oo_owner;
+ nfsd4_cleanup_open_state(cstate, open, status);
nfsd4_bump_seqid(cstate, status);
- if (!cstate->replay_owner)
- nfs4_unlock_state();
return status;
}
@@ -581,8 +576,12 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
__be32 verf[2];
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- verf[0] = (__be32)nn->nfssvc_boot.tv_sec;
- verf[1] = (__be32)nn->nfssvc_boot.tv_usec;
+ /*
+ * This is opaque to client, so no need to byte-swap. Use
+ * __force to keep sparse happy
+ */
+ verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+ verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec;
memcpy(verifier->data, verf, sizeof(verifier->data));
}
@@ -619,8 +618,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
case NF4LNK:
status = nfsd_symlink(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- create->cr_linkname, create->cr_linklen,
- &resfh, &create->cr_iattr);
+ create->cr_data, &resfh);
break;
case NF4BLK:
@@ -909,8 +907,8 @@ nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstat
default:
return nfserr_inval;
}
- exp_get(cstate->current_fh.fh_export);
- sin->sin_exp = cstate->current_fh.fh_export;
+
+ sin->sin_exp = exp_get(cstate->current_fh.fh_export);
fh_put(&cstate->current_fh);
return nfs_ok;
}
@@ -1015,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
+static __be32
+nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_seek *seek)
+{
+ int whence;
+ __be32 status;
+ struct file *file;
+
+ status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+ &seek->seek_stateid,
+ RD_STATE, &file);
+ if (status) {
+ dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+ return status;
+ }
+
+ switch (seek->seek_whence) {
+ case NFS4_CONTENT_DATA:
+ whence = SEEK_DATA;
+ break;
+ case NFS4_CONTENT_HOLE:
+ whence = SEEK_HOLE;
+ break;
+ default:
+ status = nfserr_union_notsupp;
+ goto out;
+ }
+
+ /*
+ * Note: This call does change file->f_pos, but nothing in NFSD
+ * should ever file->f_pos.
+ */
+ seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
+ if (seek->seek_pos < 0)
+ status = nfserrno(seek->seek_pos);
+ else if (seek->seek_pos >= i_size_read(file_inode(file)))
+ seek->seek_eof = true;
+
+out:
+ fput(file);
+ return status;
+}
+
/* This routine never returns NFS_OK! If there are no other errors, it
* will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
* attributes matched. VERIFY is implemented by mapping NFSERR_SAME
@@ -1231,7 +1272,8 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
*/
if (argp->opcnt == resp->opcnt)
return false;
-
+ if (next->opnum == OP_ILLEGAL)
+ return false;
nextd = OPDESC(next);
/*
* Rest of 2.6.3.1.1: certain operations will return WRONGSEC
@@ -1289,7 +1331,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
*/
- rqstp->rq_usedeferral = 0;
+ rqstp->rq_usedeferral = false;
/*
* According to RFC3010, this takes precedence over all other errors.
@@ -1391,10 +1433,7 @@ encode_op:
args->ops, args->opcnt, resp->opcnt, op->opnum,
be32_to_cpu(status));
- if (cstate->replay_owner) {
- nfs4_unlock_state();
- cstate->replay_owner = NULL;
- }
+ nfsd4_cstate_clear_replay(cstate);
/* XXX Ugh, we need to get rid of this kind of special case: */
if (op->opnum == OP_READ && op->u.read.rd_filp)
fput(op->u.read.rd_filp);
@@ -1408,7 +1447,7 @@ encode_op:
BUG_ON(cstate->replay_owner);
out:
/* Reset deferral mechanism for RPC deferrals */
- rqstp->rq_usedeferral = 1;
+ rqstp->rq_usedeferral = true;
dprintk("nfsv4 compound returned %d\n", ntohl(status));
return status;
}
@@ -1520,21 +1559,17 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
u32 maxcount = 0, rlen = 0;
maxcount = svc_max_payload(rqstp);
- rlen = op->u.read.rd_length;
-
- if (rlen > maxcount)
- rlen = maxcount;
+ rlen = min(op->u.read.rd_length, maxcount);
return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
- u32 maxcount = svc_max_payload(rqstp);
- u32 rlen = op->u.readdir.rd_maxcount;
+ u32 maxcount = 0, rlen = 0;
- if (rlen > maxcount)
- rlen = maxcount;
+ maxcount = svc_max_payload(rqstp);
+ rlen = min(op->u.readdir.rd_maxcount, maxcount);
return (op_encode_hdr_size + op_encode_verifier_maxsz +
XDR_QUADLEN(rlen)) * sizeof(__be32);
@@ -1555,7 +1590,8 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
struct nfsd4_op *op)
{
- return NFS4_MAX_SESSIONID_LEN + 20;
+ return (op_encode_hdr_size
+ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
}
static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
@@ -1859,6 +1895,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_func = (nfsd4op_func)nfsd4_sequence,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
.op_name = "OP_SEQUENCE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_sequence_rsize,
},
[OP_DESTROY_CLIENTID] = {
.op_func = (nfsd4op_func)nfsd4_destroy_clientid,
@@ -1890,6 +1927,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+
+ /* NFSv4.2 operations */
+ [OP_SEEK] = {
+ .op_func = (nfsd4op_func)nfsd4_seek,
+ .op_name = "OP_SEEK",
+ },
};
int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..a25490ae6c62 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops {
void (*create)(struct nfs4_client *);
void (*remove)(struct nfs4_client *);
int (*check)(struct nfs4_client *);
- void (*grace_done)(struct nfsd_net *, time_t);
+ void (*grace_done)(struct nfsd_net *);
};
/* Globals */
@@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
status = mnt_want_write_file(nn->rec_file);
if (status)
- return;
+ goto out_creds;
dir = nn->rec_file->f_path.dentry;
/* lock the parent */
@@ -228,6 +228,7 @@ out_unlock:
user_recovery_dirname);
}
mnt_drop_write_file(nn->rec_file);
+out_creds:
nfs4_reset_creds(original_cred);
}
@@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
}
static void
-nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
+nfsd4_recdir_purge_old(struct nfsd_net *nn)
{
int status;
@@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net)
return status;
}
+static void
+nfsd4_shutdown_recdir(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (!nn->rec_file)
+ return;
+ fput(nn->rec_file);
+ nn->rec_file = NULL;
+}
static int
nfs4_legacy_state_init(struct net *net)
@@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net)
int status;
status = nfsd4_init_recdir(net);
- if (!status)
- status = nfsd4_recdir_load(net);
if (status)
- printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+ return status;
+
+ status = nfsd4_recdir_load(net);
+ if (status)
+ nfsd4_shutdown_recdir(net);
+
return status;
}
@@ -546,21 +560,12 @@ err:
}
static void
-nfsd4_shutdown_recdir(struct nfsd_net *nn)
-{
- if (!nn->rec_file)
- return;
- fput(nn->rec_file);
- nn->rec_file = NULL;
-}
-
-static void
nfsd4_legacy_tracking_exit(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
nfs4_release_reclaim(nn);
- nfsd4_shutdown_recdir(nn);
+ nfsd4_shutdown_recdir(net);
nfs4_legacy_state_shutdown(net);
}
@@ -670,7 +675,6 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
}
schedule();
- set_current_state(TASK_RUNNING);
if (msg.errno < 0)
ret = msg.errno;
@@ -1016,7 +1020,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
}
static void
-nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_cld_grace_done(struct nfsd_net *nn)
{
int ret;
struct cld_upcall *cup;
@@ -1029,7 +1033,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
}
cup->cu_msg.cm_cmd = Cld_GraceDone;
- cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time;
+ cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
if (!ret)
ret = cup->cu_msg.cm_status;
@@ -1062,6 +1066,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable,
#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
+#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
static char *
nfsd4_cltrack_legacy_topdir(void)
@@ -1126,10 +1132,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
return result;
}
+static char *
+nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ /* prefix + Y/N character + terminating NULL */
+ len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
+ clp->cl_minorversion ? 'Y' : 'N');
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static char *
+nfsd4_cltrack_grace_start(time_t grace_start)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ /* prefix + max width of int64_t string + terminating NULL */
+ len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
+ grace_start);
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
static int
-nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
{
- char *envp[2];
+ char *envp[3];
char *argv[4];
int ret;
@@ -1140,10 +1196,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
dprintk("%s: cmd: %s\n", __func__, cmd);
dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
- dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
+ dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
+ dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
- envp[0] = legacy;
- envp[1] = NULL;
+ envp[0] = env0;
+ envp[1] = env1;
+ envp[2] = NULL;
argv[0] = (char *)cltrack_prog;
argv[1] = cmd;
@@ -1187,28 +1245,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
}
static int
-nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
+nfsd4_umh_cltrack_init(struct net *net)
{
+ int ret;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
/* XXX: The usermode helper s not working in container yet. */
if (net != &init_net) {
WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
"tracking in a container!\n");
return -EINVAL;
}
- return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
+
+ ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
+ kfree(grace_start);
+ return ret;
+}
+
+static void
+nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
+{
+ wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void
+nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
+{
+ smp_mb__before_atomic();
+ clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
}
static void
nfsd4_umh_cltrack_create(struct nfs4_client *clp)
{
- char *hexid;
+ char *hexid, *has_session, *grace_start;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ /*
+ * With v4.0 clients, there's little difference in outcome between a
+ * create and check operation, and we can end up calling into this
+ * function multiple times per client (once for each openowner). So,
+ * for v4.0 clients skip upcalling once the client has been recorded
+ * on stable storage.
+ *
+ * For v4.1+ clients, the outcome of the two operations is different,
+ * so we must ensure that we upcall for the create operation. v4.1+
+ * clients call this on RECLAIM_COMPLETE though, so we should only end
+ * up doing a single create upcall per client.
+ */
+ if (clp->cl_minorversion == 0 &&
+ test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return;
}
- nfsd4_umh_cltrack_upcall("create", hexid, NULL);
+
+ has_session = nfsd4_cltrack_client_has_session(clp);
+ grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ nfsd4_cltrack_upcall_unlock(clp);
+
+ kfree(has_session);
+ kfree(grace_start);
kfree(hexid);
}
@@ -1217,12 +1325,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
{
char *hexid;
+ if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return;
}
- nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
+ nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
+ clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ nfsd4_cltrack_upcall_unlock(clp);
+
kfree(hexid);
}
@@ -1230,30 +1347,45 @@ static int
nfsd4_umh_cltrack_check(struct nfs4_client *clp)
{
int ret;
- char *hexid, *legacy;
+ char *hexid, *has_session, *legacy;
+
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
if (!hexid) {
dprintk("%s: can't allocate memory for upcall!\n", __func__);
return -ENOMEM;
}
+
+ has_session = nfsd4_cltrack_client_has_session(clp);
legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
- ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
+ ret = 0;
+ } else {
+ ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
+ if (ret == 0)
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+ nfsd4_cltrack_upcall_unlock(clp);
+ kfree(has_session);
kfree(legacy);
kfree(hexid);
+
return ret;
}
static void
-nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
- time_t boot_time)
+nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
{
char *legacy;
char timestr[22]; /* FIXME: better way to determine max size? */
- sprintf(timestr, "%ld", boot_time);
+ sprintf(timestr, "%ld", nn->boot_time);
legacy = nfsd4_cltrack_legacy_topdir();
- nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
+ nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
kfree(legacy);
}
@@ -1356,10 +1488,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
}
void
-nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
+nfsd4_record_grace_done(struct nfsd_net *nn)
{
if (nn->client_tracking_ops)
- nn->client_tracking_ops->grace_done(nn, boot_time);
+ nn->client_tracking_ops->grace_done(nn);
}
static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2204e1fe5725..e9c3afe4b5d3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -70,13 +70,11 @@ static u64 current_sessionid = 1;
#define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t)))
/* forward declarations */
-static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
+static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
+static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
/* Locking: */
-/* Currently used for almost all code touching nfsv4 state: */
-static DEFINE_MUTEX(client_mutex);
-
/*
* Currently used for the del_recall_lru and file hash table. In an
* effort to decrease the scope of the client_mutex, this spinlock may
@@ -84,31 +82,27 @@ static DEFINE_MUTEX(client_mutex);
*/
static DEFINE_SPINLOCK(state_lock);
+/*
+ * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for
+ * the refcount on the open stateid to drop.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(close_wq);
+
static struct kmem_cache *openowner_slab;
static struct kmem_cache *lockowner_slab;
static struct kmem_cache *file_slab;
static struct kmem_cache *stateid_slab;
static struct kmem_cache *deleg_slab;
-void
-nfs4_lock_state(void)
-{
- mutex_lock(&client_mutex);
-}
-
static void free_session(struct nfsd4_session *);
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+
static bool is_session_dead(struct nfsd4_session *ses)
{
return ses->se_flags & NFS4_SESSION_DEAD;
}
-void nfsd4_put_session(struct nfsd4_session *ses)
-{
- if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
- free_session(ses);
-}
-
static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
{
if (atomic_read(&ses->se_ref) > ref_held_by_me)
@@ -117,46 +111,17 @@ static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_b
return nfs_ok;
}
-static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
-{
- if (is_session_dead(ses))
- return nfserr_badsession;
- atomic_inc(&ses->se_ref);
- return nfs_ok;
-}
-
-void
-nfs4_unlock_state(void)
-{
- mutex_unlock(&client_mutex);
-}
-
static bool is_client_expired(struct nfs4_client *clp)
{
return clp->cl_time == 0;
}
-static __be32 mark_client_expired_locked(struct nfs4_client *clp)
-{
- if (atomic_read(&clp->cl_refcount))
- return nfserr_jukebox;
- clp->cl_time = 0;
- return nfs_ok;
-}
-
-static __be32 mark_client_expired(struct nfs4_client *clp)
+static __be32 get_client_locked(struct nfs4_client *clp)
{
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- __be32 ret;
- spin_lock(&nn->client_lock);
- ret = mark_client_expired_locked(clp);
- spin_unlock(&nn->client_lock);
- return ret;
-}
+ lockdep_assert_held(&nn->client_lock);
-static __be32 get_client_locked(struct nfs4_client *clp)
-{
if (is_client_expired(clp))
return nfserr_expired;
atomic_inc(&clp->cl_refcount);
@@ -197,13 +162,17 @@ renew_client(struct nfs4_client *clp)
static void put_client_renew_locked(struct nfs4_client *clp)
{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
if (!atomic_dec_and_test(&clp->cl_refcount))
return;
if (!is_client_expired(clp))
renew_client_locked(clp);
}
-void put_client_renew(struct nfs4_client *clp)
+static void put_client_renew(struct nfs4_client *clp)
{
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -214,6 +183,84 @@ void put_client_renew(struct nfs4_client *clp)
spin_unlock(&nn->client_lock);
}
+static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
+{
+ __be32 status;
+
+ if (is_session_dead(ses))
+ return nfserr_badsession;
+ status = get_client_locked(ses->se_client);
+ if (status)
+ return status;
+ atomic_inc(&ses->se_ref);
+ return nfs_ok;
+}
+
+static void nfsd4_put_session_locked(struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
+ if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
+ free_session(ses);
+ put_client_renew_locked(clp);
+}
+
+static void nfsd4_put_session(struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ nfsd4_put_session_locked(ses);
+ spin_unlock(&nn->client_lock);
+}
+
+static inline struct nfs4_stateowner *
+nfs4_get_stateowner(struct nfs4_stateowner *sop)
+{
+ atomic_inc(&sop->so_count);
+ return sop;
+}
+
+static int
+same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner)
+{
+ return (sop->so_owner.len == owner->len) &&
+ 0 == memcmp(sop->so_owner.data, owner->data, owner->len);
+}
+
+static struct nfs4_openowner *
+find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open,
+ struct nfs4_client *clp)
+{
+ struct nfs4_stateowner *so;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval],
+ so_strhash) {
+ if (!so->so_is_open_owner)
+ continue;
+ if (same_owner_str(so, &open->op_owner))
+ return openowner(nfs4_get_stateowner(so));
+ }
+ return NULL;
+}
+
+static struct nfs4_openowner *
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
+ struct nfs4_client *clp)
+{
+ struct nfs4_openowner *oo;
+
+ spin_lock(&clp->cl_lock);
+ oo = find_openstateowner_str_locked(hashval, open, clp);
+ spin_unlock(&clp->cl_lock);
+ return oo;
+}
static inline u32
opaque_hashval(const void *ptr, int nbytes)
@@ -236,10 +283,11 @@ static void nfsd4_free_file(struct nfs4_file *f)
static inline void
put_nfs4_file(struct nfs4_file *fi)
{
+ might_lock(&state_lock);
+
if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
hlist_del(&fi->fi_hash);
spin_unlock(&state_lock);
- iput(fi->fi_inode);
nfsd4_free_file(fi);
}
}
@@ -250,7 +298,80 @@ get_nfs4_file(struct nfs4_file *fi)
atomic_inc(&fi->fi_ref);
}
-static int num_delegations;
+static struct file *
+__nfs4_get_fd(struct nfs4_file *f, int oflag)
+{
+ if (f->fi_fds[oflag])
+ return get_file(f->fi_fds[oflag]);
+ return NULL;
+}
+
+static struct file *
+find_writeable_file_locked(struct nfs4_file *f)
+{
+ struct file *ret;
+
+ lockdep_assert_held(&f->fi_lock);
+
+ ret = __nfs4_get_fd(f, O_WRONLY);
+ if (!ret)
+ ret = __nfs4_get_fd(f, O_RDWR);
+ return ret;
+}
+
+static struct file *
+find_writeable_file(struct nfs4_file *f)
+{
+ struct file *ret;
+
+ spin_lock(&f->fi_lock);
+ ret = find_writeable_file_locked(f);
+ spin_unlock(&f->fi_lock);
+
+ return ret;
+}
+
+static struct file *find_readable_file_locked(struct nfs4_file *f)
+{
+ struct file *ret;
+
+ lockdep_assert_held(&f->fi_lock);
+
+ ret = __nfs4_get_fd(f, O_RDONLY);
+ if (!ret)
+ ret = __nfs4_get_fd(f, O_RDWR);
+ return ret;
+}
+
+static struct file *
+find_readable_file(struct nfs4_file *f)
+{
+ struct file *ret;
+
+ spin_lock(&f->fi_lock);
+ ret = find_readable_file_locked(f);
+ spin_unlock(&f->fi_lock);
+
+ return ret;
+}
+
+static struct file *
+find_any_file(struct nfs4_file *f)
+{
+ struct file *ret;
+
+ spin_lock(&f->fi_lock);
+ ret = __nfs4_get_fd(f, O_RDWR);
+ if (!ret) {
+ ret = __nfs4_get_fd(f, O_WRONLY);
+ if (!ret)
+ ret = __nfs4_get_fd(f, O_RDONLY);
+ }
+ spin_unlock(&f->fi_lock);
+ return ret;
+}
+
+static atomic_long_t num_delegations;
unsigned long max_delegations;
/*
@@ -262,12 +383,11 @@ unsigned long max_delegations;
#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
-static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
{
unsigned int ret;
ret = opaque_hashval(ownername->data, ownername->len);
- ret += clientid;
return ret & OWNER_HASH_MASK;
}
@@ -275,75 +395,124 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
#define FILE_HASH_BITS 8
#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
-static unsigned int file_hashval(struct inode *ino)
+static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh)
+{
+ return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0);
+}
+
+static unsigned int file_hashval(struct knfsd_fh *fh)
+{
+ return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
+}
+
+static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
{
- /* XXX: why are we hashing on inode pointer, anyway? */
- return hash_ptr(ino, FILE_HASH_BITS);
+ return fh1->fh_size == fh2->fh_size &&
+ !memcmp(fh1->fh_base.fh_pad,
+ fh2->fh_base.fh_pad,
+ fh1->fh_size);
}
static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
-static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+static void
+__nfs4_file_get_access(struct nfs4_file *fp, u32 access)
{
- WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
- atomic_inc(&fp->fi_access[oflag]);
+ lockdep_assert_held(&fp->fi_lock);
+
+ if (access & NFS4_SHARE_ACCESS_WRITE)
+ atomic_inc(&fp->fi_access[O_WRONLY]);
+ if (access & NFS4_SHARE_ACCESS_READ)
+ atomic_inc(&fp->fi_access[O_RDONLY]);
}
-static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+static __be32
+nfs4_file_get_access(struct nfs4_file *fp, u32 access)
{
- if (oflag == O_RDWR) {
- __nfs4_file_get_access(fp, O_RDONLY);
- __nfs4_file_get_access(fp, O_WRONLY);
- } else
- __nfs4_file_get_access(fp, oflag);
+ lockdep_assert_held(&fp->fi_lock);
+
+ /* Does this access mode make sense? */
+ if (access & ~NFS4_SHARE_ACCESS_BOTH)
+ return nfserr_inval;
+
+ /* Does it conflict with a deny mode already set? */
+ if ((access & fp->fi_share_deny) != 0)
+ return nfserr_share_denied;
+
+ __nfs4_file_get_access(fp, access);
+ return nfs_ok;
}
-static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
+static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny)
{
- if (fp->fi_fds[oflag]) {
- fput(fp->fi_fds[oflag]);
- fp->fi_fds[oflag] = NULL;
+ /* Common case is that there is no deny mode. */
+ if (deny) {
+ /* Does this deny mode make sense? */
+ if (deny & ~NFS4_SHARE_DENY_BOTH)
+ return nfserr_inval;
+
+ if ((deny & NFS4_SHARE_DENY_READ) &&
+ atomic_read(&fp->fi_access[O_RDONLY]))
+ return nfserr_share_denied;
+
+ if ((deny & NFS4_SHARE_DENY_WRITE) &&
+ atomic_read(&fp->fi_access[O_WRONLY]))
+ return nfserr_share_denied;
}
+ return nfs_ok;
}
static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
{
- if (atomic_dec_and_test(&fp->fi_access[oflag])) {
- nfs4_file_put_fd(fp, oflag);
+ might_lock(&fp->fi_lock);
+
+ if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) {
+ struct file *f1 = NULL;
+ struct file *f2 = NULL;
+
+ swap(f1, fp->fi_fds[oflag]);
if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
- nfs4_file_put_fd(fp, O_RDWR);
+ swap(f2, fp->fi_fds[O_RDWR]);
+ spin_unlock(&fp->fi_lock);
+ if (f1)
+ fput(f1);
+ if (f2)
+ fput(f2);
}
}
-static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
{
- if (oflag == O_RDWR) {
- __nfs4_file_put_access(fp, O_RDONLY);
+ WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH);
+
+ if (access & NFS4_SHARE_ACCESS_WRITE)
__nfs4_file_put_access(fp, O_WRONLY);
- } else
- __nfs4_file_put_access(fp, oflag);
+ if (access & NFS4_SHARE_ACCESS_READ)
+ __nfs4_file_put_access(fp, O_RDONLY);
}
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct
-kmem_cache *slab)
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+ struct kmem_cache *slab)
{
- struct idr *stateids = &cl->cl_stateids;
struct nfs4_stid *stid;
int new_id;
- stid = kmem_cache_alloc(slab, GFP_KERNEL);
+ stid = kmem_cache_zalloc(slab, GFP_KERNEL);
if (!stid)
return NULL;
- new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL);
+ idr_preload(GFP_KERNEL);
+ spin_lock(&cl->cl_lock);
+ new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT);
+ spin_unlock(&cl->cl_lock);
+ idr_preload_end();
if (new_id < 0)
goto out_free;
stid->sc_client = cl;
- stid->sc_type = 0;
stid->sc_stateid.si_opaque.so_id = new_id;
stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
/* Will be incremented before return to client: */
- stid->sc_stateid.si_generation = 0;
+ atomic_set(&stid->sc_count, 1);
/*
* It shouldn't be a problem to reuse an opaque stateid value.
@@ -360,9 +529,24 @@ out_free:
return NULL;
}
-static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
+static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
{
- return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+ struct nfs4_stid *stid;
+ struct nfs4_ol_stateid *stp;
+
+ stid = nfs4_alloc_stid(clp, stateid_slab);
+ if (!stid)
+ return NULL;
+
+ stp = openlockstateid(stid);
+ stp->st_stid.sc_free = nfs4_free_ol_stateid;
+ return stp;
+}
+
+static void nfs4_free_deleg(struct nfs4_stid *stid)
+{
+ kmem_cache_free(deleg_slab, stid);
+ atomic_long_dec(&num_delegations);
}
/*
@@ -379,10 +563,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
* Each filter is 256 bits. We hash the filehandle to 32bit and use the
* low 3 bytes as hash-table indices.
*
- * 'state_lock', which is always held when block_delegations() is called,
+ * 'blocked_delegations_lock', which is always taken in block_delegations(),
* is used to manage concurrent access. Testing does not need the lock
* except when swapping the two filters.
*/
+static DEFINE_SPINLOCK(blocked_delegations_lock);
static struct bloom_pair {
int entries, old_entries;
time_t swap_time;
@@ -398,7 +583,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
if (bd->entries == 0)
return 0;
if (seconds_since_boot() - bd->swap_time > 30) {
- spin_lock(&state_lock);
+ spin_lock(&blocked_delegations_lock);
if (seconds_since_boot() - bd->swap_time > 30) {
bd->entries -= bd->old_entries;
bd->old_entries = bd->entries;
@@ -407,7 +592,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
bd->new = 1-bd->new;
bd->swap_time = seconds_since_boot();
}
- spin_unlock(&state_lock);
+ spin_unlock(&blocked_delegations_lock);
}
hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
if (test_bit(hash&255, bd->set[0]) &&
@@ -430,76 +615,83 @@ static void block_delegations(struct knfsd_fh *fh)
hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+ spin_lock(&blocked_delegations_lock);
__set_bit(hash&255, bd->set[bd->new]);
__set_bit((hash>>8)&255, bd->set[bd->new]);
__set_bit((hash>>16)&255, bd->set[bd->new]);
if (bd->entries == 0)
bd->swap_time = seconds_since_boot();
bd->entries += 1;
+ spin_unlock(&blocked_delegations_lock);
}
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
+alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
{
struct nfs4_delegation *dp;
+ long n;
dprintk("NFSD alloc_init_deleg\n");
- if (num_delegations > max_delegations)
- return NULL;
+ n = atomic_long_inc_return(&num_delegations);
+ if (n < 0 || n > max_delegations)
+ goto out_dec;
if (delegation_blocked(&current_fh->fh_handle))
- return NULL;
+ goto out_dec;
dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
if (dp == NULL)
- return dp;
+ goto out_dec;
+
+ dp->dl_stid.sc_free = nfs4_free_deleg;
/*
* delegation seqid's are never incremented. The 4.1 special
* meaning of seqid 0 isn't meaningful, really, but let's avoid
* 0 anyway just for consistency and use 1:
*/
dp->dl_stid.sc_stateid.si_generation = 1;
- num_delegations++;
INIT_LIST_HEAD(&dp->dl_perfile);
INIT_LIST_HEAD(&dp->dl_perclnt);
INIT_LIST_HEAD(&dp->dl_recall_lru);
- dp->dl_file = NULL;
dp->dl_type = NFS4_OPEN_DELEGATE_READ;
- fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
- dp->dl_time = 0;
- atomic_set(&dp->dl_count, 1);
- nfsd4_init_callback(&dp->dl_recall);
+ dp->dl_retries = 1;
+ nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
+ &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
return dp;
+out_dec:
+ atomic_long_dec(&num_delegations);
+ return NULL;
}
-static void remove_stid(struct nfs4_stid *s)
+void
+nfs4_put_stid(struct nfs4_stid *s)
{
- struct idr *stateids = &s->sc_client->cl_stateids;
+ struct nfs4_file *fp = s->sc_file;
+ struct nfs4_client *clp = s->sc_client;
- idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
-}
+ might_lock(&clp->cl_lock);
-static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s)
-{
- kmem_cache_free(slab, s);
-}
-
-void
-nfs4_put_delegation(struct nfs4_delegation *dp)
-{
- if (atomic_dec_and_test(&dp->dl_count)) {
- nfs4_free_stid(deleg_slab, &dp->dl_stid);
- num_delegations--;
+ if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
+ wake_up_all(&close_wq);
+ return;
}
+ idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+ spin_unlock(&clp->cl_lock);
+ s->sc_free(s);
+ if (fp)
+ put_nfs4_file(fp);
}
static void nfs4_put_deleg_lease(struct nfs4_file *fp)
{
- if (!fp->fi_lease)
- return;
- if (atomic_dec_and_test(&fp->fi_delegees)) {
- vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
- fp->fi_lease = NULL;
- fput(fp->fi_deleg_file);
- fp->fi_deleg_file = NULL;
+ struct file *filp = NULL;
+
+ spin_lock(&fp->fi_lock);
+ if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees))
+ swap(filp, fp->fi_deleg_file);
+ spin_unlock(&fp->fi_lock);
+
+ if (filp) {
+ vfs_setlease(filp, F_UNLCK, NULL, NULL);
+ fput(filp);
}
}
@@ -512,54 +704,55 @@ static void
hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
{
lockdep_assert_held(&state_lock);
+ lockdep_assert_held(&fp->fi_lock);
+ atomic_inc(&dp->dl_stid.sc_count);
dp->dl_stid.sc_type = NFS4_DELEG_STID;
list_add(&dp->dl_perfile, &fp->fi_delegations);
list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
}
-/* Called under the state lock. */
static void
-unhash_delegation(struct nfs4_delegation *dp)
+unhash_delegation_locked(struct nfs4_delegation *dp)
{
- spin_lock(&state_lock);
- list_del_init(&dp->dl_perclnt);
- list_del_init(&dp->dl_perfile);
- list_del_init(&dp->dl_recall_lru);
- spin_unlock(&state_lock);
- if (dp->dl_file) {
- nfs4_put_deleg_lease(dp->dl_file);
- put_nfs4_file(dp->dl_file);
- dp->dl_file = NULL;
- }
-}
-
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
+ lockdep_assert_held(&state_lock);
-static void destroy_revoked_delegation(struct nfs4_delegation *dp)
-{
+ dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
+ /* Ensure that deleg break won't try to requeue it */
+ ++dp->dl_time;
+ spin_lock(&fp->fi_lock);
+ list_del_init(&dp->dl_perclnt);
list_del_init(&dp->dl_recall_lru);
- remove_stid(&dp->dl_stid);
- nfs4_put_delegation(dp);
+ list_del_init(&dp->dl_perfile);
+ spin_unlock(&fp->fi_lock);
}
static void destroy_delegation(struct nfs4_delegation *dp)
{
- unhash_delegation(dp);
- remove_stid(&dp->dl_stid);
- nfs4_put_delegation(dp);
+ spin_lock(&state_lock);
+ unhash_delegation_locked(dp);
+ spin_unlock(&state_lock);
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+ nfs4_put_stid(&dp->dl_stid);
}
static void revoke_delegation(struct nfs4_delegation *dp)
{
struct nfs4_client *clp = dp->dl_stid.sc_client;
+ WARN_ON(!list_empty(&dp->dl_recall_lru));
+
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+
if (clp->cl_minorversion == 0)
- destroy_delegation(dp);
+ nfs4_put_stid(&dp->dl_stid);
else {
- unhash_delegation(dp);
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+ spin_lock(&clp->cl_lock);
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ spin_unlock(&clp->cl_lock);
}
}
@@ -607,57 +800,62 @@ bmap_to_share_mode(unsigned long bmap) {
return access;
}
-static bool
-test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
- unsigned int access, deny;
-
- access = bmap_to_share_mode(stp->st_access_bmap);
- deny = bmap_to_share_mode(stp->st_deny_bmap);
- if ((access & open->op_share_deny) || (deny & open->op_share_access))
- return false;
- return true;
-}
-
/* set share access for a given stateid */
static inline void
set_access(u32 access, struct nfs4_ol_stateid *stp)
{
- __set_bit(access, &stp->st_access_bmap);
+ unsigned char mask = 1 << access;
+
+ WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+ stp->st_access_bmap |= mask;
}
/* clear share access for a given stateid */
static inline void
clear_access(u32 access, struct nfs4_ol_stateid *stp)
{
- __clear_bit(access, &stp->st_access_bmap);
+ unsigned char mask = 1 << access;
+
+ WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+ stp->st_access_bmap &= ~mask;
}
/* test whether a given stateid has access */
static inline bool
test_access(u32 access, struct nfs4_ol_stateid *stp)
{
- return test_bit(access, &stp->st_access_bmap);
+ unsigned char mask = 1 << access;
+
+ return (bool)(stp->st_access_bmap & mask);
}
/* set share deny for a given stateid */
static inline void
-set_deny(u32 access, struct nfs4_ol_stateid *stp)
+set_deny(u32 deny, struct nfs4_ol_stateid *stp)
{
- __set_bit(access, &stp->st_deny_bmap);
+ unsigned char mask = 1 << deny;
+
+ WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+ stp->st_deny_bmap |= mask;
}
/* clear share deny for a given stateid */
static inline void
-clear_deny(u32 access, struct nfs4_ol_stateid *stp)
+clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
{
- __clear_bit(access, &stp->st_deny_bmap);
+ unsigned char mask = 1 << deny;
+
+ WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+ stp->st_deny_bmap &= ~mask;
}
/* test whether a given stateid is denying specific access */
static inline bool
-test_deny(u32 access, struct nfs4_ol_stateid *stp)
+test_deny(u32 deny, struct nfs4_ol_stateid *stp)
{
- return test_bit(access, &stp->st_deny_bmap);
+ unsigned char mask = 1 << deny;
+
+ return (bool)(stp->st_deny_bmap & mask);
}
static int nfs4_access_to_omode(u32 access)
@@ -674,138 +872,283 @@ static int nfs4_access_to_omode(u32 access)
return O_RDONLY;
}
+/*
+ * A stateid that had a deny mode associated with it is being released
+ * or downgraded. Recalculate the deny mode on the file.
+ */
+static void
+recalculate_deny_mode(struct nfs4_file *fp)
+{
+ struct nfs4_ol_stateid *stp;
+
+ spin_lock(&fp->fi_lock);
+ fp->fi_share_deny = 0;
+ list_for_each_entry(stp, &fp->fi_stateids, st_perfile)
+ fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap);
+ spin_unlock(&fp->fi_lock);
+}
+
+static void
+reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+ int i;
+ bool change = false;
+
+ for (i = 1; i < 4; i++) {
+ if ((i & deny) != i) {
+ change = true;
+ clear_deny(i, stp);
+ }
+ }
+
+ /* Recalculate per-file deny mode if there was a change */
+ if (change)
+ recalculate_deny_mode(stp->st_stid.sc_file);
+}
+
/* release all access and file references for a given stateid */
static void
release_all_access(struct nfs4_ol_stateid *stp)
{
int i;
+ struct nfs4_file *fp = stp->st_stid.sc_file;
+
+ if (fp && stp->st_deny_bmap != 0)
+ recalculate_deny_mode(fp);
for (i = 1; i < 4; i++) {
if (test_access(i, stp))
- nfs4_file_put_access(stp->st_file,
- nfs4_access_to_omode(i));
+ nfs4_file_put_access(stp->st_stid.sc_file, i);
clear_access(i, stp);
}
}
-static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
+static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
{
+ struct nfs4_client *clp = sop->so_client;
+
+ might_lock(&clp->cl_lock);
+
+ if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock))
+ return;
+ sop->so_ops->so_unhash(sop);
+ spin_unlock(&clp->cl_lock);
+ kfree(sop->so_owner.data);
+ sop->so_ops->so_free(sop);
+}
+
+static void unhash_ol_stateid(struct nfs4_ol_stateid *stp)
+{
+ struct nfs4_file *fp = stp->st_stid.sc_file;
+
+ lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock);
+
+ spin_lock(&fp->fi_lock);
list_del(&stp->st_perfile);
+ spin_unlock(&fp->fi_lock);
list_del(&stp->st_perstateowner);
}
-static void close_generic_stateid(struct nfs4_ol_stateid *stp)
+static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
{
+ struct nfs4_ol_stateid *stp = openlockstateid(stid);
+
release_all_access(stp);
- put_nfs4_file(stp->st_file);
- stp->st_file = NULL;
+ if (stp->st_stateowner)
+ nfs4_put_stateowner(stp->st_stateowner);
+ kmem_cache_free(stateid_slab, stid);
}
-static void free_generic_stateid(struct nfs4_ol_stateid *stp)
+static void nfs4_free_lock_stateid(struct nfs4_stid *stid)
{
- remove_stid(&stp->st_stid);
- nfs4_free_stid(stateid_slab, &stp->st_stid);
+ struct nfs4_ol_stateid *stp = openlockstateid(stid);
+ struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+ struct file *file;
+
+ file = find_any_file(stp->st_stid.sc_file);
+ if (file)
+ filp_close(file, (fl_owner_t)lo);
+ nfs4_free_ol_stateid(stid);
}
-static void release_lock_stateid(struct nfs4_ol_stateid *stp)
+/*
+ * Put the persistent reference to an already unhashed generic stateid, while
+ * holding the cl_lock. If it's the last reference, then put it onto the
+ * reaplist for later destruction.
+ */
+static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
+ struct list_head *reaplist)
{
- struct file *file;
+ struct nfs4_stid *s = &stp->st_stid;
+ struct nfs4_client *clp = s->sc_client;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ WARN_ON_ONCE(!list_empty(&stp->st_locks));
+
+ if (!atomic_dec_and_test(&s->sc_count)) {
+ wake_up_all(&close_wq);
+ return;
+ }
+
+ idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+ list_add(&stp->st_locks, reaplist);
+}
- unhash_generic_stateid(stp);
+static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
+{
+ struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
+
+ lockdep_assert_held(&oo->oo_owner.so_client->cl_lock);
+
+ list_del_init(&stp->st_locks);
+ unhash_ol_stateid(stp);
unhash_stid(&stp->st_stid);
- file = find_any_file(stp->st_file);
- if (file)
- locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
- close_generic_stateid(stp);
- free_generic_stateid(stp);
}
-static void unhash_lockowner(struct nfs4_lockowner *lo)
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
{
- struct nfs4_ol_stateid *stp;
+ struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
- list_del(&lo->lo_owner.so_strhash);
- list_del(&lo->lo_perstateid);
- list_del(&lo->lo_owner_ino_hash);
- while (!list_empty(&lo->lo_owner.so_stateids)) {
- stp = list_first_entry(&lo->lo_owner.so_stateids,
- struct nfs4_ol_stateid, st_perstateowner);
- release_lock_stateid(stp);
- }
+ spin_lock(&oo->oo_owner.so_client->cl_lock);
+ unhash_lock_stateid(stp);
+ spin_unlock(&oo->oo_owner.so_client->cl_lock);
+ nfs4_put_stid(&stp->st_stid);
}
-static void nfs4_free_lockowner(struct nfs4_lockowner *lo)
+static void unhash_lockowner_locked(struct nfs4_lockowner *lo)
{
- kfree(lo->lo_owner.so_owner.data);
- kmem_cache_free(lockowner_slab, lo);
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_del_init(&lo->lo_owner.so_strhash);
+}
+
+/*
+ * Free a list of generic stateids that were collected earlier after being
+ * fully unhashed.
+ */
+static void
+free_ol_stateid_reaplist(struct list_head *reaplist)
+{
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_file *fp;
+
+ might_sleep();
+
+ while (!list_empty(reaplist)) {
+ stp = list_first_entry(reaplist, struct nfs4_ol_stateid,
+ st_locks);
+ list_del(&stp->st_locks);
+ fp = stp->st_stid.sc_file;
+ stp->st_stid.sc_free(&stp->st_stid);
+ if (fp)
+ put_nfs4_file(fp);
+ }
}
static void release_lockowner(struct nfs4_lockowner *lo)
{
- unhash_lockowner(lo);
- nfs4_free_lockowner(lo);
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+ struct nfs4_ol_stateid *stp;
+ struct list_head reaplist;
+
+ INIT_LIST_HEAD(&reaplist);
+
+ spin_lock(&clp->cl_lock);
+ unhash_lockowner_locked(lo);
+ while (!list_empty(&lo->lo_owner.so_stateids)) {
+ stp = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
+ unhash_lock_stateid(stp);
+ put_ol_stateid_locked(stp, &reaplist);
+ }
+ spin_unlock(&clp->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
+ nfs4_put_stateowner(&lo->lo_owner);
}
-static void
-release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
+static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
+ struct list_head *reaplist)
{
- struct nfs4_lockowner *lo;
+ struct nfs4_ol_stateid *stp;
- while (!list_empty(&open_stp->st_lockowners)) {
- lo = list_entry(open_stp->st_lockowners.next,
- struct nfs4_lockowner, lo_perstateid);
- release_lockowner(lo);
+ while (!list_empty(&open_stp->st_locks)) {
+ stp = list_entry(open_stp->st_locks.next,
+ struct nfs4_ol_stateid, st_locks);
+ unhash_lock_stateid(stp);
+ put_ol_stateid_locked(stp, reaplist);
}
}
-static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp,
+ struct list_head *reaplist)
{
- unhash_generic_stateid(stp);
- release_stateid_lockowners(stp);
- close_generic_stateid(stp);
+ lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
+
+ unhash_ol_stateid(stp);
+ release_open_stateid_locks(stp, reaplist);
}
static void release_open_stateid(struct nfs4_ol_stateid *stp)
{
- unhash_open_stateid(stp);
- free_generic_stateid(stp);
+ LIST_HEAD(reaplist);
+
+ spin_lock(&stp->st_stid.sc_client->cl_lock);
+ unhash_open_stateid(stp, &reaplist);
+ put_ol_stateid_locked(stp, &reaplist);
+ spin_unlock(&stp->st_stid.sc_client->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
}
-static void unhash_openowner(struct nfs4_openowner *oo)
+static void unhash_openowner_locked(struct nfs4_openowner *oo)
{
- struct nfs4_ol_stateid *stp;
+ struct nfs4_client *clp = oo->oo_owner.so_client;
- list_del(&oo->oo_owner.so_strhash);
- list_del(&oo->oo_perclient);
- while (!list_empty(&oo->oo_owner.so_stateids)) {
- stp = list_first_entry(&oo->oo_owner.so_stateids,
- struct nfs4_ol_stateid, st_perstateowner);
- release_open_stateid(stp);
- }
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_del_init(&oo->oo_owner.so_strhash);
+ list_del_init(&oo->oo_perclient);
}
static void release_last_closed_stateid(struct nfs4_openowner *oo)
{
- struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
+ struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net,
+ nfsd_net_id);
+ struct nfs4_ol_stateid *s;
+ spin_lock(&nn->client_lock);
+ s = oo->oo_last_closed_stid;
if (s) {
- free_generic_stateid(s);
+ list_del_init(&oo->oo_close_lru);
oo->oo_last_closed_stid = NULL;
}
-}
-
-static void nfs4_free_openowner(struct nfs4_openowner *oo)
-{
- kfree(oo->oo_owner.so_owner.data);
- kmem_cache_free(openowner_slab, oo);
+ spin_unlock(&nn->client_lock);
+ if (s)
+ nfs4_put_stid(&s->st_stid);
}
static void release_openowner(struct nfs4_openowner *oo)
{
- unhash_openowner(oo);
- list_del(&oo->oo_close_lru);
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_client *clp = oo->oo_owner.so_client;
+ struct list_head reaplist;
+
+ INIT_LIST_HEAD(&reaplist);
+
+ spin_lock(&clp->cl_lock);
+ unhash_openowner_locked(oo);
+ while (!list_empty(&oo->oo_owner.so_stateids)) {
+ stp = list_first_entry(&oo->oo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
+ unhash_open_stateid(stp, &reaplist);
+ put_ol_stateid_locked(stp, &reaplist);
+ }
+ spin_unlock(&clp->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
release_last_closed_stateid(oo);
- nfs4_free_openowner(oo);
+ nfs4_put_stateowner(&oo->oo_owner);
}
static inline int
@@ -842,7 +1185,7 @@ void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr)
return;
if (!seqid_mutating_err(ntohl(nfserr))) {
- cstate->replay_owner = NULL;
+ nfsd4_cstate_clear_replay(cstate);
return;
}
if (!so)
@@ -1030,10 +1373,8 @@ static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, str
if (ret)
/* oops; xprt is already down: */
nfsd4_conn_lost(&conn->cn_xpt_user);
- if (conn->cn_flags & NFS4_CDFC4_BACK) {
- /* callback channel may be back up */
- nfsd4_probe_callback(ses->se_client);
- }
+ /* We may have gained or lost a callback channel: */
+ nfsd4_probe_callback_sync(ses->se_client);
}
static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses)
@@ -1073,9 +1414,6 @@ static void __free_session(struct nfsd4_session *ses)
static void free_session(struct nfsd4_session *ses)
{
- struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
-
- lockdep_assert_held(&nn->client_lock);
nfsd4_del_conns(ses);
nfsd4_put_drc_mem(&ses->se_fchannel);
__free_session(ses);
@@ -1097,12 +1435,10 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
new->se_cb_sec = cses->cb_sec;
atomic_set(&new->se_ref, 0);
idx = hash_sessionid(&new->se_sessionid);
- spin_lock(&nn->client_lock);
list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
spin_lock(&clp->cl_lock);
list_add(&new->se_perclnt, &clp->cl_sessions);
spin_unlock(&clp->cl_lock);
- spin_unlock(&nn->client_lock);
if (cses->flags & SESSION4_BACK_CHAN) {
struct sockaddr *sa = svc_addr(rqstp);
@@ -1120,12 +1456,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
/* caller must hold client_lock */
static struct nfsd4_session *
-find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
+__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
{
struct nfsd4_session *elem;
int idx;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ lockdep_assert_held(&nn->client_lock);
+
dump_sessionid(__func__, sessionid);
idx = hash_sessionid(sessionid);
/* Search in the appropriate list */
@@ -1140,10 +1478,33 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
return NULL;
}
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net,
+ __be32 *ret)
+{
+ struct nfsd4_session *session;
+ __be32 status = nfserr_badsession;
+
+ session = __find_in_sessionid_hashtbl(sessionid, net);
+ if (!session)
+ goto out;
+ status = nfsd4_get_session_locked(session);
+ if (status)
+ session = NULL;
+out:
+ *ret = status;
+ return session;
+}
+
/* caller must hold client_lock */
static void
unhash_session(struct nfsd4_session *ses)
{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
list_del(&ses->se_hash);
spin_lock(&ses->se_client->cl_lock);
list_del(&ses->se_perclnt);
@@ -1169,15 +1530,20 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
static struct nfs4_client *alloc_client(struct xdr_netobj name)
{
struct nfs4_client *clp;
+ int i;
clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
if (clp == NULL)
return NULL;
clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
- if (clp->cl_name.data == NULL) {
- kfree(clp);
- return NULL;
- }
+ if (clp->cl_name.data == NULL)
+ goto err_no_name;
+ clp->cl_ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
+ OWNER_HASH_SIZE, GFP_KERNEL);
+ if (!clp->cl_ownerstr_hashtbl)
+ goto err_no_hashtbl;
+ for (i = 0; i < OWNER_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]);
clp->cl_name.len = name.len;
INIT_LIST_HEAD(&clp->cl_sessions);
idr_init(&clp->cl_stateids);
@@ -1192,14 +1558,16 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
spin_lock_init(&clp->cl_lock);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
+err_no_hashtbl:
+ kfree(clp->cl_name.data);
+err_no_name:
+ kfree(clp);
+ return NULL;
}
static void
free_client(struct nfs4_client *clp)
{
- struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id);
-
- lockdep_assert_held(&nn->client_lock);
while (!list_empty(&clp->cl_sessions)) {
struct nfsd4_session *ses;
ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
@@ -1210,18 +1578,32 @@ free_client(struct nfs4_client *clp)
}
rpc_destroy_wait_queue(&clp->cl_cb_waitq);
free_svc_cred(&clp->cl_cred);
+ kfree(clp->cl_ownerstr_hashtbl);
kfree(clp->cl_name.data);
idr_destroy(&clp->cl_stateids);
kfree(clp);
}
/* must be called under the client_lock */
-static inline void
+static void
unhash_client_locked(struct nfs4_client *clp)
{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct nfsd4_session *ses;
- list_del(&clp->cl_lru);
+ lockdep_assert_held(&nn->client_lock);
+
+ /* Mark the client as expired! */
+ clp->cl_time = 0;
+ /* Make it invisible */
+ if (!list_empty(&clp->cl_idhash)) {
+ list_del_init(&clp->cl_idhash);
+ if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+ rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
+ else
+ rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+ }
+ list_del_init(&clp->cl_lru);
spin_lock(&clp->cl_lock);
list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
list_del_init(&ses->se_hash);
@@ -1229,53 +1611,72 @@ unhash_client_locked(struct nfs4_client *clp)
}
static void
-destroy_client(struct nfs4_client *clp)
+unhash_client(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ unhash_client_locked(clp);
+ spin_unlock(&nn->client_lock);
+}
+
+static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+{
+ if (atomic_read(&clp->cl_refcount))
+ return nfserr_jukebox;
+ unhash_client_locked(clp);
+ return nfs_ok;
+}
+
+static void
+__destroy_client(struct nfs4_client *clp)
{
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head reaplist;
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
INIT_LIST_HEAD(&reaplist);
spin_lock(&state_lock);
while (!list_empty(&clp->cl_delegations)) {
dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
- list_del_init(&dp->dl_perclnt);
- list_move(&dp->dl_recall_lru, &reaplist);
+ unhash_delegation_locked(dp);
+ list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
- destroy_delegation(dp);
+ list_del_init(&dp->dl_recall_lru);
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+ nfs4_put_stid(&dp->dl_stid);
}
- list_splice_init(&clp->cl_revoked, &reaplist);
- while (!list_empty(&reaplist)) {
+ while (!list_empty(&clp->cl_revoked)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
- destroy_revoked_delegation(dp);
+ list_del_init(&dp->dl_recall_lru);
+ nfs4_put_stid(&dp->dl_stid);
}
while (!list_empty(&clp->cl_openowners)) {
oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
+ nfs4_get_stateowner(&oo->oo_owner);
release_openowner(oo);
}
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
- list_del(&clp->cl_idhash);
- if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
- rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
- else
- rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
- spin_lock(&nn->client_lock);
- unhash_client_locked(clp);
- WARN_ON_ONCE(atomic_read(&clp->cl_refcount));
free_client(clp);
- spin_unlock(&nn->client_lock);
+}
+
+static void
+destroy_client(struct nfs4_client *clp)
+{
+ unhash_client(clp);
+ __destroy_client(clp);
}
static void expire_client(struct nfs4_client *clp)
{
+ unhash_client(clp);
nfsd4_client_record_remove(clp);
- destroy_client(clp);
+ __destroy_client(clp);
}
static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -1408,25 +1809,28 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
}
-static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
+static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn)
{
- static u32 current_clientid = 1;
+ __be32 verf[2];
- clp->cl_clientid.cl_boot = nn->boot_time;
- clp->cl_clientid.cl_id = current_clientid++;
+ /*
+ * This is opaque to client, so no need to byte-swap. Use
+ * __force to keep sparse happy
+ */
+ verf[0] = (__force __be32)get_seconds();
+ verf[1] = (__force __be32)nn->clientid_counter;
+ memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
}
-static void gen_confirm(struct nfs4_client *clp)
+static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
{
- __be32 verf[2];
- static u32 i;
-
- verf[0] = (__be32)get_seconds();
- verf[1] = (__be32)i++;
- memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
+ clp->cl_clientid.cl_boot = nn->boot_time;
+ clp->cl_clientid.cl_id = nn->clientid_counter++;
+ gen_confirm(clp, nn);
}
-static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
+static struct nfs4_stid *
+find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
{
struct nfs4_stid *ret;
@@ -1436,16 +1840,21 @@ static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
return ret;
}
-static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+static struct nfs4_stid *
+find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
{
struct nfs4_stid *s;
- s = find_stateid(cl, t);
- if (!s)
- return NULL;
- if (typemask & s->sc_type)
- return s;
- return NULL;
+ spin_lock(&cl->cl_lock);
+ s = find_stateid_locked(cl, t);
+ if (s != NULL) {
+ if (typemask & s->sc_type)
+ atomic_inc(&s->sc_count);
+ else
+ s = NULL;
+ }
+ spin_unlock(&cl->cl_lock);
+ return s;
}
static struct nfs4_client *create_client(struct xdr_netobj name,
@@ -1455,7 +1864,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
struct sockaddr *sa = svc_addr(rqstp);
int ret;
struct net *net = SVC_NET(rqstp);
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
clp = alloc_client(name);
if (clp == NULL)
@@ -1463,17 +1871,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
if (ret) {
- spin_lock(&nn->client_lock);
free_client(clp);
- spin_unlock(&nn->client_lock);
return NULL;
}
- nfsd4_init_callback(&clp->cl_cb_null);
+ nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
clp->cl_time = get_seconds();
clear_bit(0, &clp->cl_cb_slot_busy);
copy_verf(clp, verf);
rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
- gen_confirm(clp);
clp->cl_cb_session = NULL;
clp->net = net;
return clp;
@@ -1525,11 +1930,13 @@ add_to_unconfirmed(struct nfs4_client *clp)
unsigned int idhashval;
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ lockdep_assert_held(&nn->client_lock);
+
clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
add_clp_to_name_tree(clp, &nn->unconf_name_tree);
idhashval = clientid_hashval(clp->cl_clientid.cl_id);
list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
- renew_client(clp);
+ renew_client_locked(clp);
}
static void
@@ -1538,12 +1945,14 @@ move_to_confirmed(struct nfs4_client *clp)
unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ lockdep_assert_held(&nn->client_lock);
+
dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
add_clp_to_name_tree(clp, &nn->conf_name_tree);
set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
- renew_client(clp);
+ renew_client_locked(clp);
}
static struct nfs4_client *
@@ -1556,7 +1965,7 @@ find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions)
if (same_clid(&clp->cl_clientid, clid)) {
if ((bool)clp->cl_minorversion != sessions)
return NULL;
- renew_client(clp);
+ renew_client_locked(clp);
return clp;
}
}
@@ -1568,6 +1977,7 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
struct list_head *tbl = nn->conf_id_hashtbl;
+ lockdep_assert_held(&nn->client_lock);
return find_client_in_id_table(tbl, clid, sessions);
}
@@ -1576,6 +1986,7 @@ find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
struct list_head *tbl = nn->unconf_id_hashtbl;
+ lockdep_assert_held(&nn->client_lock);
return find_client_in_id_table(tbl, clid, sessions);
}
@@ -1587,12 +1998,14 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
static struct nfs4_client *
find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
{
+ lockdep_assert_held(&nn->client_lock);
return find_clp_in_name_tree(name, &nn->conf_name_tree);
}
static struct nfs4_client *
find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
{
+ lockdep_assert_held(&nn->client_lock);
return find_clp_in_name_tree(name, &nn->unconf_name_tree);
}
@@ -1642,7 +2055,7 @@ out_err:
/*
* Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
*/
-void
+static void
nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
{
struct xdr_buf *buf = resp->xdr.buf;
@@ -1758,7 +2171,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_exchange_id *exid)
{
- struct nfs4_client *unconf, *conf, *new;
+ struct nfs4_client *conf, *new;
+ struct nfs4_client *unconf = NULL;
__be32 status;
char addr_str[INET6_ADDRSTRLEN];
nfs4_verifier verf = exid->verifier;
@@ -1787,8 +2201,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
return nfserr_encr_alg_unsupp;
}
+ new = create_client(exid->clname, rqstp, &verf);
+ if (new == NULL)
+ return nfserr_jukebox;
+
/* Cases below refer to rfc 5661 section 18.35.4: */
- nfs4_lock_state();
+ spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&exid->clname, nn);
if (conf) {
bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
@@ -1813,7 +2231,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
}
/* case 6 */
exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
- new = conf;
goto out_copy;
}
if (!creds_match) { /* case 3 */
@@ -1821,15 +2238,14 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
status = nfserr_clid_inuse;
goto out;
}
- expire_client(conf);
goto out_new;
}
if (verfs_match) { /* case 2 */
conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
- new = conf;
goto out_copy;
}
/* case 5, client reboot */
+ conf = NULL;
goto out_new;
}
@@ -1840,33 +2256,38 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
if (unconf) /* case 4, possible retry or client restart */
- expire_client(unconf);
+ unhash_client_locked(unconf);
/* case 1 (normal case) */
out_new:
- new = create_client(exid->clname, rqstp, &verf);
- if (new == NULL) {
- status = nfserr_jukebox;
- goto out;
+ if (conf) {
+ status = mark_client_expired_locked(conf);
+ if (status)
+ goto out;
}
new->cl_minorversion = cstate->minorversion;
new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
gen_clid(new, nn);
add_to_unconfirmed(new);
+ swap(new, conf);
out_copy:
- exid->clientid.cl_boot = new->cl_clientid.cl_boot;
- exid->clientid.cl_id = new->cl_clientid.cl_id;
+ exid->clientid.cl_boot = conf->cl_clientid.cl_boot;
+ exid->clientid.cl_id = conf->cl_clientid.cl_id;
- exid->seqid = new->cl_cs_slot.sl_seqid + 1;
- nfsd4_set_ex_flags(new, exid);
+ exid->seqid = conf->cl_cs_slot.sl_seqid + 1;
+ nfsd4_set_ex_flags(conf, exid);
dprintk("nfsd4_exchange_id seqid %d flags %x\n",
- new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
+ conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags);
status = nfs_ok;
out:
- nfs4_unlock_state();
+ spin_unlock(&nn->client_lock);
+ if (new)
+ expire_client(new);
+ if (unconf)
+ expire_client(unconf);
return status;
}
@@ -2010,6 +2431,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
{
struct sockaddr *sa = svc_addr(rqstp);
struct nfs4_client *conf, *unconf;
+ struct nfs4_client *old = NULL;
struct nfsd4_session *new;
struct nfsd4_conn *conn;
struct nfsd4_clid_slot *cs_slot = NULL;
@@ -2035,7 +2457,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
if (!conn)
goto out_free_session;
- nfs4_lock_state();
+ spin_lock(&nn->client_lock);
unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
conf = find_confirmed_client(&cr_ses->clientid, true, nn);
WARN_ON_ONCE(conf && unconf);
@@ -2054,7 +2476,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
} else if (unconf) {
- struct nfs4_client *old;
if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
!rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
status = nfserr_clid_inuse;
@@ -2072,10 +2493,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
}
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
- status = mark_client_expired(old);
- if (status)
+ status = mark_client_expired_locked(old);
+ if (status) {
+ old = NULL;
goto out_free_conn;
- expire_client(old);
+ }
}
move_to_confirmed(unconf);
conf = unconf;
@@ -2091,20 +2513,27 @@ nfsd4_create_session(struct svc_rqst *rqstp,
cr_ses->flags &= ~SESSION4_RDMA;
init_session(rqstp, new, conf, cr_ses);
- nfsd4_init_conn(rqstp, conn, new);
+ nfsd4_get_session_locked(new);
memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
NFS4_MAX_SESSIONID_LEN);
cs_slot->sl_seqid++;
cr_ses->seqid = cs_slot->sl_seqid;
- /* cache solo and embedded create sessions under the state lock */
+ /* cache solo and embedded create sessions under the client_lock */
nfsd4_cache_create_session(cr_ses, cs_slot, status);
- nfs4_unlock_state();
+ spin_unlock(&nn->client_lock);
+ /* init connection and backchannel */
+ nfsd4_init_conn(rqstp, conn, new);
+ nfsd4_put_session(new);
+ if (old)
+ expire_client(old);
return status;
out_free_conn:
- nfs4_unlock_state();
+ spin_unlock(&nn->client_lock);
free_conn(conn);
+ if (old)
+ expire_client(old);
out_free_session:
__free_session(new);
out_release_drc_mem:
@@ -2152,17 +2581,16 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
__be32 status;
struct nfsd4_conn *conn;
struct nfsd4_session *session;
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (!nfsd4_last_compound_op(rqstp))
return nfserr_not_only_op;
- nfs4_lock_state();
spin_lock(&nn->client_lock);
- session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
+ session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status);
spin_unlock(&nn->client_lock);
- status = nfserr_badsession;
if (!session)
- goto out;
+ goto out_no_session;
status = nfserr_wrong_cred;
if (!mach_creds_match(session->se_client, rqstp))
goto out;
@@ -2176,7 +2604,8 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
nfsd4_init_conn(rqstp, conn, session);
status = nfs_ok;
out:
- nfs4_unlock_state();
+ nfsd4_put_session(session);
+out_no_session:
return status;
}
@@ -2195,9 +2624,9 @@ nfsd4_destroy_session(struct svc_rqst *r,
struct nfsd4_session *ses;
__be32 status;
int ref_held_by_me = 0;
- struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
+ struct net *net = SVC_NET(r);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- nfs4_lock_state();
status = nfserr_not_only_op;
if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
if (!nfsd4_last_compound_op(r))
@@ -2206,14 +2635,12 @@ nfsd4_destroy_session(struct svc_rqst *r,
}
dump_sessionid(__func__, &sessionid->sessionid);
spin_lock(&nn->client_lock);
- ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
- status = nfserr_badsession;
+ ses = find_in_sessionid_hashtbl(&sessionid->sessionid, net, &status);
if (!ses)
goto out_client_lock;
status = nfserr_wrong_cred;
if (!mach_creds_match(ses->se_client, r))
- goto out_client_lock;
- nfsd4_get_session_locked(ses);
+ goto out_put_session;
status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
if (status)
goto out_put_session;
@@ -2225,11 +2652,10 @@ nfsd4_destroy_session(struct svc_rqst *r,
spin_lock(&nn->client_lock);
status = nfs_ok;
out_put_session:
- nfsd4_put_session(ses);
+ nfsd4_put_session_locked(ses);
out_client_lock:
spin_unlock(&nn->client_lock);
out:
- nfs4_unlock_state();
return status;
}
@@ -2300,7 +2726,8 @@ nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_conn *conn;
__be32 status;
int buflen;
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (resp->opcnt != 1)
return nfserr_sequence_pos;
@@ -2314,17 +2741,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
return nfserr_jukebox;
spin_lock(&nn->client_lock);
- status = nfserr_badsession;
- session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
+ session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status);
if (!session)
goto out_no_session;
clp = session->se_client;
- status = get_client_locked(clp);
- if (status)
- goto out_no_session;
- status = nfsd4_get_session_locked(session);
- if (status)
- goto out_put_client;
status = nfserr_too_many_ops;
if (nfsd4_session_too_many_ops(rqstp, session))
@@ -2354,6 +2774,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
goto out_put_session;
cstate->slot = slot;
cstate->session = session;
+ cstate->clp = clp;
/* Return the cached reply status and set cstate->status
* for nfsd4_proc_compound processing */
status = nfsd4_replay_cache_entry(resp, seq);
@@ -2388,6 +2809,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
cstate->slot = slot;
cstate->session = session;
+ cstate->clp = clp;
out:
switch (clp->cl_cb_state) {
@@ -2408,31 +2830,48 @@ out_no_session:
spin_unlock(&nn->client_lock);
return status;
out_put_session:
- nfsd4_put_session(session);
-out_put_client:
- put_client_renew_locked(clp);
+ nfsd4_put_session_locked(session);
goto out_no_session;
}
+void
+nfsd4_sequence_done(struct nfsd4_compoundres *resp)
+{
+ struct nfsd4_compound_state *cs = &resp->cstate;
+
+ if (nfsd4_has_session(cs)) {
+ if (cs->status != nfserr_replay_cache) {
+ nfsd4_store_cache_entry(resp);
+ cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
+ }
+ /* Drop session reference that was taken in nfsd4_sequence() */
+ nfsd4_put_session(cs->session);
+ } else if (cs->clp)
+ put_client_renew(cs->clp);
+}
+
__be32
nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
{
- struct nfs4_client *conf, *unconf, *clp;
+ struct nfs4_client *conf, *unconf;
+ struct nfs4_client *clp = NULL;
__be32 status = 0;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- nfs4_lock_state();
+ spin_lock(&nn->client_lock);
unconf = find_unconfirmed_client(&dc->clientid, true, nn);
conf = find_confirmed_client(&dc->clientid, true, nn);
WARN_ON_ONCE(conf && unconf);
if (conf) {
- clp = conf;
-
if (client_has_state(conf)) {
status = nfserr_clientid_busy;
goto out;
}
+ status = mark_client_expired_locked(conf);
+ if (status)
+ goto out;
+ clp = conf;
} else if (unconf)
clp = unconf;
else {
@@ -2440,12 +2879,15 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
goto out;
}
if (!mach_creds_match(clp, rqstp)) {
+ clp = NULL;
status = nfserr_wrong_cred;
goto out;
}
- expire_client(clp);
+ unhash_client_locked(clp);
out:
- nfs4_unlock_state();
+ spin_unlock(&nn->client_lock);
+ if (clp)
+ expire_client(clp);
return status;
}
@@ -2464,7 +2906,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
return nfs_ok;
}
- nfs4_lock_state();
status = nfserr_complete_already;
if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
&cstate->session->se_client->cl_flags))
@@ -2484,7 +2925,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
status = nfs_ok;
nfsd4_client_record_create(cstate->session->se_client);
out:
- nfs4_unlock_state();
return status;
}
@@ -2494,12 +2934,16 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct xdr_netobj clname = setclid->se_name;
nfs4_verifier clverifier = setclid->se_verf;
- struct nfs4_client *conf, *unconf, *new;
+ struct nfs4_client *conf, *new;
+ struct nfs4_client *unconf = NULL;
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ new = create_client(clname, rqstp, &clverifier);
+ if (new == NULL)
+ return nfserr_jukebox;
/* Cases below refer to rfc 3530 section 14.2.33: */
- nfs4_lock_state();
+ spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&clname, nn);
if (conf) {
/* case 0: */
@@ -2517,11 +2961,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
unconf = find_unconfirmed_client_by_name(&clname, nn);
if (unconf)
- expire_client(unconf);
- status = nfserr_jukebox;
- new = create_client(clname, rqstp, &clverifier);
- if (new == NULL)
- goto out;
+ unhash_client_locked(unconf);
if (conf && same_verf(&conf->cl_verifier, &clverifier))
/* case 1: probable callback update */
copy_clid(new, conf);
@@ -2533,9 +2973,14 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
+ new = NULL;
status = nfs_ok;
out:
- nfs4_unlock_state();
+ spin_unlock(&nn->client_lock);
+ if (new)
+ free_client(new);
+ if (unconf)
+ expire_client(unconf);
return status;
}
@@ -2546,6 +2991,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
struct nfsd4_setclientid_confirm *setclientid_confirm)
{
struct nfs4_client *conf, *unconf;
+ struct nfs4_client *old = NULL;
nfs4_verifier confirm = setclientid_confirm->sc_confirm;
clientid_t * clid = &setclientid_confirm->sc_clientid;
__be32 status;
@@ -2553,8 +2999,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
if (STALE_CLIENTID(clid, nn))
return nfserr_stale_clientid;
- nfs4_lock_state();
+ spin_lock(&nn->client_lock);
conf = find_confirmed_client(clid, false, nn);
unconf = find_unconfirmed_client(clid, false, nn);
/*
@@ -2578,22 +3024,30 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
}
status = nfs_ok;
if (conf) { /* case 1: callback update */
+ old = unconf;
+ unhash_client_locked(old);
nfsd4_change_callback(conf, &unconf->cl_cb_conn);
- nfsd4_probe_callback(conf);
- expire_client(unconf);
} else { /* case 3: normal case; new or rebooted client */
- conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
- if (conf) {
- status = mark_client_expired(conf);
- if (status)
+ old = find_confirmed_client_by_name(&unconf->cl_name, nn);
+ if (old) {
+ status = mark_client_expired_locked(old);
+ if (status) {
+ old = NULL;
goto out;
- expire_client(conf);
+ }
}
move_to_confirmed(unconf);
- nfsd4_probe_callback(unconf);
+ conf = unconf;
}
+ get_client_locked(conf);
+ spin_unlock(&nn->client_lock);
+ nfsd4_probe_callback(conf);
+ spin_lock(&nn->client_lock);
+ put_client_renew_locked(conf);
out:
- nfs4_unlock_state();
+ spin_unlock(&nn->client_lock);
+ if (old)
+ expire_client(old);
return status;
}
@@ -2603,21 +3057,23 @@ static struct nfs4_file *nfsd4_alloc_file(void)
}
/* OPEN Share state helper functions */
-static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
+static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh)
{
- unsigned int hashval = file_hashval(ino);
+ unsigned int hashval = file_hashval(fh);
+
+ lockdep_assert_held(&state_lock);
atomic_set(&fp->fi_ref, 1);
+ spin_lock_init(&fp->fi_lock);
INIT_LIST_HEAD(&fp->fi_stateids);
INIT_LIST_HEAD(&fp->fi_delegations);
- fp->fi_inode = igrab(ino);
+ fh_copy_shallow(&fp->fi_fhandle, fh);
+ fp->fi_deleg_file = NULL;
fp->fi_had_conflict = false;
- fp->fi_lease = NULL;
+ fp->fi_share_deny = 0;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
memset(fp->fi_access, 0, sizeof(fp->fi_access));
- spin_lock(&state_lock);
hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
- spin_unlock(&state_lock);
}
void
@@ -2673,6 +3129,27 @@ static void init_nfs4_replay(struct nfs4_replay *rp)
rp->rp_status = nfserr_serverfault;
rp->rp_buflen = 0;
rp->rp_buf = rp->rp_ibuf;
+ mutex_init(&rp->rp_mutex);
+}
+
+static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
+ struct nfs4_stateowner *so)
+{
+ if (!nfsd4_has_session(cstate)) {
+ mutex_lock(&so->so_replay.rp_mutex);
+ cstate->replay_owner = nfs4_get_stateowner(so);
+ }
+}
+
+void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
+{
+ struct nfs4_stateowner *so = cstate->replay_owner;
+
+ if (so != NULL) {
+ cstate->replay_owner = NULL;
+ mutex_unlock(&so->so_replay.rp_mutex);
+ nfs4_put_stateowner(so);
+ }
}
static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
@@ -2693,111 +3170,171 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
INIT_LIST_HEAD(&sop->so_stateids);
sop->so_client = clp;
init_nfs4_replay(&sop->so_replay);
+ atomic_set(&sop->so_count, 1);
return sop;
}
static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
{
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ lockdep_assert_held(&clp->cl_lock);
- list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
+ list_add(&oo->oo_owner.so_strhash,
+ &clp->cl_ownerstr_hashtbl[strhashval]);
list_add(&oo->oo_perclient, &clp->cl_openowners);
}
+static void nfs4_unhash_openowner(struct nfs4_stateowner *so)
+{
+ unhash_openowner_locked(openowner(so));
+}
+
+static void nfs4_free_openowner(struct nfs4_stateowner *so)
+{
+ struct nfs4_openowner *oo = openowner(so);
+
+ kmem_cache_free(openowner_slab, oo);
+}
+
+static const struct nfs4_stateowner_operations openowner_ops = {
+ .so_unhash = nfs4_unhash_openowner,
+ .so_free = nfs4_free_openowner,
+};
+
static struct nfs4_openowner *
-alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
- struct nfs4_openowner *oo;
+alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
+ struct nfsd4_compound_state *cstate)
+{
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_openowner *oo, *ret;
oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
if (!oo)
return NULL;
+ oo->oo_owner.so_ops = &openowner_ops;
oo->oo_owner.so_is_open_owner = 1;
oo->oo_owner.so_seqid = open->op_seqid;
- oo->oo_flags = NFS4_OO_NEW;
+ oo->oo_flags = 0;
+ if (nfsd4_has_session(cstate))
+ oo->oo_flags |= NFS4_OO_CONFIRMED;
oo->oo_time = 0;
oo->oo_last_closed_stid = NULL;
INIT_LIST_HEAD(&oo->oo_close_lru);
- hash_openowner(oo, clp, strhashval);
+ spin_lock(&clp->cl_lock);
+ ret = find_openstateowner_str_locked(strhashval, open, clp);
+ if (ret == NULL) {
+ hash_openowner(oo, clp, strhashval);
+ ret = oo;
+ } else
+ nfs4_free_openowner(&oo->oo_owner);
+ spin_unlock(&clp->cl_lock);
return oo;
}
static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
struct nfs4_openowner *oo = open->op_openowner;
+ atomic_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_OPEN_STID;
- INIT_LIST_HEAD(&stp->st_lockowners);
- list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
- list_add(&stp->st_perfile, &fp->fi_stateids);
- stp->st_stateowner = &oo->oo_owner;
+ INIT_LIST_HEAD(&stp->st_locks);
+ stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
get_nfs4_file(fp);
- stp->st_file = fp;
+ stp->st_stid.sc_file = fp;
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
- set_access(open->op_share_access, stp);
- set_deny(open->op_share_deny, stp);
stp->st_openstp = NULL;
+ spin_lock(&oo->oo_owner.so_client->cl_lock);
+ list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
+ spin_lock(&fp->fi_lock);
+ list_add(&stp->st_perfile, &fp->fi_stateids);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&oo->oo_owner.so_client->cl_lock);
}
+/*
+ * In the 4.0 case we need to keep the owners around a little while to handle
+ * CLOSE replay. We still do need to release any file access that is held by
+ * them before returning however.
+ */
static void
-move_to_close_lru(struct nfs4_openowner *oo, struct net *net)
+move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfs4_ol_stateid *last;
+ struct nfs4_openowner *oo = openowner(s->st_stateowner);
+ struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net,
+ nfsd_net_id);
dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
+ /*
+ * We know that we hold one reference via nfsd4_close, and another
+ * "persistent" reference for the client. If the refcount is higher
+ * than 2, then there are still calls in progress that are using this
+ * stateid. We can't put the sc_file reference until they are finished.
+ * Wait for the refcount to drop to 2. Since it has been unhashed,
+ * there should be no danger of the refcount going back up again at
+ * this point.
+ */
+ wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2);
+
+ release_all_access(s);
+ if (s->st_stid.sc_file) {
+ put_nfs4_file(s->st_stid.sc_file);
+ s->st_stid.sc_file = NULL;
+ }
+
+ spin_lock(&nn->client_lock);
+ last = oo->oo_last_closed_stid;
+ oo->oo_last_closed_stid = s;
list_move_tail(&oo->oo_close_lru, &nn->close_lru);
oo->oo_time = get_seconds();
+ spin_unlock(&nn->client_lock);
+ if (last)
+ nfs4_put_stid(&last->st_stid);
}
-static int
-same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
- clientid_t *clid)
+/* search file_hashtbl[] for file */
+static struct nfs4_file *
+find_file_locked(struct knfsd_fh *fh)
{
- return (sop->so_owner.len == owner->len) &&
- 0 == memcmp(sop->so_owner.data, owner->data, owner->len) &&
- (sop->so_client->cl_clientid.cl_id == clid->cl_id);
-}
+ unsigned int hashval = file_hashval(fh);
+ struct nfs4_file *fp;
-static struct nfs4_openowner *
-find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
- bool sessions, struct nfsd_net *nn)
-{
- struct nfs4_stateowner *so;
- struct nfs4_openowner *oo;
- struct nfs4_client *clp;
+ lockdep_assert_held(&state_lock);
- list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) {
- if (!so->so_is_open_owner)
- continue;
- if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
- oo = openowner(so);
- clp = oo->oo_owner.so_client;
- if ((bool)clp->cl_minorversion != sessions)
- return NULL;
- renew_client(oo->oo_owner.so_client);
- return oo;
+ hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
+ if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
+ get_nfs4_file(fp);
+ return fp;
}
}
return NULL;
}
-/* search file_hashtbl[] for file */
static struct nfs4_file *
-find_file(struct inode *ino)
+find_file(struct knfsd_fh *fh)
{
- unsigned int hashval = file_hashval(ino);
struct nfs4_file *fp;
spin_lock(&state_lock);
- hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
- if (fp->fi_inode == ino) {
- get_nfs4_file(fp);
- spin_unlock(&state_lock);
- return fp;
- }
+ fp = find_file_locked(fh);
+ spin_unlock(&state_lock);
+ return fp;
+}
+
+static struct nfs4_file *
+find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
+{
+ struct nfs4_file *fp;
+
+ spin_lock(&state_lock);
+ fp = find_file_locked(fh);
+ if (fp == NULL) {
+ nfsd4_init_file(new, fh);
+ fp = new;
}
spin_unlock(&state_lock);
- return NULL;
+
+ return fp;
}
/*
@@ -2807,63 +3344,109 @@ find_file(struct inode *ino)
static __be32
nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
{
- struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfs4_file *fp;
- struct nfs4_ol_stateid *stp;
- __be32 ret;
+ __be32 ret = nfs_ok;
- fp = find_file(ino);
+ fp = find_file(&current_fh->fh_handle);
if (!fp)
- return nfs_ok;
- ret = nfserr_locked;
- /* Search for conflicting share reservations */
- list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
- if (test_deny(deny_type, stp) ||
- test_deny(NFS4_SHARE_DENY_BOTH, stp))
- goto out;
- }
- ret = nfs_ok;
-out:
+ return ret;
+ /* Check for conflicting share reservations */
+ spin_lock(&fp->fi_lock);
+ if (fp->fi_share_deny & deny_type)
+ ret = nfserr_locked;
+ spin_unlock(&fp->fi_lock);
put_nfs4_file(fp);
return ret;
}
-static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
{
- struct nfs4_client *clp = dp->dl_stid.sc_client;
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+ struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
+ nfsd_net_id);
- lockdep_assert_held(&state_lock);
- /* We're assuming the state code never drops its reference
- * without first removing the lease. Since we're in this lease
- * callback (and since the lease code is serialized by the kernel
- * lock) we know the server hasn't removed the lease yet, we know
- * it's safe to take a reference: */
- atomic_inc(&dp->dl_count);
+ block_delegations(&dp->dl_stid.sc_file->fi_fhandle);
+
+ /*
+ * We can't do this in nfsd_break_deleg_cb because it is
+ * already holding inode->i_lock.
+ *
+ * If the dl_time != 0, then we know that it has already been
+ * queued for a lease break. Don't queue it again.
+ */
+ spin_lock(&state_lock);
+ if (dp->dl_time == 0) {
+ dp->dl_time = get_seconds();
+ list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
+ }
+ spin_unlock(&state_lock);
+}
+
+static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+{
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+ switch (task->tk_status) {
+ case 0:
+ return 1;
+ case -EBADHANDLE:
+ case -NFS4ERR_BAD_STATEID:
+ /*
+ * Race: client probably got cb_recall before open reply
+ * granting delegation.
+ */
+ if (dp->dl_retries--) {
+ rpc_delay(task, 2 * HZ);
+ return 0;
+ }
+ /*FALLTHRU*/
+ default:
+ return -1;
+ }
+}
- list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
+static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
- /* Only place dl_time is set; protected by i_lock: */
- dp->dl_time = get_seconds();
+ nfs4_put_stid(&dp->dl_stid);
+}
- block_delegations(&dp->dl_fh);
+static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+ .prepare = nfsd4_cb_recall_prepare,
+ .done = nfsd4_cb_recall_done,
+ .release = nfsd4_cb_recall_release,
+};
- nfsd4_cb_recall(dp);
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+{
+ /*
+ * We're assuming the state code never drops its reference
+ * without first removing the lease. Since we're in this lease
+ * callback (and since the lease code is serialized by the kernel
+ * lock) we know the server hasn't removed the lease yet, we know
+ * it's safe to take a reference.
+ */
+ atomic_inc(&dp->dl_stid.sc_count);
+ nfsd4_run_cb(&dp->dl_recall);
}
/* Called from break_lease() with i_lock held. */
-static void nfsd_break_deleg_cb(struct file_lock *fl)
+static bool
+nfsd_break_deleg_cb(struct file_lock *fl)
{
+ bool ret = false;
struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
struct nfs4_delegation *dp;
if (!fp) {
WARN(1, "(%p)->fl_owner NULL\n", fl);
- return;
+ return ret;
}
if (fp->fi_had_conflict) {
WARN(1, "duplicate break on %p\n", fp);
- return;
+ return ret;
}
/*
* We don't want the locks code to timeout the lease for us;
@@ -2872,18 +3455,26 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
*/
fl->fl_break_time = 0;
- spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
fp->fi_had_conflict = true;
- list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
- nfsd_break_one_deleg(dp);
- spin_unlock(&state_lock);
+ /*
+ * If there are no delegations on the list, then return true
+ * so that the lease code will go ahead and delete it.
+ */
+ if (list_empty(&fp->fi_delegations))
+ ret = true;
+ else
+ list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+ nfsd_break_one_deleg(dp);
+ spin_unlock(&fp->fi_lock);
+ return ret;
}
-static
-int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
+static int
+nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose)
{
if (arg & F_UNLCK)
- return lease_modify(onlist, arg);
+ return lease_modify(onlist, arg, dispose);
else
return -EAGAIN;
}
@@ -2904,6 +3495,42 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
return nfserr_bad_seqid;
}
+static __be32 lookup_clientid(clientid_t *clid,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd_net *nn)
+{
+ struct nfs4_client *found;
+
+ if (cstate->clp) {
+ found = cstate->clp;
+ if (!same_clid(&found->cl_clientid, clid))
+ return nfserr_stale_clientid;
+ return nfs_ok;
+ }
+
+ if (STALE_CLIENTID(clid, nn))
+ return nfserr_stale_clientid;
+
+ /*
+ * For v4.1+ we get the client in the SEQUENCE op. If we don't have one
+ * cached already then we know this is for is for v4.0 and "sessions"
+ * will be false.
+ */
+ WARN_ON_ONCE(cstate->session);
+ spin_lock(&nn->client_lock);
+ found = find_confirmed_client(clid, false, nn);
+ if (!found) {
+ spin_unlock(&nn->client_lock);
+ return nfserr_expired;
+ }
+ atomic_inc(&found->cl_refcount);
+ spin_unlock(&nn->client_lock);
+
+ /* Cache the nfs4_client in cstate! */
+ cstate->clp = found;
+ return nfs_ok;
+}
+
__be32
nfsd4_process_open1(struct nfsd4_compound_state *cstate,
struct nfsd4_open *open, struct nfsd_net *nn)
@@ -2924,19 +3551,19 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
if (open->op_file == NULL)
return nfserr_jukebox;
- strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
- oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn);
+ status = lookup_clientid(clientid, cstate, nn);
+ if (status)
+ return status;
+ clp = cstate->clp;
+
+ strhashval = ownerstr_hashval(&open->op_owner);
+ oo = find_openstateowner_str(strhashval, open, clp);
open->op_openowner = oo;
if (!oo) {
- clp = find_confirmed_client(clientid, cstate->minorversion,
- nn);
- if (clp == NULL)
- return nfserr_expired;
goto new_owner;
}
if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
/* Replace unconfirmed owners without checking for replay. */
- clp = oo->oo_owner.so_client;
release_openowner(oo);
open->op_openowner = NULL;
goto new_owner;
@@ -2944,15 +3571,14 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
if (status)
return status;
- clp = oo->oo_owner.so_client;
goto alloc_stateid;
new_owner:
- oo = alloc_init_open_stateowner(strhashval, clp, open);
+ oo = alloc_init_open_stateowner(strhashval, open, cstate);
if (oo == NULL)
return nfserr_jukebox;
open->op_openowner = oo;
alloc_stateid:
- open->op_stp = nfs4_alloc_stateid(clp);
+ open->op_stp = nfs4_alloc_open_stateid(clp);
if (!open->op_stp)
return nfserr_jukebox;
return nfs_ok;
@@ -2994,14 +3620,18 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
{
int flags;
__be32 status = nfserr_bad_stateid;
+ struct nfs4_delegation *deleg;
- *dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
- if (*dp == NULL)
+ deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
+ if (deleg == NULL)
goto out;
flags = share_access_to_flags(open->op_share_access);
- status = nfs4_check_delegmode(*dp, flags);
- if (status)
- *dp = NULL;
+ status = nfs4_check_delegmode(deleg, flags);
+ if (status) {
+ nfs4_put_stid(&deleg->dl_stid);
+ goto out;
+ }
+ *dp = deleg;
out:
if (!nfsd4_is_deleg_cur(open))
return nfs_ok;
@@ -3011,24 +3641,25 @@ out:
return nfs_ok;
}
-static __be32
-nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
{
- struct nfs4_ol_stateid *local;
+ struct nfs4_ol_stateid *local, *ret = NULL;
struct nfs4_openowner *oo = open->op_openowner;
+ spin_lock(&fp->fi_lock);
list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
/* ignore lock owners */
if (local->st_stateowner->so_is_open_owner == 0)
continue;
- /* remember if we have seen this open owner */
- if (local->st_stateowner == &oo->oo_owner)
- *stpp = local;
- /* check for conflicting share reservations */
- if (!test_share(local, open))
- return nfserr_share_denied;
+ if (local->st_stateowner == &oo->oo_owner) {
+ ret = local;
+ atomic_inc(&ret->st_stid.sc_count);
+ break;
+ }
}
- return nfs_ok;
+ spin_unlock(&fp->fi_lock);
+ return ret;
}
static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -3042,24 +3673,6 @@ static inline int nfs4_access_to_access(u32 nfs4_access)
return flags;
}
-static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
- struct svc_fh *cur_fh, struct nfsd4_open *open)
-{
- __be32 status;
- int oflag = nfs4_access_to_omode(open->op_share_access);
- int access = nfs4_access_to_access(open->op_share_access);
-
- if (!fp->fi_fds[oflag]) {
- status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
- &fp->fi_fds[oflag]);
- if (status)
- return status;
- }
- nfs4_file_get_access(fp, oflag);
-
- return nfs_ok;
-}
-
static inline __be32
nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
struct nfsd4_open *open)
@@ -3075,34 +3688,99 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
}
-static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
+ struct nfsd4_open *open)
{
- u32 op_share_access = open->op_share_access;
- bool new_access;
+ struct file *filp = NULL;
__be32 status;
+ int oflag = nfs4_access_to_omode(open->op_share_access);
+ int access = nfs4_access_to_access(open->op_share_access);
+ unsigned char old_access_bmap, old_deny_bmap;
- new_access = !test_access(op_share_access, stp);
- if (new_access) {
- status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
- if (status)
- return status;
+ spin_lock(&fp->fi_lock);
+
+ /*
+ * Are we trying to set a deny mode that would conflict with
+ * current access?
+ */
+ status = nfs4_file_check_deny(fp, open->op_share_deny);
+ if (status != nfs_ok) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
}
- status = nfsd4_truncate(rqstp, cur_fh, open);
- if (status) {
- if (new_access) {
- int oflag = nfs4_access_to_omode(op_share_access);
- nfs4_file_put_access(fp, oflag);
- }
- return status;
+
+ /* set access to the file */
+ status = nfs4_file_get_access(fp, open->op_share_access);
+ if (status != nfs_ok) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
}
- /* remember the open */
- set_access(op_share_access, stp);
+
+ /* Set access bits in stateid */
+ old_access_bmap = stp->st_access_bmap;
+ set_access(open->op_share_access, stp);
+
+ /* Set new deny mask */
+ old_deny_bmap = stp->st_deny_bmap;
set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
- return nfs_ok;
+ if (!fp->fi_fds[oflag]) {
+ spin_unlock(&fp->fi_lock);
+ status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp);
+ if (status)
+ goto out_put_access;
+ spin_lock(&fp->fi_lock);
+ if (!fp->fi_fds[oflag]) {
+ fp->fi_fds[oflag] = filp;
+ filp = NULL;
+ }
+ }
+ spin_unlock(&fp->fi_lock);
+ if (filp)
+ fput(filp);
+
+ status = nfsd4_truncate(rqstp, cur_fh, open);
+ if (status)
+ goto out_put_access;
+out:
+ return status;
+out_put_access:
+ stp->st_access_bmap = old_access_bmap;
+ nfs4_file_put_access(fp, open->op_share_access);
+ reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp);
+ goto out;
}
+static __be32
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+{
+ __be32 status;
+ unsigned char old_deny_bmap;
+
+ if (!test_access(open->op_share_access, stp))
+ return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
+
+ /* test and set deny mode */
+ spin_lock(&fp->fi_lock);
+ status = nfs4_file_check_deny(fp, open->op_share_deny);
+ if (status == nfs_ok) {
+ old_deny_bmap = stp->st_deny_bmap;
+ set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |=
+ (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+ }
+ spin_unlock(&fp->fi_lock);
+
+ if (status != nfs_ok)
+ return status;
+
+ status = nfsd4_truncate(rqstp, cur_fh, open);
+ if (status != nfs_ok)
+ reset_union_bmap_deny(old_deny_bmap, stp);
+ return status;
+}
static void
nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session)
@@ -3123,65 +3801,112 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
}
-static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
{
struct file_lock *fl;
fl = locks_alloc_lock();
if (!fl)
return NULL;
- locks_init_lock(fl);
fl->fl_lmops = &nfsd_lease_mng_ops;
fl->fl_flags = FL_DELEG;
fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
fl->fl_end = OFFSET_MAX;
- fl->fl_owner = (fl_owner_t)(dp->dl_file);
+ fl->fl_owner = (fl_owner_t)fp;
fl->fl_pid = current->tgid;
return fl;
}
static int nfs4_setlease(struct nfs4_delegation *dp)
{
- struct nfs4_file *fp = dp->dl_file;
- struct file_lock *fl;
- int status;
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
+ struct file_lock *fl, *ret;
+ struct file *filp;
+ int status = 0;
- fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
+ fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ);
if (!fl)
return -ENOMEM;
- fl->fl_file = find_readable_file(fp);
- status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
+ filp = find_readable_file(fp);
+ if (!filp) {
+ /* We should always have a readable file here */
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ fl->fl_file = filp;
+ ret = fl;
+ status = vfs_setlease(filp, fl->fl_type, &fl, NULL);
+ if (fl)
+ locks_free_lock(fl);
if (status)
- goto out_free;
- fp->fi_lease = fl;
- fp->fi_deleg_file = get_file(fl->fl_file);
- atomic_set(&fp->fi_delegees, 1);
+ goto out_fput;
spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ /* Did the lease get broken before we took the lock? */
+ status = -EAGAIN;
+ if (fp->fi_had_conflict)
+ goto out_unlock;
+ /* Race breaker */
+ if (fp->fi_deleg_file) {
+ status = 0;
+ atomic_inc(&fp->fi_delegees);
+ hash_delegation_locked(dp, fp);
+ goto out_unlock;
+ }
+ fp->fi_deleg_file = filp;
+ atomic_set(&fp->fi_delegees, 1);
hash_delegation_locked(dp, fp);
+ spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
return 0;
-out_free:
- locks_free_lock(fl);
+out_unlock:
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+out_fput:
+ fput(filp);
return status;
}
-static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp)
+static struct nfs4_delegation *
+nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
+ struct nfs4_file *fp)
{
+ int status;
+ struct nfs4_delegation *dp;
+
if (fp->fi_had_conflict)
- return -EAGAIN;
+ return ERR_PTR(-EAGAIN);
+
+ dp = alloc_init_deleg(clp, fh);
+ if (!dp)
+ return ERR_PTR(-ENOMEM);
+
get_nfs4_file(fp);
- dp->dl_file = fp;
- if (!fp->fi_lease)
- return nfs4_setlease(dp);
spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ dp->dl_stid.sc_file = fp;
+ if (!fp->fi_deleg_file) {
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+ status = nfs4_setlease(dp);
+ goto out;
+ }
atomic_inc(&fp->fi_delegees);
if (fp->fi_had_conflict) {
- spin_unlock(&state_lock);
- return -EAGAIN;
+ status = -EAGAIN;
+ goto out_unlock;
}
hash_delegation_locked(dp, fp);
+ status = 0;
+out_unlock:
+ spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
- return 0;
+out:
+ if (status) {
+ nfs4_put_stid(&dp->dl_stid);
+ return ERR_PTR(status);
+ }
+ return dp;
}
static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
@@ -3212,11 +3937,12 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
* proper support for them.
*/
static void
-nfs4_open_delegation(struct net *net, struct svc_fh *fh,
- struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
+ struct nfs4_ol_stateid *stp)
{
struct nfs4_delegation *dp;
- struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
+ struct nfs4_openowner *oo = openowner(stp->st_stateowner);
+ struct nfs4_client *clp = stp->st_stid.sc_client;
int cb_up;
int status = 0;
@@ -3235,7 +3961,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
* Let's not give out any delegations till everyone's
* had the chance to reclaim theirs....
*/
- if (locks_in_grace(net))
+ if (locks_in_grace(clp->net))
goto out_no_deleg;
if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
goto out_no_deleg;
@@ -3254,21 +3980,17 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
default:
goto out_no_deleg;
}
- dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
- if (dp == NULL)
+ dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file);
+ if (IS_ERR(dp))
goto out_no_deleg;
- status = nfs4_set_delegation(dp, stp->st_file);
- if (status)
- goto out_free;
memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
STATEID_VAL(&dp->dl_stid.sc_stateid));
open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
+ nfs4_put_stid(&dp->dl_stid);
return;
-out_free:
- destroy_delegation(dp);
out_no_deleg:
open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
@@ -3301,16 +4023,12 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
*/
}
-/*
- * called with nfs4_lock_state() held.
- */
__be32
nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
- struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfs4_ol_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
@@ -3320,21 +4038,18 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* and check for delegations in the process of being recalled.
* If not found, create the nfs4_file struct
*/
- fp = find_file(ino);
- if (fp) {
- if ((status = nfs4_check_open(fp, open, &stp)))
- goto out;
+ fp = find_or_add_file(open->op_file, &current_fh->fh_handle);
+ if (fp != open->op_file) {
status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
+ stp = nfsd4_find_existing_open(fp, open);
} else {
+ open->op_file = NULL;
status = nfserr_bad_stateid;
if (nfsd4_is_deleg_cur(open))
goto out;
status = nfserr_jukebox;
- fp = open->op_file;
- open->op_file = NULL;
- nfsd4_init_file(fp, ino);
}
/*
@@ -3347,22 +4062,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (status)
goto out;
} else {
- status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
- if (status)
- goto out;
- status = nfsd4_truncate(rqstp, current_fh, open);
- if (status)
- goto out;
stp = open->op_stp;
open->op_stp = NULL;
init_open_stateid(stp, fp, open);
+ status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+ if (status) {
+ release_open_stateid(stp);
+ goto out;
+ }
}
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
if (nfsd4_has_session(&resp->cstate)) {
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
-
if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
open->op_why_no_deleg = WND4_NOT_WANTED;
@@ -3374,7 +4086,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* Attempt to hand out a delegation. No error return, because the
* OPEN succeeds even if we fail.
*/
- nfs4_open_delegation(SVC_NET(rqstp), current_fh, open, stp);
+ nfs4_open_delegation(current_fh, open, stp);
nodeleg:
status = nfs_ok;
@@ -3397,41 +4109,27 @@ out:
if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
!nfsd4_has_session(&resp->cstate))
open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
+ if (dp)
+ nfs4_put_stid(&dp->dl_stid);
+ if (stp)
+ nfs4_put_stid(&stp->st_stid);
return status;
}
-void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
+void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
+ struct nfsd4_open *open, __be32 status)
{
if (open->op_openowner) {
- struct nfs4_openowner *oo = open->op_openowner;
+ struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
- if (!list_empty(&oo->oo_owner.so_stateids))
- list_del_init(&oo->oo_close_lru);
- if (oo->oo_flags & NFS4_OO_NEW) {
- if (status) {
- release_openowner(oo);
- open->op_openowner = NULL;
- } else
- oo->oo_flags &= ~NFS4_OO_NEW;
- }
+ nfsd4_cstate_assign_replay(cstate, so);
+ nfs4_put_stateowner(so);
}
if (open->op_file)
nfsd4_free_file(open->op_file);
if (open->op_stp)
- free_generic_stateid(open->op_stp);
-}
-
-static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp)
-{
- struct nfs4_client *found;
-
- if (STALE_CLIENTID(clid, nn))
- return nfserr_stale_clientid;
- found = find_confirmed_client(clid, session, nn);
- if (clp)
- *clp = found;
- return found ? nfs_ok : nfserr_expired;
+ nfs4_put_stid(&open->op_stp->st_stid);
}
__be32
@@ -3442,23 +4140,22 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- nfs4_lock_state();
dprintk("process_renew(%08x/%08x): starting\n",
clid->cl_boot, clid->cl_id);
- status = lookup_clientid(clid, cstate->minorversion, nn, &clp);
+ status = lookup_clientid(clid, cstate, nn);
if (status)
goto out;
+ clp = cstate->clp;
status = nfserr_cb_path_down;
if (!list_empty(&clp->cl_delegations)
&& clp->cl_cb_state != NFSD4_CB_UP)
goto out;
status = nfs_ok;
out:
- nfs4_unlock_state();
return status;
}
-static void
+void
nfsd4_end_grace(struct nfsd_net *nn)
{
/* do nothing if grace period already ended */
@@ -3467,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn)
dprintk("NFSD: end of grace period\n");
nn->grace_ended = true;
- nfsd4_record_grace_done(nn, nn->boot_time);
+ /*
+ * If the server goes down again right now, an NFSv4
+ * client will still be allowed to reclaim after it comes back up,
+ * even if it hasn't yet had a chance to reclaim state this time.
+ *
+ */
+ nfsd4_record_grace_done(nn);
+ /*
+ * At this point, NFSv4 clients can still reclaim. But if the
+ * server crashes, any that have not yet reclaimed will be out
+ * of luck on the next boot.
+ *
+ * (NFSv4.1+ clients are considered to have reclaimed once they
+ * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to
+ * have reclaimed after their first OPEN.)
+ */
locks_end_grace(&nn->nfsd4_manager);
/*
- * Now that every NFSv4 client has had the chance to recover and
- * to see the (possibly new, possibly shorter) lease time, we
- * can safely set the next grace time to the current lease time:
+ * At this point, and once lockd and/or any other containers
+ * exit their grace period, further reclaims will fail and
+ * regular locking can resume.
*/
- nn->nfsd4_grace = nn->nfsd4_lease;
}
static time_t
@@ -3483,12 +4194,11 @@ nfs4_laundromat(struct nfsd_net *nn)
struct nfs4_client *clp;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
+ struct nfs4_ol_stateid *stp;
struct list_head *pos, *next, reaplist;
time_t cutoff = get_seconds() - nn->nfsd4_lease;
time_t t, new_timeo = nn->nfsd4_lease;
- nfs4_lock_state();
-
dprintk("NFSD: laundromat service - starting\n");
nfsd4_end_grace(nn);
INIT_LIST_HEAD(&reaplist);
@@ -3505,13 +4215,14 @@ nfs4_laundromat(struct nfsd_net *nn)
clp->cl_clientid.cl_id);
continue;
}
- list_move(&clp->cl_lru, &reaplist);
+ list_add(&clp->cl_lru, &reaplist);
}
spin_unlock(&nn->client_lock);
list_for_each_safe(pos, next, &reaplist) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
dprintk("NFSD: purging unused client (clientid %08x)\n",
clp->cl_clientid.cl_id);
+ list_del_init(&clp->cl_lru);
expire_client(clp);
}
spin_lock(&state_lock);
@@ -3524,24 +4235,37 @@ nfs4_laundromat(struct nfsd_net *nn)
new_timeo = min(new_timeo, t);
break;
}
- list_move(&dp->dl_recall_lru, &reaplist);
+ unhash_delegation_locked(dp);
+ list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
- list_for_each_safe(pos, next, &reaplist) {
- dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+ while (!list_empty(&reaplist)) {
+ dp = list_first_entry(&reaplist, struct nfs4_delegation,
+ dl_recall_lru);
+ list_del_init(&dp->dl_recall_lru);
revoke_delegation(dp);
}
- list_for_each_safe(pos, next, &nn->close_lru) {
- oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
- if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
+
+ spin_lock(&nn->client_lock);
+ while (!list_empty(&nn->close_lru)) {
+ oo = list_first_entry(&nn->close_lru, struct nfs4_openowner,
+ oo_close_lru);
+ if (time_after((unsigned long)oo->oo_time,
+ (unsigned long)cutoff)) {
t = oo->oo_time - cutoff;
new_timeo = min(new_timeo, t);
break;
}
- release_openowner(oo);
+ list_del_init(&oo->oo_close_lru);
+ stp = oo->oo_last_closed_stid;
+ oo->oo_last_closed_stid = NULL;
+ spin_unlock(&nn->client_lock);
+ nfs4_put_stid(&stp->st_stid);
+ spin_lock(&nn->client_lock);
}
+ spin_unlock(&nn->client_lock);
+
new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
- nfs4_unlock_state();
return new_timeo;
}
@@ -3564,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
{
- if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
+ if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
return nfserr_bad_stateid;
return nfs_ok;
}
@@ -3666,10 +4390,10 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
{
struct nfs4_stid *s;
struct nfs4_ol_stateid *ols;
- __be32 status;
+ __be32 status = nfserr_bad_stateid;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
- return nfserr_bad_stateid;
+ return status;
/* Client debugging aid. */
if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) {
char addr_str[INET6_ADDRSTRLEN];
@@ -3677,53 +4401,62 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
sizeof(addr_str));
pr_warn_ratelimited("NFSD: client %s testing state ID "
"with incorrect client ID\n", addr_str);
- return nfserr_bad_stateid;
+ return status;
}
- s = find_stateid(cl, stateid);
+ spin_lock(&cl->cl_lock);
+ s = find_stateid_locked(cl, stateid);
if (!s)
- return nfserr_bad_stateid;
+ goto out_unlock;
status = check_stateid_generation(stateid, &s->sc_stateid, 1);
if (status)
- return status;
+ goto out_unlock;
switch (s->sc_type) {
case NFS4_DELEG_STID:
- return nfs_ok;
+ status = nfs_ok;
+ break;
case NFS4_REVOKED_DELEG_STID:
- return nfserr_deleg_revoked;
+ status = nfserr_deleg_revoked;
+ break;
case NFS4_OPEN_STID:
case NFS4_LOCK_STID:
ols = openlockstateid(s);
if (ols->st_stateowner->so_is_open_owner
&& !(openowner(ols->st_stateowner)->oo_flags
& NFS4_OO_CONFIRMED))
- return nfserr_bad_stateid;
- return nfs_ok;
+ status = nfserr_bad_stateid;
+ else
+ status = nfs_ok;
+ break;
default:
printk("unknown stateid type %x\n", s->sc_type);
+ /* Fallthrough */
case NFS4_CLOSED_STID:
- return nfserr_bad_stateid;
+ case NFS4_CLOSED_DELEG_STID:
+ status = nfserr_bad_stateid;
}
+out_unlock:
+ spin_unlock(&cl->cl_lock);
+ return status;
}
-static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
- struct nfs4_stid **s, bool sessions,
- struct nfsd_net *nn)
+static __be32
+nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+ stateid_t *stateid, unsigned char typemask,
+ struct nfs4_stid **s, struct nfsd_net *nn)
{
- struct nfs4_client *cl;
__be32 status;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return nfserr_bad_stateid;
- status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
- nn, &cl);
+ status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn);
if (status == nfserr_stale_clientid) {
- if (sessions)
+ if (cstate->session)
return nfserr_bad_stateid;
return nfserr_stale_stateid;
}
if (status)
return status;
- *s = find_stateid_by_type(cl, stateid, typemask);
+ *s = find_stateid_by_type(cstate->clp, stateid, typemask);
if (!*s)
return nfserr_bad_stateid;
return nfs_ok;
@@ -3754,12 +4487,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return check_special_stateids(net, current_fh, stateid, flags);
- nfs4_lock_state();
-
- status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
- &s, cstate->minorversion, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid,
+ NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+ &s, nn);
if (status)
- goto out;
+ return status;
status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
if (status)
goto out;
@@ -3770,12 +4502,13 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (status)
goto out;
if (filpp) {
- file = dp->dl_file->fi_deleg_file;
+ file = dp->dl_stid.sc_file->fi_deleg_file;
if (!file) {
WARN_ON_ONCE(1);
status = nfserr_serverfault;
goto out;
}
+ get_file(file);
}
break;
case NFS4_OPEN_STID:
@@ -3791,10 +4524,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (status)
goto out;
if (filpp) {
+ struct nfs4_file *fp = stp->st_stid.sc_file;
+
if (flags & RD_STATE)
- file = find_readable_file(stp->st_file);
+ file = find_readable_file(fp);
else
- file = find_writeable_file(stp->st_file);
+ file = find_writeable_file(fp);
}
break;
default:
@@ -3803,28 +4538,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
}
status = nfs_ok;
if (file)
- *filpp = get_file(file);
+ *filpp = file;
out:
- nfs4_unlock_state();
+ nfs4_put_stid(s);
return status;
}
-static __be32
-nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
-{
- struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
-
- if (check_for_locks(stp->st_file, lo))
- return nfserr_locks_held;
- /*
- * Currently there's a 1-1 lock stateid<->lockowner
- * correspondance, and we have to delete the lockowner when we
- * delete the lock stateid:
- */
- release_lockowner(lo);
- return nfs_ok;
-}
-
/*
* Test if the stateid is valid
*/
@@ -3835,11 +4554,9 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_test_stateid_id *stateid;
struct nfs4_client *cl = cstate->session->se_client;
- nfs4_lock_state();
list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list)
stateid->ts_id_status =
nfsd4_validate_stateid(cl, &stateid->ts_id_stateid);
- nfs4_unlock_state();
return nfs_ok;
}
@@ -3851,37 +4568,50 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
stateid_t *stateid = &free_stateid->fr_stateid;
struct nfs4_stid *s;
struct nfs4_delegation *dp;
+ struct nfs4_ol_stateid *stp;
struct nfs4_client *cl = cstate->session->se_client;
__be32 ret = nfserr_bad_stateid;
- nfs4_lock_state();
- s = find_stateid(cl, stateid);
+ spin_lock(&cl->cl_lock);
+ s = find_stateid_locked(cl, stateid);
if (!s)
- goto out;
+ goto out_unlock;
switch (s->sc_type) {
case NFS4_DELEG_STID:
ret = nfserr_locks_held;
- goto out;
+ break;
case NFS4_OPEN_STID:
- case NFS4_LOCK_STID:
ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
if (ret)
- goto out;
- if (s->sc_type == NFS4_LOCK_STID)
- ret = nfsd4_free_lock_stateid(openlockstateid(s));
- else
- ret = nfserr_locks_held;
+ break;
+ ret = nfserr_locks_held;
break;
+ case NFS4_LOCK_STID:
+ ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ if (ret)
+ break;
+ stp = openlockstateid(s);
+ ret = nfserr_locks_held;
+ if (check_for_locks(stp->st_stid.sc_file,
+ lockowner(stp->st_stateowner)))
+ break;
+ unhash_lock_stateid(stp);
+ spin_unlock(&cl->cl_lock);
+ nfs4_put_stid(s);
+ ret = nfs_ok;
+ goto out;
case NFS4_REVOKED_DELEG_STID:
dp = delegstateid(s);
- destroy_revoked_delegation(dp);
+ list_del_init(&dp->dl_recall_lru);
+ spin_unlock(&cl->cl_lock);
+ nfs4_put_stid(s);
ret = nfs_ok;
- break;
- default:
- ret = nfserr_bad_stateid;
+ goto out;
+ /* Default falls through and returns nfserr_bad_stateid */
}
+out_unlock:
+ spin_unlock(&cl->cl_lock);
out:
- nfs4_unlock_state();
return ret;
}
@@ -3926,20 +4656,24 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
{
__be32 status;
struct nfs4_stid *s;
+ struct nfs4_ol_stateid *stp = NULL;
dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
seqid, STATEID_VAL(stateid));
*stpp = NULL;
- status = nfsd4_lookup_stateid(stateid, typemask, &s,
- cstate->minorversion, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
if (status)
return status;
- *stpp = openlockstateid(s);
- if (!nfsd4_has_session(cstate))
- cstate->replay_owner = (*stpp)->st_stateowner;
+ stp = openlockstateid(s);
+ nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
- return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
+ status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
+ if (!status)
+ *stpp = stp;
+ else
+ nfs4_put_stid(&stp->st_stid);
+ return status;
}
static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
@@ -3947,14 +4681,18 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
{
__be32 status;
struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *stp;
status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
- NFS4_OPEN_STID, stpp, nn);
+ NFS4_OPEN_STID, &stp, nn);
if (status)
return status;
- oo = openowner((*stpp)->st_stateowner);
- if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
+ oo = openowner(stp->st_stateowner);
+ if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+ nfs4_put_stid(&stp->st_stid);
return nfserr_bad_stateid;
+ }
+ *stpp = stp;
return nfs_ok;
}
@@ -3974,8 +4712,6 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
- nfs4_lock_state();
-
status = nfs4_preprocess_seqid_op(cstate,
oc->oc_seqid, &oc->oc_req_stateid,
NFS4_OPEN_STID, &stp, nn);
@@ -3984,7 +4720,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
oo = openowner(stp->st_stateowner);
status = nfserr_bad_stateid;
if (oo->oo_flags & NFS4_OO_CONFIRMED)
- goto out;
+ goto put_stateid;
oo->oo_flags |= NFS4_OO_CONFIRMED;
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -3993,10 +4729,10 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_client_record_create(oo->oo_owner.so_client);
status = nfs_ok;
+put_stateid:
+ nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
- if (!cstate->replay_owner)
- nfs4_unlock_state();
return status;
}
@@ -4004,7 +4740,7 @@ static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 a
{
if (!test_access(access, stp))
return;
- nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
+ nfs4_file_put_access(stp->st_stid.sc_file, access);
clear_access(access, stp);
}
@@ -4026,16 +4762,6 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
}
}
-static void
-reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp)
-{
- int i;
- for (i = 0; i < 4; i++) {
- if ((i & deny) != i)
- clear_deny(i, stp);
- }
-}
-
__be32
nfsd4_open_downgrade(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -4053,21 +4779,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__,
od->od_deleg_want);
- nfs4_lock_state();
status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
&od->od_stateid, &stp, nn);
if (status)
goto out;
status = nfserr_inval;
if (!test_access(od->od_share_access, stp)) {
- dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n",
+ dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n",
stp->st_access_bmap, od->od_share_access);
- goto out;
+ goto put_stateid;
}
if (!test_deny(od->od_share_deny, stp)) {
- dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n",
+ dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n",
stp->st_deny_bmap, od->od_share_deny);
- goto out;
+ goto put_stateid;
}
nfs4_stateid_downgrade(stp, od->od_share_access);
@@ -4076,17 +4801,31 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
status = nfs_ok;
+put_stateid:
+ nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
- if (!cstate->replay_owner)
- nfs4_unlock_state();
return status;
}
static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
{
- unhash_open_stateid(s);
+ struct nfs4_client *clp = s->st_stid.sc_client;
+ LIST_HEAD(reaplist);
+
s->st_stid.sc_type = NFS4_CLOSED_STID;
+ spin_lock(&clp->cl_lock);
+ unhash_open_stateid(s, &reaplist);
+
+ if (clp->cl_minorversion) {
+ put_ol_stateid_locked(s, &reaplist);
+ spin_unlock(&clp->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
+ } else {
+ spin_unlock(&clp->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
+ move_to_close_lru(s, clp->net);
+ }
}
/*
@@ -4097,7 +4836,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_close *close)
{
__be32 status;
- struct nfs4_openowner *oo;
struct nfs4_ol_stateid *stp;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -4105,7 +4843,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfsd4_close on file %pd\n",
cstate->current_fh.fh_dentry);
- nfs4_lock_state();
status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
&close->cl_stateid,
NFS4_OPEN_STID|NFS4_CLOSED_STID,
@@ -4113,31 +4850,14 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_bump_seqid(cstate, status);
if (status)
goto out;
- oo = openowner(stp->st_stateowner);
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
nfsd4_close_open_stateid(stp);
- if (cstate->minorversion)
- free_generic_stateid(stp);
- else
- oo->oo_last_closed_stid = stp;
-
- if (list_empty(&oo->oo_owner.so_stateids)) {
- if (cstate->minorversion)
- release_openowner(oo);
- else {
- /*
- * In the 4.0 case we need to keep the owners around a
- * little while to handle CLOSE replay.
- */
- move_to_close_lru(oo, SVC_NET(rqstp));
- }
- }
+ /* put reference from nfs4_preprocess_seqid_op */
+ nfs4_put_stid(&stp->st_stid);
out:
- if (!cstate->replay_owner)
- nfs4_unlock_state();
return status;
}
@@ -4154,28 +4874,24 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- nfs4_lock_state();
- status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s,
- cstate->minorversion, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
if (status)
goto out;
dp = delegstateid(s);
status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
if (status)
- goto out;
+ goto put_stateid;
destroy_delegation(dp);
+put_stateid:
+ nfs4_put_stid(&dp->dl_stid);
out:
- nfs4_unlock_state();
-
return status;
}
#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
-#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
-
static inline u64
end_offset(u64 start, u64 len)
{
@@ -4196,13 +4912,6 @@ last_byte_offset(u64 start, u64 len)
return end > start ? end - 1: NFS4_MAX_UINT64;
}
-static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
-{
- return (file_hashval(inode) + cl_id
- + opaque_hashval(ownername->data, ownername->len))
- & LOCKOWNER_INO_HASH_MASK;
-}
-
/*
* TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
* we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -4220,9 +4929,25 @@ nfs4_transform_lock_offset(struct file_lock *lock)
lock->fl_end = OFFSET_MAX;
}
-/* Hack!: For now, we're defining this just so we can use a pointer to it
- * as a unique cookie to identify our (NFSv4's) posix locks. */
+static void nfsd4_fl_get_owner(struct file_lock *dst, struct file_lock *src)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)src->fl_owner;
+ dst->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lo->lo_owner));
+}
+
+static void nfsd4_fl_put_owner(struct file_lock *fl)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner;
+
+ if (lo) {
+ nfs4_put_stateowner(&lo->lo_owner);
+ fl->fl_owner = NULL;
+ }
+}
+
static const struct lock_manager_operations nfsd_posix_mng_ops = {
+ .lm_get_owner = nfsd4_fl_get_owner,
+ .lm_put_owner = nfsd4_fl_put_owner,
};
static inline void
@@ -4255,47 +4980,54 @@ nevermind:
deny->ld_type = NFS4_WRITE_LT;
}
-static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
+static struct nfs4_lockowner *
+find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner,
+ struct nfs4_client *clp)
{
- struct nfs4_ol_stateid *lst;
+ unsigned int strhashval = ownerstr_hashval(owner);
+ struct nfs4_stateowner *so;
- if (!same_owner_str(&lo->lo_owner, owner, clid))
- return false;
- if (list_empty(&lo->lo_owner.so_stateids)) {
- WARN_ON_ONCE(1);
- return false;
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval],
+ so_strhash) {
+ if (so->so_is_open_owner)
+ continue;
+ if (same_owner_str(so, owner))
+ return lockowner(nfs4_get_stateowner(so));
}
- lst = list_first_entry(&lo->lo_owner.so_stateids,
- struct nfs4_ol_stateid, st_perstateowner);
- return lst->st_file->fi_inode == inode;
+ return NULL;
}
static struct nfs4_lockowner *
-find_lockowner_str(struct inode *inode, clientid_t *clid,
- struct xdr_netobj *owner, struct nfsd_net *nn)
+find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner,
+ struct nfs4_client *clp)
{
- unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
struct nfs4_lockowner *lo;
- list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
- if (same_lockowner_ino(lo, inode, clid, owner))
- return lo;
- }
- return NULL;
+ spin_lock(&clp->cl_lock);
+ lo = find_lockowner_str_locked(clid, owner, clp);
+ spin_unlock(&clp->cl_lock);
+ return lo;
}
-static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
+static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop)
{
- struct inode *inode = open_stp->st_file->fi_inode;
- unsigned int inohash = lockowner_ino_hashval(inode,
- clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ unhash_lockowner_locked(lockowner(sop));
+}
- list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
- list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]);
- list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
+static void nfs4_free_lockowner(struct nfs4_stateowner *sop)
+{
+ struct nfs4_lockowner *lo = lockowner(sop);
+
+ kmem_cache_free(lockowner_slab, lo);
}
+static const struct nfs4_stateowner_operations lockowner_ops = {
+ .so_unhash = nfs4_unhash_lockowner,
+ .so_free = nfs4_free_lockowner,
+};
+
/*
* Alloc a lock owner structure.
* Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
@@ -4303,42 +5035,106 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
*
* strhashval = ownerstr_hashval
*/
-
static struct nfs4_lockowner *
-alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
- struct nfs4_lockowner *lo;
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
+ struct nfs4_ol_stateid *open_stp,
+ struct nfsd4_lock *lock)
+{
+ struct nfs4_lockowner *lo, *ret;
lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
if (!lo)
return NULL;
INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
lo->lo_owner.so_is_open_owner = 0;
- /* It is the openowner seqid that will be incremented in encode in the
- * case of new lockowners; so increment the lock seqid manually: */
- lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
- hash_lockowner(lo, strhashval, clp, open_stp);
+ lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
+ lo->lo_owner.so_ops = &lockowner_ops;
+ spin_lock(&clp->cl_lock);
+ ret = find_lockowner_str_locked(&clp->cl_clientid,
+ &lock->lk_new_owner, clp);
+ if (ret == NULL) {
+ list_add(&lo->lo_owner.so_strhash,
+ &clp->cl_ownerstr_hashtbl[strhashval]);
+ ret = lo;
+ } else
+ nfs4_free_lockowner(&lo->lo_owner);
+ spin_unlock(&clp->cl_lock);
return lo;
}
-static struct nfs4_ol_stateid *
-alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
+static void
+init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
+ struct nfs4_file *fp, struct inode *inode,
+ struct nfs4_ol_stateid *open_stp)
{
- struct nfs4_ol_stateid *stp;
struct nfs4_client *clp = lo->lo_owner.so_client;
- stp = nfs4_alloc_stateid(clp);
- if (stp == NULL)
- return NULL;
+ lockdep_assert_held(&clp->cl_lock);
+
+ atomic_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_LOCK_STID;
- list_add(&stp->st_perfile, &fp->fi_stateids);
- list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
- stp->st_stateowner = &lo->lo_owner;
+ stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
get_nfs4_file(fp);
- stp->st_file = fp;
+ stp->st_stid.sc_file = fp;
+ stp->st_stid.sc_free = nfs4_free_lock_stateid;
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
- return stp;
+ list_add(&stp->st_locks, &open_stp->st_locks);
+ list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+ spin_lock(&fp->fi_lock);
+ list_add(&stp->st_perfile, &fp->fi_stateids);
+ spin_unlock(&fp->fi_lock);
+}
+
+static struct nfs4_ol_stateid *
+find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
+{
+ struct nfs4_ol_stateid *lst;
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
+ if (lst->st_stid.sc_file == fp) {
+ atomic_inc(&lst->st_stid.sc_count);
+ return lst;
+ }
+ }
+ return NULL;
+}
+
+static struct nfs4_ol_stateid *
+find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
+ struct inode *inode, struct nfs4_ol_stateid *ost,
+ bool *new)
+{
+ struct nfs4_stid *ns = NULL;
+ struct nfs4_ol_stateid *lst;
+ struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+ struct nfs4_client *clp = oo->oo_owner.so_client;
+
+ spin_lock(&clp->cl_lock);
+ lst = find_lock_stateid(lo, fi);
+ if (lst == NULL) {
+ spin_unlock(&clp->cl_lock);
+ ns = nfs4_alloc_stid(clp, stateid_slab);
+ if (ns == NULL)
+ return NULL;
+
+ spin_lock(&clp->cl_lock);
+ lst = find_lock_stateid(lo, fi);
+ if (likely(!lst)) {
+ lst = openlockstateid(ns);
+ init_lock_stateid(lst, lo, fi, inode, ost);
+ ns = NULL;
+ *new = true;
+ }
+ }
+ spin_unlock(&clp->cl_lock);
+ if (ns)
+ nfs4_put_stid(ns);
+ return lst;
}
static int
@@ -4350,46 +5146,53 @@ check_lock_length(u64 offset, u64 length)
static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
{
- struct nfs4_file *fp = lock_stp->st_file;
- int oflag = nfs4_access_to_omode(access);
+ struct nfs4_file *fp = lock_stp->st_stid.sc_file;
+
+ lockdep_assert_held(&fp->fi_lock);
if (test_access(access, lock_stp))
return;
- nfs4_file_get_access(fp, oflag);
+ __nfs4_file_get_access(fp, access);
set_access(access, lock_stp);
}
-static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+static __be32
+lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
+ struct nfs4_ol_stateid *ost,
+ struct nfsd4_lock *lock,
+ struct nfs4_ol_stateid **lst, bool *new)
{
- struct nfs4_file *fi = ost->st_file;
+ __be32 status;
+ struct nfs4_file *fi = ost->st_stid.sc_file;
struct nfs4_openowner *oo = openowner(ost->st_stateowner);
struct nfs4_client *cl = oo->oo_owner.so_client;
+ struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
struct nfs4_lockowner *lo;
unsigned int strhashval;
- struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id);
- lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid,
- &lock->v.new.owner, nn);
- if (lo) {
- if (!cstate->minorversion)
- return nfserr_bad_seqid;
- /* XXX: a lockowner always has exactly one stateid: */
- *lst = list_first_entry(&lo->lo_owner.so_stateids,
- struct nfs4_ol_stateid, st_perstateowner);
- return nfs_ok;
+ lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl);
+ if (!lo) {
+ strhashval = ownerstr_hashval(&lock->v.new.owner);
+ lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
+ if (lo == NULL)
+ return nfserr_jukebox;
+ } else {
+ /* with an existing lockowner, seqids must be the same */
+ status = nfserr_bad_seqid;
+ if (!cstate->minorversion &&
+ lock->lk_new_lock_seqid != lo->lo_owner.so_seqid)
+ goto out;
}
- strhashval = ownerstr_hashval(cl->cl_clientid.cl_id,
- &lock->v.new.owner);
- lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
- if (lo == NULL)
- return nfserr_jukebox;
- *lst = alloc_init_lock_stateid(lo, fi, ost);
+
+ *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
if (*lst == NULL) {
- release_lockowner(lo);
- return nfserr_jukebox;
+ status = nfserr_jukebox;
+ goto out;
}
- *new = true;
- return nfs_ok;
+ status = nfs_ok;
+out:
+ nfs4_put_stateowner(&lo->lo_owner);
+ return status;
}
/*
@@ -4401,14 +5204,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct nfs4_openowner *open_sop = NULL;
struct nfs4_lockowner *lock_sop = NULL;
- struct nfs4_ol_stateid *lock_stp;
+ struct nfs4_ol_stateid *lock_stp = NULL;
+ struct nfs4_ol_stateid *open_stp = NULL;
+ struct nfs4_file *fp;
struct file *filp = NULL;
struct file_lock *file_lock = NULL;
struct file_lock *conflock = NULL;
__be32 status = 0;
- bool new_state = false;
int lkflg;
int err;
+ bool new = false;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -4425,11 +5230,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
- nfs4_lock_state();
-
if (lock->lk_is_new) {
- struct nfs4_ol_stateid *open_stp = NULL;
-
if (nfsd4_has_session(cstate))
/* See rfc 5661 18.10.3: given clientid is ignored: */
memcpy(&lock->v.new.clientid,
@@ -4453,12 +5254,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&lock->v.new.clientid))
goto out;
status = lookup_or_create_lock_state(cstate, open_stp, lock,
- &lock_stp, &new_state);
- } else
+ &lock_stp, &new);
+ } else {
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
&lock->lk_old_lock_stateid,
NFS4_LOCK_STID, &lock_stp, nn);
+ }
if (status)
goto out;
lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4482,20 +5284,24 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- locks_init_lock(file_lock);
+ fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READ_LT:
case NFS4_READW_LT:
- filp = find_readable_file(lock_stp->st_file);
+ spin_lock(&fp->fi_lock);
+ filp = find_readable_file_locked(fp);
if (filp)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
+ spin_unlock(&fp->fi_lock);
file_lock->fl_type = F_RDLCK;
break;
case NFS4_WRITE_LT:
case NFS4_WRITEW_LT:
- filp = find_writeable_file(lock_stp->st_file);
+ spin_lock(&fp->fi_lock);
+ filp = find_writeable_file_locked(fp);
if (filp)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
+ spin_unlock(&fp->fi_lock);
file_lock->fl_type = F_WRLCK;
break;
default:
@@ -4506,7 +5312,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_openmode;
goto out;
}
- file_lock->fl_owner = (fl_owner_t)lock_sop;
+
+ file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
file_lock->fl_pid = current->tgid;
file_lock->fl_file = filp;
file_lock->fl_flags = FL_POSIX;
@@ -4544,11 +5351,27 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
}
out:
- if (status && new_state)
- release_lockowner(lock_sop);
+ if (filp)
+ fput(filp);
+ if (lock_stp) {
+ /* Bump seqid manually if the 4.0 replay owner is openowner */
+ if (cstate->replay_owner &&
+ cstate->replay_owner != &lock_sop->lo_owner &&
+ seqid_mutating_err(ntohl(status)))
+ lock_sop->lo_owner.so_seqid++;
+
+ /*
+ * If this is a new, never-before-used stateid, and we are
+ * returning an error, then just go ahead and release it.
+ */
+ if (status && new)
+ release_lock_stateid(lock_stp);
+
+ nfs4_put_stid(&lock_stp->st_stid);
+ }
+ if (open_stp)
+ nfs4_put_stid(&open_stp->st_stid);
nfsd4_bump_seqid(cstate, status);
- if (!cstate->replay_owner)
- nfs4_unlock_state();
if (file_lock)
locks_free_lock(file_lock);
if (conflock)
@@ -4580,9 +5403,8 @@ __be32
nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_lockt *lockt)
{
- struct inode *inode;
struct file_lock *file_lock = NULL;
- struct nfs4_lockowner *lo;
+ struct nfs4_lockowner *lo = NULL;
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
@@ -4592,10 +5414,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (check_lock_length(lockt->lt_offset, lockt->lt_length))
return nfserr_inval;
- nfs4_lock_state();
-
if (!nfsd4_has_session(cstate)) {
- status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL);
+ status = lookup_clientid(&lockt->lt_clientid, cstate, nn);
if (status)
goto out;
}
@@ -4603,14 +5423,13 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
goto out;
- inode = cstate->current_fh.fh_dentry->d_inode;
file_lock = locks_alloc_lock();
if (!file_lock) {
dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
status = nfserr_jukebox;
goto out;
}
- locks_init_lock(file_lock);
+
switch (lockt->lt_type) {
case NFS4_READ_LT:
case NFS4_READW_LT:
@@ -4626,7 +5445,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn);
+ lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner,
+ cstate->clp);
if (lo)
file_lock->fl_owner = (fl_owner_t)lo;
file_lock->fl_pid = current->tgid;
@@ -4646,7 +5466,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
}
out:
- nfs4_unlock_state();
+ if (lo)
+ nfs4_put_stateowner(&lo->lo_owner);
if (file_lock)
locks_free_lock(file_lock);
return status;
@@ -4670,27 +5491,25 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (check_lock_length(locku->lu_offset, locku->lu_length))
return nfserr_inval;
- nfs4_lock_state();
-
status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
&locku->lu_stateid, NFS4_LOCK_STID,
&stp, nn);
if (status)
goto out;
- filp = find_any_file(stp->st_file);
+ filp = find_any_file(stp->st_stid.sc_file);
if (!filp) {
status = nfserr_lock_range;
- goto out;
+ goto put_stateid;
}
file_lock = locks_alloc_lock();
if (!file_lock) {
dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
status = nfserr_jukebox;
- goto out;
+ goto fput;
}
- locks_init_lock(file_lock);
+
file_lock->fl_type = F_UNLCK;
- file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
+ file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
file_lock->fl_pid = current->tgid;
file_lock->fl_file = filp;
file_lock->fl_flags = FL_POSIX;
@@ -4708,41 +5527,51 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
-
+fput:
+ fput(filp);
+put_stateid:
+ nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
- if (!cstate->replay_owner)
- nfs4_unlock_state();
if (file_lock)
locks_free_lock(file_lock);
return status;
out_nfserr:
status = nfserrno(err);
- goto out;
+ goto fput;
}
/*
* returns
- * 1: locks held by lockowner
- * 0: no locks held by lockowner
+ * true: locks held by lockowner
+ * false: no locks held by lockowner
*/
-static int
-check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
+static bool
+check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
{
struct file_lock **flpp;
- struct inode *inode = filp->fi_inode;
- int status = 0;
+ int status = false;
+ struct file *filp = find_any_file(fp);
+ struct inode *inode;
+
+ if (!filp) {
+ /* Any valid lock stateid should have some sort of access */
+ WARN_ON_ONCE(1);
+ return status;
+ }
+
+ inode = file_inode(filp);
spin_lock(&inode->i_lock);
for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
- status = 1;
- goto out;
+ status = true;
+ break;
}
}
-out:
spin_unlock(&inode->i_lock);
+ fput(filp);
return status;
}
@@ -4753,53 +5582,46 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
{
clientid_t *clid = &rlockowner->rl_clientid;
struct nfs4_stateowner *sop;
- struct nfs4_lockowner *lo;
+ struct nfs4_lockowner *lo = NULL;
struct nfs4_ol_stateid *stp;
struct xdr_netobj *owner = &rlockowner->rl_owner;
- struct list_head matches;
- unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
+ unsigned int hashval = ownerstr_hashval(owner);
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_client *clp;
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
- nfs4_lock_state();
-
- status = lookup_clientid(clid, cstate->minorversion, nn, NULL);
+ status = lookup_clientid(clid, cstate, nn);
if (status)
- goto out;
+ return status;
- status = nfserr_locks_held;
- INIT_LIST_HEAD(&matches);
+ clp = cstate->clp;
+ /* Find the matching lock stateowner */
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
+ so_strhash) {
- list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) {
- if (sop->so_is_open_owner)
+ if (sop->so_is_open_owner || !same_owner_str(sop, owner))
continue;
- if (!same_owner_str(sop, owner, clid))
- continue;
- list_for_each_entry(stp, &sop->so_stateids,
- st_perstateowner) {
- lo = lockowner(sop);
- if (check_for_locks(stp->st_file, lo))
- goto out;
- list_add(&lo->lo_list, &matches);
+
+ /* see if there are still any locks associated with it */
+ lo = lockowner(sop);
+ list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) {
+ if (check_for_locks(stp->st_stid.sc_file, lo)) {
+ status = nfserr_locks_held;
+ spin_unlock(&clp->cl_lock);
+ return status;
+ }
}
+
+ nfs4_get_stateowner(sop);
+ break;
}
- /* Clients probably won't expect us to return with some (but not all)
- * of the lockowner state released; so don't release any until all
- * have been checked. */
- status = nfs_ok;
- while (!list_empty(&matches)) {
- lo = list_entry(matches.next, struct nfs4_lockowner,
- lo_list);
- /* unhash_stateowner deletes so_perclient only
- * for openowners. */
- list_del(&lo->lo_list);
+ spin_unlock(&clp->cl_lock);
+ if (lo)
release_lockowner(lo);
- }
-out:
- nfs4_unlock_state();
return status;
}
@@ -4887,34 +5709,126 @@ nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
* Called from OPEN. Look for clientid in reclaim list.
*/
__be32
-nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+nfs4_check_open_reclaim(clientid_t *clid,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd_net *nn)
{
- struct nfs4_client *clp;
+ __be32 status;
/* find clientid in conf_id_hashtbl */
- clp = find_confirmed_client(clid, sessions, nn);
- if (clp == NULL)
+ status = lookup_clientid(clid, cstate, nn);
+ if (status)
+ return nfserr_reclaim_bad;
+
+ if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
+ return nfserr_no_grace;
+
+ if (nfsd4_client_record_check(cstate->clp))
return nfserr_reclaim_bad;
- return nfsd4_client_record_check(clp) ? nfserr_reclaim_bad : nfs_ok;
+ return nfs_ok;
}
#ifdef CONFIG_NFSD_FAULT_INJECTION
+static inline void
+put_client(struct nfs4_client *clp)
+{
+ atomic_dec(&clp->cl_refcount);
+}
-u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
+static struct nfs4_client *
+nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
{
- if (mark_client_expired(clp))
- return 0;
- expire_client(clp);
- return 1;
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+
+ if (!nfsd_netns_ready(nn))
+ return NULL;
+
+ list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+ if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
+ return clp;
+ }
+ return NULL;
}
-u64 nfsd_print_client(struct nfs4_client *clp, u64 num)
+u64
+nfsd_inject_print_clients(void)
{
+ struct nfs4_client *clp;
+ u64 count = 0;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
char buf[INET6_ADDRSTRLEN];
- rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
- printk(KERN_INFO "NFS Client: %s\n", buf);
- return 1;
+
+ if (!nfsd_netns_ready(nn))
+ return 0;
+
+ spin_lock(&nn->client_lock);
+ list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+ pr_info("NFS Client: %s\n", buf);
+ ++count;
+ }
+ spin_unlock(&nn->client_lock);
+
+ return count;
+}
+
+u64
+nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size)
+{
+ u64 count = 0;
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+
+ if (!nfsd_netns_ready(nn))
+ return count;
+
+ spin_lock(&nn->client_lock);
+ clp = nfsd_find_client(addr, addr_size);
+ if (clp) {
+ if (mark_client_expired_locked(clp) == nfs_ok)
+ ++count;
+ else
+ clp = NULL;
+ }
+ spin_unlock(&nn->client_lock);
+
+ if (clp)
+ expire_client(clp);
+
+ return count;
+}
+
+u64
+nfsd_inject_forget_clients(u64 max)
+{
+ u64 count = 0;
+ struct nfs4_client *clp, *next;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
+
+ if (!nfsd_netns_ready(nn))
+ return count;
+
+ spin_lock(&nn->client_lock);
+ list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+ if (mark_client_expired_locked(clp) == nfs_ok) {
+ list_add(&clp->cl_lru, &reaplist);
+ if (max != 0 && ++count >= max)
+ break;
+ }
+ }
+ spin_unlock(&nn->client_lock);
+
+ list_for_each_entry_safe(clp, next, &reaplist, cl_lru)
+ expire_client(clp);
+
+ return count;
}
static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
@@ -4925,158 +5839,484 @@ static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
}
-static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *))
+static void
+nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst,
+ struct list_head *collect)
+{
+ struct nfs4_client *clp = lst->st_stid.sc_client;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+
+ if (!collect)
+ return;
+
+ lockdep_assert_held(&nn->client_lock);
+ atomic_inc(&clp->cl_refcount);
+ list_add(&lst->st_locks, collect);
+}
+
+static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
+ struct list_head *collect,
+ void (*func)(struct nfs4_ol_stateid *))
{
struct nfs4_openowner *oop;
- struct nfs4_lockowner *lop, *lo_next;
struct nfs4_ol_stateid *stp, *st_next;
+ struct nfs4_ol_stateid *lst, *lst_next;
u64 count = 0;
+ spin_lock(&clp->cl_lock);
list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
- list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) {
- list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) {
- if (func)
- func(lop);
- if (++count == max)
- return count;
+ list_for_each_entry_safe(stp, st_next,
+ &oop->oo_owner.so_stateids, st_perstateowner) {
+ list_for_each_entry_safe(lst, lst_next,
+ &stp->st_locks, st_locks) {
+ if (func) {
+ func(lst);
+ nfsd_inject_add_lock_to_list(lst,
+ collect);
+ }
+ ++count;
+ /*
+ * Despite the fact that these functions deal
+ * with 64-bit integers for "count", we must
+ * ensure that it doesn't blow up the
+ * clp->cl_refcount. Throw a warning if we
+ * start to approach INT_MAX here.
+ */
+ WARN_ON_ONCE(count == (INT_MAX / 2));
+ if (count == max)
+ goto out;
}
}
}
+out:
+ spin_unlock(&clp->cl_lock);
return count;
}
-u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max)
+static u64
+nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect,
+ u64 max)
{
- return nfsd_foreach_client_lock(clp, max, release_lockowner);
+ return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid);
}
-u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max)
+static u64
+nfsd_print_client_locks(struct nfs4_client *clp)
{
- u64 count = nfsd_foreach_client_lock(clp, max, NULL);
+ u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL);
nfsd_print_count(clp, count, "locked files");
return count;
}
-static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *))
+u64
+nfsd_inject_print_locks(void)
+{
+ struct nfs4_client *clp;
+ u64 count = 0;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+
+ if (!nfsd_netns_ready(nn))
+ return 0;
+
+ spin_lock(&nn->client_lock);
+ list_for_each_entry(clp, &nn->client_lru, cl_lru)
+ count += nfsd_print_client_locks(clp);
+ spin_unlock(&nn->client_lock);
+
+ return count;
+}
+
+static void
+nfsd_reap_locks(struct list_head *reaplist)
+{
+ struct nfs4_client *clp;
+ struct nfs4_ol_stateid *stp, *next;
+
+ list_for_each_entry_safe(stp, next, reaplist, st_locks) {
+ list_del_init(&stp->st_locks);
+ clp = stp->st_stid.sc_client;
+ nfs4_put_stid(&stp->st_stid);
+ put_client(clp);
+ }
+}
+
+u64
+nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size)
+{
+ unsigned int count = 0;
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
+
+ if (!nfsd_netns_ready(nn))
+ return count;
+
+ spin_lock(&nn->client_lock);
+ clp = nfsd_find_client(addr, addr_size);
+ if (clp)
+ count = nfsd_collect_client_locks(clp, &reaplist, 0);
+ spin_unlock(&nn->client_lock);
+ nfsd_reap_locks(&reaplist);
+ return count;
+}
+
+u64
+nfsd_inject_forget_locks(u64 max)
+{
+ u64 count = 0;
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
+
+ if (!nfsd_netns_ready(nn))
+ return count;
+
+ spin_lock(&nn->client_lock);
+ list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+ count += nfsd_collect_client_locks(clp, &reaplist, max - count);
+ if (max != 0 && count >= max)
+ break;
+ }
+ spin_unlock(&nn->client_lock);
+ nfsd_reap_locks(&reaplist);
+ return count;
+}
+
+static u64
+nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max,
+ struct list_head *collect,
+ void (*func)(struct nfs4_openowner *))
{
struct nfs4_openowner *oop, *next;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
u64 count = 0;
+ lockdep_assert_held(&nn->client_lock);
+
+ spin_lock(&clp->cl_lock);
list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
- if (func)
+ if (func) {
func(oop);
- if (++count == max)
+ if (collect) {
+ atomic_inc(&clp->cl_refcount);
+ list_add(&oop->oo_perclient, collect);
+ }
+ }
+ ++count;
+ /*
+ * Despite the fact that these functions deal with
+ * 64-bit integers for "count", we must ensure that
+ * it doesn't blow up the clp->cl_refcount. Throw a
+ * warning if we start to approach INT_MAX here.
+ */
+ WARN_ON_ONCE(count == (INT_MAX / 2));
+ if (count == max)
break;
}
+ spin_unlock(&clp->cl_lock);
return count;
}
-u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max)
+static u64
+nfsd_print_client_openowners(struct nfs4_client *clp)
{
- return nfsd_foreach_client_open(clp, max, release_openowner);
+ u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL);
+
+ nfsd_print_count(clp, count, "openowners");
+ return count;
}
-u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max)
+static u64
+nfsd_collect_client_openowners(struct nfs4_client *clp,
+ struct list_head *collect, u64 max)
{
- u64 count = nfsd_foreach_client_open(clp, max, NULL);
- nfsd_print_count(clp, count, "open files");
- return count;
+ return nfsd_foreach_client_openowner(clp, max, collect,
+ unhash_openowner_locked);
}
-static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
- struct list_head *victims)
+u64
+nfsd_inject_print_openowners(void)
{
- struct nfs4_delegation *dp, *next;
+ struct nfs4_client *clp;
u64 count = 0;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+
+ if (!nfsd_netns_ready(nn))
+ return 0;
+
+ spin_lock(&nn->client_lock);
+ list_for_each_entry(clp, &nn->client_lru, cl_lru)
+ count += nfsd_print_client_openowners(clp);
+ spin_unlock(&nn->client_lock);
- lockdep_assert_held(&state_lock);
- list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
- if (victims)
- list_move(&dp->dl_recall_lru, victims);
- if (++count == max)
- break;
- }
return count;
}
-u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
+static void
+nfsd_reap_openowners(struct list_head *reaplist)
{
- struct nfs4_delegation *dp, *next;
- LIST_HEAD(victims);
- u64 count;
+ struct nfs4_client *clp;
+ struct nfs4_openowner *oop, *next;
- spin_lock(&state_lock);
- count = nfsd_find_all_delegations(clp, max, &victims);
- spin_unlock(&state_lock);
+ list_for_each_entry_safe(oop, next, reaplist, oo_perclient) {
+ list_del_init(&oop->oo_perclient);
+ clp = oop->oo_owner.so_client;
+ release_openowner(oop);
+ put_client(clp);
+ }
+}
- list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
- revoke_delegation(dp);
+u64
+nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr,
+ size_t addr_size)
+{
+ unsigned int count = 0;
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
+ if (!nfsd_netns_ready(nn))
+ return count;
+
+ spin_lock(&nn->client_lock);
+ clp = nfsd_find_client(addr, addr_size);
+ if (clp)
+ count = nfsd_collect_client_openowners(clp, &reaplist, 0);
+ spin_unlock(&nn->client_lock);
+ nfsd_reap_openowners(&reaplist);
return count;
}
-u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
+u64
+nfsd_inject_forget_openowners(u64 max)
{
- struct nfs4_delegation *dp, *next;
- LIST_HEAD(victims);
- u64 count;
+ u64 count = 0;
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
- spin_lock(&state_lock);
- count = nfsd_find_all_delegations(clp, max, &victims);
- list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
- nfsd_break_one_deleg(dp);
- spin_unlock(&state_lock);
+ if (!nfsd_netns_ready(nn))
+ return count;
+ spin_lock(&nn->client_lock);
+ list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+ count += nfsd_collect_client_openowners(clp, &reaplist,
+ max - count);
+ if (max != 0 && count >= max)
+ break;
+ }
+ spin_unlock(&nn->client_lock);
+ nfsd_reap_openowners(&reaplist);
return count;
}
-u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
+static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
+ struct list_head *victims)
{
+ struct nfs4_delegation *dp, *next;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
u64 count = 0;
+ lockdep_assert_held(&nn->client_lock);
+
spin_lock(&state_lock);
- count = nfsd_find_all_delegations(clp, max, NULL);
+ list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
+ if (victims) {
+ /*
+ * It's not safe to mess with delegations that have a
+ * non-zero dl_time. They might have already been broken
+ * and could be processed by the laundromat outside of
+ * the state_lock. Just leave them be.
+ */
+ if (dp->dl_time != 0)
+ continue;
+
+ atomic_inc(&clp->cl_refcount);
+ unhash_delegation_locked(dp);
+ list_add(&dp->dl_recall_lru, victims);
+ }
+ ++count;
+ /*
+ * Despite the fact that these functions deal with
+ * 64-bit integers for "count", we must ensure that
+ * it doesn't blow up the clp->cl_refcount. Throw a
+ * warning if we start to approach INT_MAX here.
+ */
+ WARN_ON_ONCE(count == (INT_MAX / 2));
+ if (count == max)
+ break;
+ }
spin_unlock(&state_lock);
+ return count;
+}
+
+static u64
+nfsd_print_client_delegations(struct nfs4_client *clp)
+{
+ u64 count = nfsd_find_all_delegations(clp, 0, NULL);
nfsd_print_count(clp, count, "delegations");
return count;
}
-u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64))
+u64
+nfsd_inject_print_delegations(void)
{
- struct nfs4_client *clp, *next;
+ struct nfs4_client *clp;
u64 count = 0;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
if (!nfsd_netns_ready(nn))
return 0;
- list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
- count += func(clp, max - count);
- if ((max != 0) && (count >= max))
- break;
+ spin_lock(&nn->client_lock);
+ list_for_each_entry(clp, &nn->client_lru, cl_lru)
+ count += nfsd_print_client_delegations(clp);
+ spin_unlock(&nn->client_lock);
+
+ return count;
+}
+
+static void
+nfsd_forget_delegations(struct list_head *reaplist)
+{
+ struct nfs4_client *clp;
+ struct nfs4_delegation *dp, *next;
+
+ list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
+ list_del_init(&dp->dl_recall_lru);
+ clp = dp->dl_stid.sc_client;
+ revoke_delegation(dp);
+ put_client(clp);
}
+}
+
+u64
+nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr,
+ size_t addr_size)
+{
+ u64 count = 0;
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
+ if (!nfsd_netns_ready(nn))
+ return count;
+
+ spin_lock(&nn->client_lock);
+ clp = nfsd_find_client(addr, addr_size);
+ if (clp)
+ count = nfsd_find_all_delegations(clp, 0, &reaplist);
+ spin_unlock(&nn->client_lock);
+
+ nfsd_forget_delegations(&reaplist);
return count;
}
-struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
+u64
+nfsd_inject_forget_delegations(u64 max)
{
+ u64 count = 0;
struct nfs4_client *clp;
- struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
if (!nfsd_netns_ready(nn))
- return NULL;
+ return count;
+ spin_lock(&nn->client_lock);
list_for_each_entry(clp, &nn->client_lru, cl_lru) {
- if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
- return clp;
+ count += nfsd_find_all_delegations(clp, max - count, &reaplist);
+ if (max != 0 && count >= max)
+ break;
}
- return NULL;
+ spin_unlock(&nn->client_lock);
+ nfsd_forget_delegations(&reaplist);
+ return count;
+}
+
+static void
+nfsd_recall_delegations(struct list_head *reaplist)
+{
+ struct nfs4_client *clp;
+ struct nfs4_delegation *dp, *next;
+
+ list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
+ list_del_init(&dp->dl_recall_lru);
+ clp = dp->dl_stid.sc_client;
+ /*
+ * We skipped all entries that had a zero dl_time before,
+ * so we can now reset the dl_time back to 0. If a delegation
+ * break comes in now, then it won't make any difference since
+ * we're recalling it either way.
+ */
+ spin_lock(&state_lock);
+ dp->dl_time = 0;
+ spin_unlock(&state_lock);
+ nfsd_break_one_deleg(dp);
+ put_client(clp);
+ }
+}
+
+u64
+nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr,
+ size_t addr_size)
+{
+ u64 count = 0;
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
+
+ if (!nfsd_netns_ready(nn))
+ return count;
+
+ spin_lock(&nn->client_lock);
+ clp = nfsd_find_client(addr, addr_size);
+ if (clp)
+ count = nfsd_find_all_delegations(clp, 0, &reaplist);
+ spin_unlock(&nn->client_lock);
+
+ nfsd_recall_delegations(&reaplist);
+ return count;
}
+u64
+nfsd_inject_recall_delegations(u64 max)
+{
+ u64 count = 0;
+ struct nfs4_client *clp, *next;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ LIST_HEAD(reaplist);
+
+ if (!nfsd_netns_ready(nn))
+ return count;
+
+ spin_lock(&nn->client_lock);
+ list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+ count += nfsd_find_all_delegations(clp, max - count, &reaplist);
+ if (max != 0 && ++count >= max)
+ break;
+ }
+ spin_unlock(&nn->client_lock);
+ nfsd_recall_delegations(&reaplist);
+ return count;
+}
#endif /* CONFIG_NFSD_FAULT_INJECTION */
/*
@@ -5113,14 +6353,6 @@ static int nfs4_state_create_net(struct net *net)
CLIENT_HASH_SIZE, GFP_KERNEL);
if (!nn->unconf_id_hashtbl)
goto err_unconf_id;
- nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
- OWNER_HASH_SIZE, GFP_KERNEL);
- if (!nn->ownerstr_hashtbl)
- goto err_ownerstr;
- nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) *
- LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL);
- if (!nn->lockowner_ino_hashtbl)
- goto err_lockowner_ino;
nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
SESSION_HASH_SIZE, GFP_KERNEL);
if (!nn->sessionid_hashtbl)
@@ -5130,10 +6362,6 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
}
- for (i = 0; i < OWNER_HASH_SIZE; i++)
- INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]);
- for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
- INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]);
for (i = 0; i < SESSION_HASH_SIZE; i++)
INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
nn->conf_name_tree = RB_ROOT;
@@ -5149,10 +6377,6 @@ static int nfs4_state_create_net(struct net *net)
return 0;
err_sessionid:
- kfree(nn->lockowner_ino_hashtbl);
-err_lockowner_ino:
- kfree(nn->ownerstr_hashtbl);
-err_ownerstr:
kfree(nn->unconf_id_hashtbl);
err_unconf_id:
kfree(nn->conf_id_hashtbl);
@@ -5182,8 +6406,6 @@ nfs4_state_destroy_net(struct net *net)
}
kfree(nn->sessionid_hashtbl);
- kfree(nn->lockowner_ino_hashtbl);
- kfree(nn->ownerstr_hashtbl);
kfree(nn->unconf_id_hashtbl);
kfree(nn->conf_id_hashtbl);
put_net(net);
@@ -5198,10 +6420,10 @@ nfs4_state_start_net(struct net *net)
ret = nfs4_state_create_net(net);
if (ret)
return ret;
- nfsd4_client_tracking_init(net);
nn->boot_time = get_seconds();
- locks_start_grace(net, &nn->nfsd4_manager);
nn->grace_ended = false;
+ locks_start_grace(net, &nn->nfsd4_manager);
+ nfsd4_client_tracking_init(net);
printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
nn->nfsd4_grace, net);
queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
@@ -5247,22 +6469,23 @@ nfs4_state_shutdown_net(struct net *net)
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
- nfs4_lock_state();
INIT_LIST_HEAD(&reaplist);
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- list_move(&dp->dl_recall_lru, &reaplist);
+ unhash_delegation_locked(dp);
+ list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- destroy_delegation(dp);
+ list_del_init(&dp->dl_recall_lru);
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+ nfs4_put_stid(&dp->dl_stid);
}
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
- nfs4_unlock_state();
}
void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 944275c8f56d..eeea7a90eb87 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -31,13 +31,6 @@
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * TODO: Neil Brown made the following observation: We currently
- * initially reserve NFSD_BUFSIZE space on the transmit queue and
- * never release any of that until the request is complete.
- * It would be good to calculate a new maximum response size while
- * decoding the COMPOUND, and call svc_reserve with this number
- * at the end of nfs4svc_decode_compoundargs.
*/
#include <linux/slab.h>
@@ -181,28 +174,43 @@ static int zero_clientid(clientid_t *clid)
}
/**
- * defer_free - mark an allocation as deferred freed
- * @argp: NFSv4 compound argument structure to be freed with
- * @release: release callback to free @p, typically kfree()
- * @p: pointer to be freed
+ * svcxdr_tmpalloc - allocate memory to be freed after compound processing
+ * @argp: NFSv4 compound argument structure
+ * @p: pointer to be freed (with kfree())
*
* Marks @p to be freed when processing the compound operation
* described in @argp finishes.
*/
-static int
-defer_free(struct nfsd4_compoundargs *argp,
- void (*release)(const void *), void *p)
+static void *
+svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
{
- struct tmpbuf *tb;
+ struct svcxdr_tmpbuf *tb;
- tb = kmalloc(sizeof(*tb), GFP_KERNEL);
+ tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL);
if (!tb)
- return -ENOMEM;
- tb->buf = p;
- tb->release = release;
+ return NULL;
tb->next = argp->to_free;
argp->to_free = tb;
- return 0;
+ return tb->buf;
+}
+
+/*
+ * For xdr strings that need to be passed to other kernel api's
+ * as null-terminated strings.
+ *
+ * Note null-terminating in place usually isn't safe since the
+ * buffer might end on a page boundary.
+ */
+static char *
+svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
+{
+ char *p = svcxdr_tmpalloc(argp, len + 1);
+
+ if (!p)
+ return NULL;
+ memcpy(p, buf, len);
+ p[len] = '\0';
+ return p;
}
/**
@@ -217,19 +225,13 @@ defer_free(struct nfsd4_compoundargs *argp,
*/
static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
{
- if (p == argp->tmp) {
- p = kmemdup(argp->tmp, nbytes, GFP_KERNEL);
- if (!p)
- return NULL;
- } else {
- BUG_ON(p != argp->tmpp);
- argp->tmpp = NULL;
- }
- if (defer_free(argp, kfree, p)) {
- kfree(p);
+ void *ret;
+
+ ret = svcxdr_tmpalloc(argp, nbytes);
+ if (!ret)
return NULL;
- } else
- return (char *)p;
+ memcpy(ret, p, nbytes);
+ return ret;
}
static __be32
@@ -292,12 +294,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (nace > NFS4_ACL_MAX)
return nfserr_fbig;
- *acl = nfs4_acl_new(nace);
+ *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
if (*acl == NULL)
return nfserr_jukebox;
- defer_free(argp, kfree, *acl);
-
(*acl)->naces = nace;
for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
READ_BUF(16); len += 16;
@@ -418,12 +418,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
return nfserr_badlabel;
len += (XDR_QUADLEN(dummy32) << 2);
READMEM(buf, dummy32);
- label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
+ label->len = dummy32;
+ label->data = svcxdr_dupstr(argp, buf, dummy32);
if (!label->data)
return nfserr_jukebox;
- label->len = dummy32;
- defer_free(argp, kfree, label->data);
- memcpy(label->data, buf, dummy32);
}
#endif
@@ -598,20 +596,11 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
switch (create->cr_type) {
case NF4LNK:
READ_BUF(4);
- create->cr_linklen = be32_to_cpup(p++);
- READ_BUF(create->cr_linklen);
- /*
- * The VFS will want a null-terminated string, and
- * null-terminating in place isn't safe since this might
- * end on a page boundary:
- */
- create->cr_linkname =
- kmalloc(create->cr_linklen + 1, GFP_KERNEL);
- if (!create->cr_linkname)
+ create->cr_datalen = be32_to_cpup(p++);
+ READ_BUF(create->cr_datalen);
+ create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen);
+ if (!create->cr_data)
return nfserr_jukebox;
- memcpy(create->cr_linkname, p, create->cr_linklen);
- create->cr_linkname[create->cr_linklen] = '\0';
- defer_free(argp, kfree, create->cr_linkname);
break;
case NF4BLK:
case NF4CHR:
@@ -1481,13 +1470,12 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
for (i = 0; i < test_stateid->ts_num_ids; i++) {
- stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL);
+ stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
if (!stateid) {
status = nfserrno(-ENOMEM);
goto out;
}
- defer_free(argp, kfree, stateid);
INIT_LIST_HEAD(&stateid->ts_id_list);
list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
@@ -1526,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
}
static __be32
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 4);
+ p = xdr_decode_hyper(p, &seek->seek_offset);
+ seek->seek_whence = be32_to_cpup(p);
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
{
return nfs_ok;
@@ -1598,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
[OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
+
+ /* new operations for NFSv4.2 */
+ [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
+ [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
};
static inline bool
@@ -1640,7 +1658,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
goto xdr_error;
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
- argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
+ argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
if (!argp->ops) {
argp->ops = argp->iops;
dprintk("nfsd: couldn't allocate room for COMPOUND\n");
@@ -1675,6 +1693,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
readbytes += nfsd4_max_reply(argp->rqstp, op);
} else
max_reply += nfsd4_max_reply(argp->rqstp, op);
+ /*
+ * OP_LOCK may return a conflicting lock. (Special case
+ * because it will just skip encoding this if it runs
+ * out of xdr buffer space, and it is the only operation
+ * that behaves this way.)
+ */
+ if (op->opnum == OP_LOCK)
+ max_reply += NFS4_OPAQUE_LIMIT;
if (op->status) {
argp->opcnt = i+1;
@@ -2662,6 +2688,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
struct xdr_stream *xdr = cd->xdr;
int start_offset = xdr->buf->len;
int cookie_offset;
+ u32 name_and_cookie;
int entry_bytes;
__be32 nfserr = nfserr_toosmall;
__be64 wire_offset;
@@ -2723,7 +2750,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
cd->rd_maxcount -= entry_bytes;
if (!cd->rd_dircount)
goto fail;
- cd->rd_dircount--;
+ /*
+ * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
+ * let's always let through the first entry, at least:
+ */
+ name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+ if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+ goto fail;
+ cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
cd->cookie_offset = cookie_offset;
skip_entry:
cd->common.err = nfs_ok;
@@ -3077,11 +3111,8 @@ static __be32 nfsd4_encode_splice_read(
__be32 nfserr;
__be32 *p = xdr->p - 2;
- /*
- * Don't inline pages unless we know there's room for eof,
- * count, and possible padding:
- */
- if (xdr->end - xdr->p < 3)
+ /* Make sure there will be room for padding if needed */
+ if (xdr->end - xdr->p < 1)
return nfserr_resource;
nfserr = nfsd_splice_read(read->rd_rqstp, file,
@@ -3104,7 +3135,8 @@ static __be32 nfsd4_encode_splice_read(
buf->page_len = maxcount;
buf->len += maxcount;
- xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE;
+ xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+ / PAGE_SIZE;
/* Use rest of head for padding and remaining ops: */
buf->tail[0].iov_base = xdr->p;
@@ -3147,9 +3179,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
len = maxcount;
v = 0;
- thislen = (void *)xdr->end - (void *)xdr->p;
- if (len < thislen)
- thislen = len;
+ thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
p = xdr_reserve_space(xdr, (thislen+3)&~3);
WARN_ON_ONCE(!p);
resp->rqstp->rq_vec[v].iov_base = p;
@@ -3216,10 +3246,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
xdr_commit_encode(xdr);
maxcount = svc_max_payload(resp->rqstp);
- if (maxcount > xdr->buf->buflen - xdr->buf->len)
- maxcount = xdr->buf->buflen - xdr->buf->len;
- if (maxcount > read->rd_length)
- maxcount = read->rd_length;
+ maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
+ maxcount = min_t(unsigned long, maxcount, read->rd_length);
if (!read->rd_filp) {
err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
@@ -3333,6 +3361,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
}
maxcount = min_t(int, maxcount-16, bytes_left);
+ /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+ if (!readdir->rd_dircount)
+ readdir->rd_dircount = INT_MAX;
+
readdir->xdr = xdr;
readdir->rd_maxcount = maxcount;
readdir->common.err = 0;
@@ -3763,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
+nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_seek *seek)
+{
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 8);
+ *p++ = cpu_to_be32(seek->seek_eof);
+ p = xdr_encode_hyper(p, seek->seek_pos);
+
+ return nfserr;
+}
+
+static __be32
nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
{
return nfserr;
@@ -3834,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
[OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
[OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
+
+ /* NFSv4.2 operations */
+ [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
+ [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
};
/*
@@ -3937,8 +3999,6 @@ status:
*
* XDR note: do not encode rp->rp_buflen: the buffer contains the
* previously sent already encoded operation.
- *
- * called with nfs4_lock_state() held
*/
void
nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
@@ -3977,9 +4037,8 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
kfree(args->tmpp);
args->tmpp = NULL;
while (args->to_free) {
- struct tmpbuf *tb = args->to_free;
+ struct svcxdr_tmpbuf *tb = args->to_free;
args->to_free = tb->next;
- tb->release(tb->buf);
kfree(tb);
}
return 1;
@@ -4012,7 +4071,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
/*
* All that remains is to write the tag and operation count...
*/
- struct nfsd4_compound_state *cs = &resp->cstate;
struct xdr_buf *buf = resp->xdr.buf;
WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
@@ -4026,19 +4084,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
p += XDR_QUADLEN(resp->taglen);
*p++ = htonl(resp->opcnt);
- if (nfsd4_has_session(cs)) {
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- struct nfs4_client *clp = cs->session->se_client;
- if (cs->status != nfserr_replay_cache) {
- nfsd4_store_cache_entry(resp);
- cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
- }
- /* Renew the clientid on success and on replay */
- spin_lock(&nn->client_lock);
- nfsd4_put_session(cs->session);
- spin_unlock(&nn->client_lock);
- put_client_renew(clp);
- }
+ nfsd4_sequence_done(resp);
return 1;
}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 6040da8830ff..122f69185ef5 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -27,8 +27,12 @@
*/
#define TARGET_BUCKET_SIZE 64
-static struct hlist_head * cache_hash;
-static struct list_head lru_head;
+struct nfsd_drc_bucket {
+ struct list_head lru_head;
+ spinlock_t cache_lock;
+};
+
+static struct nfsd_drc_bucket *drc_hashtbl;
static struct kmem_cache *drc_slab;
/* max number of entries allowed in the cache */
@@ -36,6 +40,7 @@ static unsigned int max_drc_entries;
/* number of significant bits in the hash value */
static unsigned int maskbits;
+static unsigned int drc_hashsize;
/*
* Stats and other tracking of on the duplicate reply cache. All of these and
@@ -43,7 +48,7 @@ static unsigned int maskbits;
*/
/* total number of entries */
-static unsigned int num_drc_entries;
+static atomic_t num_drc_entries;
/* cache misses due only to checksum comparison failures */
static unsigned int payload_misses;
@@ -75,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
* A cache entry is "single use" if c_state == RC_INPROG
* Otherwise, it when accessing _prev or _next, the lock must be held.
*/
-static DEFINE_SPINLOCK(cache_lock);
static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
/*
@@ -116,6 +120,12 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}
+static u32
+nfsd_cache_hash(__be32 xid)
+{
+ return hash_32(be32_to_cpu(xid), maskbits);
+}
+
static struct svc_cacherep *
nfsd_reply_cache_alloc(void)
{
@@ -126,7 +136,6 @@ nfsd_reply_cache_alloc(void)
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
INIT_LIST_HEAD(&rp->c_lru);
- INIT_HLIST_NODE(&rp->c_hash);
}
return rp;
}
@@ -138,29 +147,27 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
}
- if (!hlist_unhashed(&rp->c_hash))
- hlist_del(&rp->c_hash);
list_del(&rp->c_lru);
- --num_drc_entries;
+ atomic_dec(&num_drc_entries);
drc_mem_usage -= sizeof(*rp);
kmem_cache_free(drc_slab, rp);
}
static void
-nfsd_reply_cache_free(struct svc_cacherep *rp)
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
nfsd_reply_cache_free_locked(rp);
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
}
int nfsd_reply_cache_init(void)
{
unsigned int hashsize;
+ unsigned int i;
- INIT_LIST_HEAD(&lru_head);
max_drc_entries = nfsd_cache_size_limit();
- num_drc_entries = 0;
+ atomic_set(&num_drc_entries, 0);
hashsize = nfsd_hashsize(max_drc_entries);
maskbits = ilog2(hashsize);
@@ -170,9 +177,14 @@ int nfsd_reply_cache_init(void)
if (!drc_slab)
goto out_nomem;
- cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL);
- if (!cache_hash)
+ drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
+ if (!drc_hashtbl)
goto out_nomem;
+ for (i = 0; i < hashsize; i++) {
+ INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
+ spin_lock_init(&drc_hashtbl[i].cache_lock);
+ }
+ drc_hashsize = hashsize;
return 0;
out_nomem:
@@ -184,17 +196,22 @@ out_nomem:
void nfsd_reply_cache_shutdown(void)
{
struct svc_cacherep *rp;
+ unsigned int i;
unregister_shrinker(&nfsd_reply_cache_shrinker);
cancel_delayed_work_sync(&cache_cleaner);
- while (!list_empty(&lru_head)) {
- rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
- nfsd_reply_cache_free_locked(rp);
+ for (i = 0; i < drc_hashsize; i++) {
+ struct list_head *head = &drc_hashtbl[i].lru_head;
+ while (!list_empty(head)) {
+ rp = list_first_entry(head, struct svc_cacherep, c_lru);
+ nfsd_reply_cache_free_locked(rp);
+ }
}
- kfree (cache_hash);
- cache_hash = NULL;
+ kfree (drc_hashtbl);
+ drc_hashtbl = NULL;
+ drc_hashsize = 0;
if (drc_slab) {
kmem_cache_destroy(drc_slab);
@@ -207,56 +224,63 @@ void nfsd_reply_cache_shutdown(void)
* not already scheduled.
*/
static void
-lru_put_end(struct svc_cacherep *rp)
+lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
rp->c_timestamp = jiffies;
- list_move_tail(&rp->c_lru, &lru_head);
+ list_move_tail(&rp->c_lru, &b->lru_head);
schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}
-/*
- * Move a cache entry from one hash list to another
- */
-static void
-hash_refile(struct svc_cacherep *rp)
-{
- hlist_del_init(&rp->c_hash);
- hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
-}
-
-/*
- * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
- * Also prune the oldest ones when the total exceeds the max number of entries.
- */
static long
-prune_cache_entries(void)
+prune_bucket(struct nfsd_drc_bucket *b)
{
struct svc_cacherep *rp, *tmp;
long freed = 0;
- list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
+ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
/*
* Don't free entries attached to calls that are still
* in-progress, but do keep scanning the list.
*/
if (rp->c_state == RC_INPROG)
continue;
- if (num_drc_entries <= max_drc_entries &&
+ if (atomic_read(&num_drc_entries) <= max_drc_entries &&
time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
nfsd_reply_cache_free_locked(rp);
freed++;
}
+ return freed;
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static long
+prune_cache_entries(void)
+{
+ unsigned int i;
+ long freed = 0;
+ bool cancel = true;
+
+ for (i = 0; i < drc_hashsize; i++) {
+ struct nfsd_drc_bucket *b = &drc_hashtbl[i];
+
+ if (list_empty(&b->lru_head))
+ continue;
+ spin_lock(&b->cache_lock);
+ freed += prune_bucket(b);
+ if (!list_empty(&b->lru_head))
+ cancel = false;
+ spin_unlock(&b->cache_lock);
+ }
/*
- * Conditionally rearm the job. If we cleaned out the list, then
- * cancel any pending run (since there won't be any work to do).
- * Otherwise, we rearm the job or modify the existing one to run in
- * RC_EXPIRE since we just ran the pruner.
+ * Conditionally rearm the job to run in RC_EXPIRE since we just
+ * ran the pruner.
*/
- if (list_empty(&lru_head))
- cancel_delayed_work(&cache_cleaner);
- else
+ if (!cancel)
mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
return freed;
}
@@ -264,32 +288,19 @@ prune_cache_entries(void)
static void
cache_cleaner_func(struct work_struct *unused)
{
- spin_lock(&cache_lock);
prune_cache_entries();
- spin_unlock(&cache_lock);
}
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long num;
-
- spin_lock(&cache_lock);
- num = num_drc_entries;
- spin_unlock(&cache_lock);
-
- return num;
+ return atomic_read(&num_drc_entries);
}
static unsigned long
nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long freed;
-
- spin_lock(&cache_lock);
- freed = prune_cache_entries();
- spin_unlock(&cache_lock);
- return freed;
+ return prune_cache_entries();
}
/*
* Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
@@ -327,20 +338,24 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
static bool
nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
{
- /* Check RPC header info first */
- if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc ||
- rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
- rqstp->rq_arg.len != rp->c_len ||
- !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
- rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+ /* Check RPC XID first */
+ if (rqstp->rq_xid != rp->c_xid)
return false;
-
/* compare checksum of NFS data */
if (csum != rp->c_csum) {
++payload_misses;
return false;
}
+ /* Other discriminators */
+ if (rqstp->rq_proc != rp->c_proc ||
+ rqstp->rq_prot != rp->c_prot ||
+ rqstp->rq_vers != rp->c_vers ||
+ rqstp->rq_arg.len != rp->c_len ||
+ !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
+ rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+ return false;
+
return true;
}
@@ -350,14 +365,14 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
* NULL on failure.
*/
static struct svc_cacherep *
-nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
+nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
+ __wsum csum)
{
struct svc_cacherep *rp, *ret = NULL;
- struct hlist_head *rh;
+ struct list_head *rh = &b->lru_head;
unsigned int entries = 0;
- rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)];
- hlist_for_each_entry(rp, rh, c_hash) {
+ list_for_each_entry(rp, rh, c_lru) {
++entries;
if (nfsd_cache_match(rqstp, csum, rp)) {
ret = rp;
@@ -368,11 +383,12 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
/* tally hash chain length stats */
if (entries > longest_chain) {
longest_chain = entries;
- longest_chain_cachesize = num_drc_entries;
+ longest_chain_cachesize = atomic_read(&num_drc_entries);
} else if (entries == longest_chain) {
/* prefer to keep the smallest cachesize possible here */
- longest_chain_cachesize = min(longest_chain_cachesize,
- num_drc_entries);
+ longest_chain_cachesize = min_t(unsigned int,
+ longest_chain_cachesize,
+ atomic_read(&num_drc_entries));
}
return ret;
@@ -394,6 +410,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
vers = rqstp->rq_vers,
proc = rqstp->rq_proc;
__wsum csum;
+ u32 hash = nfsd_cache_hash(xid);
+ struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
unsigned long age;
int type = rqstp->rq_cachetype;
int rtn = RC_DOIT;
@@ -411,16 +429,16 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
* preallocate an entry.
*/
rp = nfsd_reply_cache_alloc();
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
if (likely(rp)) {
- ++num_drc_entries;
+ atomic_inc(&num_drc_entries);
drc_mem_usage += sizeof(*rp);
}
/* go ahead and prune the cache */
- prune_cache_entries();
+ prune_bucket(b);
- found = nfsd_cache_search(rqstp, csum);
+ found = nfsd_cache_search(b, rqstp, csum);
if (found) {
if (likely(rp))
nfsd_reply_cache_free_locked(rp);
@@ -445,8 +463,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rp->c_len = rqstp->rq_arg.len;
rp->c_csum = csum;
- hash_refile(rp);
- lru_put_end(rp);
+ lru_put_end(b, rp);
/* release any buffer */
if (rp->c_type == RC_REPLBUFF) {
@@ -456,14 +473,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
}
rp->c_type = RC_NOCACHE;
out:
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
return rtn;
found_entry:
nfsdstats.rchits++;
/* We found a matching entry which is either in progress or done. */
age = jiffies - rp->c_timestamp;
- lru_put_end(rp);
+ lru_put_end(b, rp);
rtn = RC_DROPIT;
/* Request being processed or excessive rexmits */
@@ -518,18 +535,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
{
struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
+ u32 hash;
+ struct nfsd_drc_bucket *b;
int len;
size_t bufsize = 0;
if (!rp)
return;
+ hash = nfsd_cache_hash(rp->c_xid);
+ b = &drc_hashtbl[hash];
+
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
len >>= 2;
/* Don't cache excessive amounts of data and XDR failures */
if (!statp || len > (256 >> 2)) {
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}
@@ -544,23 +566,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
bufsize = len << 2;
cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
if (!cachv->iov_base) {
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}
cachv->iov_len = bufsize;
memcpy(cachv->iov_base, statp, bufsize);
break;
case RC_NOCACHE:
- nfsd_reply_cache_free(rp);
+ nfsd_reply_cache_free(b, rp);
return;
}
- spin_lock(&cache_lock);
+ spin_lock(&b->cache_lock);
drc_mem_usage += bufsize;
- lru_put_end(rp);
+ lru_put_end(b, rp);
rp->c_secure = rqstp->rq_secure;
rp->c_type = cachetype;
rp->c_state = RC_DONE;
- spin_unlock(&cache_lock);
+ spin_unlock(&b->cache_lock);
return;
}
@@ -591,9 +613,9 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
*/
static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
{
- spin_lock(&cache_lock);
seq_printf(m, "max entries: %u\n", max_drc_entries);
- seq_printf(m, "num entries: %u\n", num_drc_entries);
+ seq_printf(m, "num entries: %u\n",
+ atomic_read(&num_drc_entries));
seq_printf(m, "hash buckets: %u\n", 1 << maskbits);
seq_printf(m, "mem usage: %u\n", drc_mem_usage);
seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
@@ -602,7 +624,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "payload misses: %u\n", payload_misses);
seq_printf(m, "longest chain len: %u\n", longest_chain);
seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize);
- spin_unlock(&cache_lock);
return 0;
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 51844048937f..ca73ca79a0ee 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -39,6 +39,7 @@ enum {
NFSD_Versions,
NFSD_Ports,
NFSD_MaxBlkSize,
+ NFSD_MaxConnections,
NFSD_SupportedEnctypes,
/*
* The below MUST come last. Otherwise we leave a hole in nfsd_files[]
@@ -48,6 +49,7 @@ enum {
NFSD_Leasetime,
NFSD_Gracetime,
NFSD_RecoveryDir,
+ NFSD_V4EndGrace,
#endif
};
@@ -62,10 +64,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
static ssize_t write_versions(struct file *file, char *buf, size_t size);
static ssize_t write_ports(struct file *file, char *buf, size_t size);
static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
+static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
#ifdef CONFIG_NFSD_V4
static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
#endif
static ssize_t (*write_op[])(struct file *, char *, size_t) = {
@@ -77,10 +81,12 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
[NFSD_Versions] = write_versions,
[NFSD_Ports] = write_ports,
[NFSD_MaxBlkSize] = write_maxblksize,
+ [NFSD_MaxConnections] = write_maxconn,
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = write_leasetime,
[NFSD_Gracetime] = write_gracetime,
[NFSD_RecoveryDir] = write_recoverydir,
+ [NFSD_V4EndGrace] = write_v4_end_grace,
#endif
};
@@ -369,8 +375,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
if (maxsize < NFS_FHSIZE)
return -EINVAL;
- if (maxsize > NFS3_FHSIZE)
- maxsize = NFS3_FHSIZE;
+ maxsize = min(maxsize, NFS3_FHSIZE);
if (qword_get(&mesg, mesg, size)>0)
return -EINVAL;
@@ -871,10 +876,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
/* force bsize into allowed range and
* required alignment.
*/
- if (bsize < 1024)
- bsize = 1024;
- if (bsize > NFSSVC_MAXBLKSIZE)
- bsize = NFSSVC_MAXBLKSIZE;
+ bsize = max_t(int, bsize, 1024);
+ bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE);
bsize &= ~(1024-1);
mutex_lock(&nfsd_mutex);
if (nn->nfsd_serv) {
@@ -889,6 +892,44 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
nfsd_max_blksize);
}
+/**
+ * write_maxconn - Set or report the current max number of connections
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing the new
+ * number of max connections
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C string
+ * containing numeric value of max_connections setting
+ * for this net namespace;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
+static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
+{
+ char *mesg = buf;
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ unsigned int maxconn = nn->max_connections;
+
+ if (size > 0) {
+ int rv = get_uint(&mesg, &maxconn);
+
+ if (rv)
+ return rv;
+ nn->max_connections = maxconn;
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
+}
+
#ifdef CONFIG_NFSD_V4
static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
time_t *time, struct nfsd_net *nn)
@@ -1039,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
return rv;
}
+/**
+ * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * OR
+ *
+ * Input:
+ * buf: any value
+ * size: non-zero length of C string in @buf
+ * Output:
+ * passed-in buffer filled with "Y" or "N" with a newline
+ * and NULL-terminated C string. This indicates whether
+ * the grace period has ended in the current net
+ * namespace. Return code is the size in bytes of the
+ * string. Writing a string that starts with 'Y', 'y', or
+ * '1' to the file will end the grace period for nfsd's v4
+ * lock manager.
+ */
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
+{
+ struct net *net = file->f_dentry->d_sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (size > 0) {
+ switch(buf[0]) {
+ case 'Y':
+ case 'y':
+ case '1':
+ nfsd4_end_grace(nn);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
+ nn->grace_ended ? 'Y' : 'N');
+}
+
#endif
/*----------------------------------------------------------------------------*/
@@ -1064,6 +1146,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+ [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
@@ -1071,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
#endif
/* last one */ {""}
};
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 847daf37e566..747f3b95bd11 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -251,7 +251,7 @@ void nfsd_lockd_shutdown(void);
#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
-#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP)
#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ec8393418154..88026fc6a981 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -162,7 +162,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
/* deprecated, convert to type 3 */
len = key_len(FSID_ENCODE_DEV)/4;
fh->fh_fsid_type = FSID_ENCODE_DEV;
- fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1])));
+ /*
+ * struct knfsd_fh uses host-endian fields, which are
+ * sometimes used to hold net-endian values. This
+ * confuses sparse, so we must use __force here to
+ * keep it from complaining.
+ */
+ fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+ ntohl((__force __be32)fh->fh_fsid[1])));
fh->fh_fsid[1] = fh->fh_fsid[2];
}
data_left -= len;
@@ -202,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
* fix that case easily.
*/
struct cred *new = prepare_creds();
- if (!new)
- return nfserrno(-ENOMEM);
+ if (!new) {
+ error = nfserrno(-ENOMEM);
+ goto out;
+ }
new->cap_effective =
cap_raise_nfsd_set(new->cap_effective,
new->cap_permitted);
@@ -539,8 +548,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
dentry);
fhp->fh_dentry = dget(dentry); /* our internal copy */
- fhp->fh_export = exp;
- cache_get(&exp->h);
+ fhp->fh_export = exp_get(exp);
if (fhp->fh_handle.fh_version == 0xca) {
/* old style filehandle please */
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 2e89e70ac15c..08236d70c667 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -73,8 +73,15 @@ enum fsid_source {
extern enum fsid_source fsid_source(struct svc_fh *fhp);
-/* This might look a little large to "inline" but in all calls except
+/*
+ * This might look a little large to "inline" but in all calls except
* one, 'vers' is constant so moste of the function disappears.
+ *
+ * In some cases the values are considered to be host endian and in
+ * others, net endian. fsidv is always considered to be u32 as the
+ * callers don't know which it will be. So we must use __force to keep
+ * sparse from complaining. Since these values are opaque to the
+ * client, that shouldn't be a problem.
*/
static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
u32 fsid, unsigned char *uuid)
@@ -82,7 +89,7 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
u32 *up;
switch(vers) {
case FSID_DEV:
- fsidv[0] = htonl((MAJOR(dev)<<16) |
+ fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) |
MINOR(dev));
fsidv[1] = ino_t_to_u32(ino);
break;
@@ -90,8 +97,8 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
fsidv[0] = fsid;
break;
case FSID_MAJOR_MINOR:
- fsidv[0] = htonl(MAJOR(dev));
- fsidv[1] = htonl(MINOR(dev));
+ fsidv[0] = (__force __u32)htonl(MAJOR(dev));
+ fsidv[1] = (__force __u32)htonl(MINOR(dev));
fsidv[2] = ino_t_to_u32(ino);
break;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 54c6b3d3cc79..b8680738f588 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -403,12 +403,13 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
fh_init(&newfh, NFS_FHSIZE);
/*
- * Create the link, look up new file and set attrs.
+ * Crazy hack: the request fits in a page, and already-decoded
+ * attributes follow argp->tname, so it's safe to just write a
+ * null to ensure it's null-terminated:
*/
+ argp->tname[argp->tlen] = '\0';
nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
- argp->tname, argp->tlen,
- &newfh, &argp->attrs);
-
+ argp->tname, &newfh);
fh_put(&argp->ffh);
fh_put(&newfh);
@@ -716,6 +717,7 @@ nfserrno (int errno)
{ nfserr_noent, -ENOENT },
{ nfserr_io, -EIO },
{ nfserr_nxio, -ENXIO },
+ { nfserr_fbig, -E2BIG },
{ nfserr_acces, -EACCES },
{ nfserr_exist, -EEXIST },
{ nfserr_xdev, -EXDEV },
@@ -743,6 +745,7 @@ nfserrno (int errno)
{ nfserr_notsupp, -EOPNOTSUPP },
{ nfserr_toosmall, -ETOOSMALL },
{ nfserr_serverfault, -ESERVERFAULT },
+ { nfserr_serverfault, -ENFILE },
};
int i;
@@ -750,7 +753,7 @@ nfserrno (int errno)
if (nfs_errtbl[i].syserr == errno)
return nfs_errtbl[i].nfserr;
}
- printk (KERN_INFO "nfsd: non-standard errno: %d\n", errno);
+ WARN(1, "nfsd: non-standard errno: %d\n", errno);
return nfserr_io;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1879e43f2868..752d56bbe0ba 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -221,7 +221,8 @@ static int nfsd_startup_generic(int nrservs)
*/
ret = nfsd_racache_init(2*nrservs);
if (ret)
- return ret;
+ goto dec_users;
+
ret = nfs4_state_start();
if (ret)
goto out_racache;
@@ -229,6 +230,8 @@ static int nfsd_startup_generic(int nrservs)
out_racache:
nfsd_racache_shutdown();
+dec_users:
+ nfsd_users--;
return ret;
}
@@ -405,6 +408,7 @@ int nfsd_create_serv(struct net *net)
if (nn->nfsd_serv == NULL)
return -ENOMEM;
+ nn->nfsd_serv->sv_maxconn = nn->max_connections;
error = svc_bind(nn->nfsd_serv, net);
if (error < 0) {
svc_destroy(nn->nfsd_serv);
@@ -469,8 +473,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
/* enforce a global maximum number of threads */
tot = 0;
for (i = 0; i < n; i++) {
- if (nthreads[i] > NFSD_MAXSERVS)
- nthreads[i] = NFSD_MAXSERVS;
+ nthreads[i] = min(nthreads[i], NFSD_MAXSERVS);
tot += nthreads[i];
}
if (tot > NFSD_MAXSERVS) {
@@ -519,11 +522,11 @@ nfsd_svc(int nrservs, struct net *net)
mutex_lock(&nfsd_mutex);
dprintk("nfsd: creating service\n");
- if (nrservs <= 0)
- nrservs = 0;
- if (nrservs > NFSD_MAXSERVS)
- nrservs = NFSD_MAXSERVS;
+
+ nrservs = max(nrservs, 0);
+ nrservs = min(nrservs, NFSD_MAXSERVS);
error = 0;
+
if (nrservs == 0 && nn->nfsd_serv == NULL)
goto out;
@@ -564,6 +567,7 @@ nfsd(void *vrqstp)
struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
struct net *net = perm_sock->xpt_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int err;
/* Lock module and set up kernel thread */
@@ -597,6 +601,9 @@ nfsd(void *vrqstp)
* The main request loop
*/
for (;;) {
+ /* Update sv_maxconn if it has changed */
+ rqstp->rq_server->sv_maxconn = nn->max_connections;
+
/*
* Find a socket with data available and call its
* recvfrom routine.
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 1ac306b769df..412d7061f9e5 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -257,8 +257,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
len = args->count = ntohl(*p++);
p++; /* totalcount - unused */
- if (len > NFSSVC_MAXBLKSIZE_V2)
- len = NFSSVC_MAXBLKSIZE_V2;
+ len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
/* set up somewhere to store response.
* We take pages, put them on reslist and include in iovec
@@ -268,7 +267,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
struct page *p = *(rqstp->rq_next_page++);
rqstp->rq_vec[v].iov_base = page_address(p);
- rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
+ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
len -= rqstp->rq_vec[v].iov_len;
v++;
}
@@ -400,9 +399,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
return 0;
args->cookie = ntohl(*p++);
args->count = ntohl(*p++);
- if (args->count > PAGE_SIZE)
- args->count = PAGE_SIZE;
-
+ args->count = min_t(u32, args->count, PAGE_SIZE);
args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
@@ -516,10 +513,11 @@ nfssvc_encode_entry(void *ccdv, const char *name,
}
if (cd->offset)
*cd->offset = htonl(offset);
- if (namlen > NFS2_MAXNAMLEN)
- namlen = NFS2_MAXNAMLEN;/* truncate filename */
+ /* truncate filename */
+ namlen = min(namlen, NFS2_MAXNAMLEN);
slen = XDR_QUADLEN(namlen);
+
if ((buflen = cd->buflen - slen - 4) < 0) {
cd->common.err = nfserr_toosmall;
return -EINVAL;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 374c66283ac5..2712042a66b1 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -62,17 +62,28 @@ typedef struct {
(s)->si_generation
struct nfsd4_callback {
- void *cb_op;
struct nfs4_client *cb_clp;
struct list_head cb_per_client;
u32 cb_minorversion;
struct rpc_message cb_msg;
- const struct rpc_call_ops *cb_ops;
+ struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
bool cb_done;
};
+struct nfsd4_callback_ops {
+ void (*prepare)(struct nfsd4_callback *);
+ int (*done)(struct nfsd4_callback *, struct rpc_task *);
+ void (*release)(struct nfsd4_callback *);
+};
+
+/*
+ * A core object that represents a "common" stateid. These are generally
+ * embedded within the different (more specific) stateid objects and contain
+ * fields that are of general use to any stateid.
+ */
struct nfs4_stid {
+ atomic_t sc_count;
#define NFS4_OPEN_STID 1
#define NFS4_LOCK_STID 2
#define NFS4_DELEG_STID 4
@@ -80,26 +91,50 @@ struct nfs4_stid {
#define NFS4_CLOSED_STID 8
/* For a deleg stateid kept around only to process free_stateid's: */
#define NFS4_REVOKED_DELEG_STID 16
+#define NFS4_CLOSED_DELEG_STID 32
unsigned char sc_type;
stateid_t sc_stateid;
struct nfs4_client *sc_client;
+ struct nfs4_file *sc_file;
+ void (*sc_free)(struct nfs4_stid *);
};
+/*
+ * Represents a delegation stateid. The nfs4_client holds references to these
+ * and they are put when it is being destroyed or when the delegation is
+ * returned by the client:
+ *
+ * o 1 reference as long as a delegation is still in force (taken when it's
+ * alloc'd, put when it's returned or revoked)
+ *
+ * o 1 reference as long as a recall rpc is in progress (taken when the lease
+ * is broken, put when the rpc exits)
+ *
+ * o 1 more ephemeral reference for each nfsd thread currently doing something
+ * with that delegation without holding the cl_lock
+ *
+ * If the server attempts to recall a delegation and the client doesn't do so
+ * before a timeout, the server may also revoke the delegation. In that case,
+ * the object will either be destroyed (v4.0) or moved to a per-client list of
+ * revoked delegations (v4.1+).
+ *
+ * This object is a superset of the nfs4_stid.
+ */
struct nfs4_delegation {
struct nfs4_stid dl_stid; /* must be first field */
struct list_head dl_perfile;
struct list_head dl_perclnt;
struct list_head dl_recall_lru; /* delegation recalled */
- atomic_t dl_count; /* ref count */
- struct nfs4_file *dl_file;
u32 dl_type;
time_t dl_time;
/* For recall: */
- struct knfsd_fh dl_fh;
int dl_retries;
struct nfsd4_callback dl_recall;
};
+#define cb_to_delegation(cb) \
+ container_of(cb, struct nfs4_delegation, dl_recall)
+
/* client delegation callback info */
struct nfs4_cb_conn {
/* SETCLIENTID info */
@@ -194,6 +229,11 @@ struct nfsd4_conn {
unsigned char cn_flags;
};
+/*
+ * Representation of a v4.1+ session. These are refcounted in a similar fashion
+ * to the nfs4_client. References are only taken when the server is actively
+ * working on the object (primarily during the processing of compounds).
+ */
struct nfsd4_session {
atomic_t se_ref;
struct list_head se_hash; /* hash by sessionid */
@@ -212,8 +252,6 @@ struct nfsd4_session {
struct nfsd4_slot *se_slots[]; /* forward channel slots */
};
-extern void nfsd4_put_session(struct nfsd4_session *ses);
-
/* formatted contents of nfs4_sessionid */
struct nfsd4_sessionid {
clientid_t clientid;
@@ -225,17 +263,35 @@ struct nfsd4_sessionid {
/*
* struct nfs4_client - one per client. Clientids live here.
- * o Each nfs4_client is hashed by clientid.
*
- * o Each nfs4_clients is also hashed by name
- * (the opaque quantity initially sent by the client to identify itself).
+ * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
+ * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped.
+ * Each nfsd_net_ns object contains a set of these and they are tracked via
+ * short and long form clientid. They are hashed and searched for under the
+ * per-nfsd_net client_lock spinlock.
+ *
+ * References to it are only held during the processing of compounds, and in
+ * certain other operations. In their "resting state" they have a refcount of
+ * 0. If they are not renewed within a lease period, they become eligible for
+ * destruction by the laundromat.
+ *
+ * These objects can also be destroyed prematurely by the fault injection code,
+ * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates.
+ * Care is taken *not* to do this however when the objects have an elevated
+ * refcount.
+ *
+ * o Each nfs4_client is hashed by clientid
+ *
+ * o Each nfs4_clients is also hashed by name (the opaque quantity initially
+ * sent by the client to identify itself).
*
- * o cl_perclient list is used to ensure no dangling stateowner references
- * when we expire the nfs4_client
+ * o cl_perclient list is used to ensure no dangling stateowner references
+ * when we expire the nfs4_client
*/
struct nfs4_client {
struct list_head cl_idhash; /* hash by cl_clientid.id */
struct rb_node cl_namenode; /* link into by-name trees */
+ struct list_head *cl_ownerstr_hashtbl;
struct list_head cl_openowners;
struct idr cl_stateids; /* stateid lookup */
struct list_head cl_delegations;
@@ -258,6 +314,7 @@ struct nfs4_client {
#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
+#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
unsigned long cl_flags;
@@ -329,21 +386,43 @@ struct nfs4_replay {
unsigned int rp_buflen;
char *rp_buf;
struct knfsd_fh rp_openfh;
+ struct mutex rp_mutex;
char rp_ibuf[NFSD4_REPLAY_ISIZE];
};
+struct nfs4_stateowner;
+
+struct nfs4_stateowner_operations {
+ void (*so_unhash)(struct nfs4_stateowner *);
+ void (*so_free)(struct nfs4_stateowner *);
+};
+
+/*
+ * A core object that represents either an open or lock owner. The object and
+ * lock owner objects have one of these embedded within them. Refcounts and
+ * other fields common to both owner types are contained within these
+ * structures.
+ */
struct nfs4_stateowner {
- struct list_head so_strhash; /* hash by op_name */
- struct list_head so_stateids;
- struct nfs4_client * so_client;
- /* after increment in ENCODE_SEQID_OP_TAIL, represents the next
+ struct list_head so_strhash;
+ struct list_head so_stateids;
+ struct nfs4_client *so_client;
+ const struct nfs4_stateowner_operations *so_ops;
+ /* after increment in nfsd4_bump_seqid, represents the next
* sequence id expected from the client: */
- u32 so_seqid;
- struct xdr_netobj so_owner; /* open owner name */
- struct nfs4_replay so_replay;
- bool so_is_open_owner;
+ atomic_t so_count;
+ u32 so_seqid;
+ struct xdr_netobj so_owner; /* open owner name */
+ struct nfs4_replay so_replay;
+ bool so_is_open_owner;
};
+/*
+ * When a file is opened, the client provides an open state owner opaque string
+ * that indicates the "owner" of that open. These objects are refcounted.
+ * References to it are held by each open state associated with it. This object
+ * is a superset of the nfs4_stateowner struct.
+ */
struct nfs4_openowner {
struct nfs4_stateowner oo_owner; /* must be first field */
struct list_head oo_perclient;
@@ -358,15 +437,17 @@ struct nfs4_openowner {
struct nfs4_ol_stateid *oo_last_closed_stid;
time_t oo_time; /* time of placement on so_close_lru */
#define NFS4_OO_CONFIRMED 1
-#define NFS4_OO_NEW 4
unsigned char oo_flags;
};
+/*
+ * Represents a generic "lockowner". Similar to an openowner. References to it
+ * are held by the lock stateids that are created on its behalf. This object is
+ * a superset of the nfs4_stateowner struct (or would be if it needed any extra
+ * fields).
+ */
struct nfs4_lockowner {
struct nfs4_stateowner lo_owner; /* must be first element */
- struct list_head lo_owner_ino_hash; /* hash by owner,file */
- struct list_head lo_perstateid;
- struct list_head lo_list; /* for temporary uses */
};
static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
@@ -379,9 +460,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
return container_of(so, struct nfs4_lockowner, lo_owner);
}
-/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */
+/*
+ * nfs4_file: a file opened by some number of (open) nfs4_stateowners.
+ *
+ * These objects are global. nfsd only keeps one instance of a nfs4_file per
+ * inode (though it may keep multiple file descriptors open per inode). These
+ * are tracked in the file_hashtbl which is protected by the state_lock
+ * spinlock.
+ */
struct nfs4_file {
atomic_t fi_ref;
+ spinlock_t fi_lock;
struct hlist_node fi_hash; /* hash by "struct inode *" */
struct list_head fi_stateids;
struct list_head fi_delegations;
@@ -395,49 +484,35 @@ struct nfs4_file {
* + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
*/
atomic_t fi_access[2];
+ u32 fi_share_deny;
struct file *fi_deleg_file;
- struct file_lock *fi_lease;
atomic_t fi_delegees;
- struct inode *fi_inode;
+ struct knfsd_fh fi_fhandle;
bool fi_had_conflict;
};
-/* XXX: for first cut may fall back on returning file that doesn't work
- * at all? */
-static inline struct file *find_writeable_file(struct nfs4_file *f)
-{
- if (f->fi_fds[O_WRONLY])
- return f->fi_fds[O_WRONLY];
- return f->fi_fds[O_RDWR];
-}
-
-static inline struct file *find_readable_file(struct nfs4_file *f)
-{
- if (f->fi_fds[O_RDONLY])
- return f->fi_fds[O_RDONLY];
- return f->fi_fds[O_RDWR];
-}
-
-static inline struct file *find_any_file(struct nfs4_file *f)
-{
- if (f->fi_fds[O_RDWR])
- return f->fi_fds[O_RDWR];
- else if (f->fi_fds[O_WRONLY])
- return f->fi_fds[O_WRONLY];
- else
- return f->fi_fds[O_RDONLY];
-}
-
-/* "ol" stands for "Open or Lock". Better suggestions welcome. */
+/*
+ * A generic struct representing either a open or lock stateid. The nfs4_client
+ * holds a reference to each of these objects, and they in turn hold a
+ * reference to their respective stateowners. The client's reference is
+ * released in response to a close or unlock (depending on whether it's an open
+ * or lock stateid) or when the client is being destroyed.
+ *
+ * In the case of v4.0 open stateids, these objects are preserved for a little
+ * while after close in order to handle CLOSE replays. Those are eventually
+ * reclaimed via a LRU scheme by the laundromat.
+ *
+ * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock".
+ * Better suggestions welcome.
+ */
struct nfs4_ol_stateid {
struct nfs4_stid st_stid; /* must be first field */
struct list_head st_perfile;
struct list_head st_perstateowner;
- struct list_head st_lockowners;
+ struct list_head st_locks;
struct nfs4_stateowner * st_stateowner;
- struct nfs4_file * st_file;
- unsigned long st_access_bmap;
- unsigned long st_deny_bmap;
+ unsigned char st_access_bmap;
+ unsigned char st_deny_bmap;
struct nfs4_ol_stateid * st_openstp;
};
@@ -450,33 +525,43 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
#define RD_STATE 0x00000010
#define WR_STATE 0x00000020
+enum nfsd4_cb_op {
+ NFSPROC4_CLNT_CB_NULL = 0,
+ NFSPROC4_CLNT_CB_RECALL,
+ NFSPROC4_CLNT_CB_SEQUENCE,
+};
+
+
struct nfsd4_compound_state;
struct nfsd_net;
extern __be32 nfs4_preprocess_stateid_op(struct net *net,
struct nfsd4_compound_state *cstate,
stateid_t *stateid, int flags, struct file **filp);
-extern void nfs4_lock_state(void);
-extern void nfs4_unlock_state(void);
+void nfs4_put_stid(struct nfs4_stid *s);
void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
extern void nfs4_release_reclaim(struct nfsd_net *);
extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
struct nfsd_net *nn);
-extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
+ struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
extern int set_callback_cred(void);
-extern void nfsd4_init_callback(struct nfsd4_callback *);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
-extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+ struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+extern void nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
-extern void nfs4_put_delegation(struct nfs4_delegation *dp);
+extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
-extern void put_client_renew(struct nfs4_client *clp);
+
+/* grace period management */
+void nfsd4_end_grace(struct nfsd_net *nn);
/* nfs4recover operations */
extern int nfsd4_client_tracking_init(struct net *net);
@@ -484,25 +569,30 @@ extern void nfsd4_client_tracking_exit(struct net *net);
extern void nfsd4_client_record_create(struct nfs4_client *clp);
extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
-extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn);
/* nfs fault injection functions */
#ifdef CONFIG_NFSD_FAULT_INJECTION
int nfsd_fault_inject_init(void);
void nfsd_fault_inject_cleanup(void);
-u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64));
-struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t);
-
-u64 nfsd_forget_client(struct nfs4_client *, u64);
-u64 nfsd_forget_client_locks(struct nfs4_client*, u64);
-u64 nfsd_forget_client_openowners(struct nfs4_client *, u64);
-u64 nfsd_forget_client_delegations(struct nfs4_client *, u64);
-u64 nfsd_recall_client_delegations(struct nfs4_client *, u64);
-
-u64 nfsd_print_client(struct nfs4_client *, u64);
-u64 nfsd_print_client_locks(struct nfs4_client *, u64);
-u64 nfsd_print_client_openowners(struct nfs4_client *, u64);
-u64 nfsd_print_client_delegations(struct nfs4_client *, u64);
+
+u64 nfsd_inject_print_clients(void);
+u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_clients(u64);
+
+u64 nfsd_inject_print_locks(void);
+u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_locks(u64);
+
+u64 nfsd_inject_print_openowners(void);
+u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_openowners(u64);
+
+u64 nfsd_inject_print_delegations(void);
+u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_delegations(u64);
+u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_recall_delegations(u64);
#else /* CONFIG_NFSD_FAULT_INJECTION */
static inline int nfsd_fault_inject_init(void) { return 0; }
static inline void nfsd_fault_inject_cleanup(void) {}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 140c496f612c..989129e2d6ea 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -189,8 +189,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
dparent = fhp->fh_dentry;
- exp = fhp->fh_export;
- exp_get(exp);
+ exp = exp_get(fhp->fh_export);
/* Lookup the name, but don't follow links */
if (isdotent(name, len)) {
@@ -446,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
if (err)
goto out;
size_change = 1;
+
+ /*
+ * RFC5661, Section 18.30.4:
+ * Changing the size of a file with SETATTR indirectly
+ * changes the time_modify and change attributes.
+ *
+ * (and similar for the older RFCs)
+ */
+ if (iap->ia_size != i_size_read(inode))
+ iap->ia_valid |= ATTR_MTIME;
}
iap->ia_valid |= ATTR_CTIME;
@@ -464,7 +473,7 @@ out_put_write_access:
if (size_change)
put_write_access(inode);
if (!err)
- commit_metadata(fhp);
+ err = nfserrno(commit_metadata(fhp));
out:
return err;
}
@@ -650,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
{
struct path path;
struct inode *inode;
+ struct file *file;
int flags = O_RDONLY|O_LARGEFILE;
__be32 err;
int host_err = 0;
@@ -704,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
else
flags = O_WRONLY|O_LARGEFILE;
}
- *filp = dentry_open(&path, flags, current_cred());
- if (IS_ERR(*filp)) {
- host_err = PTR_ERR(*filp);
- *filp = NULL;
- } else {
- host_err = ima_file_check(*filp, may_flags);
- if (may_flags & NFSD_MAY_64BIT_COOKIE)
- (*filp)->f_mode |= FMODE_64BITHASH;
- else
- (*filp)->f_mode |= FMODE_32BITHASH;
+ file = dentry_open(&path, flags, current_cred());
+ if (IS_ERR(file)) {
+ host_err = PTR_ERR(file);
+ goto out_nfserr;
}
+ host_err = ima_file_check(file, may_flags, 0);
+ if (host_err) {
+ nfsd_close(file);
+ goto out_nfserr;
+ }
+
+ if (may_flags & NFSD_MAY_64BIT_COOKIE)
+ file->f_mode |= FMODE_64BITHASH;
+ else
+ file->f_mode |= FMODE_32BITHASH;
+
+ *filp = file;
out_nfserr:
err = nfserrno(host_err);
out:
@@ -820,7 +836,8 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
}
-__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
+static __be32
+nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
{
if (host_err >= 0) {
nfsdstats.io_read += host_err;
@@ -831,7 +848,7 @@ __be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
return nfserrno(host_err);
}
-int nfsd_splice_read(struct svc_rqst *rqstp,
+__be32 nfsd_splice_read(struct svc_rqst *rqstp,
struct file *file, loff_t offset, unsigned long *count)
{
struct splice_desc sd = {
@@ -847,7 +864,7 @@ int nfsd_splice_read(struct svc_rqst *rqstp,
return nfsd_finish_read(file, count, host_err);
}
-int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
+__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
unsigned long *count)
{
mm_segment_t oldfs;
@@ -1121,7 +1138,8 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
if (iap->ia_valid)
return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
- return 0;
+ /* Callers expect file metadata to be committed here */
+ return nfserrno(commit_metadata(resfhp));
}
/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
@@ -1253,9 +1271,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfsd_create_setattr(rqstp, resfhp, iap);
/*
- * nfsd_setattr already committed the child. Transactional filesystems
- * had a chance to commit changes for both parent and child
- * simultaneously making the following commit_metadata a noop.
+ * nfsd_create_setattr already committed the child. Transactional
+ * filesystems had a chance to commit changes for both parent and
+ * child * simultaneously making the following commit_metadata a
+ * noop.
*/
err2 = nfserrno(commit_metadata(fhp));
if (err2)
@@ -1426,7 +1445,8 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfsd_create_setattr(rqstp, resfhp, iap);
/*
- * nfsd_setattr already committed the child (and possibly also the parent).
+ * nfsd_create_setattr already committed the child
+ * (and possibly also the parent).
*/
if (!err)
err = nfserrno(commit_metadata(fhp));
@@ -1504,16 +1524,15 @@ out_nfserr:
__be32
nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen,
- char *path, int plen,
- struct svc_fh *resfhp,
- struct iattr *iap)
+ char *path,
+ struct svc_fh *resfhp)
{
struct dentry *dentry, *dnew;
__be32 err, cerr;
int host_err;
err = nfserr_noent;
- if (!flen || !plen)
+ if (!flen || path[0] == '\0')
goto out;
err = nfserr_exist;
if (isdotent(fname, flen))
@@ -1534,18 +1553,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (IS_ERR(dnew))
goto out_nfserr;
- if (unlikely(path[plen] != 0)) {
- char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
- if (path_alloced == NULL)
- host_err = -ENOMEM;
- else {
- strncpy(path_alloced, path, plen);
- path_alloced[plen] = 0;
- host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced);
- kfree(path_alloced);
- }
- } else
- host_err = vfs_symlink(dentry->d_inode, dnew, path);
+ host_err = vfs_symlink(dentry->d_inode, dnew, path);
err = nfserrno(host_err);
if (!err)
err = nfserrno(commit_metadata(fhp));
@@ -2093,8 +2101,7 @@ nfsd_racache_init(int cache_size)
if (raparm_hash[0].pb_head)
return 0;
nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
- if (nperbucket < 2)
- nperbucket = 2;
+ nperbucket = max(2, nperbucket);
cache_size = nperbucket * RAPARM_HASH_SIZE;
dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 91b6ae3f658b..c2ff3f14e5f6 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -74,9 +74,9 @@ struct raparms;
__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *,
struct file **, struct raparms **);
void nfsd_put_tmp_read_open(struct file *, struct raparms *);
-int nfsd_splice_read(struct svc_rqst *,
+__be32 nfsd_splice_read(struct svc_rqst *,
struct file *, loff_t, unsigned long *);
-int nfsd_readv(struct file *, loff_t, struct kvec *, int,
+__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int,
unsigned long *);
__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
loff_t, struct kvec *, int, unsigned long *);
@@ -85,8 +85,8 @@ __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
char *, int *);
__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
- char *name, int len, char *path, int plen,
- struct svc_fh *res, struct iattr *);
+ char *name, int len, char *path,
+ struct svc_fh *res);
__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
char *, int, struct svc_fh *);
__be32 nfsd_rename(struct svc_rqst *,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 18cbb6d9c8a9..5720e9457f33 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -55,6 +55,7 @@ struct nfsd4_compound_state {
struct svc_fh current_fh;
struct svc_fh save_fh;
struct nfs4_stateowner *replay_owner;
+ struct nfs4_client *clp;
/* For sessions DRC */
struct nfsd4_session *session;
struct nfsd4_slot *slot;
@@ -107,8 +108,8 @@ struct nfsd4_create {
u32 cr_type; /* request */
union { /* request */
struct {
- u32 namelen;
- char *name;
+ u32 datalen;
+ char *data;
} link; /* NF4LNK */
struct {
u32 specdata1;
@@ -121,8 +122,8 @@ struct nfsd4_create {
struct nfs4_acl *cr_acl;
struct xdr_netobj cr_label;
};
-#define cr_linklen u.link.namelen
-#define cr_linkname u.link.name
+#define cr_datalen u.link.datalen
+#define cr_data u.link.data
#define cr_specdata1 u.dev.specdata1
#define cr_specdata2 u.dev.specdata2
@@ -427,6 +428,17 @@ struct nfsd4_reclaim_complete {
u32 rca_one_fs;
};
+struct nfsd4_seek {
+ /* request */
+ stateid_t seek_stateid;
+ loff_t seek_offset;
+ u32 seek_whence;
+
+ /* response */
+ u32 seek_eof;
+ loff_t seek_pos;
+};
+
struct nfsd4_op {
int opnum;
__be32 status;
@@ -472,12 +484,23 @@ struct nfsd4_op {
struct nfsd4_reclaim_complete reclaim_complete;
struct nfsd4_test_stateid test_stateid;
struct nfsd4_free_stateid free_stateid;
+
+ /* NFSv4.2 */
+ struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
};
bool nfsd4_cache_this_op(struct nfsd4_op *);
+/*
+ * Memory needed just for the duration of processing one compound:
+ */
+struct svcxdr_tmpbuf {
+ struct svcxdr_tmpbuf *next;
+ char buf[];
+};
+
struct nfsd4_compoundargs {
/* scratch variables for XDR decode */
__be32 * p;
@@ -486,11 +509,7 @@ struct nfsd4_compoundargs {
int pagelen;
__be32 tmp[8];
__be32 * tmpp;
- struct tmpbuf {
- struct tmpbuf *next;
- void (*release)(const void *);
- void *buf;
- } *to_free;
+ struct svcxdr_tmpbuf *to_free;
struct svc_rqst *rqstp;
@@ -574,7 +593,6 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
struct nfsd4_setclientid_confirm *setclientid_confirm);
-extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
@@ -585,6 +603,7 @@ extern __be32 nfsd4_create_session(struct svc_rqst *,
extern __be32 nfsd4_sequence(struct svc_rqst *,
struct nfsd4_compound_state *,
struct nfsd4_sequence *);
+extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp);
extern __be32 nfsd4_destroy_session(struct svc_rqst *,
struct nfsd4_compound_state *,
struct nfsd4_destroy_session *);
@@ -594,7 +613,9 @@ extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
struct nfsd4_open *open, struct nfsd_net *nn);
extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
struct svc_fh *current_fh, struct nfsd4_open *open);
-extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
+extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate);
+extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
+ struct nfsd4_open *open, __be32 status);
extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
extern __be32 nfsd4_close(struct svc_rqst *rqstp,
@@ -625,6 +646,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid);
extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr);
+
#endif
/*
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index 85c98737a146..fc603e0431bb 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
btnode.o bmap.o btree.o direct.o dat.o recovery.o \
the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
- ifile.o alloc.o gcinode.o ioctl.o
+ ifile.o alloc.o gcinode.o ioctl.o sysfs.o
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 24978153c0c4..e9e3325f29f3 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -56,11 +56,9 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
mutex_unlock(&inode->i_mutex);
nilfs = inode->i_sb->s_fs_info;
- if (!err && nilfs_test_opt(nilfs, BARRIER)) {
- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- if (err != -EIO)
- err = 0;
- }
+ if (!err)
+ err = nilfs_flush_device(nilfs);
+
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6252b173a465..e1fa69b341b9 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -24,6 +24,7 @@
#include <linux/buffer_head.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
+#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/aio.h>
#include "nilfs.h"
@@ -125,7 +126,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
nilfs_transaction_abort(inode->i_sb);
goto out;
}
- nilfs_mark_inode_dirty(inode);
+ nilfs_mark_inode_dirty_sync(inode);
nilfs_transaction_commit(inode->i_sb); /* never fails */
/* Error handling should be detailed */
set_buffer_new(bh_result);
@@ -219,10 +220,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
static int nilfs_set_page_dirty(struct page *page)
{
+ struct inode *inode = page->mapping->host;
int ret = __set_page_dirty_nobuffers(page);
if (page_has_buffers(page)) {
- struct inode *inode = page->mapping->host;
unsigned nr_dirty = 0;
struct buffer_head *bh, *head;
@@ -245,6 +246,10 @@ static int nilfs_set_page_dirty(struct page *page)
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
+ } else if (ret) {
+ unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ nilfs_set_file_dirty(inode, nr_dirty);
}
return ret;
}
@@ -667,7 +672,7 @@ void nilfs_write_inode_common(struct inode *inode,
for substitutions of appended fields */
}
-void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
+void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
{
ino_t ino = inode->i_ino;
struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -678,7 +683,8 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
- set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+ if (flags & I_DIRTY_DATASYNC)
+ set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
nilfs_write_inode_common(inode, raw_inode, 0);
/* XXX: call with has_bmap = 0 is a workaround to avoid
@@ -934,7 +940,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
return 0;
}
-int nilfs_mark_inode_dirty(struct inode *inode)
+int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
{
struct buffer_head *ibh;
int err;
@@ -945,7 +951,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
"failed to reget inode block.\n");
return err;
}
- nilfs_update_inode(inode, ibh);
+ nilfs_update_inode(inode, ibh, flags);
mark_buffer_dirty(ibh);
nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
brelse(ibh);
@@ -978,7 +984,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
return;
}
nilfs_transaction_begin(inode->i_sb, &ti, 0);
- nilfs_mark_inode_dirty(inode);
+ __nilfs_mark_inode_dirty(inode, flags);
nilfs_transaction_commit(inode->i_sb); /* never fails */
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 422fb54b7377..9a20e513d7eb 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1022,11 +1022,9 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
return ret;
nilfs = inode->i_sb->s_fs_info;
- if (nilfs_test_opt(nilfs, BARRIER)) {
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- if (ret == -EIO)
- return ret;
- }
+ ret = nilfs_flush_device(nilfs);
+ if (ret < 0)
+ return ret;
if (argp != NULL) {
down_read(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 9bc72dec3fa6..91093cd74f0d 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -104,7 +104,7 @@ enum {
constructor */
NILFS_I_COLLECTED, /* All dirty blocks are collected */
NILFS_I_UPDATED, /* The file has been written back */
- NILFS_I_INODE_DIRTY, /* write_inode is requested */
+ NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
NILFS_I_BMAP, /* has bmap and btnode_cache */
NILFS_I_GCINODE, /* inode for GC, on memory only */
};
@@ -273,7 +273,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
unsigned long ino);
extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
unsigned long ino, __u64 cno);
-extern void nilfs_update_inode(struct inode *, struct buffer_head *);
+extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
extern int nilfs_setattr(struct dentry *, struct iattr *);
@@ -282,10 +282,18 @@ int nilfs_permission(struct inode *inode, int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
-extern int nilfs_mark_inode_dirty(struct inode *);
+extern int __nilfs_mark_inode_dirty(struct inode *, int);
extern void nilfs_dirty_inode(struct inode *, int flags);
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+static inline int nilfs_mark_inode_dirty(struct inode *inode)
+{
+ return __nilfs_mark_inode_dirty(inode, I_DIRTY);
+}
+static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
+{
+ return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC);
+}
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
@@ -320,6 +328,14 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
int nilfs_init_gcinode(struct inode *inode);
void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
+/* sysfs.c */
+int __init nilfs_sysfs_init(void);
+void nilfs_sysfs_exit(void);
+int nilfs_sysfs_create_device_group(struct super_block *);
+void nilfs_sysfs_delete_device_group(struct the_nilfs *);
+int nilfs_sysfs_create_snapshot_group(struct nilfs_root *);
+void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *);
+
/*
* Inodes and files operations
*/
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a1a191634abc..7ef18fc656c2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -930,7 +930,7 @@ static void nilfs_drop_collected_inodes(struct list_head *head)
if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
continue;
- clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+ clear_bit(NILFS_I_INODE_SYNC, &ii->i_state);
set_bit(NILFS_I_UPDATED, &ii->i_state);
}
}
@@ -1833,6 +1833,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
nilfs_set_next_segment(nilfs, segbuf);
if (update_sr) {
+ nilfs->ns_flushed_device = 0;
nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
@@ -2194,7 +2195,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
nilfs_transaction_lock(sb, &ti, 0);
ii = NILFS_I(inode);
- if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
+ if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) ||
nilfs_test_opt(nilfs, STRICT_ORDER) ||
test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
nilfs_discontinued(nilfs)) {
@@ -2216,6 +2217,8 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
sci->sc_dsync_end = end;
err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
+ if (!err)
+ nilfs->ns_flushed_device = 0;
nilfs_transaction_unlock(sb);
return err;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8c532b2ca3ab..2e5b3ec85b8f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -310,6 +310,9 @@ int nilfs_commit_super(struct super_block *sb, int flag)
nilfs->ns_sbsize));
}
clear_nilfs_sb_dirty(nilfs);
+ nilfs->ns_flushed_device = 1;
+ /* make sure store to ns_flushed_device cannot be reordered */
+ smp_wmb();
return nilfs_sync_super(sb, flag);
}
@@ -514,6 +517,9 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
}
up_write(&nilfs->ns_sem);
+ if (!err)
+ err = nilfs_flush_device(nilfs);
+
return err;
}
@@ -942,7 +948,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
iput(inode);
}
} else {
- dentry = d_obtain_alias(inode);
+ dentry = d_obtain_root(inode);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto failed_dentry;
@@ -1452,13 +1458,19 @@ static int __init init_nilfs_fs(void)
if (err)
goto fail;
- err = register_filesystem(&nilfs_fs_type);
+ err = nilfs_sysfs_init();
if (err)
goto free_cachep;
+ err = register_filesystem(&nilfs_fs_type);
+ if (err)
+ goto deinit_sysfs_entry;
+
printk(KERN_INFO "NILFS version 2 loaded\n");
return 0;
+deinit_sysfs_entry:
+ nilfs_sysfs_exit();
free_cachep:
nilfs_destroy_cachep();
fail:
@@ -1468,6 +1480,7 @@ fail:
static void __exit exit_nilfs_fs(void)
{
nilfs_destroy_cachep();
+ nilfs_sysfs_exit();
unregister_filesystem(&nilfs_fs_type);
}
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
new file mode 100644
index 000000000000..bbb0dcc35905
--- /dev/null
+++ b/fs/nilfs2/sysfs.c
@@ -0,0 +1,1137 @@
+/*
+ * sysfs.c - sysfs support implementation.
+ *
+ * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
+ */
+
+#include <linux/kobject.h>
+
+#include "nilfs.h"
+#include "mdt.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "sysfs.h"
+
+/* /sys/fs/<nilfs>/ */
+static struct kset *nilfs_kset;
+
+#define NILFS_SHOW_TIME(time_t_val, buf) ({ \
+ struct tm res; \
+ int count = 0; \
+ time_to_tm(time_t_val, 0, &res); \
+ res.tm_year += 1900; \
+ res.tm_mon += 1; \
+ count = scnprintf(buf, PAGE_SIZE, \
+ "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \
+ res.tm_year, res.tm_mon, res.tm_mday, \
+ res.tm_hour, res.tm_min, res.tm_sec);\
+ count; \
+})
+
+#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \
+static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \
+ struct attribute *attr, char *buf) \
+{ \
+ struct the_nilfs *nilfs = container_of(kobj->parent, \
+ struct the_nilfs, \
+ ns_##parent_name##_kobj); \
+ struct nilfs_##name##_attr *a = container_of(attr, \
+ struct nilfs_##name##_attr, \
+ attr); \
+ return a->show ? a->show(a, nilfs, buf) : 0; \
+} \
+static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \
+ struct attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ struct the_nilfs *nilfs = container_of(kobj->parent, \
+ struct the_nilfs, \
+ ns_##parent_name##_kobj); \
+ struct nilfs_##name##_attr *a = container_of(attr, \
+ struct nilfs_##name##_attr, \
+ attr); \
+ return a->store ? a->store(a, nilfs, buf, len) : 0; \
+} \
+static const struct sysfs_ops nilfs_##name##_attr_ops = { \
+ .show = nilfs_##name##_attr_show, \
+ .store = nilfs_##name##_attr_store, \
+};
+
+#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
+static void nilfs_##name##_attr_release(struct kobject *kobj) \
+{ \
+ struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
+ struct the_nilfs *nilfs = container_of(kobj->parent, \
+ struct the_nilfs, \
+ ns_##parent_name##_kobj); \
+ subgroups = nilfs->ns_##parent_name##_subgroups; \
+ complete(&subgroups->sg_##name##_kobj_unregister); \
+} \
+static struct kobj_type nilfs_##name##_ktype = { \
+ .default_attrs = nilfs_##name##_attrs, \
+ .sysfs_ops = &nilfs_##name##_attr_ops, \
+ .release = nilfs_##name##_attr_release, \
+};
+
+#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \
+static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
+{ \
+ struct kobject *parent; \
+ struct kobject *kobj; \
+ struct completion *kobj_unregister; \
+ struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
+ int err; \
+ subgroups = nilfs->ns_##parent_name##_subgroups; \
+ kobj = &subgroups->sg_##name##_kobj; \
+ kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \
+ parent = &nilfs->ns_##parent_name##_kobj; \
+ kobj->kset = nilfs_kset; \
+ init_completion(kobj_unregister); \
+ err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
+ #name); \
+ if (err) \
+ return err; \
+ return 0; \
+} \
+static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
+{ \
+ kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
+}
+
+/************************************************************************
+ * NILFS snapshot attrs *
+ ************************************************************************/
+
+static ssize_t
+nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr,
+ struct nilfs_root *root, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)atomic64_read(&root->inodes_count));
+}
+
+static ssize_t
+nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr,
+ struct nilfs_root *root, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)atomic64_read(&root->blocks_count));
+}
+
+static const char snapshot_readme_str[] =
+ "The group contains details about mounted snapshot.\n\n"
+ "(1) inodes_count\n\tshow number of inodes for snapshot.\n\n"
+ "(2) blocks_count\n\tshow number of blocks for snapshot.\n\n";
+
+static ssize_t
+nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr,
+ struct nilfs_root *root, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, snapshot_readme_str);
+}
+
+NILFS_SNAPSHOT_RO_ATTR(inodes_count);
+NILFS_SNAPSHOT_RO_ATTR(blocks_count);
+NILFS_SNAPSHOT_RO_ATTR(README);
+
+static struct attribute *nilfs_snapshot_attrs[] = {
+ NILFS_SNAPSHOT_ATTR_LIST(inodes_count),
+ NILFS_SNAPSHOT_ATTR_LIST(blocks_count),
+ NILFS_SNAPSHOT_ATTR_LIST(README),
+ NULL,
+};
+
+static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct nilfs_root *root =
+ container_of(kobj, struct nilfs_root, snapshot_kobj);
+ struct nilfs_snapshot_attr *a =
+ container_of(attr, struct nilfs_snapshot_attr, attr);
+
+ return a->show ? a->show(a, root, buf) : 0;
+}
+
+static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct nilfs_root *root =
+ container_of(kobj, struct nilfs_root, snapshot_kobj);
+ struct nilfs_snapshot_attr *a =
+ container_of(attr, struct nilfs_snapshot_attr, attr);
+
+ return a->store ? a->store(a, root, buf, len) : 0;
+}
+
+static void nilfs_snapshot_attr_release(struct kobject *kobj)
+{
+ struct nilfs_root *root = container_of(kobj, struct nilfs_root,
+ snapshot_kobj);
+ complete(&root->snapshot_kobj_unregister);
+}
+
+static const struct sysfs_ops nilfs_snapshot_attr_ops = {
+ .show = nilfs_snapshot_attr_show,
+ .store = nilfs_snapshot_attr_store,
+};
+
+static struct kobj_type nilfs_snapshot_ktype = {
+ .default_attrs = nilfs_snapshot_attrs,
+ .sysfs_ops = &nilfs_snapshot_attr_ops,
+ .release = nilfs_snapshot_attr_release,
+};
+
+int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
+{
+ struct the_nilfs *nilfs;
+ struct kobject *parent;
+ int err;
+
+ nilfs = root->nilfs;
+ parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj;
+ root->snapshot_kobj.kset = nilfs_kset;
+ init_completion(&root->snapshot_kobj_unregister);
+
+ if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+ err = kobject_init_and_add(&root->snapshot_kobj,
+ &nilfs_snapshot_ktype,
+ &nilfs->ns_dev_kobj,
+ "current_checkpoint");
+ } else {
+ err = kobject_init_and_add(&root->snapshot_kobj,
+ &nilfs_snapshot_ktype,
+ parent,
+ "%llu", root->cno);
+ }
+
+ if (err)
+ return err;
+
+ return 0;
+}
+
+void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
+{
+ kobject_del(&root->snapshot_kobj);
+}
+
+/************************************************************************
+ * NILFS mounted snapshots attrs *
+ ************************************************************************/
+
+static const char mounted_snapshots_readme_str[] =
+ "The mounted_snapshots group contains group for\n"
+ "every mounted snapshot.\n";
+
+static ssize_t
+nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr,
+ struct the_nilfs *nilfs, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str);
+}
+
+NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README);
+
+static struct attribute *nilfs_mounted_snapshots_attrs[] = {
+ NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README),
+ NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev);
+NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev);
+NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev);
+
+/************************************************************************
+ * NILFS checkpoints attrs *
+ ************************************************************************/
+
+static ssize_t
+nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ __u64 ncheckpoints;
+ struct nilfs_cpstat cpstat;
+ int err;
+
+ down_read(&nilfs->ns_segctor_sem);
+ err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+ up_read(&nilfs->ns_segctor_sem);
+ if (err < 0) {
+ printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
+ err);
+ return err;
+ }
+
+ ncheckpoints = cpstat.cs_ncps;
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints);
+}
+
+static ssize_t
+nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ __u64 nsnapshots;
+ struct nilfs_cpstat cpstat;
+ int err;
+
+ down_read(&nilfs->ns_segctor_sem);
+ err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+ up_read(&nilfs->ns_segctor_sem);
+ if (err < 0) {
+ printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
+ err);
+ return err;
+ }
+
+ nsnapshots = cpstat.cs_nsss;
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots);
+}
+
+static ssize_t
+nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ __u64 last_cno;
+
+ spin_lock(&nilfs->ns_last_segment_lock);
+ last_cno = nilfs->ns_last_cno;
+ spin_unlock(&nilfs->ns_last_segment_lock);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+}
+
+static ssize_t
+nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ __u64 cno;
+
+ down_read(&nilfs->ns_sem);
+ cno = nilfs->ns_cno;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+}
+
+static const char checkpoints_readme_str[] =
+ "The checkpoints group contains attributes that describe\n"
+ "details about volume's checkpoints.\n\n"
+ "(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n"
+ "(2) snapshots_number\n\tshow number of snapshots on volume.\n\n"
+ "(3) last_seg_checkpoint\n"
+ "\tshow checkpoint number of the latest segment.\n\n"
+ "(4) next_checkpoint\n\tshow next checkpoint number.\n\n";
+
+static ssize_t
+nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr,
+ struct the_nilfs *nilfs, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, checkpoints_readme_str);
+}
+
+NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number);
+NILFS_CHECKPOINTS_RO_ATTR(snapshots_number);
+NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint);
+NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint);
+NILFS_CHECKPOINTS_RO_ATTR(README);
+
+static struct attribute *nilfs_checkpoints_attrs[] = {
+ NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number),
+ NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number),
+ NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint),
+ NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint),
+ NILFS_CHECKPOINTS_ATTR_LIST(README),
+ NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(checkpoints, dev);
+NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev);
+NILFS_DEV_INT_GROUP_FNS(checkpoints, dev);
+
+/************************************************************************
+ * NILFS segments attrs *
+ ************************************************************************/
+
+static ssize_t
+nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments);
+}
+
+static ssize_t
+nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment);
+}
+
+static ssize_t
+nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ unsigned long ncleansegs;
+
+ down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+ ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+ up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs);
+}
+
+static ssize_t
+nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ struct nilfs_sustat sustat;
+ int err;
+
+ down_read(&nilfs->ns_segctor_sem);
+ err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+ up_read(&nilfs->ns_segctor_sem);
+ if (err < 0) {
+ printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n",
+ err);
+ return err;
+ }
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs);
+}
+
+static const char segments_readme_str[] =
+ "The segments group contains attributes that describe\n"
+ "details about volume's segments.\n\n"
+ "(1) segments_number\n\tshow number of segments on volume.\n\n"
+ "(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n"
+ "(3) clean_segments\n\tshow count of clean segments.\n\n"
+ "(4) dirty_segments\n\tshow count of dirty segments.\n\n";
+
+static ssize_t
+nilfs_segments_README_show(struct nilfs_segments_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, segments_readme_str);
+}
+
+NILFS_SEGMENTS_RO_ATTR(segments_number);
+NILFS_SEGMENTS_RO_ATTR(blocks_per_segment);
+NILFS_SEGMENTS_RO_ATTR(clean_segments);
+NILFS_SEGMENTS_RO_ATTR(dirty_segments);
+NILFS_SEGMENTS_RO_ATTR(README);
+
+static struct attribute *nilfs_segments_attrs[] = {
+ NILFS_SEGMENTS_ATTR_LIST(segments_number),
+ NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment),
+ NILFS_SEGMENTS_ATTR_LIST(clean_segments),
+ NILFS_SEGMENTS_ATTR_LIST(dirty_segments),
+ NILFS_SEGMENTS_ATTR_LIST(README),
+ NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(segments, dev);
+NILFS_DEV_INT_GROUP_TYPE(segments, dev);
+NILFS_DEV_INT_GROUP_FNS(segments, dev);
+
+/************************************************************************
+ * NILFS segctor attrs *
+ ************************************************************************/
+
+static ssize_t
+nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ sector_t last_pseg;
+
+ spin_lock(&nilfs->ns_last_segment_lock);
+ last_pseg = nilfs->ns_last_pseg;
+ spin_unlock(&nilfs->ns_last_segment_lock);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)last_pseg);
+}
+
+static ssize_t
+nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ u64 last_seq;
+
+ spin_lock(&nilfs->ns_last_segment_lock);
+ last_seq = nilfs->ns_last_seq;
+ spin_unlock(&nilfs->ns_last_segment_lock);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq);
+}
+
+static ssize_t
+nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ __u64 last_cno;
+
+ spin_lock(&nilfs->ns_last_segment_lock);
+ last_cno = nilfs->ns_last_cno;
+ spin_unlock(&nilfs->ns_last_segment_lock);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+}
+
+static ssize_t
+nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ u64 seg_seq;
+
+ down_read(&nilfs->ns_sem);
+ seg_seq = nilfs->ns_seg_seq;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
+}
+
+static ssize_t
+nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ __u64 segnum;
+
+ down_read(&nilfs->ns_sem);
+ segnum = nilfs->ns_segnum;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
+}
+
+static ssize_t
+nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ __u64 nextnum;
+
+ down_read(&nilfs->ns_sem);
+ nextnum = nilfs->ns_nextnum;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
+}
+
+static ssize_t
+nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ unsigned long pseg_offset;
+
+ down_read(&nilfs->ns_sem);
+ pseg_offset = nilfs->ns_pseg_offset;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
+}
+
+static ssize_t
+nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ __u64 cno;
+
+ down_read(&nilfs->ns_sem);
+ cno = nilfs->ns_cno;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+}
+
+static ssize_t
+nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ time_t ctime;
+
+ down_read(&nilfs->ns_sem);
+ ctime = nilfs->ns_ctime;
+ up_read(&nilfs->ns_sem);
+
+ return NILFS_SHOW_TIME(ctime, buf);
+}
+
+static ssize_t
+nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ time_t ctime;
+
+ down_read(&nilfs->ns_sem);
+ ctime = nilfs->ns_ctime;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime);
+}
+
+static ssize_t
+nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ time_t nongc_ctime;
+
+ down_read(&nilfs->ns_sem);
+ nongc_ctime = nilfs->ns_nongc_ctime;
+ up_read(&nilfs->ns_sem);
+
+ return NILFS_SHOW_TIME(nongc_ctime, buf);
+}
+
+static ssize_t
+nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ time_t nongc_ctime;
+
+ down_read(&nilfs->ns_sem);
+ nongc_ctime = nilfs->ns_nongc_ctime;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)nongc_ctime);
+}
+
+static ssize_t
+nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ u32 ndirtyblks;
+
+ down_read(&nilfs->ns_sem);
+ ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
+}
+
+static const char segctor_readme_str[] =
+ "The segctor group contains attributes that describe\n"
+ "segctor thread activity details.\n\n"
+ "(1) last_pseg_block\n"
+ "\tshow start block number of the latest segment.\n\n"
+ "(2) last_seg_sequence\n"
+ "\tshow sequence value of the latest segment.\n\n"
+ "(3) last_seg_checkpoint\n"
+ "\tshow checkpoint number of the latest segment.\n\n"
+ "(4) current_seg_sequence\n\tshow segment sequence counter.\n\n"
+ "(5) current_last_full_seg\n"
+ "\tshow index number of the latest full segment.\n\n"
+ "(6) next_full_seg\n"
+ "\tshow index number of the full segment index to be used next.\n\n"
+ "(7) next_pseg_offset\n"
+ "\tshow offset of next partial segment in the current full segment.\n\n"
+ "(8) next_checkpoint\n\tshow next checkpoint number.\n\n"
+ "(9) last_seg_write_time\n"
+ "\tshow write time of the last segment in human-readable format.\n\n"
+ "(10) last_seg_write_time_secs\n"
+ "\tshow write time of the last segment in seconds.\n\n"
+ "(11) last_nongc_write_time\n"
+ "\tshow write time of the last segment not for cleaner operation "
+ "in human-readable format.\n\n"
+ "(12) last_nongc_write_time_secs\n"
+ "\tshow write time of the last segment not for cleaner operation "
+ "in seconds.\n\n"
+ "(13) dirty_data_blocks_count\n"
+ "\tshow number of dirty data blocks.\n\n";
+
+static ssize_t
+nilfs_segctor_README_show(struct nilfs_segctor_attr *attr,
+ struct the_nilfs *nilfs, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, segctor_readme_str);
+}
+
+NILFS_SEGCTOR_RO_ATTR(last_pseg_block);
+NILFS_SEGCTOR_RO_ATTR(last_seg_sequence);
+NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint);
+NILFS_SEGCTOR_RO_ATTR(current_seg_sequence);
+NILFS_SEGCTOR_RO_ATTR(current_last_full_seg);
+NILFS_SEGCTOR_RO_ATTR(next_full_seg);
+NILFS_SEGCTOR_RO_ATTR(next_pseg_offset);
+NILFS_SEGCTOR_RO_ATTR(next_checkpoint);
+NILFS_SEGCTOR_RO_ATTR(last_seg_write_time);
+NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs);
+NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time);
+NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs);
+NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count);
+NILFS_SEGCTOR_RO_ATTR(README);
+
+static struct attribute *nilfs_segctor_attrs[] = {
+ NILFS_SEGCTOR_ATTR_LIST(last_pseg_block),
+ NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence),
+ NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint),
+ NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence),
+ NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg),
+ NILFS_SEGCTOR_ATTR_LIST(next_full_seg),
+ NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset),
+ NILFS_SEGCTOR_ATTR_LIST(next_checkpoint),
+ NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time),
+ NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs),
+ NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time),
+ NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs),
+ NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count),
+ NILFS_SEGCTOR_ATTR_LIST(README),
+ NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(segctor, dev);
+NILFS_DEV_INT_GROUP_TYPE(segctor, dev);
+NILFS_DEV_INT_GROUP_FNS(segctor, dev);
+
+/************************************************************************
+ * NILFS superblock attrs *
+ ************************************************************************/
+
+static ssize_t
+nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ time_t sbwtime;
+
+ down_read(&nilfs->ns_sem);
+ sbwtime = nilfs->ns_sbwtime;
+ up_read(&nilfs->ns_sem);
+
+ return NILFS_SHOW_TIME(sbwtime, buf);
+}
+
+static ssize_t
+nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ time_t sbwtime;
+
+ down_read(&nilfs->ns_sem);
+ sbwtime = nilfs->ns_sbwtime;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime);
+}
+
+static ssize_t
+nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ unsigned sbwcount;
+
+ down_read(&nilfs->ns_sem);
+ sbwcount = nilfs->ns_sbwcount;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount);
+}
+
+static ssize_t
+nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ unsigned sb_update_freq;
+
+ down_read(&nilfs->ns_sem);
+ sb_update_freq = nilfs->ns_sb_update_freq;
+ up_read(&nilfs->ns_sem);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq);
+}
+
+static ssize_t
+nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
+ struct the_nilfs *nilfs,
+ const char *buf, size_t count)
+{
+ unsigned val;
+ int err;
+
+ err = kstrtouint(skip_spaces(buf), 0, &val);
+ if (err) {
+ printk(KERN_ERR "NILFS: unable to convert string: err=%d\n",
+ err);
+ return err;
+ }
+
+ if (val < NILFS_SB_FREQ) {
+ val = NILFS_SB_FREQ;
+ printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n");
+ }
+
+ down_write(&nilfs->ns_sem);
+ nilfs->ns_sb_update_freq = val;
+ up_write(&nilfs->ns_sem);
+
+ return count;
+}
+
+static const char sb_readme_str[] =
+ "The superblock group contains attributes that describe\n"
+ "superblock's details.\n\n"
+ "(1) sb_write_time\n\tshow previous write time of super block "
+ "in human-readable format.\n\n"
+ "(2) sb_write_time_secs\n\tshow previous write time of super block "
+ "in seconds.\n\n"
+ "(3) sb_write_count\n\tshow write count of super block.\n\n"
+ "(4) sb_update_frequency\n"
+ "\tshow/set interval of periodical update of superblock (in seconds).\n\n"
+ "\tYou can set preferable frequency of superblock update by command:\n\n"
+ "\t'echo <val> > /sys/fs/<nilfs>/<dev>/superblock/sb_update_frequency'\n";
+
+static ssize_t
+nilfs_superblock_README_show(struct nilfs_superblock_attr *attr,
+ struct the_nilfs *nilfs, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, sb_readme_str);
+}
+
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_time);
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs);
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_count);
+NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency);
+NILFS_SUPERBLOCK_RO_ATTR(README);
+
+static struct attribute *nilfs_superblock_attrs[] = {
+ NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time),
+ NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs),
+ NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count),
+ NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency),
+ NILFS_SUPERBLOCK_ATTR_LIST(README),
+ NULL,
+};
+
+NILFS_DEV_INT_GROUP_OPS(superblock, dev);
+NILFS_DEV_INT_GROUP_TYPE(superblock, dev);
+NILFS_DEV_INT_GROUP_FNS(superblock, dev);
+
+/************************************************************************
+ * NILFS device attrs *
+ ************************************************************************/
+
+static
+ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ struct nilfs_super_block **sbp = nilfs->ns_sbp;
+ u32 major = le32_to_cpu(sbp[0]->s_rev_level);
+ u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level);
+
+ return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor);
+}
+
+static
+ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize);
+}
+
+static
+ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ struct nilfs_super_block **sbp = nilfs->ns_sbp;
+ u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size);
+}
+
+static
+ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ sector_t free_blocks = 0;
+
+ nilfs_count_free_blocks(nilfs, &free_blocks);
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)free_blocks);
+}
+
+static
+ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ struct nilfs_super_block **sbp = nilfs->ns_sbp;
+
+ return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid);
+}
+
+static
+ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ struct nilfs_super_block **sbp = nilfs->ns_sbp;
+
+ return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n",
+ sbp[0]->s_volume_name);
+}
+
+static const char dev_readme_str[] =
+ "The <device> group contains attributes that describe file system\n"
+ "partition's details.\n\n"
+ "(1) revision\n\tshow NILFS file system revision.\n\n"
+ "(2) blocksize\n\tshow volume block size in bytes.\n\n"
+ "(3) device_size\n\tshow volume size in bytes.\n\n"
+ "(4) free_blocks\n\tshow count of free blocks on volume.\n\n"
+ "(5) uuid\n\tshow volume's UUID.\n\n"
+ "(6) volume_name\n\tshow volume's name.\n\n";
+
+static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr,
+ struct the_nilfs *nilfs,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, dev_readme_str);
+}
+
+NILFS_DEV_RO_ATTR(revision);
+NILFS_DEV_RO_ATTR(blocksize);
+NILFS_DEV_RO_ATTR(device_size);
+NILFS_DEV_RO_ATTR(free_blocks);
+NILFS_DEV_RO_ATTR(uuid);
+NILFS_DEV_RO_ATTR(volume_name);
+NILFS_DEV_RO_ATTR(README);
+
+static struct attribute *nilfs_dev_attrs[] = {
+ NILFS_DEV_ATTR_LIST(revision),
+ NILFS_DEV_ATTR_LIST(blocksize),
+ NILFS_DEV_ATTR_LIST(device_size),
+ NILFS_DEV_ATTR_LIST(free_blocks),
+ NILFS_DEV_ATTR_LIST(uuid),
+ NILFS_DEV_ATTR_LIST(volume_name),
+ NILFS_DEV_ATTR_LIST(README),
+ NULL,
+};
+
+static ssize_t nilfs_dev_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+ ns_dev_kobj);
+ struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
+ attr);
+
+ return a->show ? a->show(a, nilfs, buf) : 0;
+}
+
+static ssize_t nilfs_dev_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+ ns_dev_kobj);
+ struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
+ attr);
+
+ return a->store ? a->store(a, nilfs, buf, len) : 0;
+}
+
+static void nilfs_dev_attr_release(struct kobject *kobj)
+{
+ struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+ ns_dev_kobj);
+ complete(&nilfs->ns_dev_kobj_unregister);
+}
+
+static const struct sysfs_ops nilfs_dev_attr_ops = {
+ .show = nilfs_dev_attr_show,
+ .store = nilfs_dev_attr_store,
+};
+
+static struct kobj_type nilfs_dev_ktype = {
+ .default_attrs = nilfs_dev_attrs,
+ .sysfs_ops = &nilfs_dev_attr_ops,
+ .release = nilfs_dev_attr_release,
+};
+
+int nilfs_sysfs_create_device_group(struct super_block *sb)
+{
+ struct the_nilfs *nilfs = sb->s_fs_info;
+ size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups);
+ int err;
+
+ nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
+ if (unlikely(!nilfs->ns_dev_subgroups)) {
+ err = -ENOMEM;
+ printk(KERN_ERR "NILFS: unable to allocate memory for device group\n");
+ goto failed_create_device_group;
+ }
+
+ nilfs->ns_dev_kobj.kset = nilfs_kset;
+ init_completion(&nilfs->ns_dev_kobj_unregister);
+ err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ goto free_dev_subgroups;
+
+ err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
+ if (err)
+ goto cleanup_dev_kobject;
+
+ err = nilfs_sysfs_create_checkpoints_group(nilfs);
+ if (err)
+ goto delete_mounted_snapshots_group;
+
+ err = nilfs_sysfs_create_segments_group(nilfs);
+ if (err)
+ goto delete_checkpoints_group;
+
+ err = nilfs_sysfs_create_superblock_group(nilfs);
+ if (err)
+ goto delete_segments_group;
+
+ err = nilfs_sysfs_create_segctor_group(nilfs);
+ if (err)
+ goto delete_superblock_group;
+
+ return 0;
+
+delete_superblock_group:
+ nilfs_sysfs_delete_superblock_group(nilfs);
+
+delete_segments_group:
+ nilfs_sysfs_delete_segments_group(nilfs);
+
+delete_checkpoints_group:
+ nilfs_sysfs_delete_checkpoints_group(nilfs);
+
+delete_mounted_snapshots_group:
+ nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
+
+cleanup_dev_kobject:
+ kobject_del(&nilfs->ns_dev_kobj);
+
+free_dev_subgroups:
+ kfree(nilfs->ns_dev_subgroups);
+
+failed_create_device_group:
+ return err;
+}
+
+void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
+{
+ nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
+ nilfs_sysfs_delete_checkpoints_group(nilfs);
+ nilfs_sysfs_delete_segments_group(nilfs);
+ nilfs_sysfs_delete_superblock_group(nilfs);
+ nilfs_sysfs_delete_segctor_group(nilfs);
+ kobject_del(&nilfs->ns_dev_kobj);
+ kfree(nilfs->ns_dev_subgroups);
+}
+
+/************************************************************************
+ * NILFS feature attrs *
+ ************************************************************************/
+
+static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d.%d\n",
+ NILFS_CURRENT_REV, NILFS_MINOR_REV);
+}
+
+static const char features_readme_str[] =
+ "The features group contains attributes that describe NILFS file\n"
+ "system driver features.\n\n"
+ "(1) revision\n\tshow current revision of NILFS file system driver.\n";
+
+static ssize_t nilfs_feature_README_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, features_readme_str);
+}
+
+NILFS_FEATURE_RO_ATTR(revision);
+NILFS_FEATURE_RO_ATTR(README);
+
+static struct attribute *nilfs_feature_attrs[] = {
+ NILFS_FEATURE_ATTR_LIST(revision),
+ NILFS_FEATURE_ATTR_LIST(README),
+ NULL,
+};
+
+static const struct attribute_group nilfs_feature_attr_group = {
+ .name = "features",
+ .attrs = nilfs_feature_attrs,
+};
+
+int __init nilfs_sysfs_init(void)
+{
+ int err;
+
+ nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
+ if (!nilfs_kset) {
+ err = -ENOMEM;
+ printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n",
+ err);
+ goto failed_sysfs_init;
+ }
+
+ err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
+ if (unlikely(err)) {
+ printk(KERN_ERR "NILFS: unable to create feature group: err %d\n",
+ err);
+ goto cleanup_sysfs_init;
+ }
+
+ return 0;
+
+cleanup_sysfs_init:
+ kset_unregister(nilfs_kset);
+
+failed_sysfs_init:
+ return err;
+}
+
+void nilfs_sysfs_exit(void)
+{
+ sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
+ kset_unregister(nilfs_kset);
+}
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
new file mode 100644
index 000000000000..677e3a1a8370
--- /dev/null
+++ b/fs/nilfs2/sysfs.h
@@ -0,0 +1,176 @@
+/*
+ * sysfs.h - sysfs support declarations.
+ *
+ * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
+ */
+
+#ifndef _NILFS_SYSFS_H
+#define _NILFS_SYSFS_H
+
+#include <linux/sysfs.h>
+
+#define NILFS_ROOT_GROUP_NAME "nilfs2"
+
+/*
+ * struct nilfs_sysfs_dev_subgroups - device subgroup kernel objects
+ * @sg_superblock_kobj: /sys/fs/<nilfs>/<device>/superblock
+ * @sg_superblock_kobj_unregister: completion state
+ * @sg_segctor_kobj: /sys/fs/<nilfs>/<device>/segctor
+ * @sg_segctor_kobj_unregister: completion state
+ * @sg_mounted_snapshots_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots
+ * @sg_mounted_snapshots_kobj_unregister: completion state
+ * @sg_checkpoints_kobj: /sys/fs/<nilfs>/<device>/checkpoints
+ * @sg_checkpoints_kobj_unregister: completion state
+ * @sg_segments_kobj: /sys/fs/<nilfs>/<device>/segments
+ * @sg_segments_kobj_unregister: completion state
+ */
+struct nilfs_sysfs_dev_subgroups {
+ /* /sys/fs/<nilfs>/<device>/superblock */
+ struct kobject sg_superblock_kobj;
+ struct completion sg_superblock_kobj_unregister;
+
+ /* /sys/fs/<nilfs>/<device>/segctor */
+ struct kobject sg_segctor_kobj;
+ struct completion sg_segctor_kobj_unregister;
+
+ /* /sys/fs/<nilfs>/<device>/mounted_snapshots */
+ struct kobject sg_mounted_snapshots_kobj;
+ struct completion sg_mounted_snapshots_kobj_unregister;
+
+ /* /sys/fs/<nilfs>/<device>/checkpoints */
+ struct kobject sg_checkpoints_kobj;
+ struct completion sg_checkpoints_kobj_unregister;
+
+ /* /sys/fs/<nilfs>/<device>/segments */
+ struct kobject sg_segments_kobj;
+ struct completion sg_segments_kobj_unregister;
+};
+
+#define NILFS_COMMON_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+ struct attribute attr; \
+ ssize_t (*show)(struct kobject *, struct attribute *, \
+ char *); \
+ ssize_t (*store)(struct kobject *, struct attribute *, \
+ const char *, size_t); \
+};
+
+NILFS_COMMON_ATTR_STRUCT(feature);
+
+#define NILFS_DEV_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+ struct attribute attr; \
+ ssize_t (*show)(struct nilfs_##name##_attr *, struct the_nilfs *, \
+ char *); \
+ ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \
+ const char *, size_t); \
+};
+
+NILFS_DEV_ATTR_STRUCT(dev);
+NILFS_DEV_ATTR_STRUCT(segments);
+NILFS_DEV_ATTR_STRUCT(mounted_snapshots);
+NILFS_DEV_ATTR_STRUCT(checkpoints);
+NILFS_DEV_ATTR_STRUCT(superblock);
+NILFS_DEV_ATTR_STRUCT(segctor);
+
+#define NILFS_CP_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+ struct attribute attr; \
+ ssize_t (*show)(struct nilfs_##name##_attr *, struct nilfs_root *, \
+ char *); \
+ ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \
+ const char *, size_t); \
+};
+
+NILFS_CP_ATTR_STRUCT(snapshot);
+
+#define NILFS_ATTR(type, name, mode, show, store) \
+ static struct nilfs_##type##_attr nilfs_##type##_attr_##name = \
+ __ATTR(name, mode, show, store)
+
+#define NILFS_INFO_ATTR(type, name) \
+ NILFS_ATTR(type, name, 0444, NULL, NULL)
+#define NILFS_RO_ATTR(type, name) \
+ NILFS_ATTR(type, name, 0444, nilfs_##type##_##name##_show, NULL)
+#define NILFS_RW_ATTR(type, name) \
+ NILFS_ATTR(type, name, 0644, \
+ nilfs_##type##_##name##_show, \
+ nilfs_##type##_##name##_store)
+
+#define NILFS_FEATURE_INFO_ATTR(name) \
+ NILFS_INFO_ATTR(feature, name)
+#define NILFS_FEATURE_RO_ATTR(name) \
+ NILFS_RO_ATTR(feature, name)
+#define NILFS_FEATURE_RW_ATTR(name) \
+ NILFS_RW_ATTR(feature, name)
+
+#define NILFS_DEV_INFO_ATTR(name) \
+ NILFS_INFO_ATTR(dev, name)
+#define NILFS_DEV_RO_ATTR(name) \
+ NILFS_RO_ATTR(dev, name)
+#define NILFS_DEV_RW_ATTR(name) \
+ NILFS_RW_ATTR(dev, name)
+
+#define NILFS_SEGMENTS_RO_ATTR(name) \
+ NILFS_RO_ATTR(segments, name)
+#define NILFS_SEGMENTS_RW_ATTR(name) \
+ NILFS_RW_ATTR(segs_info, name)
+
+#define NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(name) \
+ NILFS_RO_ATTR(mounted_snapshots, name)
+
+#define NILFS_CHECKPOINTS_RO_ATTR(name) \
+ NILFS_RO_ATTR(checkpoints, name)
+#define NILFS_CHECKPOINTS_RW_ATTR(name) \
+ NILFS_RW_ATTR(checkpoints, name)
+
+#define NILFS_SNAPSHOT_INFO_ATTR(name) \
+ NILFS_INFO_ATTR(snapshot, name)
+#define NILFS_SNAPSHOT_RO_ATTR(name) \
+ NILFS_RO_ATTR(snapshot, name)
+#define NILFS_SNAPSHOT_RW_ATTR(name) \
+ NILFS_RW_ATTR(snapshot, name)
+
+#define NILFS_SUPERBLOCK_RO_ATTR(name) \
+ NILFS_RO_ATTR(superblock, name)
+#define NILFS_SUPERBLOCK_RW_ATTR(name) \
+ NILFS_RW_ATTR(superblock, name)
+
+#define NILFS_SEGCTOR_INFO_ATTR(name) \
+ NILFS_INFO_ATTR(segctor, name)
+#define NILFS_SEGCTOR_RO_ATTR(name) \
+ NILFS_RO_ATTR(segctor, name)
+#define NILFS_SEGCTOR_RW_ATTR(name) \
+ NILFS_RW_ATTR(segctor, name)
+
+#define NILFS_FEATURE_ATTR_LIST(name) \
+ (&nilfs_feature_attr_##name.attr)
+#define NILFS_DEV_ATTR_LIST(name) \
+ (&nilfs_dev_attr_##name.attr)
+#define NILFS_SEGMENTS_ATTR_LIST(name) \
+ (&nilfs_segments_attr_##name.attr)
+#define NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(name) \
+ (&nilfs_mounted_snapshots_attr_##name.attr)
+#define NILFS_CHECKPOINTS_ATTR_LIST(name) \
+ (&nilfs_checkpoints_attr_##name.attr)
+#define NILFS_SNAPSHOT_ATTR_LIST(name) \
+ (&nilfs_snapshot_attr_##name.attr)
+#define NILFS_SUPERBLOCK_ATTR_LIST(name) \
+ (&nilfs_superblock_attr_##name.attr)
+#define NILFS_SEGCTOR_ATTR_LIST(name) \
+ (&nilfs_segctor_attr_##name.attr)
+
+#endif /* _NILFS_SYSFS_H */
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8ba8229ba076..9da25fe9ea61 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -85,6 +85,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
nilfs->ns_cptree = RB_ROOT;
spin_lock_init(&nilfs->ns_cptree_lock);
init_rwsem(&nilfs->ns_segctor_sem);
+ nilfs->ns_sb_update_freq = NILFS_SB_FREQ;
return nilfs;
}
@@ -97,6 +98,7 @@ void destroy_nilfs(struct the_nilfs *nilfs)
{
might_sleep();
if (nilfs_init(nilfs)) {
+ nilfs_sysfs_delete_device_group(nilfs);
brelse(nilfs->ns_sbh[0]);
brelse(nilfs->ns_sbh[1]);
}
@@ -640,6 +642,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto failed_sbh;
+ err = nilfs_sysfs_create_device_group(sb);
+ if (err)
+ goto failed_sbh;
+
set_nilfs_init(nilfs);
err = 0;
out:
@@ -740,12 +746,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
{
struct rb_node **p, *parent;
struct nilfs_root *root, *new;
+ int err;
root = nilfs_lookup_root(nilfs, cno);
if (root)
return root;
- new = kmalloc(sizeof(*root), GFP_KERNEL);
+ new = kzalloc(sizeof(*root), GFP_KERNEL);
if (!new)
return NULL;
@@ -782,6 +789,12 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
spin_unlock(&nilfs->ns_cptree_lock);
+ err = nilfs_sysfs_create_snapshot_group(new);
+ if (err) {
+ kfree(new);
+ new = NULL;
+ }
+
return new;
}
@@ -790,6 +803,8 @@ void nilfs_put_root(struct nilfs_root *root)
if (atomic_dec_and_test(&root->count)) {
struct the_nilfs *nilfs = root->nilfs;
+ nilfs_sysfs_delete_snapshot_group(root);
+
spin_lock(&nilfs->ns_cptree_lock);
rb_erase(&root->rb_node, &nilfs->ns_cptree);
spin_unlock(&nilfs->ns_cptree_lock);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index de8cc53b4a5c..23778d385836 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -33,6 +33,7 @@
#include <linux/slab.h>
struct nilfs_sc_info;
+struct nilfs_sysfs_dev_subgroups;
/* the_nilfs struct */
enum {
@@ -45,6 +46,7 @@ enum {
/**
* struct the_nilfs - struct to supervise multiple nilfs mount points
* @ns_flags: flags
+ * @ns_flushed_device: flag indicating if all volatile data was flushed
* @ns_bdev: block device
* @ns_sem: semaphore for shared states
* @ns_snapshot_mount_mutex: mutex to protect snapshot mounts
@@ -54,6 +56,7 @@ enum {
* @ns_sbwcount: write count of super block
* @ns_sbsize: size of valid data in super block
* @ns_mount_state: file system state
+ * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds)
* @ns_seg_seq: segment sequence counter
* @ns_segnum: index number of the latest full segment.
* @ns_nextnum: index number of the full segment index to be used next
@@ -95,9 +98,13 @@ enum {
* @ns_inode_size: size of on-disk inode
* @ns_first_ino: first not-special inode number
* @ns_crc_seed: seed value of CRC32 calculation
+ * @ns_dev_kobj: /sys/fs/<nilfs>/<device>
+ * @ns_dev_kobj_unregister: completion state
+ * @ns_dev_subgroups: <device> subgroups pointer
*/
struct the_nilfs {
unsigned long ns_flags;
+ int ns_flushed_device;
struct block_device *ns_bdev;
struct rw_semaphore ns_sem;
@@ -114,6 +121,7 @@ struct the_nilfs {
unsigned ns_sbwcount;
unsigned ns_sbsize;
unsigned ns_mount_state;
+ unsigned ns_sb_update_freq;
/*
* Following fields are dedicated to a writable FS-instance.
@@ -188,6 +196,11 @@ struct the_nilfs {
int ns_inode_size;
int ns_first_ino;
u32 ns_crc_seed;
+
+ /* /sys/fs/<nilfs>/<device> */
+ struct kobject ns_dev_kobj;
+ struct completion ns_dev_kobj_unregister;
+ struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
};
#define THE_NILFS_FNS(bit, name) \
@@ -232,6 +245,8 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
* @ifile: inode file
* @inodes_count: number of inodes
* @blocks_count: number of blocks
+ * @snapshot_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot>
+ * @snapshot_kobj_unregister: completion state for kernel object
*/
struct nilfs_root {
__u64 cno;
@@ -243,6 +258,10 @@ struct nilfs_root {
atomic64_t inodes_count;
atomic64_t blocks_count;
+
+ /* /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> */
+ struct kobject snapshot_kobj;
+ struct completion snapshot_kobj_unregister;
};
/* Special checkpoint number */
@@ -254,7 +273,8 @@ struct nilfs_root {
static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
{
u64 t = get_seconds();
- return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
+ return t < nilfs->ns_sbwtime ||
+ t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
}
static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
@@ -353,4 +373,24 @@ static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
}
+static inline int nilfs_flush_device(struct the_nilfs *nilfs)
+{
+ int err;
+
+ if (!nilfs_test_opt(nilfs, BARRIER) || nilfs->ns_flushed_device)
+ return 0;
+
+ nilfs->ns_flushed_device = 1;
+ /*
+ * the store to ns_flushed_device must not be reordered after
+ * blkdev_issue_flush().
+ */
+ smp_wmb();
+
+ err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL);
+ if (err != -EIO)
+ err = 0;
+ return err;
+}
+
#endif /* _THE_NILFS_H */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index abc8cbcfe90e..caaaf9dfe353 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -346,13 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
goto out;
}
- error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
- if (error) {
- /* if we added, we must shoot */
- if (dn_mark == new_dn_mark)
- destroy = 1;
- goto out;
- }
+ __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
error = attach_dn(dn, dn_mark, id, fd, filp, mask);
/* !error means that we attached the dn to the dn_mark, so don't free it */
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index ee9cb3795c2b..30d3addfad75 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -70,8 +70,15 @@ static int fanotify_get_response(struct fsnotify_group *group,
wait_event(group->fanotify_data.access_waitq, event->response ||
atomic_read(&group->fanotify_data.bypass_perm));
- if (!event->response) /* bypass_perm set */
+ if (!event->response) { /* bypass_perm set */
+ /*
+ * Event was canceled because group is being destroyed. Remove
+ * it from group's event list because we are responsible for
+ * freeing the permission event.
+ */
+ fsnotify_remove_event(group, &event->fae.fse);
return 0;
+ }
/* userspace responded, convert to something usable */
switch (event->response) {
@@ -210,7 +217,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
return -ENOMEM;
fsn_event = &event->fse;
- ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
+ ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
if (ret) {
/* Permission events shouldn't be merged */
BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 3fdc8a3e1134..c991616acca9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -66,7 +66,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
/* held the notification_mutex the whole time, so this is the
* same event we peeked above */
- return fsnotify_remove_notify_event(group);
+ return fsnotify_remove_first_event(group);
}
static int create_fd(struct fsnotify_group *group,
@@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- client_fd = get_unused_fd();
+ client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
if (client_fd < 0)
return client_fd;
@@ -359,6 +359,11 @@ static int fanotify_release(struct inode *ignored, struct file *file)
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
struct fanotify_perm_event_info *event, *next;
+ /*
+ * There may be still new events arriving in the notification queue
+ * but since userspace cannot use fanotify fd anymore, no event can
+ * enter or leave access_list by now.
+ */
spin_lock(&group->fanotify_data.access_lock);
atomic_inc(&group->fanotify_data.bypass_perm);
@@ -373,6 +378,13 @@ static int fanotify_release(struct inode *ignored, struct file *file)
}
spin_unlock(&group->fanotify_data.access_lock);
+ /*
+ * Since bypass_perm is set, newly queued events will not wait for
+ * access response. Wake up the already sleeping ones now.
+ * synchronize_srcu() in fsnotify_destroy_group() will wait for all
+ * processes sleeping in fanotify_handle_event() waiting for access
+ * response and thus also for all permission events to be freed.
+ */
wake_up(&group->fanotify_data.access_waitq);
#endif
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 238a5930cb3c..9d7e2b9659cb 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
{
struct {
struct file_handle handle;
- u8 pad[64];
+ u8 pad[MAX_HANDLE_SZ];
} f;
int size, ret, i;
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
size = f.handle.handle_bytes >> 2;
ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
- if ((ret == 255) || (ret == -ENOSPC)) {
+ if ((ret == FILEID_INVALID) || (ret < 0)) {
WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
return 0;
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9d3e9c50066a..89326acd4561 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -229,8 +229,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
&fsnotify_mark_srcu);
}
+ /*
+ * We need to merge inode & vfsmount mark lists so that inode mark
+ * ignore masks are properly reflected for mount mark notifications.
+ * That's why this traversal is so complicated...
+ */
while (inode_node || vfsmount_node) {
- inode_group = vfsmount_group = NULL;
+ inode_group = NULL;
+ inode_mark = NULL;
+ vfsmount_group = NULL;
+ vfsmount_mark = NULL;
if (inode_node) {
inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
@@ -244,21 +252,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
vfsmount_group = vfsmount_mark->group;
}
- if (inode_group > vfsmount_group) {
- /* handle inode */
- ret = send_to_group(to_tell, inode_mark, NULL, mask,
- data, data_is, cookie, file_name);
- /* we didn't use the vfsmount_mark */
- vfsmount_group = NULL;
- } else if (vfsmount_group > inode_group) {
- ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
- data, data_is, cookie, file_name);
- inode_group = NULL;
- } else {
- ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
- mask, data, data_is, cookie,
- file_name);
+ if (inode_group && vfsmount_group) {
+ int cmp = fsnotify_compare_groups(inode_group,
+ vfsmount_group);
+ if (cmp > 0) {
+ inode_group = NULL;
+ inode_mark = NULL;
+ } else if (cmp < 0) {
+ vfsmount_group = NULL;
+ vfsmount_mark = NULL;
+ }
}
+ ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
+ data, data_is, cookie, file_name);
if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
goto out;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 85e7d2b431d9..3b68b0ae0a97 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,6 +12,10 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;
+/* compare two groups for sorting of marks lists */
+extern int fsnotify_compare_groups(struct fsnotify_group *a,
+ struct fsnotify_group *b);
+
extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
__u32 mask);
/* add a mark to an inode */
@@ -23,9 +27,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group, struct vfsmount *mnt,
int allow_dups);
-/* final kfree of a group */
-extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
-
/* vfsmount specific destruction of a mark */
extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
/* inode specific destruction of a mark */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ad1995980456..d16b62cb2854 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -31,7 +31,7 @@
/*
* Final freeing of a group
*/
-void fsnotify_final_destroy_group(struct fsnotify_group *group)
+static void fsnotify_final_destroy_group(struct fsnotify_group *group)
{
if (group->ops->free_group_priv)
group->ops->free_group_priv(group);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 74825be65b7b..dfbf5447eea4 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -194,6 +194,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
{
struct fsnotify_mark *lmark, *last = NULL;
int ret = 0;
+ int cmp;
mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
@@ -219,11 +220,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
goto out;
}
- if (mark->group->priority < lmark->group->priority)
- continue;
-
- if ((mark->group->priority == lmark->group->priority) &&
- (mark->group < lmark->group))
+ cmp = fsnotify_compare_groups(lmark->group, mark->group);
+ if (cmp < 0)
continue;
hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
@@ -232,7 +230,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
BUG_ON(last == NULL);
/* mark should be the last entry. last is the current last entry */
- hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list);
+ hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list);
out:
fsnotify_recalc_inode_mask_locked(inode);
spin_unlock(&inode->i_lock);
@@ -288,20 +286,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
spin_unlock(&inode->i_lock);
/* In case the dropping of a reference would nuke next_i. */
- if ((&next_i->i_sb_list != list) &&
- atomic_read(&next_i->i_count)) {
+ while (&next_i->i_sb_list != list) {
spin_lock(&next_i->i_lock);
- if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+ if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
+ atomic_read(&next_i->i_count)) {
__iget(next_i);
need_iput = next_i;
+ spin_unlock(&next_i->i_lock);
+ break;
}
spin_unlock(&next_i->i_lock);
+ next_i = list_entry(next_i->i_sb_list.next,
+ struct inode, i_sb_list);
}
/*
- * We can safely drop inode_sb_list_lock here because we hold
- * references on both inode and next_i. Also no new inodes
- * will be added since the umount has begun.
+ * We can safely drop inode_sb_list_lock here because either
+ * we actually hold references on both inode and next_i or
+ * end of list. Also no new inodes will be added since the
+ * umount has begun.
*/
spin_unlock(&inode_sb_list_lock);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 43ab1e1a07a2..7d888d77d59a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -108,7 +108,7 @@ int inotify_handle_event(struct fsnotify_group *group,
if (len)
strcpy(event->name, file_name);
- ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
+ ret = fsnotify_add_event(group, fsn_event, inotify_merge);
if (ret) {
/* Our event wasn't used in the end. Free it. */
fsnotify_destroy_event(group, fsn_event);
@@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
/* ideally the idr is empty and we won't hit the BUG in the callback */
idr_for_each(&group->inotify_data.idr, idr_callback, group);
idr_destroy(&group->inotify_data.idr);
- atomic_dec(&group->inotify_data.user->inotify_devs);
- free_uid(group->inotify_data.user);
+ if (group->inotify_data.user) {
+ atomic_dec(&group->inotify_data.user->inotify_devs);
+ free_uid(group->inotify_data.user);
+ }
}
static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cc423a30a0c8..daf76652fe58 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -149,7 +149,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
if (fsnotify_notify_queue_is_empty(group))
return NULL;
- event = fsnotify_peek_notify_event(group);
+ event = fsnotify_peek_first_event(group);
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
@@ -159,7 +159,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
/* held the notification_mutex the whole time, so this is the
* same event we peeked above */
- fsnotify_remove_notify_event(group);
+ fsnotify_remove_first_event(group);
return event;
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d90deaa08e78..34c38fabf514 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -210,6 +210,42 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
}
/*
+ * Sorting function for lists of fsnotify marks.
+ *
+ * Fanotify supports different notification classes (reflected as priority of
+ * notification group). Events shall be passed to notification groups in
+ * decreasing priority order. To achieve this marks in notification lists for
+ * inodes and vfsmounts are sorted so that priorities of corresponding groups
+ * are descending.
+ *
+ * Furthermore correct handling of the ignore mask requires processing inode
+ * and vfsmount marks of each group together. Using the group address as
+ * further sort criterion provides a unique sorting order and thus we can
+ * merge inode and vfsmount lists of marks in linear time and find groups
+ * present in both lists.
+ *
+ * A return value of 1 signifies that b has priority over a.
+ * A return value of 0 signifies that the two marks have to be handled together.
+ * A return value of -1 signifies that a has priority over b.
+ */
+int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
+{
+ if (a == b)
+ return 0;
+ if (!a)
+ return 1;
+ if (!b)
+ return -1;
+ if (a->priority < b->priority)
+ return 1;
+ if (a->priority > b->priority)
+ return -1;
+ if (a < b)
+ return 1;
+ return -1;
+}
+
+/*
* Attach an initialized mark to a given group and fs object.
* These marks may be used for the fsnotify backend to determine which
* event types should be delivered to which group.
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 1e58402171a5..a95d8e037aeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -73,7 +73,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
/* Overflow events are per-group and we don't want to free them */
if (!event || event->mask == FS_Q_OVERFLOW)
return;
-
+ /* If the event is still queued, we have a problem... */
+ WARN_ON(!list_empty(&event->list));
group->ops->free_event(event);
}
@@ -83,10 +84,10 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
* added to the queue, 1 if the event was merged with some other queued event,
* 2 if the queue of events has overflown.
*/
-int fsnotify_add_notify_event(struct fsnotify_group *group,
- struct fsnotify_event *event,
- int (*merge)(struct list_head *,
- struct fsnotify_event *))
+int fsnotify_add_event(struct fsnotify_group *group,
+ struct fsnotify_event *event,
+ int (*merge)(struct list_head *,
+ struct fsnotify_event *))
{
int ret = 0;
struct list_head *list = &group->notification_list;
@@ -125,10 +126,25 @@ queue:
}
/*
+ * Remove @event from group's notification queue. It is the responsibility of
+ * the caller to destroy the event.
+ */
+void fsnotify_remove_event(struct fsnotify_group *group,
+ struct fsnotify_event *event)
+{
+ mutex_lock(&group->notification_mutex);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ group->q_len--;
+ }
+ mutex_unlock(&group->notification_mutex);
+}
+
+/*
* Remove and return the first event from the notification list. It is the
* responsibility of the caller to destroy the obtained event
*/
-struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
+struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
{
struct fsnotify_event *event;
@@ -140,7 +156,7 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
struct fsnotify_event, list);
/*
* We need to init list head for the case of overflow event so that
- * check in fsnotify_add_notify_events() works
+ * check in fsnotify_add_event() works
*/
list_del_init(&event->list);
group->q_len--;
@@ -149,9 +165,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
}
/*
- * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ * This will not remove the event, that must be done with
+ * fsnotify_remove_first_event()
*/
-struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
{
BUG_ON(!mutex_is_locked(&group->notification_mutex));
@@ -169,7 +186,7 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
mutex_lock(&group->notification_mutex);
while (!fsnotify_notify_queue_is_empty(group)) {
- event = fsnotify_remove_notify_event(group);
+ event = fsnotify_remove_first_event(group);
fsnotify_destroy_event(group, event);
}
mutex_unlock(&group->notification_mutex);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 68ca5a8704b5..faefa72a11eb 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -153,6 +153,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
struct mount *m = real_mount(mnt);
struct fsnotify_mark *lmark, *last = NULL;
int ret = 0;
+ int cmp;
mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
@@ -178,11 +179,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
goto out;
}
- if (mark->group->priority < lmark->group->priority)
- continue;
-
- if ((mark->group->priority == lmark->group->priority) &&
- (mark->group < lmark->group))
+ cmp = fsnotify_compare_groups(lmark->group, mark->group);
+ if (cmp < 0)
continue;
hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
@@ -191,7 +189,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
BUG_ON(last == NULL);
/* mark should be the last entry. last is the current last entry */
- hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list);
+ hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list);
out:
fsnotify_recalc_vfsmount_mask_locked(mnt);
spin_unlock(&mnt->mnt_root->d_lock);
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 30206b238433..36ae529511c4 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-ccflags-y := -DNTFS_VERSION=\"2.1.30\"
+ccflags-y := -DNTFS_VERSION=\"2.1.31\"
ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index d267ea6aa1a0..7521e11db728 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1,8 +1,7 @@
/**
* aops.c - NTFS kernel address space operations and page cache handling.
- * Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -1539,16 +1538,157 @@ err_out:
#endif /* NTFS_RW */
/**
- * ntfs_aops - general address space operations for inodes and attributes
+ * ntfs_bmap - map logical file block to physical device block
+ * @mapping: address space mapping to which the block to be mapped belongs
+ * @block: logical block to map to its physical device block
+ *
+ * For regular, non-resident files (i.e. not compressed and not encrypted), map
+ * the logical @block belonging to the file described by the address space
+ * mapping @mapping to its physical device block.
+ *
+ * The size of the block is equal to the @s_blocksize field of the super block
+ * of the mounted file system which is guaranteed to be smaller than or equal
+ * to the cluster size thus the block is guaranteed to fit entirely inside the
+ * cluster which means we do not need to care how many contiguous bytes are
+ * available after the beginning of the block.
+ *
+ * Return the physical device block if the mapping succeeded or 0 if the block
+ * is sparse or there was an error.
+ *
+ * Note: This is a problem if someone tries to run bmap() on $Boot system file
+ * as that really is in block zero but there is nothing we can do. bmap() is
+ * just broken in that respect (just like it cannot distinguish sparse from
+ * not available or error).
*/
-const struct address_space_operations ntfs_aops = {
- .readpage = ntfs_readpage, /* Fill page with data. */
+static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
+{
+ s64 ofs, size;
+ loff_t i_size;
+ LCN lcn;
+ unsigned long blocksize, flags;
+ ntfs_inode *ni = NTFS_I(mapping->host);
+ ntfs_volume *vol = ni->vol;
+ unsigned delta;
+ unsigned char blocksize_bits, cluster_size_shift;
+
+ ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
+ ni->mft_no, (unsigned long long)block);
+ if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
+ ntfs_error(vol->sb, "BMAP does not make sense for %s "
+ "attributes, returning 0.",
+ (ni->type != AT_DATA) ? "non-data" :
+ (!NInoNonResident(ni) ? "resident" :
+ "encrypted"));
+ return 0;
+ }
+ /* None of these can happen. */
+ BUG_ON(NInoCompressed(ni));
+ BUG_ON(NInoMstProtected(ni));
+ blocksize = vol->sb->s_blocksize;
+ blocksize_bits = vol->sb->s_blocksize_bits;
+ ofs = (s64)block << blocksize_bits;
+ read_lock_irqsave(&ni->size_lock, flags);
+ size = ni->initialized_size;
+ i_size = i_size_read(VFS_I(ni));
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * If the offset is outside the initialized size or the block straddles
+ * the initialized size then pretend it is a hole unless the
+ * initialized size equals the file size.
+ */
+ if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
+ goto hole;
+ cluster_size_shift = vol->cluster_size_bits;
+ down_read(&ni->runlist.lock);
+ lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
+ up_read(&ni->runlist.lock);
+ if (unlikely(lcn < LCN_HOLE)) {
+ /*
+ * Step down to an integer to avoid gcc doing a long long
+ * comparision in the switch when we know @lcn is between
+ * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
+ *
+ * Otherwise older gcc (at least on some architectures) will
+ * try to use __cmpdi2() which is of course not available in
+ * the kernel.
+ */
+ switch ((int)lcn) {
+ case LCN_ENOENT:
+ /*
+ * If the offset is out of bounds then pretend it is a
+ * hole.
+ */
+ goto hole;
+ case LCN_ENOMEM:
+ ntfs_error(vol->sb, "Not enough memory to complete "
+ "mapping for inode 0x%lx. "
+ "Returning 0.", ni->mft_no);
+ break;
+ default:
+ ntfs_error(vol->sb, "Failed to complete mapping for "
+ "inode 0x%lx. Run chkdsk. "
+ "Returning 0.", ni->mft_no);
+ break;
+ }
+ return 0;
+ }
+ if (lcn < 0) {
+ /* It is a hole. */
+hole:
+ ntfs_debug("Done (returning hole).");
+ return 0;
+ }
+ /*
+ * The block is really allocated and fullfils all our criteria.
+ * Convert the cluster to units of block size and return the result.
+ */
+ delta = ofs & vol->cluster_size_mask;
+ if (unlikely(sizeof(block) < sizeof(lcn))) {
+ block = lcn = ((lcn << cluster_size_shift) + delta) >>
+ blocksize_bits;
+ /* If the block number was truncated return 0. */
+ if (unlikely(block != lcn)) {
+ ntfs_error(vol->sb, "Physical block 0x%llx is too "
+ "large to be returned, returning 0.",
+ (long long)lcn);
+ return 0;
+ }
+ } else
+ block = ((lcn << cluster_size_shift) + delta) >>
+ blocksize_bits;
+ ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
+ return block;
+}
+
+/**
+ * ntfs_normal_aops - address space operations for normal inodes and attributes
+ *
+ * Note these are not used for compressed or mst protected inodes and
+ * attributes.
+ */
+const struct address_space_operations ntfs_normal_aops = {
+ .readpage = ntfs_readpage,
#ifdef NTFS_RW
- .writepage = ntfs_writepage, /* Write dirty page to disk. */
+ .writepage = ntfs_writepage,
+ .set_page_dirty = __set_page_dirty_buffers,
+#endif /* NTFS_RW */
+ .bmap = ntfs_bmap,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
+
+/**
+ * ntfs_compressed_aops - address space operations for compressed inodes
+ */
+const struct address_space_operations ntfs_compressed_aops = {
+ .readpage = ntfs_readpage,
+#ifdef NTFS_RW
+ .writepage = ntfs_writepage,
+ .set_page_dirty = __set_page_dirty_buffers,
#endif /* NTFS_RW */
- .migratepage = buffer_migrate_page, /* Move a page cache page from
- one physical page to an
- other. */
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -1564,9 +1704,8 @@ const struct address_space_operations ntfs_mst_aops = {
without touching the buffers
belonging to the page. */
#endif /* NTFS_RW */
- .migratepage = buffer_migrate_page, /* Move a page cache page from
- one physical page to an
- other. */
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index dd6103cc93c1..825a54e8f490 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb,
/* If 1, output debug messages, and if 0, don't. */
int debug_msgs = 0;
-void __ntfs_debug (const char *file, int line, const char *function,
+void __ntfs_debug(const char *file, int line, const char *function,
const char *fmt, ...)
{
struct va_format vaf;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 5c9e2c81cb11..643faa44f22b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
/*
* file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@@ -74,8 +74,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
* ntfs_attr_extend_initialized - extend the initialized size of an attribute
* @ni: ntfs inode of the attribute to extend
* @new_init_size: requested new initialized size in bytes
- * @cached_page: store any allocated but unused page here
- * @lru_pvec: lru-buffering pagevec of the caller
*
* Extend the initialized size of an attribute described by the ntfs inode @ni
* to @new_init_size bytes. This involves zeroing any non-sparse space between
@@ -395,7 +393,6 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
* @nr_pages: number of page cache pages to obtain
* @pages: array of pages in which to return the obtained page cache pages
* @cached_page: allocated but as yet unused page
- * @lru_pvec: lru-buffering pagevec of caller
*
* Obtain @nr_pages locked page cache pages from the mapping @mapping and
* starting at index @index.
@@ -413,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
BUG_ON(!nr_pages);
err = nr = 0;
do {
- pages[nr] = find_lock_page(mapping, index);
+ pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
+ FGP_ACCESSED);
if (!pages[nr]) {
if (!*cached_page) {
*cached_page = page_cache_alloc(mapping);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index f47af5e6e230..898b9949d363 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1,7 +1,7 @@
/**
- * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project.
+ * inode.c - NTFS kernel inode handling.
*
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@@ -1012,6 +1012,7 @@ skip_large_dir_stuff:
/* Setup the operations for this inode. */
vi->i_op = &ntfs_dir_inode_ops;
vi->i_fop = &ntfs_dir_ops;
+ vi->i_mapping->a_ops = &ntfs_mst_aops;
} else {
/* It is a file. */
ntfs_attr_reinit_search_ctx(ctx);
@@ -1160,11 +1161,12 @@ no_data_attr_special_case:
/* Setup the operations for this inode. */
vi->i_op = &ntfs_file_inode_ops;
vi->i_fop = &ntfs_file_ops;
+ vi->i_mapping->a_ops = &ntfs_normal_aops;
+ if (NInoMstProtected(ni))
+ vi->i_mapping->a_ops = &ntfs_mst_aops;
+ else if (NInoCompressed(ni))
+ vi->i_mapping->a_ops = &ntfs_compressed_aops;
}
- if (NInoMstProtected(ni))
- vi->i_mapping->a_ops = &ntfs_mst_aops;
- else
- vi->i_mapping->a_ops = &ntfs_aops;
/*
* The number of 512-byte blocks used on disk (for stat). This is in so
* far inaccurate as it doesn't account for any named streams or other
@@ -1414,10 +1416,11 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
ni->allocated_size = sle64_to_cpu(
a->data.non_resident.allocated_size);
}
+ vi->i_mapping->a_ops = &ntfs_normal_aops;
if (NInoMstProtected(ni))
vi->i_mapping->a_ops = &ntfs_mst_aops;
- else
- vi->i_mapping->a_ops = &ntfs_aops;
+ else if (NInoCompressed(ni))
+ vi->i_mapping->a_ops = &ntfs_compressed_aops;
if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
vi->i_blocks = ni->itype.compressed.size >> 9;
else
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index d6a340bf80fc..c581e26a350d 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -1,8 +1,7 @@
/*
- * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS
- * project.
+ * ntfs.h - Defines for NTFS Linux kernel driver.
*
- * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
* Copyright (C) 2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -57,7 +56,8 @@ extern struct kmem_cache *ntfs_attr_ctx_cache;
extern struct kmem_cache *ntfs_index_ctx_cache;
/* The various operations structs defined throughout the driver files. */
-extern const struct address_space_operations ntfs_aops;
+extern const struct address_space_operations ntfs_normal_aops;
+extern const struct address_space_operations ntfs_compressed_aops;
extern const struct address_space_operations ntfs_mst_aops;
extern const struct file_operations ntfs_file_ops;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c3296e546c3..9e1e112074fb 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void)
}
MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
MODULE_VERSION(NTFS_VERSION);
MODULE_LICENSE("GPL");
#ifdef DEBUG
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9d8fcf2f3b94..a93bf9892256 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4961,6 +4961,15 @@ leftright:
el = path_leaf_el(path);
split_index = ocfs2_search_extent_list(el, cpos);
+ if (split_index == -1) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has an extent at cpos %u "
+ "which can no longer be found.\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
+ ret = -EROFS;
+ goto out;
+ }
goto leftright;
}
out:
@@ -5135,7 +5144,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
el = path_leaf_el(left_path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ if (index == -1) {
ocfs2_error(sb,
"Owner %llu has an extent at cpos %u which can no "
"longer be found.\n",
@@ -5491,7 +5500,7 @@ int ocfs2_remove_extent(handle_t *handle,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
"Owner %llu has an extent at cpos %u which can no "
"longer be found.\n",
@@ -5557,7 +5566,7 @@ int ocfs2_remove_extent(handle_t *handle,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
"Owner %llu: split at cpos %u lost record.",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4a231a166cf8..1ef547e49373 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
handle_t *handle;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
page = find_or_create_page(mapping, 0, GFP_NOFS);
if (!page) {
+ ocfs2_commit_trans(osb, handle);
ret = -ENOMEM;
mlog_errno(ret);
goto out;
@@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
wc->w_pages[0] = wc->w_target_page = page;
wc->w_num_pages = 1;
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73039295d0d1..eb9d48746ab4 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2244,7 +2244,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
return -EINVAL;
for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
- if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+ if (strncasecmp(page, o2hb_heartbeat_mode_desc[i], len))
continue;
ret = o2hb_global_heartbeat_mode_set(i);
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num)
}
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num)
+{
+ unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long flags;
+
+ spin_lock_irqsave(&o2hb_live_lock, flags);
+ o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ if (!test_bit(node_num, testing_map)) {
+ mlog(ML_HEARTBEAT,
+ "node (%u) does not have heartbeating enabled.\n",
+ node_num);
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
+
int o2hb_check_node_heartbeating_from_callback(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 00ad8e8fea51..3ef5137dc362 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map,
void o2hb_exit(void);
int o2hb_init(void);
int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_no_sem(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 07ac24fd9252..af7598bff1b5 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -49,13 +49,13 @@ static ssize_t mlog_mask_show(u64 mask, char *buf)
static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
{
- if (!strnicmp(buf, "allow", 5)) {
+ if (!strncasecmp(buf, "allow", 5)) {
__mlog_set_u64(mask, mlog_and_bits);
__mlog_clear_u64(mask, mlog_not_bits);
- } else if (!strnicmp(buf, "deny", 4)) {
+ } else if (!strncasecmp(buf, "deny", 4)) {
__mlog_set_u64(mask, mlog_not_bits);
__mlog_clear_u64(mask, mlog_and_bits);
- } else if (!strnicmp(buf, "off", 3)) {
+ } else if (!strncasecmp(buf, "off", 3)) {
__mlog_clear_u64(mask, mlog_not_bits);
__mlog_clear_u64(mask, mlog_and_bits);
} else
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 73ba81928bce..27d1242c8383 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = {
static int nst_fop_open(struct inode *inode, struct file *file)
{
struct o2net_send_tracking *dummy_nst;
- struct seq_file *seq;
- int ret;
- dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
- if (dummy_nst == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- dummy_nst->st_task = NULL;
-
- ret = seq_open(file, &nst_seq_ops);
- if (ret)
- goto out;
-
- seq = file->private_data;
- seq->private = dummy_nst;
+ dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
+ if (!dummy_nst)
+ return -ENOMEM;
o2net_debug_add_nst(dummy_nst);
- dummy_nst = NULL;
-
-out:
- kfree(dummy_nst);
- return ret;
+ return 0;
}
static int nst_fop_release(struct inode *inode, struct file *file)
@@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = {
.show = sc_seq_show,
};
-static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
+static int sc_common_open(struct file *file, int ctxt)
{
+ struct o2net_sock_debug *sd;
struct o2net_sock_container *dummy_sc;
- struct seq_file *seq;
- int ret;
- dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
- if (dummy_sc == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- dummy_sc->sc_page = NULL;
+ dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
+ if (!dummy_sc)
+ return -ENOMEM;
- ret = seq_open(file, &sc_seq_ops);
- if (ret)
- goto out;
+ sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
+ if (!sd) {
+ kfree(dummy_sc);
+ return -ENOMEM;
+ }
- seq = file->private_data;
- seq->private = sd;
+ sd->dbg_ctxt = ctxt;
sd->dbg_sock = dummy_sc;
- o2net_debug_add_sc(dummy_sc);
- dummy_sc = NULL;
+ o2net_debug_add_sc(dummy_sc);
-out:
- kfree(dummy_sc);
- return ret;
+ return 0;
}
static int sc_fop_release(struct inode *inode, struct file *file)
@@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
static int stats_fop_open(struct inode *inode, struct file *file)
{
- struct o2net_sock_debug *sd;
-
- sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
- if (sd == NULL)
- return -ENOMEM;
-
- sd->dbg_ctxt = SHOW_SOCK_STATS;
- sd->dbg_sock = NULL;
-
- return sc_common_open(file, sd);
+ return sc_common_open(file, SHOW_SOCK_STATS);
}
static const struct file_operations stats_seq_fops = {
@@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = {
static int sc_fop_open(struct inode *inode, struct file *file)
{
- struct o2net_sock_debug *sd;
-
- sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
- if (sd == NULL)
- return -ENOMEM;
-
- sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
- sd->dbg_sock = NULL;
-
- return sc_common_open(file, sd);
+ return sc_common_open(file, SHOW_SOCK_CONTAINERS);
}
static const struct file_operations sc_seq_fops = {
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ec141e758d7..62e8ec619b4c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work)
}
out:
- spin_unlock(&qs->qs_lock);
- if (fence)
+ if (fence) {
+ spin_unlock(&qs->qs_lock);
o2quo_fence_self();
+ } else {
+ mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
+ "connected: %d, lowest: %d (%sreachable)\n",
+ qs->qs_heartbeating, qs->qs_connected, lowest_hb,
+ lowest_reachable ? "" : "un");
+ spin_unlock(&qs->qs_lock);
+
+ }
+
}
static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 681691bc233a..a96044004064 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
if (nn->nn_persistent_error || nn->nn_sc_valid)
wake_up(&nn->nn_sc_wq);
- if (!was_err && nn->nn_persistent_error) {
+ if (was_valid && !was_err && nn->nn_persistent_error) {
o2quo_conn_err(o2net_num_from_nn(nn));
queue_delayed_work(o2net_wq, &nn->nn_still_up,
msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
@@ -925,7 +925,7 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
size_t veclen, size_t total)
{
int ret;
- struct msghdr msg;
+ struct msghdr msg = {.msg_flags = 0,};
if (sock == NULL) {
ret = -EINVAL;
@@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock)
return ret;
}
+static int o2net_set_usertimeout(struct socket *sock)
+{
+ int user_timeout = O2NET_TCP_USER_TIMEOUT;
+
+ return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+ (char *)&user_timeout, sizeof(user_timeout));
+}
+
static void o2net_initialize_handshake(void)
{
o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1536,16 +1544,20 @@ static void o2net_idle_timer(unsigned long data)
#endif
printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
- "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
- msecs / 1000, msecs % 1000);
+ "idle for %lu.%lu secs.\n",
+ SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
- /*
- * Initialize the nn_timeout so that the next connection attempt
- * will continue in o2net_start_connect.
+ /* idle timerout happen, don't shutdown the connection, but
+ * make fence decision. Maybe the connection can recover before
+ * the decision is made.
*/
atomic_set(&nn->nn_timeout, 1);
+ o2quo_conn_err(o2net_num_from_nn(nn));
+ queue_delayed_work(o2net_wq, &nn->nn_still_up,
+ msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+
+ o2net_sc_reset_idle_timer(sc);
- o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
}
static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1560,6 +1572,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
{
+ struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+ /* clear fence decision since the connection recover from timeout*/
+ if (atomic_read(&nn->nn_timeout)) {
+ o2quo_conn_up(o2net_num_from_nn(nn));
+ cancel_delayed_work(&nn->nn_still_up);
+ atomic_set(&nn->nn_timeout, 0);
+ }
+
/* Only push out an existing timer */
if (timer_pending(&sc->sc_idle_timeout))
o2net_sc_reset_idle_timer(sc);
@@ -1580,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work)
struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
int ret = 0, stop;
unsigned int timeout;
+ unsigned int noio_flag;
+ /*
+ * sock_create allocates the sock with GFP_KERNEL. We must set
+ * per-process flag PF_MEMALLOC_NOIO so that all allocations done
+ * by this process are done as if GFP_NOIO was specified. So we
+ * are not reentering filesystem while doing memory reclaim.
+ */
+ noio_flag = memalloc_noio_save();
/* if we're greater we initiate tx, otherwise we accept */
if (o2nm_this_node() <= o2net_num_from_nn(nn))
goto out;
@@ -1650,6 +1679,12 @@ static void o2net_start_connect(struct work_struct *work)
goto out;
}
+ ret = o2net_set_usertimeout(sock);
+ if (ret) {
+ mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+ goto out;
+ }
+
o2net_register_callbacks(sc->sc_sock->sk, sc);
spin_lock(&nn->nn_lock);
@@ -1683,6 +1718,7 @@ out:
if (mynode)
o2nm_node_put(mynode);
+ memalloc_noio_restore(noio_flag);
return;
}
@@ -1694,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work)
spin_lock(&nn->nn_lock);
if (!nn->nn_sc_valid) {
printk(KERN_NOTICE "o2net: No connection established with "
- "node %u after %u.%u seconds, giving up.\n",
+ "node %u after %u.%u seconds, check network and"
+ " cluster configuration.\n",
o2net_num_from_nn(nn),
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
@@ -1808,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more)
struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
struct o2net_node *nn;
+ unsigned int noio_flag;
+
+ /*
+ * sock_create_lite allocates the sock with GFP_KERNEL. We must set
+ * per-process flag PF_MEMALLOC_NOIO so that all allocations done
+ * by this process are done as if GFP_NOIO was specified. So we
+ * are not reentering filesystem while doing memory reclaim.
+ */
+ noio_flag = memalloc_noio_save();
BUG_ON(sock == NULL);
*more = 0;
@@ -1831,6 +1877,12 @@ static int o2net_accept_one(struct socket *sock, int *more)
goto out;
}
+ ret = o2net_set_usertimeout(new_sock);
+ if (ret) {
+ mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
+ goto out;
+ }
+
slen = sizeof(sin);
ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
&slen, 1);
@@ -1918,6 +1970,8 @@ out:
o2nm_node_put(local_node);
if (sc)
sc_put(sc);
+
+ memalloc_noio_restore(noio_flag);
return ret;
}
@@ -2113,17 +2167,13 @@ int o2net_init(void)
o2quo_init();
if (o2net_debugfs_init())
- return -ENOMEM;
+ goto out;
o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
- if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
- kfree(o2net_hand);
- kfree(o2net_keep_req);
- kfree(o2net_keep_resp);
- return -ENOMEM;
- }
+ if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
+ goto out;
o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
o2net_hand->connector_id = cpu_to_be64(1);
@@ -2148,6 +2198,14 @@ int o2net_init(void)
}
return 0;
+
+out:
+ kfree(o2net_hand);
+ kfree(o2net_keep_req);
+ kfree(o2net_keep_resp);
+
+ o2quo_exit();
+ return -ENOMEM;
}
void o2net_exit(void)
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 5bada2a69b50..c571e849fda4 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
+#define O2NET_TCP_USER_TIMEOUT 0x7fffffff
/* TODO: figure this out.... */
static inline int o2net_link_down(int err, struct socket *sock)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 18f13c2e4a10..149eb556b8c6 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = {
static int debug_lockres_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- int ret = -ENOMEM;
- struct seq_file *seq;
- struct debug_lockres *dl = NULL;
+ struct debug_lockres *dl;
+ void *buf;
- dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
- if (!dl) {
- mlog_errno(ret);
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
goto bail;
- }
- dl->dl_len = PAGE_SIZE;
- dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
- if (!dl->dl_buf) {
- mlog_errno(ret);
- goto bail;
- }
+ dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl));
+ if (!dl)
+ goto bailfree;
- ret = seq_open(file, &debug_lockres_ops);
- if (ret) {
- mlog_errno(ret);
- goto bail;
- }
-
- seq = file->private_data;
- seq->private = dl;
+ dl->dl_len = PAGE_SIZE;
+ dl->dl_buf = buf;
dlm_grab(dlm);
dl->dl_ctxt = dlm;
return 0;
+
+bailfree:
+ kfree(buf);
bail:
- if (dl)
- kfree(dl->dl_buf);
- kfree(dl);
- return ret;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
static int debug_lockres_release(struct inode *inode, struct file *file)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 39efc5057a36..02d315fef432 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
* to back off and try again. This gives heartbeat a chance
* to catch up.
*/
- if (!o2hb_check_node_heartbeating(query->node_idx)) {
+ if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
mlog(0, "node %u is not in our live map yet\n",
query->node_idx);
@@ -1923,12 +1923,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- if (total_backoff >
- msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
+ if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) {
status = -ERESTARTSYS;
mlog(ML_NOTICE, "Timed out joining dlm domain "
"%s after %u msecs\n", dlm->name,
- jiffies_to_msecs(total_backoff));
+ total_backoff);
goto bail;
}
@@ -1976,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
if (!dlm) {
- mlog_errno(-ENOMEM);
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
dlm->name = kstrdup(domain, GFP_KERNEL);
if (dlm->name == NULL) {
- mlog_errno(-ENOMEM);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
if (!dlm->lockres_hash) {
- mlog_errno(-ENOMEM);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
@@ -2003,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->master_hash = (struct hlist_head **)
dlm_alloc_pagevec(DLM_HASH_PAGES);
if (!dlm->master_hash) {
- mlog_errno(-ENOMEM);
- dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
+ ret = -ENOMEM;
+ mlog_errno(ret);
goto leave;
}
@@ -2018,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->node_num = o2nm_this_node();
ret = dlm_create_debugfs_subroot(dlm);
- if (ret < 0) {
- dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
- dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
- kfree(dlm->name);
- kfree(dlm);
- dlm = NULL;
+ if (ret < 0)
goto leave;
- }
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
@@ -2086,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
atomic_read(&dlm->dlm_refs.refcount));
leave:
+ if (ret < 0 && dlm) {
+ if (dlm->master_hash)
+ dlm_free_pagevec((void **)dlm->master_hash,
+ DLM_HASH_PAGES);
+
+ if (dlm->lockres_hash)
+ dlm_free_pagevec((void **)dlm->lockres_hash,
+ DLM_HASH_PAGES);
+
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ }
return dlm;
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 82abf0cc9a12..215e41abf101 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
return res;
error:
- if (res && res->lockname.name)
- kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
-
if (res)
kmem_cache_free(dlm_lockres_cache, res);
return NULL;
@@ -655,12 +652,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
clear_bit(bit, res->refmap);
}
-
-void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- assert_spin_locked(&res->spinlock);
-
res->inflight_locks++;
mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
@@ -668,6 +662,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
__builtin_return_address(0));
}
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ __dlm_lockres_grab_inflight_ref(dlm, res);
+}
+
void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -894,10 +895,8 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* Grab inflight ref to pin the resource */
- spin_lock(&res->spinlock);
- dlm_lockres_grab_inflight_ref(dlm, res);
- spin_unlock(&res->spinlock);
+ /* since this lockres is new it doesn't not require the spinlock */
+ __dlm_lockres_grab_inflight_ref(dlm, res);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
@@ -2037,6 +2036,10 @@ kill:
"and killing the other node now! This node is OK and can continue.\n");
__dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
+ spin_lock(&dlm->master_lock);
+ if (mle)
+ __dlm_put_mle(mle);
+ spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
*ret_data = (void *)res;
dlm_put(dlm);
@@ -2405,6 +2408,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
if (res->state & DLM_LOCK_RES_MIGRATING)
return 0;
+ /* delay migration when the lockres is in RECOCERING state */
+ if (res->state & DLM_LOCK_RES_RECOVERING)
+ return 0;
+
if (res->owner != dlm->node_num)
return 0;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 45067faf5695..3365839d2971 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
BUG();
} else
__dlm_lockres_grab_inflight_worker(dlm, res);
- } else /* put.. incase we are not the master */
+ spin_unlock(&res->spinlock);
+ } else {
+ /* put.. incase we are not the master */
+ spin_unlock(&res->spinlock);
dlm_lockres_put(res);
- spin_unlock(&res->spinlock);
+ }
}
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 52cfe99ae056..21262f2b1654 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
{
- int ret;
struct ocfs2_dlm_seq_priv *priv;
- struct seq_file *seq;
struct ocfs2_super *osb;
- priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+ priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
if (!priv) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
+
osb = inode->i_private;
ocfs2_get_dlm_debug(osb->osb_dlm_debug);
priv->p_dlm_debug = osb->osb_dlm_debug;
INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
- ret = seq_open(file, &ocfs2_dlm_seq_ops);
- if (ret) {
- kfree(priv);
- mlog_errno(ret);
- goto out;
- }
-
- seq = file->private_data;
- seq->private = priv;
-
ocfs2_add_lockres_tracking(&priv->p_iter_res,
priv->p_dlm_debug);
-out:
- return ret;
+ return 0;
}
static const struct file_operations ocfs2_dlm_debug_fops = {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2930e231f3f9..324dc93ac896 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
struct address_space *mapping = inode->i_mapping;
struct page *page;
unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
- handle_t *handle = NULL;
+ handle_t *handle;
int ret = 0;
unsigned zero_from, zero_to, block_start, block_end;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
+ handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
page = find_or_create_page(mapping, index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
mlog_errno(ret);
- goto out;
+ goto out_commit_trans;
}
/* Get the offsets within the page that we want to zero */
@@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
goto out_unlock;
}
- if (!handle) {
- handle = ocfs2_zero_start_ordered_transaction(inode,
- di_bh);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
- break;
- }
- }
/* must not update i_size! */
ret = block_commit_write(page, block_start + 1,
@@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
ret = 0;
}
+ /*
+ * fs-writeback will release the dirty pages without page lock
+ * whose offset are over inode size, the release happens at
+ * block_write_full_page().
+ */
+ i_size_write(inode, abs_to);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ di->i_mtime_nsec = di->i_ctime_nsec;
if (handle) {
- /*
- * fs-writeback will release the dirty pages without page lock
- * whose offset are over inode size, the release happens at
- * block_write_full_page().
- */
- i_size_write(inode, abs_to);
- inode->i_blocks = ocfs2_inode_sector_count(inode);
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- di->i_mtime_nsec = di->i_ctime_nsec;
ocfs2_journal_dirty(handle, di_bh);
ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
}
out_unlock:
unlock_page(page);
page_cache_release(page);
+out_commit_trans:
+ if (handle)
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
return ret;
}
@@ -1253,7 +1252,7 @@ bail:
brelse(bh);
/* Release quota pointers in case we acquired them */
- for (qtype = 0; qtype < MAXQUOTAS; qtype++)
+ for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
dqput(transfer_to[qtype]);
if (!status && attr->ia_valid & ATTR_MODE) {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a6c991c0fc98..a9b76de46047 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
{
int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
- return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+ return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
}
/* Validate that a bh contains a valid inode */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 6f66b3751ace..53e6c40ed4c6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -35,9 +35,8 @@
copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
/*
- * This call is void because we are already reporting an error that may
- * be -EFAULT. The error will be returned from the ioctl(2) call. It's
- * just a best-effort to tell userspace that this request caused the error.
+ * This is just a best-effort to tell userspace that this request
+ * caused the error.
*/
static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
struct ocfs2_info_request __user *req)
@@ -146,136 +145,105 @@ bail:
static int ocfs2_info_handle_blocksize(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_blocksize oib;
if (o2info_from_user(oib, req))
- goto bail;
+ return -EFAULT;
oib.ib_blocksize = inode->i_sb->s_blocksize;
o2info_set_request_filled(&oib.ib_req);
if (o2info_to_user(oib, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oib.ib_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
static int ocfs2_info_handle_clustersize(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_clustersize oic;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oic, req))
- goto bail;
+ return -EFAULT;
oic.ic_clustersize = osb->s_clustersize;
o2info_set_request_filled(&oic.ic_req);
if (o2info_to_user(oic, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oic.ic_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
static int ocfs2_info_handle_maxslots(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_maxslots oim;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oim, req))
- goto bail;
+ return -EFAULT;
oim.im_max_slots = osb->max_slots;
o2info_set_request_filled(&oim.im_req);
if (o2info_to_user(oim, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oim.im_req, req);
-
- return status;
+ return 0;
}
static int ocfs2_info_handle_label(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_label oil;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oil, req))
- goto bail;
+ return -EFAULT;
memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
o2info_set_request_filled(&oil.il_req);
if (o2info_to_user(oil, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oil.il_req, req);
-
- return status;
+ return 0;
}
static int ocfs2_info_handle_uuid(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_uuid oiu;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oiu, req))
- goto bail;
+ return -EFAULT;
memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
o2info_set_request_filled(&oiu.iu_req);
if (o2info_to_user(oiu, req))
- goto bail;
-
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oiu.iu_req, req);
+ return -EFAULT;
- return status;
+ return 0;
}
static int ocfs2_info_handle_fs_features(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_fs_features oif;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oif, req))
- goto bail;
+ return -EFAULT;
oif.if_compat_features = osb->s_feature_compat;
oif.if_incompat_features = osb->s_feature_incompat;
@@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode,
o2info_set_request_filled(&oif.if_req);
if (o2info_to_user(oif, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oif.if_req, req);
-
- return status;
+ return 0;
}
static int ocfs2_info_handle_journal_size(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_journal_size oij;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (o2info_from_user(oij, req))
- goto bail;
+ return -EFAULT;
oij.ij_journal_size = i_size_read(osb->journal->j_inode);
o2info_set_request_filled(&oij.ij_req);
if (o2info_to_user(oij, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oij.ij_req, req);
-
- return status;
+ return 0;
}
static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
@@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
u32 i;
u64 blkno = -1;
char namebuf[40];
- int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+ int status, type = INODE_ALLOC_SYSTEM_INODE;
struct ocfs2_info_freeinode *oifi = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *inode_alloc = NULL;
@@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
goto out_err;
}
- if (o2info_from_user(*oifi, req))
- goto bail;
+ if (o2info_from_user(*oifi, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
oifi->ifi_slotnum = osb->max_slots;
@@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
o2info_set_request_filled(&oifi->ifi_req);
- if (o2info_to_user(*oifi, req))
- goto bail;
+ if (o2info_to_user(*oifi, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
status = 0;
bail:
if (status)
o2info_set_request_error(&oifi->ifi_req, req);
-
+out_free:
kfree(oifi);
out_err:
return status;
@@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
{
u64 blkno = -1;
char namebuf[40];
- int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+ int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
struct ocfs2_info_freefrag *oiff;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
goto out_err;
}
- if (o2info_from_user(*oiff, req))
- goto bail;
+ if (o2info_from_user(*oiff, req)) {
+ status = -EFAULT;
+ goto out_free;
+ }
/*
* chunksize from userspace should be power of 2.
*/
@@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
if (o2info_to_user(*oiff, req)) {
status = -EFAULT;
- goto bail;
+ goto out_free;
}
status = 0;
bail:
if (status)
o2info_set_request_error(&oiff->iff_req, req);
-
+out_free:
kfree(oiff);
out_err:
return status;
@@ -727,23 +690,17 @@ out_err:
static int ocfs2_info_handle_unknown(struct inode *inode,
struct ocfs2_info_request __user *req)
{
- int status = -EFAULT;
struct ocfs2_info_request oir;
if (o2info_from_user(oir, req))
- goto bail;
+ return -EFAULT;
o2info_clear_request_filled(&oir);
if (o2info_to_user(oir, req))
- goto bail;
+ return -EFAULT;
- status = 0;
-bail:
- if (status)
- o2info_set_request_error(&oir, req);
-
- return status;
+ return 0;
}
/*
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 599eb4c4c8be..74caffeeee1d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -98,7 +98,7 @@ static int __ocfs2_move_extent(handle_t *handle,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ if (index == -1) {
ocfs2_error(inode->i_sb,
"Inode %llu has an extent at cpos %u which can no "
"longer be found.\n",
@@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
* 'vict_blkno' was out of the valid range.
*/
if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
- (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+ (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
bits_per_unit))) {
ret = -EINVAL;
goto out;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8add6f1030d7..b931e04e3388 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -158,7 +158,7 @@ bail_add:
* NOTE: This dentry already has ->d_op set from
* ocfs2_get_parent() and ocfs2_get_dentry()
*/
- if (ret)
+ if (!IS_ERR_OR_NULL(ret))
dentry = ret;
status = ocfs2_dentry_attach_lock(dentry, inode,
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index f266d67df3c6..1eae330193a6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,6 +17,9 @@
#include "ocfs2.h"
+/* Number of quota types we support */
+#define OCFS2_MAXQUOTAS 2
+
/*
* In-memory structures
*/
@@ -39,7 +42,7 @@ struct ocfs2_recovery_chunk {
};
struct ocfs2_quota_recovery {
- struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */
+ struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */
};
/* In-memory structure with quota header information */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b990a62cff50..c93d67220887 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -336,8 +336,8 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
int ocfs2_global_read_info(struct super_block *sb, int type)
{
struct inode *gqinode = NULL;
- unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
- GROUP_QUOTA_SYSTEM_INODE };
+ unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
struct ocfs2_global_disk_dqinfo dinfo;
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2001862bf2b1..10b653930ee2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -166,12 +166,12 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
/* Check whether we understand format of quota files */
static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
{
- unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
- unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
- unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
- unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
- unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
- GROUP_QUOTA_SYSTEM_INODE };
+ unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+ unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+ unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+ unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+ unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
struct buffer_head *bh = NULL;
struct inode *linode = sb_dqopt(sb)->files[type];
struct inode *ginode = NULL;
@@ -336,7 +336,7 @@ void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
{
int type;
- for (type = 0; type < MAXQUOTAS; type++)
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++)
free_recovery_list(&(rec->r_list[type]));
kfree(rec);
}
@@ -382,7 +382,7 @@ static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
if (!rec)
return NULL;
- for (type = 0; type < MAXQUOTAS; type++)
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++)
INIT_LIST_HEAD(&(rec->r_list[type]));
return rec;
}
@@ -392,10 +392,11 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
struct ocfs2_super *osb,
int slot_num)
{
- unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
- unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
- LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ unsigned int feature[OCFS2_MAXQUOTAS] = {
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
struct super_block *sb = osb->sb;
struct ocfs2_local_disk_dqinfo *ldinfo;
struct inode *lqinode;
@@ -412,7 +413,7 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
return ERR_PTR(-ENOMEM);
/* First init... */
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
/* At this point, journal of the slot is already replayed so
@@ -589,8 +590,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
struct ocfs2_quota_recovery *rec,
int slot_num)
{
- unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
- LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
struct super_block *sb = osb->sb;
struct ocfs2_local_disk_dqinfo *ldinfo;
struct buffer_head *bh;
@@ -604,7 +605,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
"slot %u\n", osb->dev_str, slot_num);
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
continue;
trace_ocfs2_finish_quota_recovery(slot_num);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 636aab69ead5..d81f6e2a97f5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3109,7 +3109,7 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
- if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ if (index == -1) {
ocfs2_error(sb,
"Inode %llu has an extent at cpos %u which can no "
"longer be found.\n",
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 1424c151cccc..a88b2a4fcc85 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -382,7 +382,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
- si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+ si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!si->si_bh) {
status = -ENOMEM;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 13a8537d8e8b..720aa389e0ea 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
*/
ocfs2_control_this_node = -1;
running_proto.pv_major = 0;
- running_proto.pv_major = 0;
+ running_proto.pv_minor = 0;
}
out:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ddb662b32447..93c85bc745e1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -899,11 +899,12 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
{
int type;
struct super_block *sb = osb->sb;
- unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int feature[OCFS2_MAXQUOTAS] = {
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
int status = 0;
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
if (unsuspend)
@@ -927,17 +928,19 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
static int ocfs2_enable_quotas(struct ocfs2_super *osb)
{
- struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+ struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL };
struct super_block *sb = osb->sb;
- unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
- unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ unsigned int feature[OCFS2_MAXQUOTAS] = {
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[OCFS2_MAXQUOTAS] = {
+ LOCAL_USER_QUOTA_SYSTEM_INODE,
LOCAL_GROUP_QUOTA_SYSTEM_INODE };
int status;
int type;
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
@@ -952,12 +955,12 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
goto out_quota_off;
}
- for (type = 0; type < MAXQUOTAS; type++)
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++)
iput(inode[type]);
return 0;
out_quota_off:
ocfs2_disable_quotas(osb);
- for (type = 0; type < MAXQUOTAS; type++)
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++)
iput(inode[type]);
mlog_errno(status);
return status;
@@ -972,7 +975,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
/* We mostly ignore errors in this function because there's not much
* we can do when we see them */
- for (type = 0; type < MAXQUOTAS; type++) {
+ for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!sb_has_quota_loaded(sb, type))
continue;
/* Cancel periodic syncing before we grab dqonoff_mutex */
@@ -993,8 +996,9 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
/* Handle quota on quotactl */
static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
{
- unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int feature[OCFS2_MAXQUOTAS] = {
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
return -EINVAL;
@@ -2532,6 +2536,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->journal);
kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
+ kfree(osb->vol_label);
ocfs2_put_dlm_debug(osb->osb_dlm_debug);
memset(osb, 0, sizeof(struct ocfs2_super));
}
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index ec58c7659183..138321b0c6c2 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -306,9 +306,7 @@ static const struct super_operations omfs_sops = {
*/
static int omfs_get_imap(struct super_block *sb)
{
- int bitmap_size;
- int array_size;
- int count;
+ unsigned int bitmap_size, count, array_size;
struct omfs_sb_info *sbi = OMFS_SB(sb);
struct buffer_head *bh;
unsigned long **ptr;
@@ -321,7 +319,7 @@ static int omfs_get_imap(struct super_block *sb)
goto out;
sbi->s_imap_size = array_size;
- sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL);
+ sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL);
if (!sbi->s_imap)
goto nomem;
@@ -473,6 +471,12 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize);
mutex_init(&sbi->s_bitmap_lock);
+ if (sbi->s_num_blocks > OMFS_MAX_BLOCKS) {
+ printk(KERN_ERR "omfs: sysblock number (%llx) is out of range\n",
+ (unsigned long long)sbi->s_num_blocks);
+ goto out_brelse_bh;
+ }
+
if (sbi->s_sys_blocksize > PAGE_SIZE) {
printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n",
sbi->s_sys_blocksize);
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index ee5e4327de92..83a98330ed66 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -18,6 +18,7 @@
#define OMFS_XOR_COUNT 19
#define OMFS_MAX_BLOCK_SIZE 8192
#define OMFS_MAX_CLUSTER_SIZE 8
+#define OMFS_MAX_BLOCKS (1ul << 31)
struct omfs_super_block {
char s_fill1[256];
diff --git a/fs/open.c b/fs/open.c
index d6fd3acde134..de92c13b58be 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags,
f = get_empty_filp();
if (!IS_ERR(f)) {
f->f_flags = flags;
- f->f_path = *path;
- error = do_dentry_open(f, NULL, cred);
+ error = vfs_open(path, f, cred);
if (!error) {
/* from now on we need fput() to dispose of f */
error = open_check_o_direct(f);
@@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @filp: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *filp,
+ const struct cred *cred)
+{
+ struct inode *inode = path->dentry->d_inode;
+
+ if (inode->i_op->dentry_open)
+ return inode->i_op->dentry_open(path->dentry, filp, cred);
+ else {
+ filp->f_path = *path;
+ return do_dentry_open(filp, NULL, cred);
+ }
+}
+EXPORT_SYMBOL(vfs_open);
+
static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
new file mode 100644
index 000000000000..e60125976873
--- /dev/null
+++ b/fs/overlayfs/Kconfig
@@ -0,0 +1,10 @@
+config OVERLAYFS_FS
+ tristate "Overlay filesystem support"
+ help
+ An overlay filesystem combines two filesystems - an 'upper' filesystem
+ and a 'lower' filesystem. When a name exists in both filesystems, the
+ object in the 'upper' filesystem is visible while the object in the
+ 'lower' filesystem is either hidden or, in the case of directories,
+ merged with the 'upper' object.
+
+ For more information see Documentation/filesystems/overlayfs.txt
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
new file mode 100644
index 000000000000..8f91889480d0
--- /dev/null
+++ b/fs/overlayfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the overlay filesystem.
+#
+
+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
+
+overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
new file mode 100644
index 000000000000..ea10a8719107
--- /dev/null
+++ b/fs/overlayfs/copy_up.c
@@ -0,0 +1,414 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include "overlayfs.h"
+
+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+
+int ovl_copy_xattr(struct dentry *old, struct dentry *new)
+{
+ ssize_t list_size, size;
+ char *buf, *name, *value;
+ int error;
+
+ if (!old->d_inode->i_op->getxattr ||
+ !new->d_inode->i_op->getxattr)
+ return 0;
+
+ list_size = vfs_listxattr(old, NULL, 0);
+ if (list_size <= 0) {
+ if (list_size == -EOPNOTSUPP)
+ return 0;
+ return list_size;
+ }
+
+ buf = kzalloc(list_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ error = -ENOMEM;
+ value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
+ if (!value)
+ goto out;
+
+ list_size = vfs_listxattr(old, buf, list_size);
+ if (list_size <= 0) {
+ error = list_size;
+ goto out_free_value;
+ }
+
+ for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+ size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
+ if (size <= 0) {
+ error = size;
+ goto out_free_value;
+ }
+ error = vfs_setxattr(new, name, value, size, 0);
+ if (error)
+ goto out_free_value;
+ }
+
+out_free_value:
+ kfree(value);
+out:
+ kfree(buf);
+ return error;
+}
+
+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
+{
+ struct file *old_file;
+ struct file *new_file;
+ loff_t old_pos = 0;
+ loff_t new_pos = 0;
+ int error = 0;
+
+ if (len == 0)
+ return 0;
+
+ old_file = ovl_path_open(old, O_RDONLY);
+ if (IS_ERR(old_file))
+ return PTR_ERR(old_file);
+
+ new_file = ovl_path_open(new, O_WRONLY);
+ if (IS_ERR(new_file)) {
+ error = PTR_ERR(new_file);
+ goto out_fput;
+ }
+
+ /* FIXME: copy up sparse files efficiently */
+ while (len) {
+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
+ long bytes;
+
+ if (len < this_len)
+ this_len = len;
+
+ if (signal_pending_state(TASK_KILLABLE, current)) {
+ error = -EINTR;
+ break;
+ }
+
+ bytes = do_splice_direct(old_file, &old_pos,
+ new_file, &new_pos,
+ this_len, SPLICE_F_MOVE);
+ if (bytes <= 0) {
+ error = bytes;
+ break;
+ }
+ WARN_ON(old_pos != new_pos);
+
+ len -= bytes;
+ }
+
+ fput(new_file);
+out_fput:
+ fput(old_file);
+ return error;
+}
+
+static char *ovl_read_symlink(struct dentry *realdentry)
+{
+ int res;
+ char *buf;
+ struct inode *inode = realdentry->d_inode;
+ mm_segment_t old_fs;
+
+ res = -EINVAL;
+ if (!inode->i_op->readlink)
+ goto err;
+
+ res = -ENOMEM;
+ buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ res = inode->i_op->readlink(realdentry,
+ (char __user *)buf, PAGE_SIZE - 1);
+ set_fs(old_fs);
+ if (res < 0) {
+ free_page((unsigned long) buf);
+ goto err;
+ }
+ buf[res] = '\0';
+
+ return buf;
+
+err:
+ return ERR_PTR(res);
+}
+
+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
+{
+ struct iattr attr = {
+ .ia_valid =
+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+ .ia_atime = stat->atime,
+ .ia_mtime = stat->mtime,
+ };
+
+ return notify_change(upperdentry, &attr, NULL);
+}
+
+int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
+{
+ int err = 0;
+
+ if (!S_ISLNK(stat->mode)) {
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = stat->mode,
+ };
+ err = notify_change(upperdentry, &attr, NULL);
+ }
+ if (!err) {
+ struct iattr attr = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = stat->uid,
+ .ia_gid = stat->gid,
+ };
+ err = notify_change(upperdentry, &attr, NULL);
+ }
+ if (!err)
+ ovl_set_timestamps(upperdentry, stat);
+
+ return err;
+
+}
+
+static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
+ struct dentry *dentry, struct path *lowerpath,
+ struct kstat *stat, struct iattr *attr,
+ const char *link)
+{
+ struct inode *wdir = workdir->d_inode;
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *newdentry = NULL;
+ struct dentry *upper = NULL;
+ umode_t mode = stat->mode;
+ int err;
+
+ newdentry = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out;
+
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out1;
+
+ /* Can't properly set mode on creation because of the umask */
+ stat->mode &= S_IFMT;
+ err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
+ stat->mode = mode;
+ if (err)
+ goto out2;
+
+ if (S_ISREG(stat->mode)) {
+ struct path upperpath;
+ ovl_path_upper(dentry, &upperpath);
+ BUG_ON(upperpath.dentry != NULL);
+ upperpath.dentry = newdentry;
+
+ err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
+ if (err)
+ goto out_cleanup;
+ }
+
+ err = ovl_copy_xattr(lowerpath->dentry, newdentry);
+ if (err)
+ goto out_cleanup;
+
+ mutex_lock(&newdentry->d_inode->i_mutex);
+ err = ovl_set_attr(newdentry, stat);
+ if (!err && attr)
+ err = notify_change(newdentry, attr, NULL);
+ mutex_unlock(&newdentry->d_inode->i_mutex);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+ if (err)
+ goto out_cleanup;
+
+ ovl_dentry_update(dentry, newdentry);
+ newdentry = NULL;
+
+ /*
+ * Non-directores become opaque when copied up.
+ */
+ if (!S_ISDIR(stat->mode))
+ ovl_dentry_set_opaque(dentry, true);
+out2:
+ dput(upper);
+out1:
+ dput(newdentry);
+out:
+ return err;
+
+out_cleanup:
+ ovl_cleanup(wdir, newdentry);
+ goto out;
+}
+
+/*
+ * Copy up a single dentry
+ *
+ * Directory renames only allowed on "pure upper" (already created on
+ * upper filesystem, never copied up). Directories which are on lower or
+ * are merged may not be renamed. For these -EXDEV is returned and
+ * userspace has to deal with it. This means, when copying up a
+ * directory we can rely on it and ancestors being stable.
+ *
+ * Non-directory renames start with copy up of source if necessary. The
+ * actual rename will only proceed once the copy up was successful. Copy
+ * up uses upper parent i_mutex for exclusion. Since rename can change
+ * d_parent it is possible that the copy up will lock the old parent. At
+ * that point the file will have already been copied up anyway.
+ */
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ struct path *lowerpath, struct kstat *stat,
+ struct iattr *attr)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ int err;
+ struct kstat pstat;
+ struct path parentpath;
+ struct dentry *upperdir;
+ struct dentry *upperdentry;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+ char *link = NULL;
+
+ ovl_path_upper(parent, &parentpath);
+ upperdir = parentpath.dentry;
+
+ err = vfs_getattr(&parentpath, &pstat);
+ if (err)
+ return err;
+
+ if (S_ISLNK(stat->mode)) {
+ link = ovl_read_symlink(lowerpath->dentry);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ }
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_free_link;
+
+ override_cred->fsuid = stat->uid;
+ override_cred->fsgid = stat->gid;
+ /*
+ * CAP_SYS_ADMIN for copying up extended attributes
+ * CAP_DAC_OVERRIDE for create
+ * CAP_FOWNER for chmod, timestamp update
+ * CAP_FSETID for chmod
+ * CAP_CHOWN for chown
+ * CAP_MKNOD for mknod
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ cap_raise(override_cred->cap_effective, CAP_MKNOD);
+ old_cred = override_creds(override_cred);
+
+ err = -EIO;
+ if (lock_rename(workdir, upperdir) != NULL) {
+ pr_err("overlayfs: failed to lock workdir+upperdir\n");
+ goto out_unlock;
+ }
+ upperdentry = ovl_dentry_upper(dentry);
+ if (upperdentry) {
+ unlock_rename(workdir, upperdir);
+ err = 0;
+ /* Raced with another copy-up? Do the setattr here */
+ if (attr) {
+ mutex_lock(&upperdentry->d_inode->i_mutex);
+ err = notify_change(upperdentry, attr, NULL);
+ mutex_unlock(&upperdentry->d_inode->i_mutex);
+ }
+ goto out_put_cred;
+ }
+
+ err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
+ stat, attr, link);
+ if (!err) {
+ /* Restore timestamps on parent (best effort) */
+ ovl_set_timestamps(upperdir, &pstat);
+ }
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out_put_cred:
+ revert_creds(old_cred);
+ put_cred(override_cred);
+
+out_free_link:
+ if (link)
+ free_page((unsigned long) link);
+
+ return err;
+}
+
+int ovl_copy_up(struct dentry *dentry)
+{
+ int err;
+
+ err = 0;
+ while (!err) {
+ struct dentry *next;
+ struct dentry *parent;
+ struct path lowerpath;
+ struct kstat stat;
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (type != OVL_PATH_LOWER)
+ break;
+
+ next = dget(dentry);
+ /* find the topmost dentry not yet copied up */
+ for (;;) {
+ parent = dget_parent(next);
+
+ type = ovl_path_type(parent);
+ if (type != OVL_PATH_LOWER)
+ break;
+
+ dput(next);
+ next = parent;
+ }
+
+ ovl_path_lower(next, &lowerpath);
+ err = vfs_getattr(&lowerpath, &stat);
+ if (!err)
+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+
+ dput(parent);
+ dput(next);
+ }
+
+ return err;
+}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
new file mode 100644
index 000000000000..15cd91ad9940
--- /dev/null
+++ b/fs/overlayfs/dir.c
@@ -0,0 +1,921 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
+{
+ int err;
+
+ dget(wdentry);
+ if (S_ISDIR(wdentry->d_inode->i_mode))
+ err = ovl_do_rmdir(wdir, wdentry);
+ else
+ err = ovl_do_unlink(wdir, wdentry);
+ dput(wdentry);
+
+ if (err) {
+ pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
+ wdentry, err);
+ }
+}
+
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
+{
+ struct dentry *temp;
+ char name[20];
+
+ snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
+
+ temp = lookup_one_len(name, workdir, strlen(name));
+ if (!IS_ERR(temp) && temp->d_inode) {
+ pr_err("overlayfs: workdir/%s already exists\n", name);
+ dput(temp);
+ temp = ERR_PTR(-EIO);
+ }
+
+ return temp;
+}
+
+/* caller holds i_mutex on workdir */
+static struct dentry *ovl_whiteout(struct dentry *workdir,
+ struct dentry *dentry)
+{
+ int err;
+ struct dentry *whiteout;
+ struct inode *wdir = workdir->d_inode;
+
+ whiteout = ovl_lookup_temp(workdir, dentry);
+ if (IS_ERR(whiteout))
+ return whiteout;
+
+ err = ovl_do_whiteout(wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ }
+
+ return whiteout;
+}
+
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink, bool debug)
+{
+ int err;
+
+ if (newdentry->d_inode)
+ return -ESTALE;
+
+ if (hardlink) {
+ err = ovl_do_link(hardlink, dir, newdentry, debug);
+ } else {
+ switch (stat->mode & S_IFMT) {
+ case S_IFREG:
+ err = ovl_do_create(dir, newdentry, stat->mode, debug);
+ break;
+
+ case S_IFDIR:
+ err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
+ break;
+
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ err = ovl_do_mknod(dir, newdentry,
+ stat->mode, stat->rdev, debug);
+ break;
+
+ case S_IFLNK:
+ err = ovl_do_symlink(dir, newdentry, link, debug);
+ break;
+
+ default:
+ err = -EPERM;
+ }
+ }
+ if (!err && WARN_ON(!newdentry->d_inode)) {
+ /*
+ * Not quite sure if non-instantiated dentry is legal or not.
+ * VFS doesn't seem to care so check and warn here.
+ */
+ err = -ENOENT;
+ }
+ return err;
+}
+
+static int ovl_set_opaque(struct dentry *upperdentry)
+{
+ return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
+}
+
+static void ovl_remove_opaque(struct dentry *upperdentry)
+{
+ int err;
+
+ err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr);
+ if (err) {
+ pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
+ upperdentry->d_name.name, err);
+ }
+}
+
+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ int err;
+ enum ovl_path_type type;
+ struct path realpath;
+
+ type = ovl_path_real(dentry, &realpath);
+ err = vfs_getattr(&realpath, stat);
+ if (err)
+ return err;
+
+ stat->dev = dentry->d_sb->s_dev;
+ stat->ino = dentry->d_inode->i_ino;
+
+ /*
+ * It's probably not worth it to count subdirs to get the
+ * correct link count. nlink=1 seems to pacify 'find' and
+ * other utilities.
+ */
+ if (type == OVL_PATH_MERGE)
+ stat->nlink = 1;
+
+ return 0;
+}
+
+static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink)
+{
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *newdentry;
+ int err;
+
+ mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+ newdentry = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+ err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
+ if (err)
+ goto out_dput;
+
+ ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_update(dentry, newdentry);
+ ovl_copyattr(newdentry->d_inode, inode);
+ d_instantiate(dentry, inode);
+ newdentry = NULL;
+out_dput:
+ dput(newdentry);
+out_unlock:
+ mutex_unlock(&udir->i_mutex);
+ return err;
+}
+
+static int ovl_lock_rename_workdir(struct dentry *workdir,
+ struct dentry *upperdir)
+{
+ /* Workdir should not be the same as upperdir */
+ if (workdir == upperdir)
+ goto err;
+
+ /* Workdir should not be subdir of upperdir and vice versa */
+ if (lock_rename(workdir, upperdir) != NULL)
+ goto err_unlock;
+
+ return 0;
+
+err_unlock:
+ unlock_rename(workdir, upperdir);
+err:
+ pr_err("overlayfs: failed to lock workdir+upperdir\n");
+ return -EIO;
+}
+
+static struct dentry *ovl_clear_empty(struct dentry *dentry,
+ struct list_head *list)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct path upperpath;
+ struct dentry *upper;
+ struct dentry *opaquedir;
+ struct kstat stat;
+ int err;
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out;
+
+ ovl_path_upper(dentry, &upperpath);
+ err = vfs_getattr(&upperpath, &stat);
+ if (err)
+ goto out_unlock;
+
+ err = -ESTALE;
+ if (!S_ISDIR(stat.mode))
+ goto out_unlock;
+ upper = upperpath.dentry;
+ if (upper->d_parent->d_inode != udir)
+ goto out_unlock;
+
+ opaquedir = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out_unlock;
+
+ err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
+ if (err)
+ goto out_dput;
+
+ err = ovl_copy_xattr(upper, opaquedir);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_set_opaque(opaquedir);
+ if (err)
+ goto out_cleanup;
+
+ mutex_lock(&opaquedir->d_inode->i_mutex);
+ err = ovl_set_attr(opaquedir, &stat);
+ mutex_unlock(&opaquedir->d_inode->i_mutex);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+ if (err)
+ goto out_cleanup;
+
+ ovl_cleanup_whiteouts(upper, list);
+ ovl_cleanup(wdir, upper);
+ unlock_rename(workdir, upperdir);
+
+ /* dentry's upper doesn't match now, get rid of it */
+ d_drop(dentry);
+
+ return opaquedir;
+
+out_cleanup:
+ ovl_cleanup(wdir, opaquedir);
+out_dput:
+ dput(opaquedir);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out:
+ return ERR_PTR(err);
+}
+
+static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry,
+ enum ovl_path_type type)
+{
+ int err;
+ struct dentry *ret = NULL;
+ LIST_HEAD(list);
+
+ err = ovl_check_empty_dir(dentry, &list);
+ if (err)
+ ret = ERR_PTR(err);
+ else if (type == OVL_PATH_MERGE)
+ ret = ovl_clear_empty(dentry, &list);
+
+ ovl_cache_free(&list);
+
+ return ret;
+}
+
+static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *upper;
+ struct dentry *newdentry;
+ int err;
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out;
+
+ newdentry = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out_dput;
+
+ err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
+ if (err)
+ goto out_dput2;
+
+ if (S_ISDIR(stat->mode)) {
+ err = ovl_set_opaque(newdentry);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, newdentry, udir, upper,
+ RENAME_EXCHANGE);
+ if (err)
+ goto out_cleanup;
+
+ ovl_cleanup(wdir, upper);
+ } else {
+ err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+ if (err)
+ goto out_cleanup;
+ }
+ ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_update(dentry, newdentry);
+ ovl_copyattr(newdentry->d_inode, inode);
+ d_instantiate(dentry, inode);
+ newdentry = NULL;
+out_dput2:
+ dput(upper);
+out_dput:
+ dput(newdentry);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out:
+ return err;
+
+out_cleanup:
+ ovl_cleanup(wdir, newdentry);
+ goto out_dput2;
+}
+
+static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
+ const char *link, struct dentry *hardlink)
+{
+ int err;
+ struct inode *inode;
+ struct kstat stat = {
+ .mode = mode,
+ .rdev = rdev,
+ };
+
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
+ if (!inode)
+ goto out;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ goto out_iput;
+
+ if (!ovl_dentry_is_opaque(dentry)) {
+ err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
+ } else {
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_iput;
+
+ /*
+ * CAP_SYS_ADMIN for setting opaque xattr
+ * CAP_DAC_OVERRIDE for create in workdir, rename
+ * CAP_FOWNER for removing whiteout from sticky dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ old_cred = override_creds(override_cred);
+
+ err = ovl_create_over_whiteout(dentry, inode, &stat, link,
+ hardlink);
+
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+
+ if (!err)
+ inode = NULL;
+out_iput:
+ iput(inode);
+out:
+ return err;
+}
+
+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
+ const char *link)
+{
+ int err;
+
+ err = ovl_want_write(dentry);
+ if (!err) {
+ err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
+ ovl_drop_write(dentry);
+ }
+
+ return err;
+}
+
+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
+}
+
+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+}
+
+static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t rdev)
+{
+ /* Don't allow creation of "whiteout" on overlay */
+ if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
+ return -EPERM;
+
+ return ovl_create_object(dentry, mode, rdev, NULL);
+}
+
+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
+ const char *link)
+{
+ return ovl_create_object(dentry, S_IFLNK, 0, link);
+}
+
+static int ovl_link(struct dentry *old, struct inode *newdir,
+ struct dentry *new)
+{
+ int err;
+ struct dentry *upper;
+
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(old);
+ if (err)
+ goto out_drop_write;
+
+ upper = ovl_dentry_upper(old);
+ err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
+
+out_drop_write:
+ ovl_drop_write(old);
+out:
+ return err;
+}
+
+static int ovl_remove_and_whiteout(struct dentry *dentry,
+ enum ovl_path_type type, bool is_dir)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *whiteout;
+ struct dentry *upper;
+ struct dentry *opaquedir = NULL;
+ int err;
+
+ if (is_dir) {
+ opaquedir = ovl_check_empty_and_clear(dentry, type);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
+ }
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out_dput;
+
+ whiteout = ovl_whiteout(workdir, dentry);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ goto out_unlock;
+
+ if (type == OVL_PATH_LOWER) {
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto kill_whiteout;
+
+ err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
+ dput(upper);
+ if (err)
+ goto kill_whiteout;
+ } else {
+ int flags = 0;
+
+ upper = ovl_dentry_upper(dentry);
+ if (opaquedir)
+ upper = opaquedir;
+ err = -ESTALE;
+ if (upper->d_parent != upperdir)
+ goto kill_whiteout;
+
+ if (is_dir)
+ flags |= RENAME_EXCHANGE;
+
+ err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
+ if (err)
+ goto kill_whiteout;
+
+ if (is_dir)
+ ovl_cleanup(wdir, upper);
+ }
+ ovl_dentry_version_inc(dentry->d_parent);
+out_d_drop:
+ d_drop(dentry);
+ dput(whiteout);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out_dput:
+ dput(opaquedir);
+out:
+ return err;
+
+kill_whiteout:
+ ovl_cleanup(wdir, whiteout);
+ goto out_d_drop;
+}
+
+static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
+{
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *dir = upperdir->d_inode;
+ struct dentry *upper = ovl_dentry_upper(dentry);
+ int err;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ err = -ESTALE;
+ if (upper->d_parent == upperdir) {
+ /* Don't let d_delete() think it can reset d_inode */
+ dget(upper);
+ if (is_dir)
+ err = vfs_rmdir(dir, upper);
+ else
+ err = vfs_unlink(dir, upper, NULL);
+ dput(upper);
+ ovl_dentry_version_inc(dentry->d_parent);
+ }
+
+ /*
+ * Keeping this dentry hashed would mean having to release
+ * upperpath/lowerpath, which could only be done if we are the
+ * sole user of this dentry. Too tricky... Just unhash for
+ * now.
+ */
+ d_drop(dentry);
+ mutex_unlock(&dir->i_mutex);
+
+ return err;
+}
+
+static inline int ovl_check_sticky(struct dentry *dentry)
+{
+ struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
+ struct inode *inode = ovl_dentry_real(dentry)->d_inode;
+
+ if (check_sticky(dir, inode))
+ return -EPERM;
+
+ return 0;
+}
+
+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
+{
+ enum ovl_path_type type;
+ int err;
+
+ err = ovl_check_sticky(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ goto out_drop_write;
+
+ type = ovl_path_type(dentry);
+ if (type == OVL_PATH_PURE_UPPER) {
+ err = ovl_remove_upper(dentry, is_dir);
+ } else {
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_drop_write;
+
+ /*
+ * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+ * CAP_DAC_OVERRIDE for create in workdir, rename
+ * CAP_FOWNER for removing whiteout from sticky dir
+ * CAP_FSETID for chmod of opaque dir
+ * CAP_CHOWN for chown of opaque dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ old_cred = override_creds(override_cred);
+
+ err = ovl_remove_and_whiteout(dentry, type, is_dir);
+
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return ovl_do_remove(dentry, false);
+}
+
+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return ovl_do_remove(dentry, true);
+}
+
+static int ovl_rename2(struct inode *olddir, struct dentry *old,
+ struct inode *newdir, struct dentry *new,
+ unsigned int flags)
+{
+ int err;
+ enum ovl_path_type old_type;
+ enum ovl_path_type new_type;
+ struct dentry *old_upperdir;
+ struct dentry *new_upperdir;
+ struct dentry *olddentry;
+ struct dentry *newdentry;
+ struct dentry *trap;
+ bool old_opaque;
+ bool new_opaque;
+ bool new_create = false;
+ bool cleanup_whiteout = false;
+ bool overwrite = !(flags & RENAME_EXCHANGE);
+ bool is_dir = S_ISDIR(old->d_inode->i_mode);
+ bool new_is_dir = false;
+ struct dentry *opaquedir = NULL;
+ const struct cred *old_cred = NULL;
+ struct cred *override_cred = NULL;
+
+ err = -EINVAL;
+ if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
+ goto out;
+
+ flags &= ~RENAME_NOREPLACE;
+
+ err = ovl_check_sticky(old);
+ if (err)
+ goto out;
+
+ /* Don't copy up directory trees */
+ old_type = ovl_path_type(old);
+ err = -EXDEV;
+ if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir)
+ goto out;
+
+ if (new->d_inode) {
+ err = ovl_check_sticky(new);
+ if (err)
+ goto out;
+
+ if (S_ISDIR(new->d_inode->i_mode))
+ new_is_dir = true;
+
+ new_type = ovl_path_type(new);
+ err = -EXDEV;
+ if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir)
+ goto out;
+
+ err = 0;
+ if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
+ if (ovl_dentry_lower(old)->d_inode ==
+ ovl_dentry_lower(new)->d_inode)
+ goto out;
+ }
+ if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
+ if (ovl_dentry_upper(old)->d_inode ==
+ ovl_dentry_upper(new)->d_inode)
+ goto out;
+ }
+ } else {
+ if (ovl_dentry_is_opaque(new))
+ new_type = OVL_PATH_UPPER;
+ else
+ new_type = OVL_PATH_PURE_UPPER;
+ }
+
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(old);
+ if (err)
+ goto out_drop_write;
+
+ err = ovl_copy_up(new->d_parent);
+ if (err)
+ goto out_drop_write;
+ if (!overwrite) {
+ err = ovl_copy_up(new);
+ if (err)
+ goto out_drop_write;
+ }
+
+ old_opaque = old_type != OVL_PATH_PURE_UPPER;
+ new_opaque = new_type != OVL_PATH_PURE_UPPER;
+
+ if (old_opaque || new_opaque) {
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_drop_write;
+
+ /*
+ * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+ * CAP_DAC_OVERRIDE for create in workdir
+ * CAP_FOWNER for removing whiteout from sticky dir
+ * CAP_FSETID for chmod of opaque dir
+ * CAP_CHOWN for chown of opaque dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ old_cred = override_creds(override_cred);
+ }
+
+ if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
+ opaquedir = ovl_check_empty_and_clear(new, new_type);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir)) {
+ opaquedir = NULL;
+ goto out_revert_creds;
+ }
+ }
+
+ if (overwrite) {
+ if (old_opaque) {
+ if (new->d_inode || !new_opaque) {
+ /* Whiteout source */
+ flags |= RENAME_WHITEOUT;
+ } else {
+ /* Switch whiteouts */
+ flags |= RENAME_EXCHANGE;
+ }
+ } else if (is_dir && !new->d_inode && new_opaque) {
+ flags |= RENAME_EXCHANGE;
+ cleanup_whiteout = true;
+ }
+ }
+
+ old_upperdir = ovl_dentry_upper(old->d_parent);
+ new_upperdir = ovl_dentry_upper(new->d_parent);
+
+ trap = lock_rename(new_upperdir, old_upperdir);
+
+ olddentry = ovl_dentry_upper(old);
+ newdentry = ovl_dentry_upper(new);
+ if (newdentry) {
+ if (opaquedir) {
+ newdentry = opaquedir;
+ opaquedir = NULL;
+ } else {
+ dget(newdentry);
+ }
+ } else {
+ new_create = true;
+ newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+ new->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+ }
+
+ err = -ESTALE;
+ if (olddentry->d_parent != old_upperdir)
+ goto out_dput;
+ if (newdentry->d_parent != new_upperdir)
+ goto out_dput;
+ if (olddentry == trap)
+ goto out_dput;
+ if (newdentry == trap)
+ goto out_dput;
+
+ if (is_dir && !old_opaque && new_opaque) {
+ err = ovl_set_opaque(olddentry);
+ if (err)
+ goto out_dput;
+ }
+ if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
+ err = ovl_set_opaque(newdentry);
+ if (err)
+ goto out_dput;
+ }
+
+ if (old_opaque || new_opaque) {
+ err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+ new_upperdir->d_inode, newdentry,
+ flags);
+ } else {
+ /* No debug for the plain case */
+ BUG_ON(flags & ~RENAME_EXCHANGE);
+ err = vfs_rename(old_upperdir->d_inode, olddentry,
+ new_upperdir->d_inode, newdentry,
+ NULL, flags);
+ }
+
+ if (err) {
+ if (is_dir && !old_opaque && new_opaque)
+ ovl_remove_opaque(olddentry);
+ if (!overwrite && new_is_dir && old_opaque && !new_opaque)
+ ovl_remove_opaque(newdentry);
+ goto out_dput;
+ }
+
+ if (is_dir && old_opaque && !new_opaque)
+ ovl_remove_opaque(olddentry);
+ if (!overwrite && new_is_dir && !old_opaque && new_opaque)
+ ovl_remove_opaque(newdentry);
+
+ if (old_opaque != new_opaque) {
+ ovl_dentry_set_opaque(old, new_opaque);
+ if (!overwrite)
+ ovl_dentry_set_opaque(new, old_opaque);
+ }
+
+ if (cleanup_whiteout)
+ ovl_cleanup(old_upperdir->d_inode, newdentry);
+
+ ovl_dentry_version_inc(old->d_parent);
+ ovl_dentry_version_inc(new->d_parent);
+
+out_dput:
+ dput(newdentry);
+out_unlock:
+ unlock_rename(new_upperdir, old_upperdir);
+out_revert_creds:
+ if (old_opaque || new_opaque) {
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+out_drop_write:
+ ovl_drop_write(old);
+out:
+ dput(opaquedir);
+ return err;
+}
+
+const struct inode_operations ovl_dir_inode_operations = {
+ .lookup = ovl_lookup,
+ .mkdir = ovl_mkdir,
+ .symlink = ovl_symlink,
+ .unlink = ovl_unlink,
+ .rmdir = ovl_rmdir,
+ .rename2 = ovl_rename2,
+ .link = ovl_link,
+ .setattr = ovl_setattr,
+ .create = ovl_create,
+ .mknod = ovl_mknod,
+ .permission = ovl_permission,
+ .getattr = ovl_dir_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
new file mode 100644
index 000000000000..af2d18c9fcee
--- /dev/null
+++ b/fs/overlayfs/inode.c
@@ -0,0 +1,425 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+
+static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
+ bool no_data)
+{
+ int err;
+ struct dentry *parent;
+ struct kstat stat;
+ struct path lowerpath;
+
+ parent = dget_parent(dentry);
+ err = ovl_copy_up(parent);
+ if (err)
+ goto out_dput_parent;
+
+ ovl_path_lower(dentry, &lowerpath);
+ err = vfs_getattr(&lowerpath, &stat);
+ if (err)
+ goto out_dput_parent;
+
+ if (no_data)
+ stat.size = 0;
+
+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+
+out_dput_parent:
+ dput(parent);
+ return err;
+}
+
+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ int err;
+ struct dentry *upperdentry;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ upperdentry = ovl_dentry_upper(dentry);
+ if (upperdentry) {
+ mutex_lock(&upperdentry->d_inode->i_mutex);
+ err = notify_change(upperdentry, attr, NULL);
+ mutex_unlock(&upperdentry->d_inode->i_mutex);
+ } else {
+ err = ovl_copy_up_last(dentry, attr, false);
+ }
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct path realpath;
+
+ ovl_path_real(dentry, &realpath);
+ return vfs_getattr(&realpath, stat);
+}
+
+int ovl_permission(struct inode *inode, int mask)
+{
+ struct ovl_entry *oe;
+ struct dentry *alias = NULL;
+ struct inode *realinode;
+ struct dentry *realdentry;
+ bool is_upper;
+ int err;
+
+ if (S_ISDIR(inode->i_mode)) {
+ oe = inode->i_private;
+ } else if (mask & MAY_NOT_BLOCK) {
+ return -ECHILD;
+ } else {
+ /*
+ * For non-directories find an alias and get the info
+ * from there.
+ */
+ alias = d_find_any_alias(inode);
+ if (WARN_ON(!alias))
+ return -ENOENT;
+
+ oe = alias->d_fsdata;
+ }
+
+ realdentry = ovl_entry_real(oe, &is_upper);
+
+ /* Careful in RCU walk mode */
+ realinode = ACCESS_ONCE(realdentry->d_inode);
+ if (!realinode) {
+ WARN_ON(!(mask & MAY_NOT_BLOCK));
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ if (mask & MAY_WRITE) {
+ umode_t mode = realinode->i_mode;
+
+ /*
+ * Writes will always be redirected to upper layer, so
+ * ignore lower layer being read-only.
+ *
+ * If the overlay itself is read-only then proceed
+ * with the permission check, don't return EROFS.
+ * This will only happen if this is the lower layer of
+ * another overlayfs.
+ *
+ * If upper fs becomes read-only after the overlay was
+ * constructed return EROFS to prevent modification of
+ * upper layer.
+ */
+ err = -EROFS;
+ if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+ goto out_dput;
+ }
+
+ err = __inode_permission(realinode, mask);
+out_dput:
+ dput(alias);
+ return err;
+}
+
+
+struct ovl_link_data {
+ struct dentry *realdentry;
+ void *cookie;
+};
+
+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ void *ret;
+ struct dentry *realdentry;
+ struct inode *realinode;
+
+ realdentry = ovl_dentry_real(dentry);
+ realinode = realdentry->d_inode;
+
+ if (WARN_ON(!realinode->i_op->follow_link))
+ return ERR_PTR(-EPERM);
+
+ ret = realinode->i_op->follow_link(realdentry, nd);
+ if (IS_ERR(ret))
+ return ret;
+
+ if (realinode->i_op->put_link) {
+ struct ovl_link_data *data;
+
+ data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
+ if (!data) {
+ realinode->i_op->put_link(realdentry, nd, ret);
+ return ERR_PTR(-ENOMEM);
+ }
+ data->realdentry = realdentry;
+ data->cookie = ret;
+
+ return data;
+ } else {
+ return NULL;
+ }
+}
+
+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+ struct inode *realinode;
+ struct ovl_link_data *data = c;
+
+ if (!data)
+ return;
+
+ realinode = data->realdentry->d_inode;
+ realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+ kfree(data);
+}
+
+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+ struct path realpath;
+ struct inode *realinode;
+
+ ovl_path_real(dentry, &realpath);
+ realinode = realpath.dentry->d_inode;
+
+ if (!realinode->i_op->readlink)
+ return -EINVAL;
+
+ touch_atime(&realpath);
+
+ return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+}
+
+
+static bool ovl_is_private_xattr(const char *name)
+{
+ return strncmp(name, "trusted.overlay.", 14) == 0;
+}
+
+int ovl_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err;
+ struct dentry *upperdentry;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = -EPERM;
+ if (ovl_is_private_xattr(name))
+ goto out_drop_write;
+
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ upperdentry = ovl_dentry_upper(dentry);
+ err = vfs_setxattr(upperdentry, name, value, size, flags);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size)
+{
+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
+ ovl_is_private_xattr(name))
+ return -ENODATA;
+
+ return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ ssize_t res;
+ int off;
+
+ res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
+ if (res <= 0 || size == 0)
+ return res;
+
+ if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
+ return res;
+
+ /* filter out private xattrs */
+ for (off = 0; off < res;) {
+ char *s = list + off;
+ size_t slen = strlen(s) + 1;
+
+ BUG_ON(off + slen > res);
+
+ if (ovl_is_private_xattr(s)) {
+ res -= slen;
+ memmove(s, s + slen, res - off);
+ } else {
+ off += slen;
+ }
+ }
+
+ return res;
+}
+
+int ovl_removexattr(struct dentry *dentry, const char *name)
+{
+ int err;
+ struct path realpath;
+ enum ovl_path_type type;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
+ ovl_is_private_xattr(name))
+ goto out_drop_write;
+
+ type = ovl_path_real(dentry, &realpath);
+ if (type == OVL_PATH_LOWER) {
+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
+ if (err < 0)
+ goto out_drop_write;
+
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ ovl_path_upper(dentry, &realpath);
+ }
+
+ err = vfs_removexattr(realpath.dentry, name);
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
+ struct dentry *realdentry)
+{
+ if (type != OVL_PATH_LOWER)
+ return false;
+
+ if (special_file(realdentry->d_inode->i_mode))
+ return false;
+
+ if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
+ return false;
+
+ return true;
+}
+
+static int ovl_dentry_open(struct dentry *dentry, struct file *file,
+ const struct cred *cred)
+{
+ int err;
+ struct path realpath;
+ enum ovl_path_type type;
+ bool want_write = false;
+
+ type = ovl_path_real(dentry, &realpath);
+ if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
+ want_write = true;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ if (file->f_flags & O_TRUNC)
+ err = ovl_copy_up_last(dentry, NULL, true);
+ else
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ ovl_path_upper(dentry, &realpath);
+ }
+
+ err = vfs_open(&realpath, file, cred);
+out_drop_write:
+ if (want_write)
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static const struct inode_operations ovl_file_inode_operations = {
+ .setattr = ovl_setattr,
+ .permission = ovl_permission,
+ .getattr = ovl_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+ .dentry_open = ovl_dentry_open,
+};
+
+static const struct inode_operations ovl_symlink_inode_operations = {
+ .setattr = ovl_setattr,
+ .follow_link = ovl_follow_link,
+ .put_link = ovl_put_link,
+ .readlink = ovl_readlink,
+ .getattr = ovl_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+};
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+ struct ovl_entry *oe)
+{
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ mode &= S_IFMT;
+
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+ switch (mode) {
+ case S_IFDIR:
+ inode->i_private = oe;
+ inode->i_op = &ovl_dir_inode_operations;
+ inode->i_fop = &ovl_dir_operations;
+ break;
+
+ case S_IFLNK:
+ inode->i_op = &ovl_symlink_inode_operations;
+ break;
+
+ case S_IFREG:
+ case S_IFSOCK:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ inode->i_op = &ovl_file_inode_operations;
+ break;
+
+ default:
+ WARN(1, "illegal file type: %i\n", mode);
+ iput(inode);
+ inode = NULL;
+ }
+
+ return inode;
+
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
new file mode 100644
index 000000000000..814bed33dd07
--- /dev/null
+++ b/fs/overlayfs/overlayfs.h
@@ -0,0 +1,191 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+
+struct ovl_entry;
+
+enum ovl_path_type {
+ OVL_PATH_PURE_UPPER,
+ OVL_PATH_UPPER,
+ OVL_PATH_MERGE,
+ OVL_PATH_LOWER,
+};
+
+extern const char *ovl_opaque_xattr;
+
+static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_rmdir(dir, dentry);
+ pr_debug("rmdir(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_unlink(dir, dentry, NULL);
+ pr_debug("unlink(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool debug)
+{
+ int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+ if (debug) {
+ pr_debug("link(%pd2, %pd2) = %i\n",
+ old_dentry, new_dentry, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool debug)
+{
+ int err = vfs_create(dir, dentry, mode, true);
+ if (debug)
+ pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return err;
+}
+
+static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool debug)
+{
+ int err = vfs_mkdir(dir, dentry, mode);
+ if (debug)
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return err;
+}
+
+static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t dev, bool debug)
+{
+ int err = vfs_mknod(dir, dentry, mode, dev);
+ if (debug) {
+ pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
+ dentry, mode, dev, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
+ const char *oldname, bool debug)
+{
+ int err = vfs_symlink(dir, dentry, oldname);
+ if (debug)
+ pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
+ return err;
+}
+
+static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err = vfs_setxattr(dentry, name, value, size, flags);
+ pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
+ dentry, name, (int) size, (char *) value, flags, err);
+ return err;
+}
+
+static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
+{
+ int err = vfs_removexattr(dentry, name);
+ pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
+ return err;
+}
+
+static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
+ struct inode *newdir, struct dentry *newdentry,
+ unsigned int flags)
+{
+ int err;
+
+ pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
+ olddentry, newdentry, flags);
+
+ err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+
+ if (err) {
+ pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
+ olddentry, newdentry, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_whiteout(dir, dentry);
+ pr_debug("whiteout(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+void ovl_dentry_version_inc(struct dentry *dentry);
+void ovl_path_upper(struct dentry *dentry, struct path *path);
+void ovl_path_lower(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+struct dentry *ovl_dentry_upper(struct dentry *dentry);
+struct dentry *ovl_dentry_lower(struct dentry *dentry);
+struct dentry *ovl_dentry_real(struct dentry *dentry);
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
+struct dentry *ovl_workdir(struct dentry *dentry);
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+bool ovl_dentry_is_opaque(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
+bool ovl_is_whiteout(struct dentry *dentry);
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags);
+struct file *ovl_path_open(struct path *path, int flags);
+
+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
+ struct kstat *stat, const char *link);
+
+/* readdir.c */
+extern const struct file_operations ovl_dir_operations;
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
+void ovl_cache_free(struct list_head *list);
+
+/* inode.c */
+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_permission(struct inode *inode, int mask);
+int ovl_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+int ovl_removexattr(struct dentry *dentry, const char *name);
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+ struct ovl_entry *oe);
+static inline void ovl_copyattr(struct inode *from, struct inode *to)
+{
+ to->i_uid = from->i_uid;
+ to->i_gid = from->i_gid;
+}
+
+/* dir.c */
+extern const struct inode_operations ovl_dir_inode_operations;
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink, bool debug);
+void ovl_cleanup(struct inode *dir, struct dentry *dentry);
+
+/* copy_up.c */
+int ovl_copy_up(struct dentry *dentry);
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ struct path *lowerpath, struct kstat *stat,
+ struct iattr *attr);
+int ovl_copy_xattr(struct dentry *old, struct dentry *new);
+int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
new file mode 100644
index 000000000000..2a7ef4f8e2a6
--- /dev/null
+++ b/fs/overlayfs/readdir.c
@@ -0,0 +1,593 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+struct ovl_cache_entry {
+ unsigned int len;
+ unsigned int type;
+ u64 ino;
+ struct list_head l_node;
+ struct rb_node node;
+ bool is_whiteout;
+ bool is_cursor;
+ char name[];
+};
+
+struct ovl_dir_cache {
+ long refcount;
+ u64 version;
+ struct list_head entries;
+};
+
+struct ovl_readdir_data {
+ struct dir_context ctx;
+ bool is_merge;
+ struct rb_root root;
+ struct list_head *list;
+ struct list_head middle;
+ int count;
+ int err;
+};
+
+struct ovl_dir_file {
+ bool is_real;
+ bool is_upper;
+ struct ovl_dir_cache *cache;
+ struct ovl_cache_entry cursor;
+ struct file *realfile;
+ struct file *upperfile;
+};
+
+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
+{
+ return container_of(n, struct ovl_cache_entry, node);
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
+ const char *name, int len)
+{
+ struct rb_node *node = root->rb_node;
+ int cmp;
+
+ while (node) {
+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
+
+ cmp = strncmp(name, p->name, len);
+ if (cmp > 0)
+ node = p->node.rb_right;
+ else if (cmp < 0 || len < p->len)
+ node = p->node.rb_left;
+ else
+ return p;
+ }
+
+ return NULL;
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
+ u64 ino, unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+ size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
+
+ p = kmalloc(size, GFP_KERNEL);
+ if (p) {
+ memcpy(p->name, name, len);
+ p->name[len] = '\0';
+ p->len = len;
+ p->type = d_type;
+ p->ino = ino;
+ p->is_whiteout = false;
+ p->is_cursor = false;
+ }
+
+ return p;
+}
+
+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+ const char *name, int len, u64 ino,
+ unsigned int d_type)
+{
+ struct rb_node **newp = &rdd->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct ovl_cache_entry *p;
+
+ while (*newp) {
+ int cmp;
+ struct ovl_cache_entry *tmp;
+
+ parent = *newp;
+ tmp = ovl_cache_entry_from_node(*newp);
+ cmp = strncmp(name, tmp->name, len);
+ if (cmp > 0)
+ newp = &tmp->node.rb_right;
+ else if (cmp < 0 || len < tmp->len)
+ newp = &tmp->node.rb_left;
+ else
+ return 0;
+ }
+
+ p = ovl_cache_entry_new(name, len, ino, d_type);
+ if (p == NULL)
+ return -ENOMEM;
+
+ list_add_tail(&p->l_node, rdd->list);
+ rb_link_node(&p->node, parent, newp);
+ rb_insert_color(&p->node, &rdd->root);
+
+ return 0;
+}
+
+static int ovl_fill_lower(struct ovl_readdir_data *rdd,
+ const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+
+ p = ovl_cache_entry_find(&rdd->root, name, namelen);
+ if (p) {
+ list_move_tail(&p->l_node, &rdd->middle);
+ } else {
+ p = ovl_cache_entry_new(name, namelen, ino, d_type);
+ if (p == NULL)
+ rdd->err = -ENOMEM;
+ else
+ list_add_tail(&p->l_node, &rdd->middle);
+ }
+
+ return rdd->err;
+}
+
+void ovl_cache_free(struct list_head *list)
+{
+ struct ovl_cache_entry *p;
+ struct ovl_cache_entry *n;
+
+ list_for_each_entry_safe(p, n, list, l_node)
+ kfree(p);
+
+ INIT_LIST_HEAD(list);
+}
+
+static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+{
+ struct ovl_dir_cache *cache = od->cache;
+
+ list_del_init(&od->cursor.l_node);
+ WARN_ON(cache->refcount <= 0);
+ cache->refcount--;
+ if (!cache->refcount) {
+ if (ovl_dir_cache(dentry) == cache)
+ ovl_set_dir_cache(dentry, NULL);
+
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ }
+}
+
+static int ovl_fill_merge(void *buf, const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct ovl_readdir_data *rdd = buf;
+
+ rdd->count++;
+ if (!rdd->is_merge)
+ return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+ else
+ return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+}
+
+static inline int ovl_dir_read(struct path *realpath,
+ struct ovl_readdir_data *rdd)
+{
+ struct file *realfile;
+ int err;
+
+ realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
+
+ rdd->ctx.pos = 0;
+ do {
+ rdd->count = 0;
+ rdd->err = 0;
+ err = iterate_dir(realfile, &rdd->ctx);
+ if (err >= 0)
+ err = rdd->err;
+ } while (!err && rdd->count);
+ fput(realfile);
+
+ return err;
+}
+
+static void ovl_dir_reset(struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct ovl_dir_cache *cache = od->cache;
+ struct dentry *dentry = file->f_path.dentry;
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (cache && ovl_dentry_version_get(dentry) != cache->version) {
+ ovl_cache_put(od, dentry);
+ od->cache = NULL;
+ }
+ WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
+ if (od->is_real && type == OVL_PATH_MERGE)
+ od->is_real = false;
+}
+
+static int ovl_dir_mark_whiteouts(struct dentry *dir,
+ struct ovl_readdir_data *rdd)
+{
+ struct ovl_cache_entry *p;
+ struct dentry *dentry;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ override_cred = prepare_creds();
+ if (!override_cred) {
+ ovl_cache_free(rdd->list);
+ return -ENOMEM;
+ }
+
+ /*
+ * CAP_DAC_OVERRIDE for lookup
+ */
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ old_cred = override_creds(override_cred);
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ list_for_each_entry(p, rdd->list, l_node) {
+ if (p->is_cursor)
+ continue;
+
+ if (p->type != DT_CHR)
+ continue;
+
+ dentry = lookup_one_len(p->name, dir, p->len);
+ if (IS_ERR(dentry))
+ continue;
+
+ p->is_whiteout = ovl_is_whiteout(dentry);
+ dput(dentry);
+ }
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ revert_creds(old_cred);
+ put_cred(override_cred);
+
+ return 0;
+}
+
+static inline int ovl_dir_read_merged(struct path *upperpath,
+ struct path *lowerpath,
+ struct list_head *list)
+{
+ int err;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_merge,
+ .list = list,
+ .root = RB_ROOT,
+ .is_merge = false,
+ };
+
+ if (upperpath->dentry) {
+ err = ovl_dir_read(upperpath, &rdd);
+ if (err)
+ goto out;
+
+ if (lowerpath->dentry) {
+ err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd);
+ if (err)
+ goto out;
+ }
+ }
+ if (lowerpath->dentry) {
+ /*
+ * Insert lowerpath entries before upperpath ones, this allows
+ * offsets to be reasonably constant
+ */
+ list_add(&rdd.middle, rdd.list);
+ rdd.is_merge = true;
+ err = ovl_dir_read(lowerpath, &rdd);
+ list_del(&rdd.middle);
+ }
+out:
+ return err;
+}
+
+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
+{
+ struct ovl_cache_entry *p;
+ loff_t off = 0;
+
+ list_for_each_entry(p, &od->cache->entries, l_node) {
+ if (p->is_cursor)
+ continue;
+ if (off >= pos)
+ break;
+ off++;
+ }
+ list_move_tail(&od->cursor.l_node, &p->l_node);
+}
+
+static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
+{
+ int res;
+ struct path lowerpath;
+ struct path upperpath;
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_dir_cache(dentry);
+ if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+ cache->refcount++;
+ return cache;
+ }
+ ovl_set_dir_cache(dentry, NULL);
+
+ cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+ if (!cache)
+ return ERR_PTR(-ENOMEM);
+
+ cache->refcount = 1;
+ INIT_LIST_HEAD(&cache->entries);
+
+ ovl_path_lower(dentry, &lowerpath);
+ ovl_path_upper(dentry, &upperpath);
+
+ res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries);
+ if (res) {
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ return ERR_PTR(res);
+ }
+
+ cache->version = ovl_dentry_version_get(dentry);
+ ovl_set_dir_cache(dentry, cache);
+
+ return cache;
+}
+
+static int ovl_iterate(struct file *file, struct dir_context *ctx)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+
+ if (!ctx->pos)
+ ovl_dir_reset(file);
+
+ if (od->is_real)
+ return iterate_dir(od->realfile, ctx);
+
+ if (!od->cache) {
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_cache_get(dentry);
+ if (IS_ERR(cache))
+ return PTR_ERR(cache);
+
+ od->cache = cache;
+ ovl_seek_cursor(od, ctx->pos);
+ }
+
+ while (od->cursor.l_node.next != &od->cache->entries) {
+ struct ovl_cache_entry *p;
+
+ p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
+ /* Skip cursors */
+ if (!p->is_cursor) {
+ if (!p->is_whiteout) {
+ if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+ break;
+ }
+ ctx->pos++;
+ }
+ list_move(&od->cursor.l_node, &p->l_node);
+ }
+ return 0;
+}
+
+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t res;
+ struct ovl_dir_file *od = file->private_data;
+
+ mutex_lock(&file_inode(file)->i_mutex);
+ if (!file->f_pos)
+ ovl_dir_reset(file);
+
+ if (od->is_real) {
+ res = vfs_llseek(od->realfile, offset, origin);
+ file->f_pos = od->realfile->f_pos;
+ } else {
+ res = -EINVAL;
+
+ switch (origin) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ break;
+ case SEEK_SET:
+ break;
+ default:
+ goto out_unlock;
+ }
+ if (offset < 0)
+ goto out_unlock;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (od->cache)
+ ovl_seek_cursor(od, offset);
+ }
+ res = offset;
+ }
+out_unlock:
+ mutex_unlock(&file_inode(file)->i_mutex);
+
+ return res;
+}
+
+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct file *realfile = od->realfile;
+
+ /*
+ * Need to check if we started out being a lower dir, but got copied up
+ */
+ if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) {
+ struct inode *inode = file_inode(file);
+
+ realfile =lockless_dereference(od->upperfile);
+ if (!realfile) {
+ struct path upperpath;
+
+ ovl_path_upper(dentry, &upperpath);
+ realfile = ovl_path_open(&upperpath, O_RDONLY);
+ smp_mb__before_spinlock();
+ mutex_lock(&inode->i_mutex);
+ if (!od->upperfile) {
+ if (IS_ERR(realfile)) {
+ mutex_unlock(&inode->i_mutex);
+ return PTR_ERR(realfile);
+ }
+ od->upperfile = realfile;
+ } else {
+ /* somebody has beaten us to it */
+ if (!IS_ERR(realfile))
+ fput(realfile);
+ realfile = od->upperfile;
+ }
+ mutex_unlock(&inode->i_mutex);
+ }
+ }
+
+ return vfs_fsync_range(realfile, start, end, datasync);
+}
+
+static int ovl_dir_release(struct inode *inode, struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+
+ if (od->cache) {
+ mutex_lock(&inode->i_mutex);
+ ovl_cache_put(od, file->f_path.dentry);
+ mutex_unlock(&inode->i_mutex);
+ }
+ fput(od->realfile);
+ if (od->upperfile)
+ fput(od->upperfile);
+ kfree(od);
+
+ return 0;
+}
+
+static int ovl_dir_open(struct inode *inode, struct file *file)
+{
+ struct path realpath;
+ struct file *realfile;
+ struct ovl_dir_file *od;
+ enum ovl_path_type type;
+
+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
+ if (!od)
+ return -ENOMEM;
+
+ type = ovl_path_real(file->f_path.dentry, &realpath);
+ realfile = ovl_path_open(&realpath, file->f_flags);
+ if (IS_ERR(realfile)) {
+ kfree(od);
+ return PTR_ERR(realfile);
+ }
+ INIT_LIST_HEAD(&od->cursor.l_node);
+ od->realfile = realfile;
+ od->is_real = (type != OVL_PATH_MERGE);
+ od->is_upper = (type != OVL_PATH_LOWER);
+ od->cursor.is_cursor = true;
+ file->private_data = od;
+
+ return 0;
+}
+
+const struct file_operations ovl_dir_operations = {
+ .read = generic_read_dir,
+ .open = ovl_dir_open,
+ .iterate = ovl_iterate,
+ .llseek = ovl_dir_llseek,
+ .fsync = ovl_dir_fsync,
+ .release = ovl_dir_release,
+};
+
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
+{
+ int err;
+ struct path lowerpath;
+ struct path upperpath;
+ struct ovl_cache_entry *p;
+
+ ovl_path_upper(dentry, &upperpath);
+ ovl_path_lower(dentry, &lowerpath);
+
+ err = ovl_dir_read_merged(&upperpath, &lowerpath, list);
+ if (err)
+ return err;
+
+ err = 0;
+
+ list_for_each_entry(p, list, l_node) {
+ if (p->is_whiteout)
+ continue;
+
+ if (p->name[0] == '.') {
+ if (p->len == 1)
+ continue;
+ if (p->len == 2 && p->name[1] == '.')
+ continue;
+ }
+ err = -ENOTEMPTY;
+ break;
+ }
+
+ return err;
+}
+
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
+{
+ struct ovl_cache_entry *p;
+
+ mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+ list_for_each_entry(p, list, l_node) {
+ struct dentry *dentry;
+
+ if (!p->is_whiteout)
+ continue;
+
+ dentry = lookup_one_len(p->name, upper, p->len);
+ if (IS_ERR(dentry)) {
+ pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
+ upper->d_name.name, p->len, p->name,
+ (int) PTR_ERR(dentry));
+ continue;
+ }
+ ovl_cleanup(upper->d_inode, dentry);
+ dput(dentry);
+ }
+ mutex_unlock(&upper->d_inode->i_mutex);
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
new file mode 100644
index 000000000000..08b704cebfc4
--- /dev/null
+++ b/fs/overlayfs/super.c
@@ -0,0 +1,796 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include "overlayfs.h"
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Overlay filesystem");
+MODULE_LICENSE("GPL");
+
+#define OVERLAYFS_SUPER_MAGIC 0x794c764f
+
+struct ovl_config {
+ char *lowerdir;
+ char *upperdir;
+ char *workdir;
+};
+
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+ struct vfsmount *upper_mnt;
+ struct vfsmount *lower_mnt;
+ struct dentry *workdir;
+ long lower_namelen;
+ /* pathnames of lower and upper dirs, for show_options */
+ struct ovl_config config;
+};
+
+struct ovl_dir_cache;
+
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+ struct dentry *__upperdentry;
+ struct dentry *lowerdentry;
+ struct ovl_dir_cache *cache;
+ union {
+ struct {
+ u64 version;
+ bool opaque;
+ };
+ struct rcu_head rcu;
+ };
+};
+
+const char *ovl_opaque_xattr = "trusted.overlay.opaque";
+
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe->__upperdentry) {
+ if (oe->lowerdentry) {
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ return OVL_PATH_MERGE;
+ else
+ return OVL_PATH_UPPER;
+ } else {
+ if (oe->opaque)
+ return OVL_PATH_UPPER;
+ else
+ return OVL_PATH_PURE_UPPER;
+ }
+ } else {
+ return OVL_PATH_LOWER;
+ }
+}
+
+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
+{
+ struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
+ /*
+ * Make sure to order reads to upperdentry wrt ovl_dentry_update()
+ */
+ smp_read_barrier_depends();
+ return upperdentry;
+}
+
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ path->mnt = ofs->upper_mnt;
+ path->dentry = ovl_upperdentry_dereference(oe);
+}
+
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (type == OVL_PATH_LOWER)
+ ovl_path_lower(dentry, path);
+ else
+ ovl_path_upper(dentry, path);
+
+ return type;
+}
+
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return ovl_upperdentry_dereference(oe);
+}
+
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->lowerdentry;
+}
+
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ struct dentry *realdentry;
+
+ realdentry = ovl_upperdentry_dereference(oe);
+ if (!realdentry)
+ realdentry = oe->lowerdentry;
+
+ return realdentry;
+}
+
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
+{
+ struct dentry *realdentry;
+
+ realdentry = ovl_upperdentry_dereference(oe);
+ if (realdentry) {
+ *is_upper = true;
+ } else {
+ realdentry = oe->lowerdentry;
+ *is_upper = false;
+ }
+ return realdentry;
+}
+
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->cache;
+}
+
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ oe->cache = cache;
+}
+
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ path->mnt = ofs->lower_mnt;
+ path->dentry = oe->lowerdentry;
+}
+
+int ovl_want_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return mnt_want_write(ofs->upper_mnt);
+}
+
+void ovl_drop_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ mnt_drop_write(ofs->upper_mnt);
+}
+
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return ofs->workdir;
+}
+
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ return oe->opaque;
+}
+
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ oe->opaque = opaque;
+}
+
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+ WARN_ON(oe->__upperdentry);
+ BUG_ON(!upperdentry->d_inode);
+ /*
+ * Make sure upperdentry is consistent before making it visible to
+ * ovl_upperdentry_dereference().
+ */
+ smp_wmb();
+ oe->__upperdentry = upperdentry;
+}
+
+void ovl_dentry_version_inc(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ oe->version++;
+}
+
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ return oe->version;
+}
+
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ return inode && IS_WHITEOUT(inode);
+}
+
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+ int res;
+ char val;
+ struct inode *inode = dentry->d_inode;
+
+ if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
+ return false;
+
+ res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1);
+ if (res == 1 && val == 'y')
+ return true;
+
+ return false;
+}
+
+static void ovl_dentry_release(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe) {
+ dput(oe->__upperdentry);
+ dput(oe->lowerdentry);
+ kfree_rcu(oe, rcu);
+ }
+}
+
+static const struct dentry_operations ovl_dentry_operations = {
+ .d_release = ovl_dentry_release,
+};
+
+static struct ovl_entry *ovl_alloc_entry(void)
+{
+ return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
+}
+
+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
+ struct qstr *name)
+{
+ struct dentry *dentry;
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ dentry = lookup_one_len(name->name, dir, name->len);
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ if (IS_ERR(dentry)) {
+ if (PTR_ERR(dentry) == -ENOENT)
+ dentry = NULL;
+ } else if (!dentry->d_inode) {
+ dput(dentry);
+ dentry = NULL;
+ }
+ return dentry;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct ovl_entry *oe;
+ struct dentry *upperdir;
+ struct dentry *lowerdir;
+ struct dentry *upperdentry = NULL;
+ struct dentry *lowerdentry = NULL;
+ struct inode *inode = NULL;
+ int err;
+
+ err = -ENOMEM;
+ oe = ovl_alloc_entry();
+ if (!oe)
+ goto out;
+
+ upperdir = ovl_dentry_upper(dentry->d_parent);
+ lowerdir = ovl_dentry_lower(dentry->d_parent);
+
+ if (upperdir) {
+ upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
+ err = PTR_ERR(upperdentry);
+ if (IS_ERR(upperdentry))
+ goto out_put_dir;
+
+ if (lowerdir && upperdentry) {
+ if (ovl_is_whiteout(upperdentry)) {
+ dput(upperdentry);
+ upperdentry = NULL;
+ oe->opaque = true;
+ } else if (ovl_is_opaquedir(upperdentry)) {
+ oe->opaque = true;
+ }
+ }
+ }
+ if (lowerdir && !oe->opaque) {
+ lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
+ err = PTR_ERR(lowerdentry);
+ if (IS_ERR(lowerdentry))
+ goto out_dput_upper;
+ }
+
+ if (lowerdentry && upperdentry &&
+ (!S_ISDIR(upperdentry->d_inode->i_mode) ||
+ !S_ISDIR(lowerdentry->d_inode->i_mode))) {
+ dput(lowerdentry);
+ lowerdentry = NULL;
+ oe->opaque = true;
+ }
+
+ if (lowerdentry || upperdentry) {
+ struct dentry *realdentry;
+
+ realdentry = upperdentry ? upperdentry : lowerdentry;
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
+ oe);
+ if (!inode)
+ goto out_dput;
+ ovl_copyattr(realdentry->d_inode, inode);
+ }
+
+ oe->__upperdentry = upperdentry;
+ oe->lowerdentry = lowerdentry;
+
+ dentry->d_fsdata = oe;
+ d_add(dentry, inode);
+
+ return NULL;
+
+out_dput:
+ dput(lowerdentry);
+out_dput_upper:
+ dput(upperdentry);
+out_put_dir:
+ kfree(oe);
+out:
+ return ERR_PTR(err);
+}
+
+struct file *ovl_path_open(struct path *path, int flags)
+{
+ return dentry_open(path, flags, current_cred());
+}
+
+static void ovl_put_super(struct super_block *sb)
+{
+ struct ovl_fs *ufs = sb->s_fs_info;
+
+ dput(ufs->workdir);
+ mntput(ufs->upper_mnt);
+ mntput(ufs->lower_mnt);
+
+ kfree(ufs->config.lowerdir);
+ kfree(ufs->config.upperdir);
+ kfree(ufs->config.workdir);
+ kfree(ufs);
+}
+
+/**
+ * ovl_statfs
+ * @sb: The overlayfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. As writes always target the upper layer
+ * filesystem pass the statfs to the same filesystem.
+ */
+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct dentry *root_dentry = dentry->d_sb->s_root;
+ struct path path;
+ int err;
+
+ ovl_path_upper(root_dentry, &path);
+
+ err = vfs_statfs(&path, buf);
+ if (!err) {
+ buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
+ buf->f_type = OVERLAYFS_SUPER_MAGIC;
+ }
+
+ return err;
+}
+
+/**
+ * ovl_show_options
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct ovl_fs *ufs = sb->s_fs_info;
+
+ seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+ seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+ seq_printf(m, ",workdir=%s", ufs->config.workdir);
+ return 0;
+}
+
+static const struct super_operations ovl_super_operations = {
+ .put_super = ovl_put_super,
+ .statfs = ovl_statfs,
+ .show_options = ovl_show_options,
+};
+
+enum {
+ OPT_LOWERDIR,
+ OPT_UPPERDIR,
+ OPT_WORKDIR,
+ OPT_ERR,
+};
+
+static const match_table_t ovl_tokens = {
+ {OPT_LOWERDIR, "lowerdir=%s"},
+ {OPT_UPPERDIR, "upperdir=%s"},
+ {OPT_WORKDIR, "workdir=%s"},
+ {OPT_ERR, NULL}
+};
+
+static int ovl_parse_opt(char *opt, struct ovl_config *config)
+{
+ char *p;
+
+ while ((p = strsep(&opt, ",")) != NULL) {
+ int token;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, ovl_tokens, args);
+ switch (token) {
+ case OPT_UPPERDIR:
+ kfree(config->upperdir);
+ config->upperdir = match_strdup(&args[0]);
+ if (!config->upperdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_LOWERDIR:
+ kfree(config->lowerdir);
+ config->lowerdir = match_strdup(&args[0]);
+ if (!config->lowerdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_WORKDIR:
+ kfree(config->workdir);
+ config->workdir = match_strdup(&args[0]);
+ if (!config->workdir)
+ return -ENOMEM;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+#define OVL_WORKDIR_NAME "work"
+
+static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
+ struct dentry *dentry)
+{
+ struct inode *dir = dentry->d_inode;
+ struct dentry *work;
+ int err;
+ bool retried = false;
+
+ err = mnt_want_write(mnt);
+ if (err)
+ return ERR_PTR(err);
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+retry:
+ work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
+ strlen(OVL_WORKDIR_NAME));
+
+ if (!IS_ERR(work)) {
+ struct kstat stat = {
+ .mode = S_IFDIR | 0,
+ };
+
+ if (work->d_inode) {
+ err = -EEXIST;
+ if (retried)
+ goto out_dput;
+
+ retried = true;
+ ovl_cleanup(dir, work);
+ dput(work);
+ goto retry;
+ }
+
+ err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
+ if (err)
+ goto out_dput;
+ }
+out_unlock:
+ mutex_unlock(&dir->i_mutex);
+ mnt_drop_write(mnt);
+
+ return work;
+
+out_dput:
+ dput(work);
+ work = ERR_PTR(err);
+ goto out_unlock;
+}
+
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+ int err;
+
+ err = kern_path(name, LOOKUP_FOLLOW, path);
+ if (err) {
+ pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
+ err = -EINVAL;
+ }
+ return err;
+}
+
+static bool ovl_is_allowed_fs_type(struct dentry *root)
+{
+ const struct dentry_operations *dop = root->d_op;
+
+ /*
+ * We don't support:
+ * - automount filesystems
+ * - filesystems with revalidate (FIXME for lower layer)
+ * - filesystems with case insensitive names
+ */
+ if (dop &&
+ (dop->d_manage || dop->d_automount ||
+ dop->d_revalidate || dop->d_weak_revalidate ||
+ dop->d_compare || dop->d_hash)) {
+ return false;
+ }
+ return true;
+}
+
+/* Workdir should not be subdir of upperdir and vice versa */
+static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
+{
+ bool ok = false;
+
+ if (workdir != upperdir) {
+ ok = (lock_rename(workdir, upperdir) == NULL);
+ unlock_rename(workdir, upperdir);
+ }
+ return ok;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct path lowerpath;
+ struct path upperpath;
+ struct path workpath;
+ struct inode *root_inode;
+ struct dentry *root_dentry;
+ struct ovl_entry *oe;
+ struct ovl_fs *ufs;
+ struct kstatfs statfs;
+ int err;
+
+ err = -ENOMEM;
+ ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+ if (!ufs)
+ goto out;
+
+ err = ovl_parse_opt((char *) data, &ufs->config);
+ if (err)
+ goto out_free_config;
+
+ /* FIXME: workdir is not needed for a R/O mount */
+ err = -EINVAL;
+ if (!ufs->config.upperdir || !ufs->config.lowerdir ||
+ !ufs->config.workdir) {
+ pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
+ goto out_free_config;
+ }
+
+ err = -ENOMEM;
+ oe = ovl_alloc_entry();
+ if (oe == NULL)
+ goto out_free_config;
+
+ err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
+ if (err)
+ goto out_free_oe;
+
+ err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath);
+ if (err)
+ goto out_put_upperpath;
+
+ err = ovl_mount_dir(ufs->config.workdir, &workpath);
+ if (err)
+ goto out_put_lowerpath;
+
+ err = -EINVAL;
+ if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
+ !S_ISDIR(lowerpath.dentry->d_inode->i_mode) ||
+ !S_ISDIR(workpath.dentry->d_inode->i_mode)) {
+ pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n");
+ goto out_put_workpath;
+ }
+
+ if (upperpath.mnt != workpath.mnt) {
+ pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+ goto out_put_workpath;
+ }
+ if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
+ pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+ goto out_put_workpath;
+ }
+
+ if (!ovl_is_allowed_fs_type(upperpath.dentry)) {
+ pr_err("overlayfs: filesystem of upperdir is not supported\n");
+ goto out_put_workpath;
+ }
+
+ if (!ovl_is_allowed_fs_type(lowerpath.dentry)) {
+ pr_err("overlayfs: filesystem of lowerdir is not supported\n");
+ goto out_put_workpath;
+ }
+
+ err = vfs_statfs(&lowerpath, &statfs);
+ if (err) {
+ pr_err("overlayfs: statfs failed on lowerpath\n");
+ goto out_put_workpath;
+ }
+ ufs->lower_namelen = statfs.f_namelen;
+
+ sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
+ lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
+
+ err = -EINVAL;
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ pr_err("overlayfs: maximum fs stacking depth exceeded\n");
+ goto out_put_workpath;
+ }
+
+ ufs->upper_mnt = clone_private_mount(&upperpath);
+ err = PTR_ERR(ufs->upper_mnt);
+ if (IS_ERR(ufs->upper_mnt)) {
+ pr_err("overlayfs: failed to clone upperpath\n");
+ goto out_put_workpath;
+ }
+
+ ufs->lower_mnt = clone_private_mount(&lowerpath);
+ err = PTR_ERR(ufs->lower_mnt);
+ if (IS_ERR(ufs->lower_mnt)) {
+ pr_err("overlayfs: failed to clone lowerpath\n");
+ goto out_put_upper_mnt;
+ }
+
+ ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
+ err = PTR_ERR(ufs->workdir);
+ if (IS_ERR(ufs->workdir)) {
+ pr_err("overlayfs: failed to create directory %s/%s\n",
+ ufs->config.workdir, OVL_WORKDIR_NAME);
+ goto out_put_lower_mnt;
+ }
+
+ /*
+ * Make lower_mnt R/O. That way fchmod/fchown on lower file
+ * will fail instead of modifying lower fs.
+ */
+ ufs->lower_mnt->mnt_flags |= MNT_READONLY;
+
+ /* If the upper fs is r/o, we mark overlayfs r/o too */
+ if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
+ sb->s_flags |= MS_RDONLY;
+
+ sb->s_d_op = &ovl_dentry_operations;
+
+ err = -ENOMEM;
+ root_inode = ovl_new_inode(sb, S_IFDIR, oe);
+ if (!root_inode)
+ goto out_put_workdir;
+
+ root_dentry = d_make_root(root_inode);
+ if (!root_dentry)
+ goto out_put_workdir;
+
+ mntput(upperpath.mnt);
+ mntput(lowerpath.mnt);
+ path_put(&workpath);
+
+ oe->__upperdentry = upperpath.dentry;
+ oe->lowerdentry = lowerpath.dentry;
+
+ root_dentry->d_fsdata = oe;
+
+ sb->s_magic = OVERLAYFS_SUPER_MAGIC;
+ sb->s_op = &ovl_super_operations;
+ sb->s_root = root_dentry;
+ sb->s_fs_info = ufs;
+
+ return 0;
+
+out_put_workdir:
+ dput(ufs->workdir);
+out_put_lower_mnt:
+ mntput(ufs->lower_mnt);
+out_put_upper_mnt:
+ mntput(ufs->upper_mnt);
+out_put_workpath:
+ path_put(&workpath);
+out_put_lowerpath:
+ path_put(&lowerpath);
+out_put_upperpath:
+ path_put(&upperpath);
+out_free_oe:
+ kfree(oe);
+out_free_config:
+ kfree(ufs->config.lowerdir);
+ kfree(ufs->config.upperdir);
+ kfree(ufs->config.workdir);
+ kfree(ufs);
+out:
+ return err;
+}
+
+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *raw_data)
+{
+ return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
+}
+
+static struct file_system_type ovl_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "overlayfs",
+ .mount = ovl_mount,
+ .kill_sb = kill_anon_super,
+};
+MODULE_ALIAS_FS("overlayfs");
+
+static int __init ovl_init(void)
+{
+ return register_filesystem(&ovl_fs_type);
+}
+
+static void __exit ovl_exit(void)
+{
+ unregister_filesystem(&ovl_fs_type);
+}
+
+module_init(ovl_init);
+module_exit(ovl_exit);
diff --git a/fs/pnode.c b/fs/pnode.c
index 302bf22c4a30..aae331a5d03b 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt)
* other children
*/
if (child && list_empty(&child->mnt_mounts)) {
+ list_del_init(&child->mnt_child);
hlist_del_init_rcu(&child->mnt_hash);
hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
}
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 239493ec718e..7151ea428041 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -23,6 +23,7 @@ proc-y += version.o
proc-y += softirqs.o
proc-y += namespaces.o
proc-y += self.o
+proc-y += thread_self.o
proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2d696b0c93bf..772efa45a452 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -105,7 +105,7 @@
*/
struct pid_entry {
- char *name;
+ const char *name;
int len;
umode_t mode;
const struct inode_operations *iop;
@@ -130,10 +130,6 @@ struct pid_entry {
{ .proc_get_link = get_link } )
#define REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
-#define INF(NAME, MODE, read) \
- NOD(NAME, (S_IFREG|(MODE)), \
- NULL, &proc_info_file_operations, \
- { .proc_read = read } )
#define ONE(NAME, MODE, show) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
@@ -200,27 +196,32 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static int proc_pid_cmdline(struct task_struct *task, char *buffer)
+static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
- return get_cmdline(task, buffer, PAGE_SIZE);
+ /*
+ * Rely on struct seq_operations::show() being called once
+ * per internal buffer allocation. See single_open(), traverse().
+ */
+ BUG_ON(m->size < PAGE_SIZE);
+ m->count += get_cmdline(task, m->buf, PAGE_SIZE);
+ return 0;
}
-static int proc_pid_auxv(struct task_struct *task, char *buffer)
+static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
- int res = PTR_ERR(mm);
if (mm && !IS_ERR(mm)) {
unsigned int nwords = 0;
do {
nwords += 2;
} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
- res = nwords * sizeof(mm->saved_auxv[0]);
- if (res > PAGE_SIZE)
- res = PAGE_SIZE;
- memcpy(buffer, mm->saved_auxv, res);
+ seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
mmput(mm);
- }
- return res;
+ return 0;
+ } else
+ return PTR_ERR(mm);
}
@@ -229,7 +230,8 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
* Returns the resolved symbol. If that fails, simply return the address.
*/
-static int proc_pid_wchan(struct task_struct *task, char *buffer)
+static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
unsigned long wchan;
char symname[KSYM_NAME_LEN];
@@ -240,9 +242,9 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
if (!ptrace_may_access(task, PTRACE_MODE_READ))
return 0;
else
- return sprintf(buffer, "%lu", wchan);
+ return seq_printf(m, "%lu", wchan);
else
- return sprintf(buffer, "%s", symname);
+ return seq_printf(m, "%s", symname);
}
#endif /* CONFIG_KALLSYMS */
@@ -304,9 +306,10 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
/*
* Provides /proc/PID/schedstat
*/
-static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
- return sprintf(buffer, "%llu %llu %lu\n",
+ return seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
@@ -373,38 +376,8 @@ static const struct file_operations proc_lstats_operations = {
#endif
-#ifdef CONFIG_CGROUPS
-static int cgroup_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cgroup_show, pid);
-}
-
-static const struct file_operations proc_cgroup_operations = {
- .open = cgroup_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-#endif
-
-#ifdef CONFIG_PROC_PID_CPUSET
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cpuset_show, pid);
-}
-
-static const struct file_operations proc_cpuset_operations = {
- .open = cpuset_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-#endif
-
-static int proc_oom_score(struct task_struct *task, char *buffer)
+static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
unsigned long totalpages = totalram_pages + total_swap_pages;
unsigned long points = 0;
@@ -414,12 +387,12 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
points = oom_badness(task, NULL, NULL, totalpages) *
1000 / totalpages;
read_unlock(&tasklist_lock);
- return sprintf(buffer, "%lu\n", points);
+ return seq_printf(m, "%lu\n", points);
}
struct limit_names {
- char *name;
- char *unit;
+ const char *name;
+ const char *unit;
};
static const struct limit_names lnames[RLIM_NLIMITS] = {
@@ -442,12 +415,11 @@ static const struct limit_names lnames[RLIM_NLIMITS] = {
};
/* Display limits for a process */
-static int proc_pid_limits(struct task_struct *task, char *buffer)
+static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
unsigned int i;
- int count = 0;
unsigned long flags;
- char *bufptr = buffer;
struct rlimit rlim[RLIM_NLIMITS];
@@ -459,35 +431,34 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
/*
* print the file header
*/
- count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
+ seq_printf(m, "%-25s %-20s %-20s %-10s\n",
"Limit", "Soft Limit", "Hard Limit", "Units");
for (i = 0; i < RLIM_NLIMITS; i++) {
if (rlim[i].rlim_cur == RLIM_INFINITY)
- count += sprintf(&bufptr[count], "%-25s %-20s ",
+ seq_printf(m, "%-25s %-20s ",
lnames[i].name, "unlimited");
else
- count += sprintf(&bufptr[count], "%-25s %-20lu ",
+ seq_printf(m, "%-25s %-20lu ",
lnames[i].name, rlim[i].rlim_cur);
if (rlim[i].rlim_max == RLIM_INFINITY)
- count += sprintf(&bufptr[count], "%-20s ", "unlimited");
+ seq_printf(m, "%-20s ", "unlimited");
else
- count += sprintf(&bufptr[count], "%-20lu ",
- rlim[i].rlim_max);
+ seq_printf(m, "%-20lu ", rlim[i].rlim_max);
if (lnames[i].unit)
- count += sprintf(&bufptr[count], "%-10s\n",
- lnames[i].unit);
+ seq_printf(m, "%-10s\n", lnames[i].unit);
else
- count += sprintf(&bufptr[count], "\n");
+ seq_putc(m, '\n');
}
- return count;
+ return 0;
}
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-static int proc_pid_syscall(struct task_struct *task, char *buffer)
+static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
long nr;
unsigned long args[6], sp, pc;
@@ -496,11 +467,11 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
return res;
if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
- res = sprintf(buffer, "running\n");
+ seq_puts(m, "running\n");
else if (nr < 0)
- res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+ seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
else
- res = sprintf(buffer,
+ seq_printf(m,
"%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
nr,
args[0], args[1], args[2], args[3], args[4], args[5],
@@ -598,43 +569,6 @@ static const struct inode_operations proc_def_inode_operations = {
.setattr = proc_setattr,
};
-#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */
-
-static ssize_t proc_info_read(struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
-{
- struct inode * inode = file_inode(file);
- unsigned long page;
- ssize_t length;
- struct task_struct *task = get_proc_task(inode);
-
- length = -ESRCH;
- if (!task)
- goto out_no_task;
-
- if (count > PROC_BLOCK_SIZE)
- count = PROC_BLOCK_SIZE;
-
- length = -ENOMEM;
- if (!(page = __get_free_page(GFP_TEMPORARY)))
- goto out;
-
- length = PROC_I(inode)->op.proc_read(task, (char*)page);
-
- if (length >= 0)
- length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
- free_page(page);
-out:
- put_task_struct(task);
-out_no_task:
- return length;
-}
-
-static const struct file_operations proc_info_file_operations = {
- .read = proc_info_read,
- .llseek = generic_file_llseek,
-};
-
static int proc_single_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
@@ -667,29 +601,35 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
- struct task_struct *task = get_proc_task(file_inode(file));
- struct mm_struct *mm;
+ struct task_struct *task = get_proc_task(inode);
+ struct mm_struct *mm = ERR_PTR(-ESRCH);
- if (!task)
- return -ESRCH;
+ if (task) {
+ mm = mm_access(task, mode);
+ put_task_struct(task);
- mm = mm_access(task, mode);
- put_task_struct(task);
+ if (!IS_ERR_OR_NULL(mm)) {
+ /* ensure this mm_struct can't be freed */
+ atomic_inc(&mm->mm_count);
+ /* but do not pin its memory */
+ mmput(mm);
+ }
+ }
+
+ return mm;
+}
+
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+{
+ struct mm_struct *mm = proc_mem_open(inode, mode);
if (IS_ERR(mm))
return PTR_ERR(mm);
- if (mm) {
- /* ensure this mm_struct can't be freed */
- atomic_inc(&mm->mm_count);
- /* but do not pin its memory */
- mmput(mm);
- }
-
file->private_data = mm;
-
return 0;
}
@@ -1625,7 +1565,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
put_task_struct(task);
return 1;
}
- d_drop(dentry);
return 0;
}
@@ -1762,9 +1701,6 @@ out:
put_task_struct(task);
out_notask:
- if (status <= 0)
- d_drop(dentry);
-
return status;
}
@@ -2056,7 +1992,7 @@ static int show_timer(struct seq_file *m, void *v)
struct k_itimer *timer;
struct timers_private *tp = m->private;
int notify;
- static char *nstr[] = {
+ static const char * const nstr[] = {
[SIGEV_SIGNAL] = "signal",
[SIGEV_NONE] = "none",
[SIGEV_THREAD] = "thread",
@@ -2392,7 +2328,7 @@ static const struct file_operations proc_coredump_filter_operations = {
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
-static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
+static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
struct task_io_accounting acct = task->ioac;
unsigned long flags;
@@ -2416,7 +2352,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
unlock_task_sighand(task, &flags);
}
- result = sprintf(buffer,
+ result = seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
"syscr: %llu\n"
@@ -2436,20 +2372,22 @@ out_unlock:
return result;
}
-static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
- return do_io_accounting(task, buffer, 0);
+ return do_io_accounting(task, m, 0);
}
-static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
- return do_io_accounting(task, buffer, 1);
+ return do_io_accounting(task, m, 1);
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */
#ifdef CONFIG_USER_NS
static int proc_id_map_open(struct inode *inode, struct file *file,
- struct seq_operations *seq_ops)
+ const struct seq_operations *seq_ops)
{
struct user_namespace *ns = NULL;
struct task_struct *task;
@@ -2557,10 +2495,10 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
REG("environ", S_IRUSR, proc_environ_operations),
- INF("auxv", S_IRUSR, proc_pid_auxv),
+ ONE("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUGO, proc_pid_limits),
+ ONE("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
@@ -2569,9 +2507,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUSR, proc_pid_syscall),
+ ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
- INF("cmdline", S_IRUGO, proc_pid_cmdline),
+ ONE("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_pid_maps_operations),
@@ -2594,24 +2532,24 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
- INF("wchan", S_IRUGO, proc_pid_wchan),
+ ONE("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
- INF("schedstat", S_IRUGO, proc_pid_schedstat),
+ ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
REG("latency", S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
- REG("cpuset", S_IRUGO, proc_cpuset_operations),
+ ONE("cpuset", S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
- REG("cgroup", S_IRUGO, proc_cgroup_operations),
+ ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
- INF("oom_score", S_IRUGO, proc_oom_score),
+ ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
@@ -2625,10 +2563,10 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUSR, proc_tgid_io_accounting),
+ ONE("io", S_IRUSR, proc_tgid_io_accounting),
#endif
#ifdef CONFIG_HARDWALL
- INF("hardwall", S_IRUGO, proc_pid_hardwall),
+ ONE("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@ -2676,8 +2614,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
/* no ->d_hash() rejects on procfs */
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
- shrink_dcache_parent(dentry);
- d_drop(dentry);
+ d_invalidate(dentry);
dput(dentry);
}
@@ -2697,8 +2634,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
name.len = snprintf(buf, sizeof(buf), "%d", pid);
dentry = d_hash_and_lookup(dir, &name);
if (dentry) {
- shrink_dcache_parent(dentry);
- d_drop(dentry);
+ d_invalidate(dentry);
dput(dentry);
}
@@ -2780,12 +2716,12 @@ out:
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- int result = 0;
+ int result = -ENOENT;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
- tgid = name_to_int(dentry);
+ tgid = name_to_int(&dentry->d_name);
if (tgid == ~0U)
goto out;
@@ -2847,7 +2783,7 @@ retry:
return iter;
}
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
@@ -2859,14 +2795,19 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
return 0;
- if (pos == TGID_OFFSET - 1) {
+ if (pos == TGID_OFFSET - 2) {
struct inode *inode = ns->proc_self->d_inode;
if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
return 0;
- iter.tgid = 0;
- } else {
- iter.tgid = pos - TGID_OFFSET;
+ ctx->pos = pos = pos + 1;
}
+ if (pos == TGID_OFFSET - 1) {
+ struct inode *inode = ns->proc_thread_self->d_inode;
+ if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+ return 0;
+ ctx->pos = pos = pos + 1;
+ }
+ iter.tgid = pos - TGID_OFFSET;
iter.task = NULL;
for (iter = next_tgid(ns, iter);
iter.task;
@@ -2895,19 +2836,22 @@ static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+#ifdef CONFIG_NET
+ DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+#endif
REG("environ", S_IRUSR, proc_environ_operations),
- INF("auxv", S_IRUSR, proc_pid_auxv),
+ ONE("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUGO, proc_pid_limits),
+ ONE("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUSR, proc_pid_syscall),
+ ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
- INF("cmdline", S_IRUGO, proc_pid_cmdline),
+ ONE("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_tid_maps_operations),
@@ -2932,24 +2876,24 @@ static const struct pid_entry tid_base_stuff[] = {
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
- INF("wchan", S_IRUGO, proc_pid_wchan),
+ ONE("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
- INF("schedstat", S_IRUGO, proc_pid_schedstat),
+ ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
REG("latency", S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
- REG("cpuset", S_IRUGO, proc_cpuset_operations),
+ ONE("cpuset", S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
- REG("cgroup", S_IRUGO, proc_cgroup_operations),
+ ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
- INF("oom_score", S_IRUGO, proc_oom_score),
+ ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
@@ -2960,10 +2904,10 @@ static const struct pid_entry tid_base_stuff[] = {
REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUSR, proc_tid_io_accounting),
+ ONE("io", S_IRUSR, proc_tid_io_accounting),
#endif
#ifdef CONFIG_HARDWALL
- INF("hardwall", S_IRUGO, proc_pid_hardwall),
+ ONE("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@ -3033,7 +2977,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
if (!leader)
goto out_no_task;
- tid = name_to_int(dentry);
+ tid = name_to_int(&dentry->d_name);
if (tid == ~0U)
goto out;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 0788d093f5d8..e11d7c590bb0 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -129,8 +129,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
}
put_task_struct(task);
}
-
- d_drop(dentry);
return 0;
}
@@ -206,7 +204,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
{
struct task_struct *task = get_proc_task(dir);
int result = -ENOENT;
- unsigned fd = name_to_int(dentry);
+ unsigned fd = name_to_int(&dentry->d_name);
if (!task)
goto out_no_task;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b7f268eb5f45..317b72641ebf 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -27,7 +27,7 @@
#include "internal.h"
-DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_SPINLOCK(proc_subdir_lock);
static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
{
@@ -330,28 +330,28 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
nlink_t nlink)
{
struct proc_dir_entry *ent = NULL;
- const char *fn = name;
- unsigned int len;
-
- /* make sure name is valid */
- if (!name || !strlen(name))
- goto out;
+ const char *fn;
+ struct qstr qstr;
if (xlate_proc_name(name, parent, &fn) != 0)
goto out;
+ qstr.name = fn;
+ qstr.len = strlen(fn);
+ if (qstr.len == 0 || qstr.len >= 256) {
+ WARN(1, "name len %u\n", qstr.len);
+ return NULL;
+ }
+ if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
+ WARN(1, "create '/proc/%s' by hand\n", qstr.name);
+ return NULL;
+ }
- /* At this point there must not be any '/' characters beyond *fn */
- if (strchr(fn, '/'))
- goto out;
-
- len = strlen(fn);
-
- ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
+ ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
if (!ent)
goto out;
- memcpy(ent->name, fn, len + 1);
- ent->namelen = len;
+ memcpy(ent->name, fn, qstr.len + 1);
+ ent->namelen = qstr.len;
ent->mode = mode;
ent->nlink = nlink;
atomic_set(&ent->count, 1);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 0adbc02d60e3..333080d7a671 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -442,6 +442,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
int proc_fill_super(struct super_block *s)
{
struct inode *root_inode;
+ int ret;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
@@ -463,5 +464,9 @@ int proc_fill_super(struct super_block *s)
return -ENOMEM;
}
- return proc_setup_self(s);
+ ret = proc_setup_self(s);
+ if (ret) {
+ return ret;
+ }
+ return proc_setup_thread_self(s);
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3ab6d14e71c5..aa7a0ee182e1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -52,7 +52,6 @@ struct proc_dir_entry {
union proc_op {
int (*proc_get_link)(struct dentry *, struct path *);
- int (*proc_read)(struct task_struct *task, char *page);
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
@@ -112,10 +111,10 @@ static inline int task_dumpable(struct task_struct *task)
return 0;
}
-static inline unsigned name_to_int(struct dentry *dentry)
+static inline unsigned name_to_int(const struct qstr *qstr)
{
- const char *name = dentry->d_name.name;
- int len = dentry->d_name.len;
+ const char *name = qstr->name;
+ int len = qstr->len;
unsigned n = 0;
if (len > 1 && *name == '0')
@@ -178,8 +177,6 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i
/*
* generic.c
*/
-extern spinlock_t proc_subdir_lock;
-
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
struct dentry *);
@@ -234,6 +231,12 @@ static inline int proc_net_init(void) { return 0; }
extern int proc_setup_self(struct super_block *);
/*
+ * proc_thread_self.c
+ */
+extern int proc_setup_thread_self(struct super_block *);
+extern void proc_thread_self_init(void);
+
+/*
* proc_sysctl.c
*/
#ifdef CONFIG_PROC_SYSCTL
@@ -265,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *);
* task_[no]mmu.c
*/
struct proc_maps_private {
- struct pid *pid;
+ struct inode *inode;
struct task_struct *task;
+ struct mm_struct *mm;
#ifdef CONFIG_MMU
struct vm_area_struct *tail_vma;
#endif
@@ -275,6 +279,8 @@ struct proc_maps_private {
#endif
};
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
+
extern const struct file_operations proc_pid_maps_operations;
extern const struct file_operations proc_tid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 39e6ef32f0bd..91a4e6426321 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -172,7 +172,7 @@ get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
- end = ALIGN(end, PAGE_SIZE);
+ end = PAGE_ALIGN(end);
/* overlap check (because we have to align page */
list_for_each_entry(tmp, head, list) {
if (tmp->type != KCORE_VMEMMAP)
@@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void)
struct kcore_list kcore_modules;
static void __init add_modules_range(void)
{
- kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+ if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
+ kclist_add(&kcore_modules, (void *)MODULES_VADDR,
MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+ }
}
#else
static void __init add_modules_range(void)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7445af0b1aa3..aa1eee06420f 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)),
K(global_page_state(NR_FILE_MAPPED)),
- K(global_page_state(NR_SHMEM)),
+ K(i.sharedram),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55275d9..1e3187da1fed 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page)
if (PageBuddy(page))
u |= 1 << KPF_BUDDY;
+ if (PageBalloon(page))
+ u |= 1 << KPF_BALLOON;
+
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4677bb7dc7c2..a63af3e0a612 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir)
rcu_read_lock();
task = pid_task(proc_pid(dir), PIDTYPE_PID);
if (task != NULL) {
- ns = task_nsproxy(task);
+ task_lock(task);
+ ns = task->nsproxy;
if (ns != NULL)
net = get_net(ns->net_ns);
+ task_unlock(task);
}
rcu_read_unlock();
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 71290463a1d3..f92d5dd578a4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -632,7 +632,7 @@ out:
return ret;
}
-static int scan(struct ctl_table_header *head, ctl_table *table,
+static int scan(struct ctl_table_header *head, struct ctl_table *table,
unsigned long *pos, struct file *file,
struct dir_context *ctx)
{
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index cb761f010300..15f327bed8c6 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -18,7 +18,7 @@
/*
* The /proc/tty directory inodes...
*/
-static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver;
+static struct proc_dir_entry *proc_tty_driver;
/*
* This is the handler for /proc/tty/drivers
@@ -176,7 +176,7 @@ void __init proc_tty_init(void)
{
if (!proc_mkdir("tty", NULL))
return;
- proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL);
+ proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */
/*
* /proc/tty/driver/serial reveals the exact character counts for
* serial links which is just too easy to abuse for inferring
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5dbadecb234d..094e44d4a6be 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -149,6 +149,8 @@ static void proc_kill_sb(struct super_block *sb)
ns = (struct pid_namespace *)sb->s_fs_info;
if (ns->proc_self)
dput(ns->proc_self);
+ if (ns->proc_thread_self)
+ dput(ns->proc_thread_self);
kill_anon_super(sb);
put_pid_ns(ns);
}
@@ -170,6 +172,7 @@ void __init proc_root_init(void)
return;
proc_self_init();
+ proc_thread_self_init();
proc_symlink("mounts", NULL, "self/mounts");
proc_net_init();
@@ -199,10 +202,10 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_lookup(dir, dentry, flags))
+ if (!proc_pid_lookup(dir, dentry, flags))
return NULL;
- return proc_pid_lookup(dir, dentry, flags);
+ return proc_lookup(dir, dentry, flags);
}
static int proc_root_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cfa63ee92c96..4e0388cffe3d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm,
#ifdef CONFIG_NUMA
/*
- * These functions are for numa_maps but called in generic **maps seq_file
- * ->start(), ->stop() ops.
- *
- * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
- * Each mempolicy object is controlled by reference counting. The problem here
- * is how to avoid accessing dead mempolicy object.
- *
- * Because we're holding mmap_sem while reading seq_file, it's safe to access
- * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
- *
- * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
- * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
- * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
- * gurantee the task never exits under us. But taking task_lock() around
- * get_vma_plicy() causes lock order problem.
- *
- * To access task->mempolicy without lock, we hold a reference count of an
- * object pointed by task->mempolicy and remember it. This will guarantee
- * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
+ * Save get_task_policy() for show_numa_map().
*/
static void hold_task_mempolicy(struct proc_maps_private *priv)
{
struct task_struct *task = priv->task;
task_lock(task);
- priv->task_mempolicy = task->mempolicy;
+ priv->task_mempolicy = get_task_policy(task);
mpol_get(priv->task_mempolicy);
task_unlock(task);
}
@@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
+static void vma_stop(struct proc_maps_private *priv)
{
- if (vma && vma != priv->tail_vma) {
- struct mm_struct *mm = vma->vm_mm;
- release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
- mmput(mm);
- }
+ struct mm_struct *mm = priv->mm;
+
+ release_task_mempolicy(priv);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+}
+
+static struct vm_area_struct *
+m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
+{
+ if (vma == priv->tail_vma)
+ return NULL;
+ return vma->vm_next ?: priv->tail_vma;
}
-static void *m_start(struct seq_file *m, loff_t *pos)
+static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
+{
+ if (m->count < m->size) /* vma is copied successfully */
+ m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
unsigned long last_addr = m->version;
struct mm_struct *mm;
- struct vm_area_struct *vma, *tail_vma = NULL;
- loff_t l = *pos;
-
- /* Clear the per syscall fields in priv */
- priv->task = NULL;
- priv->tail_vma = NULL;
-
- /*
- * We remember last_addr rather than next_addr to hit with
- * vmacache most of the time. We have zero last_addr at
- * the beginning and also after lseek. We will have -1 last_addr
- * after the end of the vmas.
- */
+ struct vm_area_struct *vma;
+ unsigned int pos = *ppos;
+ /* See m_cache_vma(). Zero at the start or after lseek. */
if (last_addr == -1UL)
return NULL;
- priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+ priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = mm_access(priv->task, PTRACE_MODE_READ);
- if (!mm || IS_ERR(mm))
- return mm;
- down_read(&mm->mmap_sem);
+ mm = priv->mm;
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+ return NULL;
- tail_vma = get_gate_vma(priv->task->mm);
- priv->tail_vma = tail_vma;
+ down_read(&mm->mmap_sem);
hold_task_mempolicy(priv);
- /* Start with last addr hint */
- vma = find_vma(mm, last_addr);
- if (last_addr && vma) {
- vma = vma->vm_next;
- goto out;
+ priv->tail_vma = get_gate_vma(mm);
+
+ if (last_addr) {
+ vma = find_vma(mm, last_addr);
+ if (vma && (vma = m_next_vma(priv, vma)))
+ return vma;
}
- /*
- * Check the vma index is within the range and do
- * sequential scan until m_index.
- */
- vma = NULL;
- if ((unsigned long)l < mm->map_count) {
- vma = mm->mmap;
- while (l-- && vma)
+ m->version = 0;
+ if (pos < mm->map_count) {
+ for (vma = mm->mmap; pos; pos--) {
+ m->version = vma->vm_start;
vma = vma->vm_next;
- goto out;
+ }
+ return vma;
}
- if (l != mm->map_count)
- tail_vma = NULL; /* After gate vma */
-
-out:
- if (vma)
- return vma;
+ /* we do not bother to update m->version in this case */
+ if (pos == mm->map_count && priv->tail_vma)
+ return priv->tail_vma;
- release_task_mempolicy(priv);
- /* End of vmas has been reached */
- m->version = (tail_vma != NULL)? 0: -1UL;
- up_read(&mm->mmap_sem);
- mmput(mm);
- return tail_vma;
+ vma_stop(priv);
+ return NULL;
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- struct vm_area_struct *tail_vma = priv->tail_vma;
+ struct vm_area_struct *next;
(*pos)++;
- if (vma && (vma != tail_vma) && vma->vm_next)
- return vma->vm_next;
- vma_stop(priv, vma);
- return (vma != tail_vma)? tail_vma: NULL;
+ next = m_next_vma(priv, v);
+ if (!next)
+ vma_stop(priv);
+ return next;
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- if (!IS_ERR(vma))
- vma_stop(priv, vma);
- if (priv->task)
+ if (!IS_ERR_OR_NULL(v))
+ vma_stop(priv);
+ if (priv->task) {
put_task_struct(priv->task);
+ priv->task = NULL;
+ }
+}
+
+static int proc_maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops, int psize)
+{
+ struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
+
+ if (!priv)
+ return -ENOMEM;
+
+ priv->inode = inode;
+ priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->mm)) {
+ int err = PTR_ERR(priv->mm);
+
+ seq_release_private(inode, file);
+ return err;
+ }
+
+ return 0;
+}
+
+static int proc_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->mm)
+ mmdrop(priv->mm);
+
+ return seq_release_private(inode, file);
}
static int do_maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
- struct proc_maps_private *priv;
- int ret = -ENOMEM;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv) {
- priv->pid = proc_pid(inode);
- ret = seq_open(file, ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = priv;
- } else {
- kfree(priv);
- }
+ return proc_maps_open(inode, file, ops,
+ sizeof(struct proc_maps_private));
+}
+
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, bool is_pid)
+{
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
+ pid_t ret = 0;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (task) {
+ task = task_of_stack(task, vma, is_pid);
+ if (task)
+ ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
}
+ rcu_read_unlock();
+
return ret;
}
@@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
struct proc_maps_private *priv = m->private;
- struct task_struct *task = priv->task;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
@@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- tid = vm_is_stack(task, vma, is_pid);
-
+ tid = pid_of_stack(priv, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
@@ -349,15 +359,8 @@ done:
static int show_map(struct seq_file *m, void *v, int is_pid)
{
- struct vm_area_struct *vma = v;
- struct proc_maps_private *priv = m->private;
- struct task_struct *task = priv->task;
-
- show_map_vma(m, vma, is_pid);
-
- if (m->count < m->size) /* vma is copied successfully */
- m->version = (vma != get_gate_vma(task->mm))
- ? vma->vm_start : 0;
+ show_map_vma(m, v, is_pid);
+ m_cache_vma(m, v);
return 0;
}
@@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
const struct file_operations proc_tid_maps_operations = {
.open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
/*
@@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
- struct proc_maps_private *priv = m->private;
- struct task_struct *task = priv->task;
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
@@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.nonlinear >> 10);
show_smap_vma_flags(m, vma);
-
- if (m->count < m->size) /* vma is copied successfully */
- m->version = (vma != get_gate_vma(task->mm))
- ? vma->vm_start : 0;
+ m_cache_vma(m, vma);
return 0;
}
@@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = {
.open = pid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
const struct file_operations proc_tid_smaps_operations = {
.open = tid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
/*
@@ -829,8 +827,21 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
.private = &cp,
};
down_read(&mm->mmap_sem);
- if (type == CLEAR_REFS_SOFT_DIRTY)
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!(vma->vm_flags & VM_SOFTDIRTY))
+ continue;
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
+ }
+ downgrade_write(&mm->mmap_sem);
+ break;
+ }
mmu_notifier_invalidate_range_start(mm, 0, -1);
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cp.vma = vma;
if (is_vm_hugetlb_page(vma))
@@ -850,10 +861,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
continue;
if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
continue;
- if (type == CLEAR_REFS_SOFT_DIRTY) {
- if (vma->vm_flags & VM_SOFTDIRTY)
- vma->vm_flags &= ~VM_SOFTDIRTY;
- }
walk_page_range(vma->vm_start, vma->vm_end,
&clear_refs_walk);
}
@@ -925,15 +932,39 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
- unsigned long addr;
+ unsigned long addr = start;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
+ while (addr < end) {
+ struct vm_area_struct *vma = find_vma(walk->mm, addr);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ /* End of address space hole, which we mark as non-present. */
+ unsigned long hole_end;
+
+ if (vma)
+ hole_end = min(end, vma->vm_start);
+ else
+ hole_end = end;
+
+ for (; addr < hole_end; addr += PAGE_SIZE) {
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ goto out;
+ }
+
+ if (!vma)
break;
+
+ /* Addresses in the VMA. */
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ goto out;
+ }
}
+out:
return err;
}
@@ -1005,7 +1036,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
spinlock_t *ptl;
pte_t *pte;
int err = 0;
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
@@ -1019,6 +1049,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (; addr != end; addr += PAGE_SIZE) {
unsigned long offset;
+ pagemap_entry_t pme;
offset = (addr & ~PAGEMAP_WALK_MASK) >>
PAGE_SHIFT;
@@ -1033,32 +1064,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (pmd_trans_unstable(pmd))
return 0;
- for (; addr != end; addr += PAGE_SIZE) {
- int flags2;
-
- /* check to see if we've left 'vma' behind
- * and need a new, higher one */
- if (vma && (addr >= vma->vm_end)) {
- vma = find_vma(walk->mm, addr);
- if (vma && (vma->vm_flags & VM_SOFTDIRTY))
- flags2 = __PM_SOFT_DIRTY;
- else
- flags2 = 0;
- pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
+
+ while (1) {
+ /* End of address space hole, which we mark as non-present. */
+ unsigned long hole_end;
+
+ if (vma)
+ hole_end = min(end, vma->vm_start);
+ else
+ hole_end = end;
+
+ for (; addr < hole_end; addr += PAGE_SIZE) {
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ return err;
}
- /* check that 'vma' actually covers this address,
- * and that it isn't a huge page vma */
- if (vma && (vma->vm_start <= addr) &&
- !is_vm_hugetlb_page(vma)) {
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * We can't possibly be in a hugetlb VMA. In general,
+ * for a mm_walk with a pmd_entry and a hugetlb_entry,
+ * the pmd_entry can only be called on addresses in a
+ * hugetlb if the walk starts in a non-hugetlb VMA and
+ * spans a hugepage VMA. Since pagemap_read walks are
+ * PMD-sized and PMD-aligned, this will never be true.
+ */
+ BUG_ON(is_vm_hugetlb_page(vma));
+
+ /* Addresses in the VMA. */
+ for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
+ pagemap_entry_t pme;
pte = pte_offset_map(pmd, addr);
pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
- /* unmap before userspace copy */
pte_unmap(pte);
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ return err;
}
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- return err;
+
+ if (addr == end)
+ break;
+
+ vma = find_vma(walk->mm, addr);
}
cond_resched();
@@ -1391,7 +1441,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
struct vm_area_struct *vma = v;
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
- struct task_struct *task = proc_priv->task;
struct mm_struct *mm = vma->vm_mm;
struct mm_walk walk = {};
struct mempolicy *pol;
@@ -1411,9 +1460,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
walk.private = md;
walk.mm = mm;
- pol = get_vma_policy(task, vma, vma->vm_start);
- mpol_to_str(buffer, sizeof(buffer), pol);
- mpol_cond_put(pol);
+ pol = __get_vma_policy(vma, vma->vm_start);
+ if (pol) {
+ mpol_to_str(buffer, sizeof(buffer), pol);
+ mpol_cond_put(pol);
+ } else {
+ mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
+ }
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
@@ -1423,7 +1476,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
} else {
- pid_t tid = vm_is_stack(task, vma, is_pid);
+ pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
@@ -1471,9 +1524,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_printf(m, " N%d=%lu", nid, md->node[nid]);
out:
seq_putc(m, '\n');
-
- if (m->count < m->size)
- m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+ m_cache_vma(m, vma);
return 0;
}
@@ -1504,20 +1555,8 @@ static const struct seq_operations proc_tid_numa_maps_op = {
static int numa_maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
- struct numa_maps_private *priv;
- int ret = -ENOMEM;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv) {
- priv->proc_maps.pid = proc_pid(inode);
- ret = seq_open(file, ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = priv;
- } else {
- kfree(priv);
- }
- }
- return ret;
+ return proc_maps_open(inode, file, ops,
+ sizeof(struct numa_maps_private));
}
static int pid_numa_maps_open(struct inode *inode, struct file *file)
@@ -1534,13 +1573,13 @@ const struct file_operations proc_pid_numa_maps_operations = {
.open = pid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
const struct file_operations proc_tid_numa_maps_operations = {
.open = tid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = proc_map_release,
};
#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 678455d2d683..599ec2e20104 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
+static pid_t pid_of_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, bool is_pid)
+{
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
+ pid_t ret = 0;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (task) {
+ task = task_of_stack(task, vma, is_pid);
+ if (task)
+ ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* display a single VMA to a sequenced file
*/
@@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
seq_pad(m, ' ');
seq_path(m, &file->f_path, "");
} else if (mm) {
- pid_t tid = vm_is_stack(priv->task, vma, is_pid);
+ pid_t tid = pid_of_stack(priv, vma, is_pid);
if (tid != 0) {
seq_pad(m, ' ');
@@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos)
loff_t n = *pos;
/* pin the task and mm whilst we play with them */
- priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+ priv->task = get_proc_task(priv->inode);
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = mm_access(priv->task, PTRACE_MODE_READ);
- if (!mm || IS_ERR(mm)) {
- put_task_struct(priv->task);
- priv->task = NULL;
- return mm;
- }
- down_read(&mm->mmap_sem);
+ mm = priv->mm;
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+ return NULL;
+ down_read(&mm->mmap_sem);
/* start from the Nth VMA */
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
if (n-- == 0)
return p;
+
+ up_read(&mm->mmap_sem);
+ mmput(mm);
return NULL;
}
@@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml)
{
struct proc_maps_private *priv = m->private;
+ if (!IS_ERR_OR_NULL(_vml)) {
+ up_read(&priv->mm->mmap_sem);
+ mmput(priv->mm);
+ }
if (priv->task) {
- struct mm_struct *mm = priv->task->mm;
- up_read(&mm->mmap_sem);
- mmput(mm);
put_task_struct(priv->task);
+ priv->task = NULL;
}
}
@@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
struct proc_maps_private *priv;
- int ret = -ENOMEM;
-
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv) {
- priv->pid = proc_pid(inode);
- ret = seq_open(file, ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = priv;
- } else {
- kfree(priv);
- }
+
+ priv = __seq_open_private(file, ops, sizeof(*priv));
+ if (!priv)
+ return -ENOMEM;
+
+ priv->inode = inode;
+ priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->mm)) {
+ int err = PTR_ERR(priv->mm);
+
+ seq_release_private(inode, file);
+ return err;
}
- return ret;
+
+ return 0;
+}
+
+
+static int map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->mm)
+ mmdrop(priv->mm);
+
+ return seq_release_private(inode, file);
}
static int pid_maps_open(struct inode *inode, struct file *file)
@@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = map_release,
};
const struct file_operations proc_tid_maps_operations = {
.open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = map_release,
};
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
new file mode 100644
index 000000000000..59075b509df3
--- /dev/null
+++ b/fs/proc/thread_self.c
@@ -0,0 +1,85 @@
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+/*
+ * /proc/thread_self:
+ */
+static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
+ int buflen)
+{
+ struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ pid_t tgid = task_tgid_nr_ns(current, ns);
+ pid_t pid = task_pid_nr_ns(current, ns);
+ char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF];
+ if (!pid)
+ return -ENOENT;
+ sprintf(tmp, "%d/task/%d", tgid, pid);
+ return readlink_copy(buffer, buflen, tmp);
+}
+
+static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ pid_t tgid = task_tgid_nr_ns(current, ns);
+ pid_t pid = task_pid_nr_ns(current, ns);
+ char *name = ERR_PTR(-ENOENT);
+ if (pid) {
+ name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
+ if (!name)
+ name = ERR_PTR(-ENOMEM);
+ else
+ sprintf(name, "%d/task/%d", tgid, pid);
+ }
+ nd_set_link(nd, name);
+ return NULL;
+}
+
+static const struct inode_operations proc_thread_self_inode_operations = {
+ .readlink = proc_thread_self_readlink,
+ .follow_link = proc_thread_self_follow_link,
+ .put_link = kfree_put_link,
+};
+
+static unsigned thread_self_inum;
+
+int proc_setup_thread_self(struct super_block *s)
+{
+ struct inode *root_inode = s->s_root->d_inode;
+ struct pid_namespace *ns = s->s_fs_info;
+ struct dentry *thread_self;
+
+ mutex_lock(&root_inode->i_mutex);
+ thread_self = d_alloc_name(s->s_root, "thread-self");
+ if (thread_self) {
+ struct inode *inode = new_inode_pseudo(s);
+ if (inode) {
+ inode->i_ino = thread_self_inum;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_op = &proc_thread_self_inode_operations;
+ d_add(thread_self, inode);
+ } else {
+ dput(thread_self);
+ thread_self = ERR_PTR(-ENOMEM);
+ }
+ } else {
+ thread_self = ERR_PTR(-ENOMEM);
+ }
+ mutex_unlock(&root_inode->i_mutex);
+ if (IS_ERR(thread_self)) {
+ pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
+ return PTR_ERR(thread_self);
+ }
+ ns->proc_thread_self = thread_self;
+ return 0;
+}
+
+void __init proc_thread_self_init(void)
+{
+ proc_alloc_inum(&thread_self_inum);
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 382aa890e228..a90d6d354199 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -328,6 +328,82 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
* virtually contiguous user-space in ELF layout.
*/
#ifdef CONFIG_MMU
+/*
+ * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
+ * reported as not being ram with the zero page.
+ *
+ * @vma: vm_area_struct describing requested mapping
+ * @from: start remapping from
+ * @pfn: page frame number to start remapping to
+ * @size: remapping size
+ * @prot: protection bits
+ *
+ * Returns zero on success, -EAGAIN on failure.
+ */
+static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
+ unsigned long from, unsigned long pfn,
+ unsigned long size, pgprot_t prot)
+{
+ unsigned long map_size;
+ unsigned long pos_start, pos_end, pos;
+ unsigned long zeropage_pfn = my_zero_pfn(0);
+ size_t len = 0;
+
+ pos_start = pfn;
+ pos_end = pfn + (size >> PAGE_SHIFT);
+
+ for (pos = pos_start; pos < pos_end; ++pos) {
+ if (!pfn_is_ram(pos)) {
+ /*
+ * We hit a page which is not ram. Remap the continuous
+ * region between pos_start and pos-1 and replace
+ * the non-ram page at pos with the zero page.
+ */
+ if (pos > pos_start) {
+ /* Remap continuous region */
+ map_size = (pos - pos_start) << PAGE_SHIFT;
+ if (remap_oldmem_pfn_range(vma, from + len,
+ pos_start, map_size,
+ prot))
+ goto fail;
+ len += map_size;
+ }
+ /* Remap the zero page */
+ if (remap_oldmem_pfn_range(vma, from + len,
+ zeropage_pfn,
+ PAGE_SIZE, prot))
+ goto fail;
+ len += PAGE_SIZE;
+ pos_start = pos + 1;
+ }
+ }
+ if (pos > pos_start) {
+ /* Remap the rest */
+ map_size = (pos - pos_start) << PAGE_SHIFT;
+ if (remap_oldmem_pfn_range(vma, from + len, pos_start,
+ map_size, prot))
+ goto fail;
+ }
+ return 0;
+fail:
+ do_munmap(vma->vm_mm, from, len);
+ return -EAGAIN;
+}
+
+static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
+ unsigned long from, unsigned long pfn,
+ unsigned long size, pgprot_t prot)
+{
+ /*
+ * Check if oldmem_pfn_is_ram was registered to avoid
+ * looping over all pages without a reason.
+ */
+ if (oldmem_pfn_is_ram)
+ return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+ else
+ return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+}
+
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
{
size_t size = vma->vm_end - vma->vm_start;
@@ -387,9 +463,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
tsz = min_t(size_t, m->offset + m->size - start, size);
paddr = m->paddr + start - m->offset;
- if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
- paddr >> PAGE_SHIFT, tsz,
- vma->vm_page_prot))
+ if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
+ paddr >> PAGE_SHIFT, tsz,
+ vma->vm_page_prot))
goto fail;
size -= tsz;
start += tsz;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 1a81373947f3..73ca1740d839 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file,
if (!task)
goto err;
- rcu_read_lock();
- nsp = task_nsproxy(task);
+ task_lock(task);
+ nsp = task->nsproxy;
if (!nsp || !nsp->mnt_ns) {
- rcu_read_unlock();
+ task_unlock(task);
put_task_struct(task);
goto err;
}
ns = nsp->mnt_ns;
get_mnt_ns(ns);
- rcu_read_unlock();
- task_lock(task);
if (!task->fs) {
task_unlock(task);
put_task_struct(task);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 192297b0090d..fafb7a02a5d6 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -320,10 +320,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
compressed ? ".enc.z" : "");
break;
case PSTORE_TYPE_CONSOLE:
- sprintf(name, "console-%s", psname);
+ sprintf(name, "console-%s-%lld", psname, id);
break;
case PSTORE_TYPE_FTRACE:
- sprintf(name, "ftrace-%s", psname);
+ sprintf(name, "ftrace-%s-%lld", psname, id);
break;
case PSTORE_TYPE_MCE:
sprintf(name, "mce-%s-%lld", psname, id);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 34a1e5aa848c..9d7b9a83699e 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -394,7 +394,7 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
prot = pgprot_noncached(PAGE_KERNEL);
- pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL);
+ pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
pr_err("%s: Failed to allocate array for %u pages\n",
__func__, page_count);
diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile
index 9dd06199afc9..5e6bae6fae50 100644
--- a/fs/qnx6/Makefile
+++ b/fs/qnx6/Makefile
@@ -5,3 +5,4 @@
obj-$(CONFIG_QNX6FS_FS) += qnx6.o
qnx6-objs := inode.o dir.o namei.o super_mmi.o
+ccflags-$(CONFIG_QNX6FS_DEBUG) += -DDEBUG
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 15b7d92ed60d..8d64bb5366bf 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -77,21 +77,20 @@ static int qnx6_dir_longfilename(struct inode *inode,
if (de->de_size != 0xff) {
/* error - long filename entries always have size 0xff
in direntry */
- printk(KERN_ERR "qnx6: invalid direntry size (%i).\n",
- de->de_size);
+ pr_err("invalid direntry size (%i).\n", de->de_size);
return 0;
}
lf = qnx6_longname(s, de, &page);
if (IS_ERR(lf)) {
- printk(KERN_ERR "qnx6:Error reading longname\n");
+ pr_err("Error reading longname\n");
return 0;
}
lf_size = fs16_to_cpu(sbi, lf->lf_size);
if (lf_size > QNX6_LONG_NAME_MAX) {
- QNX6DEBUG((KERN_INFO "file %s\n", lf->lf_fname));
- printk(KERN_ERR "qnx6:Filename too long (%i)\n", lf_size);
+ pr_debug("file %s\n", lf->lf_fname);
+ pr_err("Filename too long (%i)\n", lf_size);
qnx6_put_page(page);
return 0;
}
@@ -100,10 +99,10 @@ static int qnx6_dir_longfilename(struct inode *inode,
mmi 3g filesystem does not have that checksum */
if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) !=
qnx6_lfile_checksum(lf->lf_fname, lf_size))
- printk(KERN_INFO "qnx6: long filename checksum error.\n");
+ pr_info("long filename checksum error.\n");
- QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
- lf_size, lf->lf_fname, de_inode));
+ pr_debug("qnx6_readdir:%.*s inode:%u\n",
+ lf_size, lf->lf_fname, de_inode);
if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
qnx6_put_page(page);
return 0;
@@ -136,7 +135,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
int i = start;
if (IS_ERR(page)) {
- printk(KERN_ERR "qnx6_readdir: read failed\n");
+ pr_err("%s(): read failed\n", __func__);
ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
return PTR_ERR(page);
}
@@ -159,9 +158,9 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
break;
}
} else {
- QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
- " inode:%u\n", size, de->de_fname,
- no_inode));
+ pr_debug("%s():%.*s inode:%u\n",
+ __func__, size, de->de_fname,
+ no_inode);
if (!dir_emit(ctx, de->de_fname, size,
no_inode, DT_UNKNOWN)) {
done = true;
@@ -259,8 +258,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
if (ino)
goto found;
} else
- printk(KERN_ERR "qnx6: undefined "
- "filename size in inode.\n");
+ pr_err("undefined filename size in inode.\n");
}
qnx6_put_page(page);
}
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 65cdaab3ed49..44e73923670d 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -73,8 +73,8 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock,
{
unsigned phys;
- QNX6DEBUG((KERN_INFO "qnx6: qnx6_get_block inode=[%ld] iblock=[%ld]\n",
- inode->i_ino, (unsigned long)iblock));
+ pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n",
+ inode->i_ino, (unsigned long)iblock);
phys = qnx6_block_map(inode, iblock);
if (phys) {
@@ -87,7 +87,7 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock,
static int qnx6_check_blockptr(__fs32 ptr)
{
if (ptr == ~(__fs32)0) {
- printk(KERN_ERR "qnx6: hit unused blockpointer.\n");
+ pr_err("hit unused blockpointer.\n");
return 0;
}
return 1;
@@ -127,8 +127,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no)
levelptr = no >> bitdelta;
if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) {
- printk(KERN_ERR "qnx6:Requested file block number (%u) too big.",
- no);
+ pr_err("Requested file block number (%u) too big.", no);
return 0;
}
@@ -137,8 +136,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no)
for (i = 0; i < depth; i++) {
bh = sb_bread(s, block);
if (!bh) {
- printk(KERN_ERR "qnx6:Error reading block (%u)\n",
- block);
+ pr_err("Error reading block (%u)\n", block);
return 0;
}
bitdelta -= ptrbits;
@@ -207,26 +205,16 @@ void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s)
{
struct qnx6_sb_info *sbi = QNX6_SB(s);
- QNX6DEBUG((KERN_INFO "magic: %08x\n",
- fs32_to_cpu(sbi, sb->sb_magic)));
- QNX6DEBUG((KERN_INFO "checksum: %08x\n",
- fs32_to_cpu(sbi, sb->sb_checksum)));
- QNX6DEBUG((KERN_INFO "serial: %llx\n",
- fs64_to_cpu(sbi, sb->sb_serial)));
- QNX6DEBUG((KERN_INFO "flags: %08x\n",
- fs32_to_cpu(sbi, sb->sb_flags)));
- QNX6DEBUG((KERN_INFO "blocksize: %08x\n",
- fs32_to_cpu(sbi, sb->sb_blocksize)));
- QNX6DEBUG((KERN_INFO "num_inodes: %08x\n",
- fs32_to_cpu(sbi, sb->sb_num_inodes)));
- QNX6DEBUG((KERN_INFO "free_inodes: %08x\n",
- fs32_to_cpu(sbi, sb->sb_free_inodes)));
- QNX6DEBUG((KERN_INFO "num_blocks: %08x\n",
- fs32_to_cpu(sbi, sb->sb_num_blocks)));
- QNX6DEBUG((KERN_INFO "free_blocks: %08x\n",
- fs32_to_cpu(sbi, sb->sb_free_blocks)));
- QNX6DEBUG((KERN_INFO "inode_levels: %02x\n",
- sb->Inode.levels));
+ pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic));
+ pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum));
+ pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial));
+ pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags));
+ pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize));
+ pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes));
+ pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes));
+ pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks));
+ pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks));
+ pr_debug("inode_levels: %02x\n", sb->Inode.levels);
}
#endif
@@ -277,7 +265,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
start with the first superblock */
bh = sb_bread(s, offset);
if (!bh) {
- printk(KERN_ERR "qnx6: unable to read the first superblock\n");
+ pr_err("unable to read the first superblock\n");
return NULL;
}
sb = (struct qnx6_super_block *)bh->b_data;
@@ -285,20 +273,16 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
sbi->s_bytesex = BYTESEX_BE;
if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
/* we got a big endian fs */
- QNX6DEBUG((KERN_INFO "qnx6: fs got different"
- " endianness.\n"));
+ pr_debug("fs got different endianness.\n");
return bh;
} else
sbi->s_bytesex = BYTESEX_LE;
if (!silent) {
if (offset == 0) {
- printk(KERN_ERR "qnx6: wrong signature (magic)"
- " in superblock #1.\n");
+ pr_err("wrong signature (magic) in superblock #1.\n");
} else {
- printk(KERN_INFO "qnx6: wrong signature (magic)"
- " at position (0x%lx) - will try"
- " alternative position (0x0000).\n",
- offset * s->s_blocksize);
+ pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n",
+ offset * s->s_blocksize);
}
}
brelse(bh);
@@ -329,13 +313,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
/* Superblock always is 512 Byte long */
if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) {
- printk(KERN_ERR "qnx6: unable to set blocksize\n");
+ pr_err("unable to set blocksize\n");
goto outnobh;
}
/* parse the mount-options */
if (!qnx6_parse_options((char *) data, s)) {
- printk(KERN_ERR "qnx6: invalid mount options.\n");
+ pr_err("invalid mount options.\n");
goto outnobh;
}
if (test_opt(s, MMI_FS)) {
@@ -355,7 +339,7 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
/* try again without bootblock offset */
bh1 = qnx6_check_first_superblock(s, 0, silent);
if (!bh1) {
- printk(KERN_ERR "qnx6: unable to read the first superblock\n");
+ pr_err("unable to read the first superblock\n");
goto outnobh;
}
/* seems that no bootblock at partition start */
@@ -370,13 +354,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
/* checksum check - start at byte 8 and end at byte 512 */
if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
- printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+ pr_err("superblock #1 checksum error\n");
goto out;
}
/* set new blocksize */
if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
- printk(KERN_ERR "qnx6: unable to set blocksize\n");
+ pr_err("unable to set blocksize\n");
goto out;
}
/* blocksize invalidates bh - pull it back in */
@@ -398,21 +382,20 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
/* next the second superblock */
bh2 = sb_bread(s, offset);
if (!bh2) {
- printk(KERN_ERR "qnx6: unable to read the second superblock\n");
+ pr_err("unable to read the second superblock\n");
goto out;
}
sb2 = (struct qnx6_super_block *)bh2->b_data;
if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
if (!silent)
- printk(KERN_ERR "qnx6: wrong signature (magic)"
- " in superblock #2.\n");
+ pr_err("wrong signature (magic) in superblock #2.\n");
goto out;
}
/* checksum check - start at byte 8 and end at byte 512 */
if (fs32_to_cpu(sbi, sb2->sb_checksum) !=
crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
- printk(KERN_ERR "qnx6: superblock #2 checksum error\n");
+ pr_err("superblock #2 checksum error\n");
goto out;
}
@@ -422,25 +405,24 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
sbi->sb_buf = bh1;
sbi->sb = (struct qnx6_super_block *)bh1->b_data;
brelse(bh2);
- printk(KERN_INFO "qnx6: superblock #1 active\n");
+ pr_info("superblock #1 active\n");
} else {
/* superblock #2 active */
sbi->sb_buf = bh2;
sbi->sb = (struct qnx6_super_block *)bh2->b_data;
brelse(bh1);
- printk(KERN_INFO "qnx6: superblock #2 active\n");
+ pr_info("superblock #2 active\n");
}
mmi_success:
/* sanity check - limit maximum indirect pointer levels */
if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) {
- printk(KERN_ERR "qnx6: too many inode levels (max %i, sb %i)\n",
- QNX6_PTR_MAX_LEVELS, sb1->Inode.levels);
+ pr_err("too many inode levels (max %i, sb %i)\n",
+ QNX6_PTR_MAX_LEVELS, sb1->Inode.levels);
goto out;
}
if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) {
- printk(KERN_ERR "qnx6: too many longfilename levels"
- " (max %i, sb %i)\n",
- QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
+ pr_err("too many longfilename levels (max %i, sb %i)\n",
+ QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
goto out;
}
s->s_op = &qnx6_sops;
@@ -460,7 +442,7 @@ mmi_success:
/* prefetch root inode */
root = qnx6_iget(s, QNX6_ROOT_INO);
if (IS_ERR(root)) {
- printk(KERN_ERR "qnx6: get inode failed\n");
+ pr_err("get inode failed\n");
ret = PTR_ERR(root);
goto out2;
}
@@ -474,7 +456,7 @@ mmi_success:
errmsg = qnx6_checkroot(s);
if (errmsg != NULL) {
if (!silent)
- printk(KERN_ERR "qnx6: %s\n", errmsg);
+ pr_err("%s\n", errmsg);
goto out3;
}
return 0;
@@ -555,8 +537,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode->i_mode = 0;
if (ino == 0) {
- printk(KERN_ERR "qnx6: bad inode number on dev %s: %u is "
- "out of range\n",
+ pr_err("bad inode number on dev %s: %u is out of range\n",
sb->s_id, ino);
iget_failed(inode);
return ERR_PTR(-EIO);
@@ -566,8 +547,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
mapping = sbi->inodes->i_mapping;
page = read_mapping_page(mapping, n, NULL);
if (IS_ERR(page)) {
- printk(KERN_ERR "qnx6: major problem: unable to read inode from "
- "dev %s\n", sb->s_id);
+ pr_err("major problem: unable to read inode from dev %s\n",
+ sb->s_id);
iget_failed(inode);
return ERR_CAST(page);
}
@@ -689,7 +670,7 @@ static int __init init_qnx6_fs(void)
return err;
}
- printk(KERN_INFO "QNX6 filesystem 1.0.0 registered.\n");
+ pr_info("QNX6 filesystem 1.0.0 registered.\n");
return 0;
}
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 0561326a94f5..6c1a323137dd 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -29,12 +29,12 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
foundinode = qnx6_iget(dir->i_sb, ino);
qnx6_put_page(page);
if (IS_ERR(foundinode)) {
- QNX6DEBUG((KERN_ERR "qnx6: lookup->iget -> "
- " error %ld\n", PTR_ERR(foundinode)));
+ pr_debug("lookup->iget -> error %ld\n",
+ PTR_ERR(foundinode));
return ERR_CAST(foundinode);
}
} else {
- QNX6DEBUG((KERN_INFO "qnx6_lookup: not found %s\n", name));
+ pr_debug("%s(): not found %s\n", __func__, name);
return NULL;
}
d_add(dentry, foundinode);
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index b00fcc960d37..d3fb2b698800 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -10,6 +10,12 @@
*
*/
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -19,12 +25,6 @@ typedef __u64 __bitwise __fs64;
#include <linux/qnx6_fs.h>
-#ifdef CONFIG_QNX6FS_DEBUG
-#define QNX6DEBUG(X) printk X
-#else
-#define QNX6DEBUG(X) (void) 0
-#endif
-
struct qnx6_sb_info {
struct buffer_head *sb_buf; /* superblock buffer */
struct qnx6_super_block *sb; /* our superblock */
diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c
index 29c32cba62d6..62aaf3e3126a 100644
--- a/fs/qnx6/super_mmi.c
+++ b/fs/qnx6/super_mmi.c
@@ -44,15 +44,14 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
start with the first superblock */
bh1 = sb_bread(s, 0);
if (!bh1) {
- printk(KERN_ERR "qnx6: Unable to read first mmi superblock\n");
+ pr_err("Unable to read first mmi superblock\n");
return NULL;
}
sb1 = (struct qnx6_mmi_super_block *)bh1->b_data;
sbi = QNX6_SB(s);
if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) {
if (!silent) {
- printk(KERN_ERR "qnx6: wrong signature (magic) in"
- " superblock #1.\n");
+ pr_err("wrong signature (magic) in superblock #1.\n");
goto out;
}
}
@@ -60,7 +59,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
/* checksum check - start at byte 8 and end at byte 512 */
if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
- printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+ pr_err("superblock #1 checksum error\n");
goto out;
}
@@ -70,7 +69,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
/* set new blocksize */
if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
- printk(KERN_ERR "qnx6: unable to set blocksize\n");
+ pr_err("unable to set blocksize\n");
goto out;
}
/* blocksize invalidates bh - pull it back in */
@@ -83,27 +82,26 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
/* read second superblock */
bh2 = sb_bread(s, offset);
if (!bh2) {
- printk(KERN_ERR "qnx6: unable to read the second superblock\n");
+ pr_err("unable to read the second superblock\n");
goto out;
}
sb2 = (struct qnx6_mmi_super_block *)bh2->b_data;
if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
if (!silent)
- printk(KERN_ERR "qnx6: wrong signature (magic) in"
- " superblock #2.\n");
+ pr_err("wrong signature (magic) in superblock #2.\n");
goto out;
}
/* checksum check - start at byte 8 and end at byte 512 */
if (fs32_to_cpu(sbi, sb2->sb_checksum)
!= crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
- printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+ pr_err("superblock #1 checksum error\n");
goto out;
}
qsb = kmalloc(sizeof(*qsb), GFP_KERNEL);
if (!qsb) {
- printk(KERN_ERR "qnx6: unable to allocate memory.\n");
+ pr_err("unable to allocate memory.\n");
goto out;
}
@@ -119,7 +117,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
sbi->sb_buf = bh1;
sbi->sb = (struct qnx6_super_block *)bh1->b_data;
brelse(bh2);
- printk(KERN_INFO "qnx6: superblock #1 active\n");
+ pr_info("superblock #1 active\n");
} else {
/* superblock #2 active */
qnx6_mmi_copy_sb(qsb, sb2);
@@ -131,7 +129,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
sbi->sb_buf = bh2;
sbi->sb = (struct qnx6_super_block *)bh2->b_data;
brelse(bh1);
- printk(KERN_INFO "qnx6: superblock #2 active\n");
+ pr_info("superblock #2 active\n");
}
kfree(qsb);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7f30bdc57d13..6b4527216a7f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -96,13 +96,16 @@
* Note that some things (eg. sb pointer, type, id) doesn't change during
* the life of the dquot structure and so needn't to be protected by a lock
*
- * Any operation working on dquots via inode pointers must hold dqptr_sem. If
- * operation is just reading pointers from inode (or not using them at all) the
- * read lock is enough. If pointers are altered function must hold write lock.
+ * Operation accessing dquots via inode pointers are protected by dquot_srcu.
+ * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
+ * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
+ * inode and before dropping dquot references to avoid use of dquots after
+ * they are freed. dq_data_lock is used to serialize the pointer setting and
+ * clearing operations.
* Special care needs to be taken about S_NOQUOTA inode flag (marking that
* inode is a quota file). Functions adding pointers from inode to dquots have
- * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
- * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dq_data_lock. This makes
* sure they cannot race with quotaon which first sets S_NOQUOTA flag and
* then drops all pointers to dquots from an inode.
*
@@ -116,21 +119,15 @@
* spinlock to internal buffers before writing.
*
* Lock ordering (including related VFS locks) is the following:
- * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock >
- * dqio_mutex
+ * dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
* dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
- * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
- * dqptr_sem. But filesystem has to count with the fact that functions such as
- * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
- * from inside a transaction to keep filesystem consistency after a crash. Also
- * filesystems usually want to do some IO on dquot from ->mark_dirty which is
- * called with dqptr_sem held.
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
EXPORT_SYMBOL(dq_data_lock);
+DEFINE_STATIC_SRCU(dquot_srcu);
void __quota_error(struct super_block *sb, const char *func,
const char *fmt, ...)
@@ -637,7 +634,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
dqstats_inc(DQST_LOOKUPS);
err = sb->dq_op->write_dquot(dquot);
if (!ret && err)
- err = ret;
+ ret = err;
dqput(dquot);
spin_lock(&dq_list_lock);
}
@@ -733,7 +730,6 @@ static struct shrinker dqcache_shrinker = {
/*
* Put reference to dquot
- * NOTE: If you change this function please check whether dqput_blocks() works right...
*/
void dqput(struct dquot *dquot)
{
@@ -963,46 +959,33 @@ static void add_dquot_ref(struct super_block *sb, int type)
}
/*
- * Return 0 if dqput() won't block.
- * (note that 1 doesn't necessarily mean blocking)
- */
-static inline int dqput_blocks(struct dquot *dquot)
-{
- if (atomic_read(&dquot->dq_count) <= 1)
- return 1;
- return 0;
-}
-
-/*
* Remove references to dquots from inode and add dquot to list for freeing
* if we have the last reference to dquot
- * We can't race with anybody because we hold dqptr_sem for writing...
*/
-static int remove_inode_dquot_ref(struct inode *inode, int type,
- struct list_head *tofree_head)
+static void remove_inode_dquot_ref(struct inode *inode, int type,
+ struct list_head *tofree_head)
{
struct dquot *dquot = inode->i_dquot[type];
inode->i_dquot[type] = NULL;
- if (dquot) {
- if (dqput_blocks(dquot)) {
-#ifdef CONFIG_QUOTA_DEBUG
- if (atomic_read(&dquot->dq_count) != 1)
- quota_error(inode->i_sb, "Adding dquot with "
- "dq_count %d to dispose list",
- atomic_read(&dquot->dq_count));
-#endif
- spin_lock(&dq_list_lock);
- /* As dquot must have currently users it can't be on
- * the free list... */
- list_add(&dquot->dq_free, tofree_head);
- spin_unlock(&dq_list_lock);
- return 1;
- }
- else
- dqput(dquot); /* We have guaranteed we won't block */
+ if (!dquot)
+ return;
+
+ if (list_empty(&dquot->dq_free)) {
+ /*
+ * The inode still has reference to dquot so it can't be in the
+ * free list
+ */
+ spin_lock(&dq_list_lock);
+ list_add(&dquot->dq_free, tofree_head);
+ spin_unlock(&dq_list_lock);
+ } else {
+ /*
+ * Dquot is already in a list to put so we won't drop the last
+ * reference here.
+ */
+ dqput(dquot);
}
- return 0;
}
/*
@@ -1037,13 +1020,15 @@ static void remove_dquot_ref(struct super_block *sb, int type,
* We have to scan also I_NEW inodes because they can already
* have quota pointer initialized. Luckily, we need to touch
* only quota pointers and these have separate locking
- * (dqptr_sem).
+ * (dq_data_lock).
*/
+ spin_lock(&dq_data_lock);
if (!IS_NOQUOTA(inode)) {
if (unlikely(inode_get_rsv_space(inode) > 0))
reserved = 1;
remove_inode_dquot_ref(inode, type, tofree_head);
}
+ spin_unlock(&dq_data_lock);
}
spin_unlock(&inode_sb_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
@@ -1061,9 +1046,8 @@ static void drop_dquot_ref(struct super_block *sb, int type)
LIST_HEAD(tofree_head);
if (sb->dq_op) {
- down_write(&sb_dqopt(sb)->dqptr_sem);
remove_dquot_ref(sb, type, &tofree_head);
- up_write(&sb_dqopt(sb)->dqptr_sem);
+ synchronize_srcu(&dquot_srcu);
put_dquot_list(&tofree_head);
}
}
@@ -1394,21 +1378,16 @@ static int dquot_active(const struct inode *inode)
/*
* Initialize quota pointers in inode
*
- * We do things in a bit complicated way but by that we avoid calling
- * dqget() and thus filesystem callbacks under dqptr_sem.
- *
* It is better to call this function outside of any transaction as it
* might need a lot of space in journal for dquot structure allocation.
*/
static void __dquot_initialize(struct inode *inode, int type)
{
- int cnt;
+ int cnt, init_needed = 0;
struct dquot *got[MAXQUOTAS];
struct super_block *sb = inode->i_sb;
qsize_t rsv;
- /* First test before acquiring mutex - solves deadlocks when we
- * re-enter the quota code and are already holding the mutex */
if (!dquot_active(inode))
return;
@@ -1418,6 +1397,15 @@ static void __dquot_initialize(struct inode *inode, int type)
got[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
+ /*
+ * The i_dquot should have been initialized in most cases,
+ * we check it without locking here to avoid unnecessary
+ * dqget()/dqput() calls.
+ */
+ if (inode->i_dquot[cnt])
+ continue;
+ init_needed = 1;
+
switch (cnt) {
case USRQUOTA:
qid = make_kqid_uid(inode->i_uid);
@@ -1429,7 +1417,11 @@ static void __dquot_initialize(struct inode *inode, int type)
got[cnt] = dqget(sb, qid);
}
- down_write(&sb_dqopt(sb)->dqptr_sem);
+ /* All required i_dquot has been initialized */
+ if (!init_needed)
+ return;
+
+ spin_lock(&dq_data_lock);
if (IS_NOQUOTA(inode))
goto out_err;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1449,15 +1441,12 @@ static void __dquot_initialize(struct inode *inode, int type)
* did a write before quota was turned on
*/
rsv = inode_get_rsv_space(inode);
- if (unlikely(rsv)) {
- spin_lock(&dq_data_lock);
+ if (unlikely(rsv))
dquot_resv_space(inode->i_dquot[cnt], rsv);
- spin_unlock(&dq_data_lock);
- }
}
}
out_err:
- up_write(&sb_dqopt(sb)->dqptr_sem);
+ spin_unlock(&dq_data_lock);
/* Drop unused references */
dqput_all(got);
}
@@ -1469,19 +1458,24 @@ void dquot_initialize(struct inode *inode)
EXPORT_SYMBOL(dquot_initialize);
/*
- * Release all quotas referenced by inode
+ * Release all quotas referenced by inode.
+ *
+ * This function only be called on inode free or converting
+ * a file to quota file, no other users for the i_dquot in
+ * both cases, so we needn't call synchronize_srcu() after
+ * clearing i_dquot.
*/
static void __dquot_drop(struct inode *inode)
{
int cnt;
struct dquot *put[MAXQUOTAS];
- down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
put[cnt] = inode->i_dquot[cnt];
inode->i_dquot[cnt] = NULL;
}
- up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ spin_unlock(&dq_data_lock);
dqput_all(put);
}
@@ -1599,15 +1593,11 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
*/
int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
- int cnt, ret = 0;
+ int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
struct dquot **dquots = inode->i_dquot;
int reserve = flags & DQUOT_SPACE_RESERVE;
- /*
- * First test before acquiring mutex - solves deadlocks when we
- * re-enter the quota code and are already holding the mutex
- */
if (!dquot_active(inode)) {
inode_incr_space(inode, number, reserve);
goto out;
@@ -1616,7 +1606,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warn[cnt].w_type = QUOTA_NL_NOWARN;
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!dquots[cnt])
@@ -1643,7 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
goto out_flush_warn;
mark_all_dquot_dirty(dquots);
out_flush_warn:
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
out:
return ret;
@@ -1655,17 +1645,16 @@ EXPORT_SYMBOL(__dquot_alloc_space);
*/
int dquot_alloc_inode(const struct inode *inode)
{
- int cnt, ret = 0;
+ int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
struct dquot * const *dquots = inode->i_dquot;
- /* First test before acquiring mutex - solves deadlocks when we
- * re-enter the quota code and are already holding the mutex */
if (!dquot_active(inode))
return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warn[cnt].w_type = QUOTA_NL_NOWARN;
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+ index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!dquots[cnt])
@@ -1685,7 +1674,7 @@ warn_put_all:
spin_unlock(&dq_data_lock);
if (ret == 0)
mark_all_dquot_dirty(dquots);
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
return ret;
}
@@ -1696,14 +1685,14 @@ EXPORT_SYMBOL(dquot_alloc_inode);
*/
int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
- int cnt;
+ int cnt, index;
if (!dquot_active(inode)) {
inode_claim_rsv_space(inode, number);
return 0;
}
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1715,7 +1704,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
inode_claim_rsv_space(inode, number);
spin_unlock(&dq_data_lock);
mark_all_dquot_dirty(inode->i_dquot);
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ srcu_read_unlock(&dquot_srcu, index);
return 0;
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);
@@ -1725,14 +1714,14 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
*/
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
- int cnt;
+ int cnt, index;
if (!dquot_active(inode)) {
inode_reclaim_rsv_space(inode, number);
return;
}
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1744,7 +1733,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
inode_reclaim_rsv_space(inode, number);
spin_unlock(&dq_data_lock);
mark_all_dquot_dirty(inode->i_dquot);
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ srcu_read_unlock(&dquot_srcu, index);
return;
}
EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
@@ -1757,16 +1746,14 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
struct dquot **dquots = inode->i_dquot;
- int reserve = flags & DQUOT_SPACE_RESERVE;
+ int reserve = flags & DQUOT_SPACE_RESERVE, index;
- /* First test before acquiring mutex - solves deadlocks when we
- * re-enter the quota code and are already holding the mutex */
if (!dquot_active(inode)) {
inode_decr_space(inode, number, reserve);
return;
}
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
int wtype;
@@ -1789,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
goto out_unlock;
mark_all_dquot_dirty(dquots);
out_unlock:
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
}
EXPORT_SYMBOL(__dquot_free_space);
@@ -1802,13 +1789,12 @@ void dquot_free_inode(const struct inode *inode)
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
struct dquot * const *dquots = inode->i_dquot;
+ int index;
- /* First test before acquiring mutex - solves deadlocks when we
- * re-enter the quota code and are already holding the mutex */
if (!dquot_active(inode))
return;
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
int wtype;
@@ -1823,7 +1809,7 @@ void dquot_free_inode(const struct inode *inode)
}
spin_unlock(&dq_data_lock);
mark_all_dquot_dirty(dquots);
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
}
EXPORT_SYMBOL(dquot_free_inode);
@@ -1837,6 +1823,8 @@ EXPORT_SYMBOL(dquot_free_inode);
* This operation can block, but only after everything is updated
* A transaction must be started when entering this function.
*
+ * We are holding reference on transfer_from & transfer_to, no need to
+ * protect them by srcu_read_lock().
*/
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
@@ -1849,8 +1837,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
struct dquot_warn warn_from_inodes[MAXQUOTAS];
struct dquot_warn warn_from_space[MAXQUOTAS];
- /* First test before acquiring mutex - solves deadlocks when we
- * re-enter the quota code and are already holding the mutex */
if (IS_NOQUOTA(inode))
return 0;
/* Initialize the arrays */
@@ -1859,12 +1845,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
}
- down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+ spin_lock(&dq_data_lock);
if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
- up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ spin_unlock(&dq_data_lock);
return 0;
}
- spin_lock(&dq_data_lock);
cur_space = inode_get_bytes(inode);
rsv_space = inode_get_rsv_space(inode);
space = cur_space + rsv_space;
@@ -1918,7 +1904,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
inode->i_dquot[cnt] = transfer_to[cnt];
}
spin_unlock(&dq_data_lock);
- up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
mark_all_dquot_dirty(transfer_from);
mark_all_dquot_dirty(transfer_to);
@@ -1932,7 +1917,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
return 0;
over_quota:
spin_unlock(&dq_data_lock);
- up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
flush_warnings(warn_to);
return ret;
}
@@ -2741,7 +2725,7 @@ static int __init dquot_init(void)
panic("Cannot create dquot hash table");
for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
- ret = percpu_counter_init(&dqstats.counter[i], 0);
+ ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
if (ret)
panic("Cannot create dquot stat counters");
}
diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c
index 2f97b0e2c501..ebc5e6285800 100644
--- a/fs/quota/kqid.c
+++ b/fs/quota/kqid.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(qid_lt);
/**
* from_kqid - Create a qid from a kqid user-namespace pair.
* @targ: The user namespace we want a qid in.
- * @kuid: The kernel internal quota identifier to start with.
+ * @kqid: The kernel internal quota identifier to start with.
*
* Map @kqid into the user-namespace specified by @targ and
* return the resulting qid.
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 72d29177998e..bb2869f5dfd8 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -32,8 +32,7 @@ static struct genl_family quota_genl_family = {
/**
* quota_send_warning - Send warning to userspace about exceeded quota
- * @type: The quota type: USRQQUOTA, GRPQUOTA,...
- * @id: The user or group id of the quota that was exceeded
+ * @qid: The kernel internal quota identifier.
* @dev: The device on which the fs is mounted (sb->s_dev)
* @warntype: The type of the warning: QUOTA_NL_...
*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ff3f0b3cfdb3..75621649dbd7 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,13 +79,13 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
{
__u32 fmt;
- down_read(&sb_dqopt(sb)->dqptr_sem);
+ mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
if (!sb_has_quota_active(sb, type)) {
- up_read(&sb_dqopt(sb)->dqptr_sem);
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return -ESRCH;
}
fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
- up_read(&sb_dqopt(sb)->dqptr_sem);
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
if (copy_to_user(addr, &fmt, sizeof(fmt)))
return -EFAULT;
return 0;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index dda012ad4208..bbafbde3471a 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -222,7 +222,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
/* gang-find the pages */
ret = -ENOMEM;
- pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+ pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto out_free;
diff --git a/fs/read_write.c b/fs/read_write.c
index 009d8542a889..7d9318c3d43c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -513,6 +513,8 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
return ret;
}
+EXPORT_SYMBOL(__kernel_write);
+
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index d9f5a60dd59b..0a7dc941aaf4 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -9,7 +9,7 @@
#include <linux/stat.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
extern const struct reiserfs_key MIN_KEY;
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 54fdf196bfb2..9c02d96d3a42 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -10,7 +10,7 @@
* and using buffers obtained after all above.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/time.h>
#include "reiserfs.h"
#include <linux/buffer_head.h>
@@ -286,12 +286,14 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
return 0;
}
-static void balance_leaf_insert_left(struct tree_balance *tb,
- struct item_head *ih, const char *body)
+static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
+ struct item_head *const ih,
+ const char * const body)
{
int ret;
struct buffer_info bi;
int n = B_NR_ITEMS(tb->L[0]);
+ unsigned body_shift_bytes = 0;
if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
/* part of new item falls into L[0] */
@@ -329,7 +331,7 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
put_ih_item_len(ih, new_item_len);
if (tb->lbytes > tb->zeroes_num) {
- body += (tb->lbytes - tb->zeroes_num);
+ body_shift_bytes = tb->lbytes - tb->zeroes_num;
tb->zeroes_num = 0;
} else
tb->zeroes_num -= tb->lbytes;
@@ -349,11 +351,12 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
tb->insert_size[0] = 0;
tb->zeroes_num = 0;
}
+ return body_shift_bytes;
}
static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
- struct item_head *ih,
- const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
int n = B_NR_ITEMS(tb->L[0]);
struct buffer_info bi;
@@ -413,17 +416,18 @@ static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
tb->pos_in_item -= tb->lbytes;
}
-static void balance_leaf_paste_left_shift(struct tree_balance *tb,
- struct item_head *ih,
- const char *body)
+static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int n = B_NR_ITEMS(tb->L[0]);
struct buffer_info bi;
+ int body_shift_bytes = 0;
if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
balance_leaf_paste_left_shift_dirent(tb, ih, body);
- return;
+ return 0;
}
RFALSE(tb->lbytes <= 0,
@@ -497,7 +501,7 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
* insert_size[0]
*/
if (l_n > tb->zeroes_num) {
- body += (l_n - tb->zeroes_num);
+ body_shift_bytes = l_n - tb->zeroes_num;
tb->zeroes_num = 0;
} else
tb->zeroes_num -= l_n;
@@ -526,13 +530,14 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
*/
leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
}
+ return body_shift_bytes;
}
/* appended item will be in L[0] in whole */
static void balance_leaf_paste_left_whole(struct tree_balance *tb,
- struct item_head *ih,
- const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int n = B_NR_ITEMS(tb->L[0]);
@@ -584,39 +589,44 @@ static void balance_leaf_paste_left_whole(struct tree_balance *tb,
tb->zeroes_num = 0;
}
-static void balance_leaf_paste_left(struct tree_balance *tb,
- struct item_head *ih, const char *body)
+static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
+ struct item_head * const ih,
+ const char * const body)
{
/* we must shift the part of the appended item */
if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
- balance_leaf_paste_left_shift(tb, ih, body);
+ return balance_leaf_paste_left_shift(tb, ih, body);
else
balance_leaf_paste_left_whole(tb, ih, body);
+ return 0;
}
/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
-static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih,
- const char *body, int flag)
+static unsigned int balance_leaf_left(struct tree_balance *tb,
+ struct item_head * const ih,
+ const char * const body, int flag)
{
if (tb->lnum[0] <= 0)
- return;
+ return 0;
/* new item or it part falls to L[0], shift it too */
if (tb->item_pos < tb->lnum[0]) {
BUG_ON(flag != M_INSERT && flag != M_PASTE);
if (flag == M_INSERT)
- balance_leaf_insert_left(tb, ih, body);
+ return balance_leaf_insert_left(tb, ih, body);
else /* M_PASTE */
- balance_leaf_paste_left(tb, ih, body);
+ return balance_leaf_paste_left(tb, ih, body);
} else
/* new item doesn't fall into L[0] */
leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ return 0;
}
static void balance_leaf_insert_right(struct tree_balance *tb,
- struct item_head *ih, const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
@@ -704,7 +714,8 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
- struct item_head *ih, const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
struct buffer_info bi;
@@ -754,7 +765,8 @@ static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
}
static void balance_leaf_paste_right_shift(struct tree_balance *tb,
- struct item_head *ih, const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int n_shift, n_rem, r_zeroes_number, version;
@@ -831,7 +843,8 @@ static void balance_leaf_paste_right_shift(struct tree_balance *tb,
}
static void balance_leaf_paste_right_whole(struct tree_balance *tb,
- struct item_head *ih, const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int n = B_NR_ITEMS(tbS0);
@@ -874,7 +887,8 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb,
}
static void balance_leaf_paste_right(struct tree_balance *tb,
- struct item_head *ih, const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int n = B_NR_ITEMS(tbS0);
@@ -896,8 +910,9 @@ static void balance_leaf_paste_right(struct tree_balance *tb,
}
/* shift rnum[0] items from S[0] to the right neighbor R[0] */
-static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
- const char *body, int flag)
+static void balance_leaf_right(struct tree_balance *tb,
+ struct item_head * const ih,
+ const char * const body, int flag)
{
if (tb->rnum[0] <= 0)
return;
@@ -911,8 +926,8 @@ static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
}
static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
- struct item_head *ih,
- const char *body,
+ struct item_head * const ih,
+ const char * const body,
struct item_head *insert_key,
struct buffer_head **insert_ptr,
int i)
@@ -1003,8 +1018,8 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
/* we append to directory item */
static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
- struct item_head *ih,
- const char *body,
+ struct item_head * const ih,
+ const char * const body,
struct item_head *insert_key,
struct buffer_head **insert_ptr,
int i)
@@ -1058,8 +1073,8 @@ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
}
static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
- struct item_head *ih,
- const char *body,
+ struct item_head * const ih,
+ const char * const body,
struct item_head *insert_key,
struct buffer_head **insert_ptr,
int i)
@@ -1131,8 +1146,8 @@ static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
}
static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
- struct item_head *ih,
- const char *body,
+ struct item_head * const ih,
+ const char * const body,
struct item_head *insert_key,
struct buffer_head **insert_ptr,
int i)
@@ -1184,8 +1199,8 @@ static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
}
static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
- struct item_head *ih,
- const char *body,
+ struct item_head * const ih,
+ const char * const body,
struct item_head *insert_key,
struct buffer_head **insert_ptr,
int i)
@@ -1214,8 +1229,8 @@ static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
/* Fill new nodes that appear in place of S[0] */
static void balance_leaf_new_nodes(struct tree_balance *tb,
- struct item_head *ih,
- const char *body,
+ struct item_head * const ih,
+ const char * const body,
struct item_head *insert_key,
struct buffer_head **insert_ptr,
int flag)
@@ -1254,8 +1269,8 @@ static void balance_leaf_new_nodes(struct tree_balance *tb,
}
static void balance_leaf_finish_node_insert(struct tree_balance *tb,
- struct item_head *ih,
- const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
struct buffer_info bi;
@@ -1271,8 +1286,8 @@ static void balance_leaf_finish_node_insert(struct tree_balance *tb,
}
static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
- struct item_head *ih,
- const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
struct item_head *pasted = item_head(tbS0, tb->item_pos);
@@ -1305,8 +1320,8 @@ static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
}
static void balance_leaf_finish_node_paste(struct tree_balance *tb,
- struct item_head *ih,
- const char *body)
+ struct item_head * const ih,
+ const char * const body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
struct buffer_info bi;
@@ -1349,8 +1364,8 @@ static void balance_leaf_finish_node_paste(struct tree_balance *tb,
* of the affected item which remains in S
*/
static void balance_leaf_finish_node(struct tree_balance *tb,
- struct item_head *ih,
- const char *body, int flag)
+ struct item_head * const ih,
+ const char * const body, int flag)
{
/* if we must insert or append into buffer S[0] */
if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
@@ -1402,7 +1417,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
&& is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
tb->pos_in_item *= UNFM_P_SIZE;
- balance_leaf_left(tb, ih, body, flag);
+ body += balance_leaf_left(tb, ih, body, flag);
/* tb->lnum[0] > 0 */
/* Calculate new item position */
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index db9e80ba53a0..751dd3f4346b 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -6,7 +6,7 @@
#include "reiserfs.h"
#include "acl.h"
#include "xattr.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/writeback.h>
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 73231b1ebdbe..b751eea32e20 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -2,7 +2,7 @@
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/string.h>
#include <linux/time.h>
#include "reiserfs.h"
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 63b2b0ec49e6..a7eec9888f10 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,7 +11,7 @@
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 501ed6811a2b..6ec8a30a0911 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -7,7 +7,7 @@
#include <linux/mount.h>
#include "reiserfs.h"
#include <linux/time.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/compat.h>
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index cfaee912ee09..aca73dd73906 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -54,7 +54,7 @@ static void sd_print_item(struct item_head *ih, char *item)
} else {
struct stat_data *sd = (struct stat_data *)item;
- printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd),
+ printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd),
(unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
}
@@ -408,7 +408,7 @@ static void direntry_print_item(struct item_head *ih, char *item)
namebuf[namelen + 2] = 0;
}
- printk("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n",
+ printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
i, namebuf,
deh_dir_id(deh), deh_objectid(deh),
GET_HASH_VALUE(deh_offset(deh)),
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e8870de4627e..d571e173a990 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -699,11 +699,13 @@ static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
chunk->bh[chunk->nr++] = bh;
if (chunk->nr >= CHUNK_SIZE) {
ret = 1;
- if (lock)
+ if (lock) {
spin_unlock(lock);
- fn(chunk);
- if (lock)
+ fn(chunk);
spin_lock(lock);
+ } else {
+ fn(chunk);
+ }
}
return ret;
}
@@ -1947,8 +1949,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
}
}
- /* wait for all commits to finish */
- cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
/*
* We must release the write lock here because
@@ -1956,8 +1956,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
*/
reiserfs_write_unlock(sb);
+ /*
+ * Cancel flushing of old commits. Note that neither of these works
+ * will be requeued because superblock is being shutdown and doesn't
+ * have MS_ACTIVE set.
+ */
cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
- flush_workqueue(REISERFS_SB(sb)->commit_wq);
+ /* wait for all commits to finish */
+ cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
free_journal_ram(sb);
@@ -4292,9 +4298,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
if (flush) {
flush_commit_list(sb, jl, 1);
flush_journal_list(sb, jl, 1);
- } else if (!(jl->j_state & LIST_COMMIT_PENDING))
- queue_delayed_work(REISERFS_SB(sb)->commit_wq,
- &journal->j_work, HZ / 10);
+ } else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
+ /*
+ * Avoid queueing work when sb is being shut down. Transaction
+ * will be flushed on journal shutdown.
+ */
+ if (sb->s_flags & MS_ACTIVE)
+ queue_delayed_work(REISERFS_SB(sb)->commit_wq,
+ &journal->j_work, HZ / 10);
+ }
/*
* if the next transaction has any chance of wrapping, flush
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index d6744c8b24e1..249594a821e0 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -2,7 +2,7 @@
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/string.h>
#include <linux/time.h>
#include "reiserfs.h"
@@ -899,8 +899,9 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
/* insert item into the leaf node in position before */
void leaf_insert_into_buf(struct buffer_info *bi, int before,
- struct item_head *inserted_item_ih,
- const char *inserted_item_body, int zeros_number)
+ struct item_head * const inserted_item_ih,
+ const char * const inserted_item_body,
+ int zeros_number)
{
struct buffer_head *bh = bi->bi_bh;
int nr, free_space;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index c9b47e91baf8..ae1dc841db3a 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -17,7 +17,7 @@ static char off_buf[80];
static char *reiserfs_cpu_offset(struct cpu_key *key)
{
if (cpu_key_k_type(key) == TYPE_DIRENTRY)
- sprintf(off_buf, "%Lu(%Lu)",
+ sprintf(off_buf, "%llu(%llu)",
(unsigned long long)
GET_HASH_VALUE(cpu_key_k_offset(key)),
(unsigned long long)
@@ -34,7 +34,7 @@ static char *le_offset(struct reiserfs_key *key)
version = le_key_version(key);
if (le_key_k_type(version, key) == TYPE_DIRENTRY)
- sprintf(off_buf, "%Lu(%Lu)",
+ sprintf(off_buf, "%llu(%llu)",
(unsigned long long)
GET_HASH_VALUE(le_key_k_offset(version, key)),
(unsigned long long)
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 02b0b7d0f7d5..621b9f381fe1 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -11,7 +11,7 @@
#include <linux/module.h>
#include <linux/time.h>
#include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "reiserfs.h"
#include <linux/init.h>
#include <linux/proc_fs.h>
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index bf53888c7f59..1894d96ccb7c 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -506,6 +506,9 @@ typedef struct reiserfs_proc_info_data {
} reiserfs_proc_info_data_t;
#endif
+/* Number of quota types we support */
+#define REISERFS_MAXQUOTAS 2
+
/* reiserfs union of in-core super block data */
struct reiserfs_sb_info {
/* Buffer containing the super block */
@@ -615,7 +618,7 @@ struct reiserfs_sb_info {
spinlock_t old_work_lock; /* protects old_work and work_queued */
#ifdef CONFIG_QUOTA
- char *s_qf_names[MAXQUOTAS];
+ char *s_qf_names[REISERFS_MAXQUOTAS];
int s_jquota_fmt;
#endif
char *s_jdev; /* Stored jdev for mount option showing */
@@ -3216,11 +3219,12 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
int del_num, int del_bytes);
void leaf_insert_into_buf(struct buffer_info *bi, int before,
- struct item_head *inserted_item_ih,
- const char *inserted_item_body, int zeros_number);
-void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
- int pos_in_item, int paste_size, const char *body,
+ struct item_head * const inserted_item_ih,
+ const char * const inserted_item_body,
int zeros_number);
+void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
+ int pos_in_item, int paste_size,
+ const char * const body, int zeros_number);
void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
int pos_in_item, int cut_size);
void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index dd44468edc2b..24cbe013240f 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2006,7 +2006,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
&s_search_path) == POSITION_FOUND);
RFALSE(file_size > ROUND_UP(new_file_size),
- "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
+ "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
update_and_out:
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index a392cef6acc6..f1376c92cf74 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -15,7 +15,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/time.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "reiserfs.h"
#include "acl.h"
#include "xattr.h"
@@ -100,7 +100,11 @@ void reiserfs_schedule_old_flush(struct super_block *s)
struct reiserfs_sb_info *sbi = REISERFS_SB(s);
unsigned long delay;
- if (s->s_flags & MS_RDONLY)
+ /*
+ * Avoid scheduling flush when sb is being shut down. It can race
+ * with journal shutdown and free still queued delayed work.
+ */
+ if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE))
return;
spin_lock(&sbi->old_work_lock);
@@ -202,7 +206,7 @@ static int finish_unfinished(struct super_block *s)
#ifdef CONFIG_QUOTA
int i;
int ms_active_set;
- int quota_enabled[MAXQUOTAS];
+ int quota_enabled[REISERFS_MAXQUOTAS];
#endif
/* compose key to look for "save" links */
@@ -223,7 +227,7 @@ static int finish_unfinished(struct super_block *s)
s->s_flags |= MS_ACTIVE;
}
/* Turn on quotas so that they are updated correctly */
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
quota_enabled[i] = 1;
if (REISERFS_SB(s)->s_qf_names[i]) {
int ret;
@@ -331,7 +335,7 @@ static int finish_unfinished(struct super_block *s)
* not completed truncate found. New size was
* committed together with "save" link
*/
- reiserfs_info(s, "Truncating %k to %Ld ..",
+ reiserfs_info(s, "Truncating %k to %lld ..",
INODE_PKEY(inode), inode->i_size);
/* don't update modification time */
@@ -366,7 +370,7 @@ static int finish_unfinished(struct super_block *s)
#ifdef CONFIG_QUOTA
/* Turn quotas off */
reiserfs_write_unlock(s);
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
if (sb_dqopt(s)->files[i] && quota_enabled[i])
dquot_quota_off(s, i);
}
@@ -1356,7 +1360,7 @@ static void handle_quota_files(struct super_block *s, char **qf_names,
{
int i;
- for (i = 0; i < MAXQUOTAS; i++) {
+ for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
kfree(REISERFS_SB(s)->s_qf_names[i]);
REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
@@ -1377,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
struct reiserfs_journal *journal = SB_JOURNAL(s);
char *new_opts = kstrdup(arg, GFP_KERNEL);
int err;
- char *qf_names[MAXQUOTAS];
+ char *qf_names[REISERFS_MAXQUOTAS];
unsigned int qfmt = 0;
#ifdef CONFIG_QUOTA
int i;
@@ -1396,7 +1400,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
(s, arg, &mount_options, &blocks, NULL, &commit_max_age,
qf_names, &qfmt)) {
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++)
+ for (i = 0; i < REISERFS_MAXQUOTAS; i++)
if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
kfree(qf_names[i]);
#endif
@@ -1577,7 +1581,7 @@ static int read_super_block(struct super_block *s, int offset)
rs = (struct reiserfs_super_block *)bh->b_data;
if (sb_blocksize(rs) != s->s_blocksize) {
reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
- "filesystem on (dev %s, block %Lu, size %lu)",
+ "filesystem on (dev %s, block %llu, size %lu)",
s->s_id,
(unsigned long long)bh->b_blocknr,
s->s_blocksize);
@@ -1840,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
char *jdev_name;
struct reiserfs_sb_info *sbi;
int errval = -EINVAL;
- char *qf_names[MAXQUOTAS] = {};
+ char *qf_names[REISERFS_MAXQUOTAS] = {};
unsigned int qfmt = 0;
save_mount_options(s, data);
@@ -2165,7 +2169,7 @@ error_unlocked:
#ifdef CONFIG_QUOTA
{
int j;
- for (j = 0; j < MAXQUOTAS; j++)
+ for (j = 0; j < REISERFS_MAXQUOTAS; j++)
kfree(qf_names[j]);
}
#endif
@@ -2441,8 +2445,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
struct buffer_head tmp_bh, *bh;
if (!current->journal_info) {
- printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)"
- " cancelled because transaction is not started.\n",
+ printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ca416d099e7d..7c36898af402 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -45,7 +45,7 @@
#include <linux/xattr.h>
#include "xattr.h"
#include "acl.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
#include <linux/stat.h>
#include <linux/quotaops.h>
@@ -84,6 +84,7 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
static int xattr_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
+
BUG_ON(!mutex_is_locked(&dir->i_mutex));
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
@@ -98,6 +99,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
{
int error;
+
BUG_ON(!mutex_is_locked(&dir->i_mutex));
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
@@ -117,6 +119,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
{
struct dentry *privroot = REISERFS_SB(sb)->priv_root;
struct dentry *xaroot;
+
if (!privroot->d_inode)
return ERR_PTR(-ENODATA);
@@ -127,6 +130,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
xaroot = ERR_PTR(-ENODATA);
else if (!xaroot->d_inode) {
int err = -ENODATA;
+
if (xattr_may_create(flags))
err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
if (err) {
@@ -157,6 +161,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
if (!IS_ERR(xadir) && !xadir->d_inode) {
int err = -ENODATA;
+
if (xattr_may_create(flags))
err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
if (err) {
@@ -188,6 +193,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
{
struct reiserfs_dentry_buf *dbuf = buf;
struct dentry *dentry;
+
WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
@@ -218,6 +224,7 @@ static void
cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
{
int i;
+
for (i = 0; i < buf->count; i++)
if (buf->dentries[i])
dput(buf->dentries[i]);
@@ -283,11 +290,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
struct reiserfs_transaction_handle th;
+
reiserfs_write_lock(inode->i_sb);
err = journal_begin(&th, inode->i_sb, blocks);
reiserfs_write_unlock(inode->i_sb);
if (!err) {
int jerror;
+
mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
I_MUTEX_XATTR);
err = action(dir, data);
@@ -340,6 +349,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data)
int reiserfs_delete_xattrs(struct inode *inode)
{
int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
+
if (err)
reiserfs_warning(inode->i_sb, "jdm-20004",
"Couldn't delete all xattrs (%d)\n", err);
@@ -350,6 +360,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
{
int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
+
if (err)
reiserfs_warning(inode->i_sb, "jdm-20007",
"Couldn't chown all xattrs (%d)\n", err);
@@ -439,6 +450,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
static void update_ctime(struct inode *inode)
{
struct timespec now = current_fs_time(inode->i_sb);
+
if (inode_unhashed(inode) || !inode->i_nlink ||
timespec_equal(&inode->i_ctime, &now))
return;
@@ -514,6 +526,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
size_t chunk;
size_t skip = 0;
size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
+
if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
chunk = PAGE_CACHE_SIZE;
else
@@ -530,6 +543,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
if (file_pos == 0) {
struct reiserfs_xattr_header *rxh;
+
skip = file_pos = sizeof(struct reiserfs_xattr_header);
if (chunk + skip > PAGE_CACHE_SIZE)
chunk = PAGE_CACHE_SIZE - skip;
@@ -659,6 +673,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
size_t chunk;
char *data;
size_t skip = 0;
+
if (isize - file_pos > PAGE_CACHE_SIZE)
chunk = PAGE_CACHE_SIZE;
else
@@ -792,6 +807,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
int reiserfs_removexattr(struct dentry *dentry, const char *name)
{
const struct xattr_handler *handler;
+
handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -813,9 +829,11 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
{
struct listxattr_buf *b = (struct listxattr_buf *)buf;
size_t size;
+
if (name[0] != '.' ||
(namelen != 1 && (name[1] != '.' || namelen != 2))) {
const struct xattr_handler *handler;
+
handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
name);
if (!handler) /* Unsupported xattr name */
@@ -885,6 +903,7 @@ static int create_privroot(struct dentry *dentry)
{
int err;
struct inode *inode = dentry->d_parent->d_inode;
+
WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
err = xattr_mkdir(inode, dentry, 0700);
@@ -1015,6 +1034,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
mutex_lock(&privroot->d_inode->i_mutex);
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
+
dentry = lookup_one_len(XAROOT_NAME, privroot,
strlen(XAROOT_NAME));
if (!IS_ERR(dentry))
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index 857ec7e3016f..f620e9678dd5 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -7,7 +7,6 @@ struct inode;
struct dentry;
struct iattr;
struct super_block;
-struct nameidata;
int reiserfs_xattr_register_handlers(void) __init;
void reiserfs_xattr_unregister_handlers(void);
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 44503e293790..4b34b9dc03dd 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -9,7 +9,7 @@
#include <linux/posix_acl_xattr.h>
#include "xattr.h"
#include "acl.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
struct inode *inode, int type,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 800a3cef6f62..e7f8939a4cb5 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -6,7 +6,7 @@
#include <linux/slab.h>
#include "xattr.h"
#include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static int
security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index a0035719f66b..5eeb0c48ba46 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -5,7 +5,7 @@
#include <linux/pagemap.h>
#include <linux/xattr.h>
#include "xattr.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static int
trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 8667491ae7c3..e50eab046471 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -4,7 +4,7 @@
#include <linux/pagemap.h>
#include <linux/xattr.h>
#include "xattr.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static int
user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ef90e8bca95a..e98dd88197d5 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -56,6 +56,8 @@
* 2 of the Licence, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
@@ -380,7 +382,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
eio:
ret = -EIO;
error:
- printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+ pr_err("read error for inode 0x%lx\n", pos);
return ERR_PTR(ret);
}
@@ -390,6 +392,7 @@ error:
static struct inode *romfs_alloc_inode(struct super_block *sb)
{
struct romfs_inode_info *inode;
+
inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
return inode ? &inode->vfs_inode : NULL;
}
@@ -400,6 +403,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
static void romfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
+
kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
}
@@ -507,15 +511,13 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
img_size < ROMFH_SIZE) {
if (!silent)
- printk(KERN_WARNING "VFS:"
- " Can't find a romfs filesystem on dev %s.\n",
+ pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n",
sb->s_id);
goto error_rsb_inval;
}
if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
- printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
- sb->s_id);
+ pr_err("bad initial checksum on dev %s.\n", sb->s_id);
goto error_rsb_inval;
}
@@ -523,8 +525,8 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
len = strnlen(rsb->name, ROMFS_MAXFN);
if (!silent)
- printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
- (unsigned) len, (unsigned) len, rsb->name, storage);
+ pr_notice("Mounting image '%*.*s' through %s\n",
+ (unsigned) len, (unsigned) len, rsb->name, storage);
kfree(rsb);
rsb = NULL;
@@ -614,7 +616,7 @@ static int __init init_romfs_fs(void)
{
int ret;
- printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+ pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n");
romfs_inode_cachep =
kmem_cache_create("romfs_i",
@@ -623,13 +625,12 @@ static int __init init_romfs_fs(void)
romfs_i_init_once);
if (!romfs_inode_cachep) {
- printk(KERN_ERR
- "ROMFS error: Failed to initialise inode cache\n");
+ pr_err("Failed to initialise inode cache\n");
return -ENOMEM;
}
ret = register_filesystem(&romfs_fs_type);
if (ret) {
- printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+ pr_err("Failed to register filesystem\n");
goto error_register;
}
return 0;
diff --git a/fs/splice.c b/fs/splice.c
index f5cb9ba84510..75c6058eabf2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
return ret;
}
+EXPORT_SYMBOL(do_splice_direct);
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 62a0de6632e1..43e7a7eddac0 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -44,7 +44,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
pages = end_index - start_index + 1;
- page = kmalloc(sizeof(void *) * pages, GFP_KERNEL);
+ page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
if (page == NULL)
return res;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 031c8d67fd51..5056babe00df 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -27,6 +27,8 @@
* the filesystem.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/slab.h>
@@ -448,8 +450,7 @@ static int __init init_squashfs_fs(void)
return err;
}
- printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
- "Phillip Lougher\n");
+ pr_info("version 4.0 (2009/01/31) Phillip Lougher\n");
return 0;
}
diff --git a/fs/super.c b/fs/super.c
index d20d5b11dedf..eae088f6aaae 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,7 +22,6 @@
#include <linux/export.h>
#include <linux/slab.h>
-#include <linux/acct.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
@@ -81,6 +80,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
total_objects = dentries + inodes + fs_objects + 1;
+ if (!total_objects)
+ total_objects = 1;
/* proportion the scan between the caches */
dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
@@ -176,7 +177,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
goto fail;
for (i = 0; i < SB_FREEZE_LEVELS; i++) {
- if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0)
+ if (percpu_counter_init(&s->s_writers.counter[i], 0,
+ GFP_KERNEL) < 0)
goto fail;
lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
&type->s_writers_key[i], 0);
@@ -218,7 +220,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
mutex_init(&s->s_dquot.dqio_mutex);
mutex_init(&s->s_dquot.dqonoff_mutex);
- init_rwsem(&s->s_dquot.dqptr_sem);
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
@@ -702,12 +703,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
return -EACCES;
#endif
- if (flags & MS_RDONLY)
- acct_auto_close(sb);
- shrink_dcache_sb(sb);
-
remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+ if (remount_ro) {
+ if (sb->s_pins.first) {
+ up_write(&sb->s_umount);
+ sb_pin_kill(sb);
+ down_write(&sb->s_umount);
+ if (!sb->s_root)
+ return 0;
+ if (sb->s_writers.frozen != SB_UNFROZEN)
+ return -EBUSY;
+ remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+ }
+ }
+ shrink_dcache_sb(sb);
+
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
if (remount_ro) {
diff --git a/fs/sync.c b/fs/sync.c
index b28d1dd10e8b..bdc729d80e5e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -65,7 +65,7 @@ int sync_filesystem(struct super_block *sb)
return ret;
return __sync_filesystem(sb, 1);
}
-EXPORT_SYMBOL_GPL(sync_filesystem);
+EXPORT_SYMBOL(sync_filesystem);
static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 80c350216ea8..b46ffa94372a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -333,8 +333,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
spin_lock_irq(&ctx->wqh.lock);
if (!timerfd_canceled(ctx)) {
ctx->ticks = ticks;
- if (ticks)
- wake_up_locked(&ctx->wqh);
+ wake_up_locked(&ctx->wqh);
} else
ret = -ECANCELED;
spin_unlock_irq(&ctx->wqh.lock);
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index ff8229340cd5..26b69b2d4a45 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -166,15 +166,10 @@ static int do_commit(struct ubifs_info *c)
err = ubifs_orphan_end_commit(c);
if (err)
goto out;
- old_ltail_lnum = c->ltail_lnum;
- err = ubifs_log_end_commit(c, new_ltail_lnum);
- if (err)
- goto out;
err = dbg_check_old_index(c, &zroot);
if (err)
goto out;
- mutex_lock(&c->mst_mutex);
c->mst_node->cmt_no = cpu_to_le64(c->cmt_no);
c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
@@ -203,8 +198,9 @@ static int do_commit(struct ubifs_info *c)
c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
else
c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
- err = ubifs_write_master(c);
- mutex_unlock(&c->mst_mutex);
+
+ old_ltail_lnum = c->ltail_lnum;
+ err = ubifs_log_end_commit(c, new_ltail_lnum);
if (err)
goto out;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 177b0152fef4..7ed13e1e216a 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -334,9 +334,9 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
pr_err("\tkey_fmt %d (%s)\n",
(int)sup->key_fmt, get_key_fmt(sup->key_fmt));
pr_err("\tflags %#x\n", sup_flags);
- pr_err("\t big_lpt %u\n",
+ pr_err("\tbig_lpt %u\n",
!!(sup_flags & UBIFS_FLG_BIGLPT));
- pr_err("\t space_fixup %u\n",
+ pr_err("\tspace_fixup %u\n",
!!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
pr_err("\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size));
pr_err("\tleb_size %u\n", le32_to_cpu(sup->leb_size));
@@ -2462,7 +2462,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
if (chance(1, 2)) {
d->pc_delay = 1;
- /* Fail withing 1 minute */
+ /* Fail within 1 minute */
delay = prandom_u32() % 60000;
d->pc_timeout = jiffies;
d->pc_timeout += msecs_to_jiffies(delay);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 2290d5866725..fb08b0c514b6 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -431,7 +431,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
/**
* wbuf_timer_callback - write-buffer timer callback function.
- * @data: timer data (write-buffer descriptor)
+ * @timer: timer data (write-buffer descriptor)
*
* This function is called when the write-buffer timer expires.
*/
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 0e045e75abd8..fb166e204441 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -546,15 +546,14 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
int last_reference = !!(deletion && inode->i_nlink == 0);
struct ubifs_inode *ui = ubifs_inode(inode);
- struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_inode *host_ui = ubifs_inode(dir);
struct ubifs_dent_node *dent;
struct ubifs_ino_node *ino;
union ubifs_key dent_key, ino_key;
dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
- ubifs_assert(dir_ui->data_len == 0);
- ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
+ ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
ilen = UBIFS_INO_NODE_SZ;
@@ -658,7 +657,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
ui->synced_i_size = ui->ui_size;
spin_unlock(&ui->ui_lock);
mark_inode_clean(c, ui);
- mark_inode_clean(c, dir_ui);
+ mark_inode_clean(c, host_ui);
return 0;
out_finish:
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index a902c5919e42..c14628fbeee2 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -106,10 +106,14 @@ static inline long long empty_log_bytes(const struct ubifs_info *c)
h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
t = (long long)c->ltail_lnum * c->leb_size;
- if (h >= t)
+ if (h > t)
return c->log_bytes - h + t;
- else
+ else if (h != t)
return t - h;
+ else if (c->lhead_lnum != c->ltail_lnum)
+ return 0;
+ else
+ return c->log_bytes;
}
/**
@@ -240,6 +244,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+ ubifs_assert(c->lhead_lnum != c->ltail_lnum);
c->lhead_offs = 0;
}
@@ -404,15 +409,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
/* Switch to the next log LEB */
if (c->lhead_offs) {
c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+ ubifs_assert(c->lhead_lnum != c->ltail_lnum);
c->lhead_offs = 0;
}
- if (c->lhead_offs == 0) {
- /* Must ensure next LEB has been unmapped */
- err = ubifs_leb_unmap(c, c->lhead_lnum);
- if (err)
- goto out;
- }
+ /* Must ensure next LEB has been unmapped */
+ err = ubifs_leb_unmap(c, c->lhead_lnum);
+ if (err)
+ goto out;
len = ALIGN(len, c->min_io_size);
dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
@@ -447,9 +451,9 @@ out:
* @ltail_lnum: new log tail LEB number
*
* This function is called on when the commit operation was finished. It
- * moves log tail to new position and unmaps LEBs which contain obsolete data.
- * Returns zero in case of success and a negative error code in case of
- * failure.
+ * moves log tail to new position and updates the master node so that it stores
+ * the new log tail LEB number. Returns zero in case of success and a negative
+ * error code in case of failure.
*/
int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
{
@@ -477,7 +481,12 @@ int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
spin_unlock(&c->buds_lock);
err = dbg_check_bud_bytes(c);
+ if (err)
+ goto out;
+
+ err = ubifs_write_master(c);
+out:
mutex_unlock(&c->log_mutex);
return err;
}
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index d46b19ec1815..421bd0a80424 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1464,7 +1464,6 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
return ERR_CAST(nnode);
}
iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
- shft -= UBIFS_LPT_FANOUT_SHIFT;
pnode = ubifs_get_pnode(c, nnode, iip);
if (IS_ERR(pnode))
return ERR_CAST(pnode);
@@ -1604,7 +1603,6 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
return ERR_CAST(nnode);
}
iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
- shft -= UBIFS_LPT_FANOUT_SHIFT;
pnode = ubifs_get_pnode(c, nnode, iip);
if (IS_ERR(pnode))
return ERR_CAST(pnode);
@@ -1964,7 +1962,6 @@ again:
}
}
iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
- shft -= UBIFS_LPT_FANOUT_SHIFT;
pnode = scan_get_pnode(c, path + h, nnode, iip);
if (IS_ERR(pnode)) {
err = PTR_ERR(pnode);
@@ -2198,6 +2195,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
lprops->dirty);
return -EINVAL;
}
+ break;
case LPROPS_FREEABLE:
case LPROPS_FRDI_IDX:
if (lprops->free + lprops->dirty != c->leb_size) {
@@ -2206,6 +2204,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
lprops->dirty);
return -EINVAL;
}
+ break;
}
}
return 0;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 45d4e96a6bac..d9c02928e992 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -304,7 +304,6 @@ static int layout_cnodes(struct ubifs_info *c)
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
}
- done_ltab = 1;
c->ltab_lnum = lnum;
c->ltab_offs = offs;
offs += c->ltab_sz;
@@ -514,7 +513,6 @@ static int write_cnodes(struct ubifs_info *c)
if (err)
return err;
}
- done_ltab = 1;
ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
offs += c->ltab_sz;
dbg_chk_lpt_sz(c, 1, c->ltab_sz);
@@ -1941,6 +1939,11 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
pr_err("LEB %d:%d, nnode, ",
lnum, offs);
err = ubifs_unpack_nnode(c, p, &nnode);
+ if (err) {
+ pr_err("failed to unpack_node, error %d\n",
+ err);
+ break;
+ }
for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
pr_cont("%d:%d", nnode.nbranch[i].lnum,
nnode.nbranch[i].offs);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index ab83ace9910a..1a4bb9e8b3b8 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c)
* ubifs_write_master - write master node.
* @c: UBIFS file-system description object
*
- * This function writes the master node. The caller has to take the
- * @c->mst_mutex lock before calling this function. Returns zero in case of
- * success and a negative error code in case of failure. The master node is
- * written twice to enable recovery.
+ * This function writes the master node. Returns zero in case of success and a
+ * negative error code in case of failure. The master node is written twice to
+ * enable recovery.
*/
int ubifs_write_master(struct ubifs_info *c)
{
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index f1c3e5a1b315..4409f486ecef 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -346,7 +346,6 @@ static int write_orph_nodes(struct ubifs_info *c, int atomic)
int lnum;
/* Unmap any unused LEBs after consolidation */
- lnum = c->ohead_lnum + 1;
for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
err = ubifs_leb_unmap(c, lnum);
if (err)
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c14adb2f420c..c640938f62f0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -596,7 +596,6 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
* drop_last_node - drop the last node.
* @sleb: scanned LEB information
* @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
*
* This is a helper function for 'ubifs_recover_leb()' which drops the last
* node of the scanned LEB.
@@ -629,8 +628,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
*
* This function does a scan of a LEB, but caters for errors that might have
* been caused by the unclean unmount from which we are attempting to recover.
- * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is
- * found, and a negative error code in case of failure.
+ * Returns the scanned information on success and a negative error code on
+ * failure.
*/
struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
int offs, void *sbuf, int jhead)
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 4c37607a958e..79c6dbbc0e04 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -332,6 +332,8 @@ static int create_default_filesystem(struct ubifs_info *c)
cs->ch.node_type = UBIFS_CS_NODE;
err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
kfree(cs);
+ if (err)
+ return err;
ubifs_msg("default file-system created");
return 0;
@@ -447,7 +449,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
goto failed;
}
- if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+ if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
err = 13;
goto failed;
}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 58aa05df2bb6..89adbc4d08ac 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -131,7 +131,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
* @offs: offset to start at (usually zero)
* @sbuf: scan buffer (must be c->leb_size)
*
- * This function returns %0 on success and a negative error code on failure.
+ * This function returns the scanned information on success and a negative error
+ * code on failure.
*/
struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
int offs, void *sbuf)
@@ -157,9 +158,10 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
return ERR_PTR(err);
}
- if (err == -EBADMSG)
- sleb->ecc = 1;
-
+ /*
+ * Note, we ignore integrity errors (EBASMSG) because all the nodes are
+ * protected by CRC checksums.
+ */
return sleb;
}
@@ -169,8 +171,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
* @sleb: scanning information
* @lnum: logical eraseblock number
* @offs: offset to start at (usually zero)
- *
- * This function returns %0 on success and a negative error code on failure.
*/
void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
int lnum, int offs)
@@ -257,7 +257,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
* @quiet: print no messages
*
* This function scans LEB number @lnum and returns complete information about
- * its contents. Returns the scaned information in case of success and,
+ * its contents. Returns the scanned information in case of success and,
* %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
* of failure.
*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3904c8574ef9..106bf20629ce 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -75,7 +75,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
return 1;
}
- if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+ if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
ubifs_err("unknown compression type %d", ui->compr_type);
return 2;
}
@@ -424,19 +424,19 @@ static int ubifs_show_options(struct seq_file *s, struct dentry *root)
struct ubifs_info *c = root->d_sb->s_fs_info;
if (c->mount_opts.unmount_mode == 2)
- seq_printf(s, ",fast_unmount");
+ seq_puts(s, ",fast_unmount");
else if (c->mount_opts.unmount_mode == 1)
- seq_printf(s, ",norm_unmount");
+ seq_puts(s, ",norm_unmount");
if (c->mount_opts.bulk_read == 2)
- seq_printf(s, ",bulk_read");
+ seq_puts(s, ",bulk_read");
else if (c->mount_opts.bulk_read == 1)
- seq_printf(s, ",no_bulk_read");
+ seq_puts(s, ",no_bulk_read");
if (c->mount_opts.chk_data_crc == 2)
- seq_printf(s, ",chk_data_crc");
+ seq_puts(s, ",chk_data_crc");
else if (c->mount_opts.chk_data_crc == 1)
- seq_printf(s, ",no_chk_data_crc");
+ seq_puts(s, ",no_chk_data_crc");
if (c->mount_opts.override_compr) {
seq_printf(s, ",compr=%s",
@@ -796,8 +796,8 @@ static int alloc_wbufs(struct ubifs_info *c)
{
int i, err;
- c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
- GFP_KERNEL);
+ c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead),
+ GFP_KERNEL);
if (!c->jheads)
return -ENOMEM;
@@ -1963,7 +1963,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
mutex_init(&c->lp_mutex);
mutex_init(&c->tnc_mutex);
mutex_init(&c->log_mutex);
- mutex_init(&c->mst_mutex);
mutex_init(&c->umount_mutex);
mutex_init(&c->bu_mutex);
mutex_init(&c->write_reserve_mutex);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8a40cf9c02d7..6793db0754f6 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -3294,7 +3294,6 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
goto out_unlock;
if (err) {
- err = -EINVAL;
key = &from_key;
goto out_dump;
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 3600994f8411..7a205e046776 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -389,7 +389,6 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
ubifs_dump_lprops(c);
}
/* Try to commit anyway */
- err = 0;
break;
}
p++;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c1f71fe17cc0..c4fe900c67ab 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -314,7 +314,6 @@ struct ubifs_scan_node {
* @nodes_cnt: number of nodes scanned
* @nodes: list of struct ubifs_scan_node
* @endpt: end point (and therefore the start of empty space)
- * @ecc: read returned -EBADMSG
* @buf: buffer containing entire LEB scanned
*/
struct ubifs_scan_leb {
@@ -322,7 +321,6 @@ struct ubifs_scan_leb {
int nodes_cnt;
struct list_head nodes;
int endpt;
- int ecc;
void *buf;
};
@@ -1051,7 +1049,6 @@ struct ubifs_debug_info;
*
* @mst_node: master node
* @mst_offs: offset of valid master node
- * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
*
* @max_bu_buf_len: maximum bulk-read buffer length
* @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
@@ -1292,7 +1289,6 @@ struct ubifs_info {
struct ubifs_mst_node *mst_node;
int mst_offs;
- struct mutex mst_mutex;
int max_bu_buf_len;
struct mutex bu_mutex;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d80738fdf424..bb15771b92ae 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -27,7 +27,7 @@
#include "udfdecl.h"
#include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/string.h> /* memset */
#include <linux/capability.h>
@@ -100,24 +100,6 @@ static int udf_adinicb_write_begin(struct file *file,
return 0;
}
-static int udf_adinicb_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = mapping->host;
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
- char *kaddr;
- struct udf_inode_info *iinfo = UDF_I(inode);
-
- kaddr = kmap_atomic(page);
- memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
- kaddr + offset, copied);
- kunmap_atomic(kaddr);
-
- return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
-}
-
static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset)
@@ -130,7 +112,7 @@ const struct address_space_operations udf_adinicb_aops = {
.readpage = udf_adinicb_readpage,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
- .write_end = udf_adinicb_write_end,
+ .write_end = simple_write_end,
.direct_IO = udf_adinicb_direct_IO,
};
@@ -241,11 +223,18 @@ out:
static int udf_release_file(struct inode *inode, struct file *filp)
{
- if (filp->f_mode & FMODE_WRITE) {
+ if (filp->f_mode & FMODE_WRITE &&
+ atomic_read(&inode->i_writecount) > 1) {
+ /*
+ * Grab i_mutex to avoid races with writes changing i_size
+ * while we are running.
+ */
+ mutex_lock(&inode->i_mutex);
down_write(&UDF_I(inode)->i_data_sem);
udf_discard_prealloc(inode);
udf_truncate_tail_extent(inode);
up_write(&UDF_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
}
return 0;
}
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6eaf5edf1ea1..e77db621ec89 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode)
udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
}
-struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
+struct inode *udf_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
@@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
struct udf_inode_info *iinfo;
struct udf_inode_info *dinfo = UDF_I(dir);
struct logicalVolIntegrityDescImpUse *lvidiu;
+ int err;
inode = new_inode(sb);
- if (!inode) {
- *err = -ENOMEM;
- return NULL;
- }
- *err = -ENOSPC;
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
iinfo = UDF_I(inode);
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
@@ -80,21 +78,22 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
}
if (!iinfo->i_ext.i_data) {
iput(inode);
- *err = -ENOMEM;
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
+ err = -ENOSPC;
block = udf_new_block(dir->i_sb, NULL,
dinfo->i_location.partitionReferenceNum,
- start, err);
- if (*err) {
+ start, &err);
+ if (err) {
iput(inode);
- return NULL;
+ return ERR_PTR(err);
}
lvidiu = udf_sb_lvidiu(sb);
if (lvidiu) {
iinfo->i_unique = lvid_get_unique_id(sb);
+ inode->i_generation = iinfo->i_unique;
mutex_lock(&sbi->s_alloc_mutex);
if (S_ISDIR(mode))
le32_add_cpu(&lvidiu->numDirs, 1);
@@ -123,9 +122,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
inode->i_mtime = inode->i_atime = inode->i_ctime =
iinfo->i_crtime = current_fs_time(inode->i_sb);
- insert_inode_hash(inode);
+ if (unlikely(insert_inode_locked(inode) < 0)) {
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(-EIO);
+ }
mark_inode_dirty(inode);
- *err = 0;
return inode;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 236cd48184c2..c9b4df5810d5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -51,7 +51,6 @@ MODULE_LICENSE("GPL");
static umode_t udf_convert_permissions(struct fileEntry *);
static int udf_update_inode(struct inode *, int);
-static void udf_fill_inode(struct inode *, struct buffer_head *);
static int udf_sync_inode(struct inode *inode);
static int udf_alloc_i_data(struct inode *inode, size_t size);
static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
@@ -1271,12 +1270,33 @@ update_time:
return 0;
}
-static void __udf_read_inode(struct inode *inode)
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+
+static int udf_read_inode(struct inode *inode, bool hidden_inode)
{
struct buffer_head *bh = NULL;
struct fileEntry *fe;
+ struct extendedFileEntry *efe;
uint16_t ident;
struct udf_inode_info *iinfo = UDF_I(inode);
+ struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+ struct kernel_lb_addr *iloc = &iinfo->i_location;
+ unsigned int link_count;
+ unsigned int indirections = 0;
+ int ret = -EIO;
+
+reread:
+ if (iloc->logicalBlockNum >=
+ sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
+ udf_debug("block=%d, partition=%d out of range\n",
+ iloc->logicalBlockNum, iloc->partitionReferenceNum);
+ return -EIO;
+ }
/*
* Set defaults, but the inode is still incomplete!
@@ -1290,78 +1310,54 @@ static void __udf_read_inode(struct inode *inode)
* i_nlink = 1
* i_op = NULL;
*/
- bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
+ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
if (!bh) {
udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
- make_bad_inode(inode);
- return;
+ return -EIO;
}
if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
ident != TAG_IDENT_USE) {
udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
inode->i_ino, ident);
- brelse(bh);
- make_bad_inode(inode);
- return;
+ goto out;
}
fe = (struct fileEntry *)bh->b_data;
+ efe = (struct extendedFileEntry *)bh->b_data;
if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
struct buffer_head *ibh;
- ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
- &ident);
+ ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
if (ident == TAG_IDENT_IE && ibh) {
- struct buffer_head *nbh = NULL;
struct kernel_lb_addr loc;
struct indirectEntry *ie;
ie = (struct indirectEntry *)ibh->b_data;
loc = lelb_to_cpu(ie->indirectICB.extLocation);
- if (ie->indirectICB.extLength &&
- (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
- &ident))) {
- if (ident == TAG_IDENT_FE ||
- ident == TAG_IDENT_EFE) {
- memcpy(&iinfo->i_location,
- &loc,
- sizeof(struct kernel_lb_addr));
- brelse(bh);
- brelse(ibh);
- brelse(nbh);
- __udf_read_inode(inode);
- return;
+ if (ie->indirectICB.extLength) {
+ brelse(ibh);
+ memcpy(&iinfo->i_location, &loc,
+ sizeof(struct kernel_lb_addr));
+ if (++indirections > UDF_MAX_ICB_NESTING) {
+ udf_err(inode->i_sb,
+ "too many ICBs in ICB hierarchy"
+ " (max %d supported)\n",
+ UDF_MAX_ICB_NESTING);
+ goto out;
}
- brelse(nbh);
+ brelse(bh);
+ goto reread;
}
}
brelse(ibh);
} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
udf_err(inode->i_sb, "unsupported strategy type: %d\n",
le16_to_cpu(fe->icbTag.strategyType));
- brelse(bh);
- make_bad_inode(inode);
- return;
+ goto out;
}
- udf_fill_inode(inode, bh);
-
- brelse(bh);
-}
-
-static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
-{
- struct fileEntry *fe;
- struct extendedFileEntry *efe;
- struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
- struct udf_inode_info *iinfo = UDF_I(inode);
- unsigned int link_count;
-
- fe = (struct fileEntry *)bh->b_data;
- efe = (struct extendedFileEntry *)bh->b_data;
-
if (fe->icbTag.strategyType == cpu_to_le16(4))
iinfo->i_strat4096 = 0;
else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
@@ -1378,11 +1374,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
iinfo->i_efe = 1;
iinfo->i_use = 0;
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
- sizeof(struct extendedFileEntry))) {
- make_bad_inode(inode);
- return;
- }
+ ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ sizeof(struct extendedFileEntry));
+ if (ret)
+ goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct extendedFileEntry),
inode->i_sb->s_blocksize -
@@ -1390,11 +1385,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
iinfo->i_efe = 0;
iinfo->i_use = 0;
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
- sizeof(struct fileEntry))) {
- make_bad_inode(inode);
- return;
- }
+ ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ sizeof(struct fileEntry));
+ if (ret)
+ goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct fileEntry),
inode->i_sb->s_blocksize - sizeof(struct fileEntry));
@@ -1404,18 +1398,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
iinfo->i_lenAlloc = le32_to_cpu(
((struct unallocSpaceEntry *)bh->b_data)->
lengthAllocDescs);
- if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
- sizeof(struct unallocSpaceEntry))) {
- make_bad_inode(inode);
- return;
- }
+ ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ sizeof(struct unallocSpaceEntry));
+ if (ret)
+ goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct unallocSpaceEntry),
inode->i_sb->s_blocksize -
sizeof(struct unallocSpaceEntry));
- return;
+ return 0;
}
+ ret = -EIO;
read_lock(&sbi->s_cred_lock);
i_uid_write(inode, le32_to_cpu(fe->uid));
if (!uid_valid(inode->i_uid) ||
@@ -1441,8 +1435,13 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
read_unlock(&sbi->s_cred_lock);
link_count = le16_to_cpu(fe->fileLinkCount);
- if (!link_count)
+ if (!link_count) {
+ if (!hidden_inode) {
+ ret = -ESTALE;
+ goto out;
+ }
link_count = 1;
+ }
set_nlink(inode, link_count);
inode->i_size = le64_to_cpu(fe->informationLength);
@@ -1488,6 +1487,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
}
+ inode->i_generation = iinfo->i_unique;
switch (fe->icbTag.fileType) {
case ICBTAG_FILE_TYPE_DIRECTORY:
@@ -1537,8 +1537,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
default:
udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
inode->i_ino, fe->icbTag.fileType);
- make_bad_inode(inode);
- return;
+ goto out;
}
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
struct deviceSpec *dsea =
@@ -1549,8 +1548,12 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
le32_to_cpu(dsea->minorDeviceIdent)));
/* Developer ID ??? */
} else
- make_bad_inode(inode);
+ goto out;
}
+ ret = 0;
+out:
+ brelse(bh);
+ return ret;
}
static int udf_alloc_i_data(struct inode *inode, size_t size)
@@ -1664,7 +1667,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
fe->permissions = cpu_to_le32(udfperms);
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
else
fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
@@ -1826,36 +1829,28 @@ out:
return err;
}
-struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
+struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
+ bool hidden_inode)
{
unsigned long block = udf_get_lb_pblock(sb, ino, 0);
struct inode *inode = iget_locked(sb, block);
+ int err;
if (!inode)
- return NULL;
-
- if (inode->i_state & I_NEW) {
- memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
- __udf_read_inode(inode);
- unlock_new_inode(inode);
- }
+ return ERR_PTR(-ENOMEM);
- if (is_bad_inode(inode))
- goto out_iput;
+ if (!(inode->i_state & I_NEW))
+ return inode;
- if (ino->logicalBlockNum >= UDF_SB(sb)->
- s_partmaps[ino->partitionReferenceNum].s_partition_len) {
- udf_debug("block=%d, partition=%d out of range\n",
- ino->logicalBlockNum, ino->partitionReferenceNum);
- make_bad_inode(inode);
- goto out_iput;
+ memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
+ err = udf_read_inode(inode, hidden_inode);
+ if (err < 0) {
+ iget_failed(inode);
+ return ERR_PTR(err);
}
+ unlock_new_inode(inode);
return inode;
-
- out_iput:
- iput(inode);
- return NULL;
}
int udf_add_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 6583fe9b0645..6ad5a453af97 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -21,7 +21,7 @@
#include <linux/blkdev.h>
#include <linux/cdrom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "udf_sb.h"
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 9737cba1357d..c12e260fd6c4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
NULL, 0),
};
inode = udf_iget(dir->i_sb, lb);
- if (!inode) {
- return ERR_PTR(-EACCES);
- }
+ if (IS_ERR(inode))
+ return inode;
} else
#endif /* UDF_RECOVERY */
@@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
loc = lelb_to_cpu(cfi.icb.extLocation);
inode = udf_iget(dir->i_sb, &loc);
- if (!inode) {
- return ERR_PTR(-EACCES);
- }
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
}
return d_splice_alias(inode, dentry);
@@ -550,32 +548,18 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
}
-static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ struct inode *dir = dentry->d_parent->d_inode;
struct udf_fileident_bh fibh;
- struct inode *inode;
struct fileIdentDesc cfi, *fi;
int err;
- struct udf_inode_info *iinfo;
-
- inode = udf_new_inode(dir, mode, &err);
- if (!inode) {
- return err;
- }
-
- iinfo = UDF_I(inode);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- inode->i_data.a_ops = &udf_adinicb_aops;
- else
- inode->i_data.a_ops = &udf_aops;
- inode->i_op = &udf_file_inode_operations;
- inode->i_fop = &udf_file_operations;
- mark_inode_dirty(inode);
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
- if (!fi) {
+ if (unlikely(!fi)) {
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -589,23 +573,21 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
return 0;
}
-static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
{
- struct inode *inode;
- struct udf_inode_info *iinfo;
- int err;
+ struct inode *inode = udf_new_inode(dir, mode);
- inode = udf_new_inode(dir, mode, &err);
- if (!inode)
- return err;
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- iinfo = UDF_I(inode);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
inode->i_data.a_ops = &udf_adinicb_aops;
else
inode->i_data.a_ops = &udf_aops;
@@ -613,7 +595,25 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_fop = &udf_file_operations;
mark_inode_dirty(inode);
+ return udf_add_nondir(dentry, inode);
+}
+
+static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode = udf_new_inode(dir, mode);
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ inode->i_data.a_ops = &udf_adinicb_aops;
+ else
+ inode->i_data.a_ops = &udf_aops;
+ inode->i_op = &udf_file_inode_operations;
+ inode->i_fop = &udf_file_operations;
+ mark_inode_dirty(inode);
d_tmpfile(dentry, inode);
+ unlock_new_inode(inode);
return 0;
}
@@ -621,44 +621,16 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
struct inode *inode;
- struct udf_fileident_bh fibh;
- struct fileIdentDesc cfi, *fi;
- int err;
- struct udf_inode_info *iinfo;
if (!old_valid_dev(rdev))
return -EINVAL;
- err = -EIO;
- inode = udf_new_inode(dir, mode, &err);
- if (!inode)
- goto out;
+ inode = udf_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- iinfo = UDF_I(inode);
init_special_inode(inode, mode, rdev);
- fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
- if (!fi) {
- inode_dec_link_count(inode);
- iput(inode);
- return err;
- }
- cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
- cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
- *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
- cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
- udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
- if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- mark_inode_dirty(dir);
- mark_inode_dirty(inode);
-
- if (fibh.sbh != fibh.ebh)
- brelse(fibh.ebh);
- brelse(fibh.sbh);
- d_instantiate(dentry, inode);
- err = 0;
-
-out:
- return err;
+ return udf_add_nondir(dentry, inode);
}
static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -670,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct udf_inode_info *dinfo = UDF_I(dir);
struct udf_inode_info *iinfo;
- err = -EIO;
- inode = udf_new_inode(dir, S_IFDIR | mode, &err);
- if (!inode)
- goto out;
+ inode = udf_new_inode(dir, S_IFDIR | mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
iinfo = UDF_I(inode);
inode->i_op = &udf_dir_inode_operations;
@@ -681,6 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
if (!fi) {
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -699,6 +671,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (!fi) {
clear_nlink(inode);
mark_inode_dirty(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -710,6 +683,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
inc_nlink(dir);
mark_inode_dirty(dir);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
@@ -876,14 +850,11 @@ out:
static int udf_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct inode *inode;
+ struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
struct pathComponent *pc;
const char *compstart;
- struct udf_fileident_bh fibh;
struct extent_position epos = {};
int eoffset, elen = 0;
- struct fileIdentDesc *fi;
- struct fileIdentDesc cfi;
uint8_t *ea;
int err;
int block;
@@ -892,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
struct udf_inode_info *iinfo;
struct super_block *sb = dir->i_sb;
- inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
- if (!inode)
- goto out;
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
iinfo = UDF_I(inode);
down_write(&iinfo->i_data_sem);
@@ -1012,24 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
mark_inode_dirty(inode);
up_write(&iinfo->i_data_sem);
- fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
- if (!fi)
- goto out_no_entry;
- cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
- cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
- if (UDF_SB(inode->i_sb)->s_lvid_bh) {
- *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
- cpu_to_le32(lvid_get_unique_id(sb));
- }
- udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
- if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- mark_inode_dirty(dir);
- if (fibh.sbh != fibh.ebh)
- brelse(fibh.ebh);
- brelse(fibh.sbh);
- d_instantiate(dentry, inode);
- err = 0;
-
+ err = udf_add_nondir(dentry, inode);
out:
kfree(name);
return err;
@@ -1037,6 +990,7 @@ out:
out_no_entry:
up_write(&iinfo->i_data_sem);
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -1221,7 +1175,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
struct udf_fileident_bh fibh;
if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
- goto out_unlock;
+ return ERR_PTR(-EACCES);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
@@ -1229,12 +1183,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
tloc = lelb_to_cpu(cfi.icb.extLocation);
inode = udf_iget(child->d_inode->i_sb, &tloc);
- if (!inode)
- goto out_unlock;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
return d_obtain_alias(inode);
-out_unlock:
- return ERR_PTR(-EACCES);
}
@@ -1251,8 +1203,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
loc.partitionReferenceNum = partref;
inode = udf_iget(sb, &loc);
- if (inode == NULL)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
if (generation && inode->i_generation != generation) {
iput(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3286db047a40..e229315bbf7a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -63,7 +63,7 @@
#include "udf_i.h"
#include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define VDS_POS_PRIMARY_VOL_DESC 0
#define VDS_POS_UNALLOC_SPACE_DESC 1
@@ -959,14 +959,16 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
addr.logicalBlockNum = meta_file_loc;
addr.partitionReferenceNum = partition_num;
- metadata_fe = udf_iget(sb, &addr);
+ metadata_fe = udf_iget_special(sb, &addr);
- if (metadata_fe == NULL)
+ if (IS_ERR(metadata_fe)) {
udf_warn(sb, "metadata inode efe not found\n");
- else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+ return metadata_fe;
+ }
+ if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
iput(metadata_fe);
- metadata_fe = NULL;
+ return ERR_PTR(-EIO);
}
return metadata_fe;
@@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
struct udf_part_map *map;
struct udf_meta_data *mdata;
struct kernel_lb_addr addr;
+ struct inode *fe;
map = &sbi->s_partmaps[partition];
mdata = &map->s_type_specific.s_metadata;
@@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
udf_debug("Metadata file location: block = %d part = %d\n",
mdata->s_meta_file_loc, map->s_partition_num);
- mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
- mdata->s_meta_file_loc, map->s_partition_num);
-
- if (mdata->s_metadata_fe == NULL) {
+ fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
+ map->s_partition_num);
+ if (IS_ERR(fe)) {
/* mirror file entry */
udf_debug("Mirror metadata file location: block = %d part = %d\n",
mdata->s_mirror_file_loc, map->s_partition_num);
- mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
- mdata->s_mirror_file_loc, map->s_partition_num);
+ fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
+ map->s_partition_num);
- if (mdata->s_mirror_fe == NULL) {
+ if (IS_ERR(fe)) {
udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
- return -EIO;
+ return PTR_ERR(fe);
}
- }
+ mdata->s_mirror_fe = fe;
+ } else
+ mdata->s_metadata_fe = fe;
+
/*
* bitmap file entry
@@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
udf_debug("Bitmap file location: block = %d part = %d\n",
addr.logicalBlockNum, addr.partitionReferenceNum);
- mdata->s_bitmap_fe = udf_iget(sb, &addr);
- if (mdata->s_bitmap_fe == NULL) {
+ fe = udf_iget_special(sb, &addr);
+ if (IS_ERR(fe)) {
if (sb->s_flags & MS_RDONLY)
udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
else {
udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
- return -EIO;
+ return PTR_ERR(fe);
}
- }
+ } else
+ mdata->s_bitmap_fe = fe;
}
udf_debug("udf_load_metadata_files Ok\n");
@@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
phd->unallocSpaceTable.extPosition),
.partitionReferenceNum = p_index,
};
+ struct inode *inode;
- map->s_uspace.s_table = udf_iget(sb, &loc);
- if (!map->s_uspace.s_table) {
+ inode = udf_iget_special(sb, &loc);
+ if (IS_ERR(inode)) {
udf_debug("cannot load unallocSpaceTable (part %d)\n",
p_index);
- return -EIO;
+ return PTR_ERR(inode);
}
+ map->s_uspace.s_table = inode;
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
udf_debug("unallocSpaceTable (part %d) @ %ld\n",
p_index, map->s_uspace.s_table->i_ino);
@@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
phd->freedSpaceTable.extPosition),
.partitionReferenceNum = p_index,
};
+ struct inode *inode;
- map->s_fspace.s_table = udf_iget(sb, &loc);
- if (!map->s_fspace.s_table) {
+ inode = udf_iget_special(sb, &loc);
+ if (IS_ERR(inode)) {
udf_debug("cannot load freedSpaceTable (part %d)\n",
p_index);
- return -EIO;
+ return PTR_ERR(inode);
}
-
+ map->s_fspace.s_table = inode;
map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
udf_debug("freedSpaceTable (part %d) @ %ld\n",
p_index, map->s_fspace.s_table->i_ino);
@@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
struct udf_part_map *map = &sbi->s_partmaps[p_index];
sector_t vat_block;
struct kernel_lb_addr ino;
+ struct inode *inode;
/*
* VAT file entry is in the last recorded block. Some broken disks have
@@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
ino.partitionReferenceNum = type1_index;
for (vat_block = start_block;
vat_block >= map->s_partition_root &&
- vat_block >= start_block - 3 &&
- !sbi->s_vat_inode; vat_block--) {
+ vat_block >= start_block - 3; vat_block--) {
ino.logicalBlockNum = vat_block - map->s_partition_root;
- sbi->s_vat_inode = udf_iget(sb, &ino);
+ inode = udf_iget_special(sb, &ino);
+ if (!IS_ERR(inode)) {
+ sbi->s_vat_inode = inode;
+ break;
+ }
}
}
@@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
/* assign inodes by physical block number */
/* perhaps it's not extensible enough, but for now ... */
inode = udf_iget(sb, &rootdir);
- if (!inode) {
+ if (IS_ERR(inode)) {
udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
- ret = -EIO;
+ ret = PTR_ERR(inode);
goto error_out;
}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index d7c6dbe4194b..6fb7945c1e6e 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -20,7 +20,7 @@
*/
#include "udfdecl.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/time.h>
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index be7dabbbcb49..1cc3c993ebd0 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -138,12 +138,22 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
/* file.c */
extern long udf_ioctl(struct file *, unsigned int, unsigned long);
/* inode.c */
-extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
+extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *,
+ bool hidden_inode);
+static inline struct inode *udf_iget_special(struct super_block *sb,
+ struct kernel_lb_addr *ino)
+{
+ return __udf_iget(sb, ino, true);
+}
+static inline struct inode *udf_iget(struct super_block *sb,
+ struct kernel_lb_addr *ino)
+{
+ return __udf_iget(sb, ino, false);
+}
extern int udf_expand_file_adinicb(struct inode *);
extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
extern int udf_setsize(struct inode *, loff_t);
-extern void udf_read_inode(struct inode *);
extern void udf_evict_inode(struct inode *);
extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern long udf_block_map(struct inode *, sector_t);
@@ -209,7 +219,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
/* ialloc.c */
extern void udf_free_inode(struct inode *);
-extern struct inode *udf_new_inode(struct inode *, umode_t, int *);
+extern struct inode *udf_new_inode(struct inode *, umode_t);
/* truncate.c */
extern void udf_truncate_tail_extent(struct inode *);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 1f11483eba6a..77c331f1a770 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -81,8 +81,6 @@ static time_t year_seconds[MAX_YEAR_SECONDS] = {
/*2038*/ SPY(68, 17, 0)
};
-extern struct timezone sys_tz;
-
#define SECS_PER_HOUR (60 * 60)
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 44b815e57f94..afd470e588ff 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -412,7 +412,6 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
int extIndex = 0, newExtIndex = 0, hasExt = 0;
unsigned short valueCRC;
uint8_t curr;
- const uint8_t hexChar[] = "0123456789ABCDEF";
if (udfName[0] == '.' &&
(udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
@@ -477,10 +476,10 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
newIndex = 250;
newName[newIndex++] = CRC_MARK;
valueCRC = crc_itu_t(0, fidName, fidNameLen);
- newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
- newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
- newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
- newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
+ newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
+ newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
+ newName[newIndex++] = hex_asc_upper_hi(valueCRC);
+ newName[newIndex++] = hex_asc_upper_lo(valueCRC);
if (hasExt) {
newName[newIndex++] = EXT_MARK;
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index dd39980437fc..4d0e02b022b3 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_UFS_FS) += ufs.o
ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
namei.o super.o symlink.o truncate.o util.o
+ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 7bc20809c99e..2c1036080d52 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -784,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
};
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct ufs_cylinder_group *ucg;
unsigned start, length, loc;
unsigned pos, want, blockmap, mask, end;
u64 result;
@@ -792,8 +791,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
(unsigned long long)goal, count);
- ucg = ubh_get_ucg(UCPI_UBH(ucpi));
-
if (goal)
start = ufs_dtogd(uspi, goal) >> 3;
else
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a9cc75ffa925..7caa01652888 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -298,7 +298,10 @@ cg_found:
ufsi->i_oeftflag = 0;
ufsi->i_dir_start_lookup = 0;
memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
- insert_inode_hash(inode);
+ if (insert_inode_locked(inode) < 0) {
+ err = -EIO;
+ goto failed;
+ }
mark_inode_dirty(inode);
if (uspi->fs_magic == UFS2_MAGIC) {
@@ -337,6 +340,7 @@ cg_found:
fail_remove_inode:
unlock_ufs(sb);
clear_nlink(inode);
+ unlock_new_inode(inode);
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 61e8a9b021dd..be7d42c7d938 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -158,16 +158,16 @@ out:
/**
* ufs_inode_getfrag() - allocate new fragment(s)
- * @inode - pointer to inode
- * @fragment - number of `fragment' which hold pointer
+ * @inode: pointer to inode
+ * @fragment: number of `fragment' which hold pointer
* to new allocated fragment(s)
- * @new_fragment - number of new allocated fragment(s)
- * @required - how many fragment(s) we require
- * @err - we set it if something wrong
- * @phys - pointer to where we save physical number of new allocated fragments,
+ * @new_fragment: number of new allocated fragment(s)
+ * @required: how many fragment(s) we require
+ * @err: we set it if something wrong
+ * @phys: pointer to where we save physical number of new allocated fragments,
* NULL if we allocate not data(indirect blocks for example).
- * @new - we set it if we allocate new block
- * @locked_page - for ufs_new_fragments()
+ * @new: we set it if we allocate new block
+ * @locked_page: for ufs_new_fragments()
*/
static struct buffer_head *
ufs_inode_getfrag(struct inode *inode, u64 fragment,
@@ -315,16 +315,16 @@ repeat2:
/**
* ufs_inode_getblock() - allocate new block
- * @inode - pointer to inode
- * @bh - pointer to block which hold "pointer" to new allocated block
- * @fragment - number of `fragment' which hold pointer
+ * @inode: pointer to inode
+ * @bh: pointer to block which hold "pointer" to new allocated block
+ * @fragment: number of `fragment' which hold pointer
* to new allocated block
- * @new_fragment - number of new allocated fragment
+ * @new_fragment: number of new allocated fragment
* (block will hold this fragment and also uspi->s_fpb-1)
- * @err - see ufs_inode_getfrag()
- * @phys - see ufs_inode_getfrag()
- * @new - see ufs_inode_getfrag()
- * @locked_page - see ufs_inode_getfrag()
+ * @err: see ufs_inode_getfrag()
+ * @phys: see ufs_inode_getfrag()
+ * @new: see ufs_inode_getfrag()
+ * @locked_page: see ufs_inode_getfrag()
*/
static struct buffer_head *
ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
@@ -902,9 +902,6 @@ void ufs_evict_inode(struct inode * inode)
invalidate_inode_buffers(inode);
clear_inode(inode);
- if (want_delete) {
- lock_ufs(inode->i_sb);
- ufs_free_inode (inode);
- unlock_ufs(inode->i_sb);
- }
+ if (want_delete)
+ ufs_free_inode(inode);
}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 90d74b8f8eba..fd65deb4b5f0 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -38,10 +38,12 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
{
int err = ufs_add_link(dentry, inode);
if (!err) {
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
return 0;
}
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -126,12 +128,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out_notlocked;
- lock_ufs(dir->i_sb);
inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out;
+ goto out_notlocked;
+ lock_ufs(dir->i_sb);
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
inode->i_op = &ufs_symlink_inode_operations;
@@ -155,6 +157,7 @@ out_notlocked:
out_fail:
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -181,13 +184,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
struct inode * inode;
int err;
- lock_ufs(dir->i_sb);
- inode_inc_link_count(dir);
-
inode = ufs_new_inode(dir, S_IFDIR|mode);
- err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out_dir;
+ return PTR_ERR(inode);
inode->i_op = &ufs_dir_inode_operations;
inode->i_fop = &ufs_dir_operations;
@@ -195,6 +194,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
inode_inc_link_count(inode);
+ lock_ufs(dir->i_sb);
+ inode_inc_link_count(dir);
+
err = ufs_make_empty(inode, dir);
if (err)
goto out_fail;
@@ -211,8 +213,8 @@ out:
out_fail:
inode_dec_link_count(inode);
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput (inode);
-out_dir:
inode_dec_link_count(dir);
unlock_ufs(dir->i_sb);
goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b879f1ba3439..da73801301d5 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -65,7 +65,6 @@
* Evgeniy Dushistov <dushistov@mail.ru>, 2007
*/
-
#include <linux/exportfs.h>
#include <linux/module.h>
#include <linux/bitops.h>
@@ -172,73 +171,73 @@ static void ufs_print_super_stuff(struct super_block *sb,
{
u32 magic = fs32_to_cpu(sb, usb3->fs_magic);
- printk("ufs_print_super_stuff\n");
- printk(" magic: 0x%x\n", magic);
+ pr_debug("ufs_print_super_stuff\n");
+ pr_debug(" magic: 0x%x\n", magic);
if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) {
- printk(" fs_size: %llu\n", (unsigned long long)
- fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
- printk(" fs_dsize: %llu\n", (unsigned long long)
- fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
- printk(" bsize: %u\n",
- fs32_to_cpu(sb, usb1->fs_bsize));
- printk(" fsize: %u\n",
- fs32_to_cpu(sb, usb1->fs_fsize));
- printk(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname);
- printk(" fs_sblockloc: %llu\n", (unsigned long long)
- fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
- printk(" cs_ndir(No of dirs): %llu\n", (unsigned long long)
- fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
- printk(" cs_nbfree(No of free blocks): %llu\n",
- (unsigned long long)
- fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
- printk(KERN_INFO" cs_nifree(Num of free inodes): %llu\n",
- (unsigned long long)
- fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree));
- printk(KERN_INFO" cs_nffree(Num of free frags): %llu\n",
- (unsigned long long)
- fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree));
- printk(KERN_INFO" fs_maxsymlinklen: %u\n",
- fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen));
+ pr_debug(" fs_size: %llu\n", (unsigned long long)
+ fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
+ pr_debug(" fs_dsize: %llu\n", (unsigned long long)
+ fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
+ pr_debug(" bsize: %u\n",
+ fs32_to_cpu(sb, usb1->fs_bsize));
+ pr_debug(" fsize: %u\n",
+ fs32_to_cpu(sb, usb1->fs_fsize));
+ pr_debug(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname);
+ pr_debug(" fs_sblockloc: %llu\n", (unsigned long long)
+ fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
+ pr_debug(" cs_ndir(No of dirs): %llu\n", (unsigned long long)
+ fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
+ pr_debug(" cs_nbfree(No of free blocks): %llu\n",
+ (unsigned long long)
+ fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
+ pr_info(" cs_nifree(Num of free inodes): %llu\n",
+ (unsigned long long)
+ fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree));
+ pr_info(" cs_nffree(Num of free frags): %llu\n",
+ (unsigned long long)
+ fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree));
+ pr_info(" fs_maxsymlinklen: %u\n",
+ fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen));
} else {
- printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
- printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
- printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
- printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
- printk(" cgoffset: %u\n",
- fs32_to_cpu(sb, usb1->fs_cgoffset));
- printk(" ~cgmask: 0x%x\n",
- ~fs32_to_cpu(sb, usb1->fs_cgmask));
- printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size));
- printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
- printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
- printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
- printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
- printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag));
- printk(" fragshift: %u\n",
- fs32_to_cpu(sb, usb1->fs_fragshift));
- printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
- printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
- printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
- printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc));
- printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
- printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
- printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
- printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
- printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
- printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
- printk(" fstodb: %u\n",
- fs32_to_cpu(sb, usb1->fs_fsbtodb));
- printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
- printk(" ndir %u\n",
- fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
- printk(" nifree %u\n",
- fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
- printk(" nbfree %u\n",
- fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
- printk(" nffree %u\n",
- fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+ pr_debug(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+ pr_debug(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+ pr_debug(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+ pr_debug(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+ pr_debug(" cgoffset: %u\n",
+ fs32_to_cpu(sb, usb1->fs_cgoffset));
+ pr_debug(" ~cgmask: 0x%x\n",
+ ~fs32_to_cpu(sb, usb1->fs_cgmask));
+ pr_debug(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size));
+ pr_debug(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+ pr_debug(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+ pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+ pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+ pr_debug(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+ pr_debug(" fragshift: %u\n",
+ fs32_to_cpu(sb, usb1->fs_fragshift));
+ pr_debug(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+ pr_debug(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+ pr_debug(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+ pr_debug(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+ pr_debug(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+ pr_debug(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+ pr_debug(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+ pr_debug(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+ pr_debug(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+ pr_debug(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+ pr_debug(" fstodb: %u\n",
+ fs32_to_cpu(sb, usb1->fs_fsbtodb));
+ pr_debug(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+ pr_debug(" ndir %u\n",
+ fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+ pr_debug(" nifree %u\n",
+ fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+ pr_debug(" nbfree %u\n",
+ fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+ pr_debug(" nffree %u\n",
+ fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
}
- printk("\n");
+ pr_debug("\n");
}
/*
@@ -247,38 +246,38 @@ static void ufs_print_super_stuff(struct super_block *sb,
static void ufs_print_cylinder_stuff(struct super_block *sb,
struct ufs_cylinder_group *cg)
{
- printk("\nufs_print_cylinder_stuff\n");
- printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
- printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic));
- printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time));
- printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx));
- printk(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl));
- printk(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk));
- printk(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk));
- printk(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir));
- printk(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree));
- printk(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree));
- printk(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree));
- printk(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor));
- printk(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor));
- printk(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor));
- printk(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n",
+ pr_debug("\nufs_print_cylinder_stuff\n");
+ pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
+ pr_debug(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic));
+ pr_debug(" time: %u\n", fs32_to_cpu(sb, cg->cg_time));
+ pr_debug(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx));
+ pr_debug(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl));
+ pr_debug(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk));
+ pr_debug(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk));
+ pr_debug(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir));
+ pr_debug(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree));
+ pr_debug(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree));
+ pr_debug(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree));
+ pr_debug(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor));
+ pr_debug(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor));
+ pr_debug(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor));
+ pr_debug(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n",
fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]),
fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]),
fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]),
fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7]));
- printk(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff));
- printk(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff));
- printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
- printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
- printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
- printk(" clustersumoff %u\n",
- fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
- printk(" clusteroff %u\n",
- fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
- printk(" nclusterblks %u\n",
- fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
- printk("\n");
+ pr_debug(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff));
+ pr_debug(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff));
+ pr_debug(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
+ pr_debug(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
+ pr_debug(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
+ pr_debug(" clustersumoff %u\n",
+ fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+ pr_debug(" clusteroff %u\n",
+ fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+ pr_debug(" nclusterblks %u\n",
+ fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+ pr_debug("\n");
}
#else
# define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/
@@ -287,13 +286,12 @@ static void ufs_print_cylinder_stuff(struct super_block *sb,
static const struct super_operations ufs_super_ops;
-static char error_buf[1024];
-
void ufs_error (struct super_block * sb, const char * function,
const char * fmt, ...)
{
struct ufs_sb_private_info * uspi;
struct ufs_super_block_first * usb1;
+ struct va_format vaf;
va_list args;
uspi = UFS_SB(sb)->s_uspi;
@@ -305,20 +303,21 @@ void ufs_error (struct super_block * sb, const char * function,
ufs_mark_sb_dirty(sb);
sb->s_flags |= MS_RDONLY;
}
- va_start (args, fmt);
- vsnprintf (error_buf, sizeof(error_buf), fmt, args);
- va_end (args);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) {
case UFS_MOUNT_ONERROR_PANIC:
- panic ("UFS-fs panic (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
+ panic("panic (device %s): %s: %pV\n",
+ sb->s_id, function, &vaf);
case UFS_MOUNT_ONERROR_LOCK:
case UFS_MOUNT_ONERROR_UMOUNT:
case UFS_MOUNT_ONERROR_REPAIR:
- printk (KERN_CRIT "UFS-fs error (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
- }
+ pr_crit("error (device %s): %s: %pV\n",
+ sb->s_id, function, &vaf);
+ }
+ va_end(args);
}
void ufs_panic (struct super_block * sb, const char * function,
@@ -326,6 +325,7 @@ void ufs_panic (struct super_block * sb, const char * function,
{
struct ufs_sb_private_info * uspi;
struct ufs_super_block_first * usb1;
+ struct va_format vaf;
va_list args;
uspi = UFS_SB(sb)->s_uspi;
@@ -336,24 +336,27 @@ void ufs_panic (struct super_block * sb, const char * function,
ubh_mark_buffer_dirty(USPI_UBH(uspi));
ufs_mark_sb_dirty(sb);
}
- va_start (args, fmt);
- vsnprintf (error_buf, sizeof(error_buf), fmt, args);
- va_end (args);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
sb->s_flags |= MS_RDONLY;
- printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
+ pr_crit("panic (device %s): %s: %pV\n",
+ sb->s_id, function, &vaf);
+ va_end(args);
}
void ufs_warning (struct super_block * sb, const char * function,
const char * fmt, ...)
{
+ struct va_format vaf;
va_list args;
- va_start (args, fmt);
- vsnprintf (error_buf, sizeof(error_buf), fmt, args);
- va_end (args);
- printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_warn("(device %s): %s: %pV\n",
+ sb->s_id, function, &vaf);
+ va_end(args);
}
enum {
@@ -464,14 +467,12 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
ufs_set_opt (*mount_options, ONERROR_UMOUNT);
break;
case Opt_onerror_repair:
- printk("UFS-fs: Unable to do repair on error, "
- "will lock lock instead\n");
+ pr_err("Unable to do repair on error, will lock lock instead\n");
ufs_clear_opt (*mount_options, ONERROR);
ufs_set_opt (*mount_options, ONERROR_REPAIR);
break;
default:
- printk("UFS-fs: Invalid option: \"%s\" "
- "or missing value\n", p);
+ pr_err("Invalid option: \"%s\" or missing value\n", p);
return 0;
}
}
@@ -788,8 +789,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
#ifndef CONFIG_UFS_FS_WRITE
if (!(sb->s_flags & MS_RDONLY)) {
- printk("ufs was compiled with read-only support, "
- "can't be mounted as read-write\n");
+ pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
return -EROFS;
}
#endif
@@ -812,12 +812,12 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_mount_opt = 0;
ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK);
if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) {
- printk("wrong mount options\n");
+ pr_err("wrong mount options\n");
goto failed;
}
if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) {
if (!silent)
- printk("You didn't specify the type of your ufs filesystem\n\n"
+ pr_err("You didn't specify the type of your ufs filesystem\n\n"
"mount -t ufs -o ufstype="
"sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n"
">>>WARNING<<< Wrong ufstype may corrupt your filesystem, "
@@ -868,7 +868,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
break;
case UFS_MOUNT_UFSTYPE_SUNOS:
- UFSD(("ufstype=sunos\n"))
+ UFSD("ufstype=sunos\n");
uspi->s_fsize = block_size = 1024;
uspi->s_fmask = ~(1024 - 1);
uspi->s_fshift = 10;
@@ -900,7 +900,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
if (!(sb->s_flags & MS_RDONLY)) {
if (!silent)
- printk(KERN_INFO "ufstype=old is supported read-only\n");
+ pr_info("ufstype=old is supported read-only\n");
sb->s_flags |= MS_RDONLY;
}
break;
@@ -916,7 +916,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
if (!(sb->s_flags & MS_RDONLY)) {
if (!silent)
- printk(KERN_INFO "ufstype=nextstep is supported read-only\n");
+ pr_info("ufstype=nextstep is supported read-only\n");
sb->s_flags |= MS_RDONLY;
}
break;
@@ -932,7 +932,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
if (!(sb->s_flags & MS_RDONLY)) {
if (!silent)
- printk(KERN_INFO "ufstype=nextstep-cd is supported read-only\n");
+ pr_info("ufstype=nextstep-cd is supported read-only\n");
sb->s_flags |= MS_RDONLY;
}
break;
@@ -948,7 +948,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
if (!(sb->s_flags & MS_RDONLY)) {
if (!silent)
- printk(KERN_INFO "ufstype=openstep is supported read-only\n");
+ pr_info("ufstype=openstep is supported read-only\n");
sb->s_flags |= MS_RDONLY;
}
break;
@@ -963,19 +963,19 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
if (!(sb->s_flags & MS_RDONLY)) {
if (!silent)
- printk(KERN_INFO "ufstype=hp is supported read-only\n");
+ pr_info("ufstype=hp is supported read-only\n");
sb->s_flags |= MS_RDONLY;
}
break;
default:
if (!silent)
- printk("unknown ufstype\n");
+ pr_err("unknown ufstype\n");
goto failed;
}
again:
if (!sb_set_blocksize(sb, block_size)) {
- printk(KERN_ERR "UFS: failed to set blocksize\n");
+ pr_err("failed to set blocksize\n");
goto failed;
}
@@ -1034,7 +1034,7 @@ again:
goto again;
}
if (!silent)
- printk("ufs_read_super: bad magic number\n");
+ pr_err("%s(): bad magic number\n", __func__);
goto failed;
magic_found:
@@ -1048,33 +1048,33 @@ magic_found:
uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
if (!is_power_of_2(uspi->s_fsize)) {
- printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n",
- uspi->s_fsize);
- goto failed;
+ pr_err("%s(): fragment size %u is not a power of 2\n",
+ __func__, uspi->s_fsize);
+ goto failed;
}
if (uspi->s_fsize < 512) {
- printk(KERN_ERR "ufs_read_super: fragment size %u is too small\n",
- uspi->s_fsize);
+ pr_err("%s(): fragment size %u is too small\n",
+ __func__, uspi->s_fsize);
goto failed;
}
if (uspi->s_fsize > 4096) {
- printk(KERN_ERR "ufs_read_super: fragment size %u is too large\n",
- uspi->s_fsize);
+ pr_err("%s(): fragment size %u is too large\n",
+ __func__, uspi->s_fsize);
goto failed;
}
if (!is_power_of_2(uspi->s_bsize)) {
- printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n",
- uspi->s_bsize);
+ pr_err("%s(): block size %u is not a power of 2\n",
+ __func__, uspi->s_bsize);
goto failed;
}
if (uspi->s_bsize < 4096) {
- printk(KERN_ERR "ufs_read_super: block size %u is too small\n",
- uspi->s_bsize);
+ pr_err("%s(): block size %u is too small\n",
+ __func__, uspi->s_bsize);
goto failed;
}
if (uspi->s_bsize / uspi->s_fsize > 8) {
- printk(KERN_ERR "ufs_read_super: too many fragments per block (%u)\n",
- uspi->s_bsize / uspi->s_fsize);
+ pr_err("%s(): too many fragments per block (%u)\n",
+ __func__, uspi->s_bsize / uspi->s_fsize);
goto failed;
}
if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) {
@@ -1113,20 +1113,21 @@ magic_found:
UFSD("fs is DEC OSF/1\n");
break;
case UFS_FSACTIVE:
- printk("ufs_read_super: fs is active\n");
+ pr_err("%s(): fs is active\n", __func__);
sb->s_flags |= MS_RDONLY;
break;
case UFS_FSBAD:
- printk("ufs_read_super: fs is bad\n");
+ pr_err("%s(): fs is bad\n", __func__);
sb->s_flags |= MS_RDONLY;
break;
default:
- printk("ufs_read_super: can't grok fs_clean 0x%x\n", usb1->fs_clean);
+ pr_err("%s(): can't grok fs_clean 0x%x\n",
+ __func__, usb1->fs_clean);
sb->s_flags |= MS_RDONLY;
break;
}
} else {
- printk("ufs_read_super: fs needs fsck\n");
+ pr_err("%s(): fs needs fsck\n", __func__);
sb->s_flags |= MS_RDONLY;
}
@@ -1299,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
new_mount_opt |= ufstype;
} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
- printk("ufstype can't be changed during remount\n");
+ pr_err("ufstype can't be changed during remount\n");
unlock_ufs(sb);
return -EINVAL;
}
@@ -1328,8 +1329,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
* fs was mounted as ro, remounting rw
*/
#ifndef CONFIG_UFS_FS_WRITE
- printk("ufs was compiled with read-only support, "
- "can't be mounted as read-write\n");
+ pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
unlock_ufs(sb);
return -EINVAL;
#else
@@ -1338,12 +1338,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufstype != UFS_MOUNT_UFSTYPE_44BSD &&
ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
- printk("this ufstype is read-only supported\n");
+ pr_err("this ufstype is read-only supported\n");
unlock_ufs(sb);
return -EINVAL;
}
if (!ufs_read_cylinder_structures(sb)) {
- printk("failed during remounting\n");
+ pr_err("failed during remounting\n");
unlock_ufs(sb);
return -EPERM;
}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 343e6fc571e5..2a07396d5f9e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -1,6 +1,12 @@
#ifndef _UFS_UFS_H
#define _UFS_UFS_H 1
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#define UFS_MAX_GROUP_LOADED 8
#define UFS_CGNO_EMPTY ((unsigned)-1)
@@ -71,9 +77,9 @@ struct ufs_inode_info {
*/
#ifdef CONFIG_UFS_DEBUG
# define UFSD(f, a...) { \
- printk ("UFSD (%s, %d): %s:", \
+ pr_debug("UFSD (%s, %d): %s:", \
__FILE__, __LINE__, __func__); \
- printk (f, ## a); \
+ pr_debug(f, ## a); \
}
#else
# define UFSD(f, a...) /**/
diff --git a/fs/xattr.c b/fs/xattr.c
index c69e6d43a0d2..64e83efb742d 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -364,13 +364,12 @@ out:
return error;
}
-SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
- const char __user *, name, const void __user *, value,
- size_t, size, int, flags)
+static int path_setxattr(const char __user *pathname,
+ const char __user *name, const void __user *value,
+ size_t size, int flags, unsigned int lookup_flags)
{
struct path path;
int error;
- unsigned int lookup_flags = LOOKUP_FOLLOW;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
@@ -388,28 +387,18 @@ retry:
return error;
}
+SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
+ const char __user *, name, const void __user *, value,
+ size_t, size, int, flags)
+{
+ return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
+}
+
SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
- struct path path;
- int error;
- unsigned int lookup_flags = 0;
-retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
- if (error)
- return error;
- error = mnt_want_write(path.mnt);
- if (!error) {
- error = setxattr(path.dentry, name, value, size, flags);
- mnt_drop_write(path.mnt);
- }
- path_put(&path);
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- return error;
+ return path_setxattr(pathname, name, value, size, flags, 0);
}
SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
@@ -481,12 +470,12 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
return error;
}
-SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
- const char __user *, name, void __user *, value, size_t, size)
+static ssize_t path_getxattr(const char __user *pathname,
+ const char __user *name, void __user *value,
+ size_t size, unsigned int lookup_flags)
{
struct path path;
ssize_t error;
- unsigned int lookup_flags = LOOKUP_FOLLOW;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
@@ -500,23 +489,16 @@ retry:
return error;
}
+SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
+ const char __user *, name, void __user *, value, size_t, size)
+{
+ return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
+}
+
SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
- struct path path;
- ssize_t error;
- unsigned int lookup_flags = 0;
-retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
- if (error)
- return error;
- error = getxattr(path.dentry, name, value, size);
- path_put(&path);
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- return error;
+ return path_getxattr(pathname, name, value, size, 0);
}
SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
@@ -571,12 +553,11 @@ listxattr(struct dentry *d, char __user *list, size_t size)
return error;
}
-SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
- size_t, size)
+static ssize_t path_listxattr(const char __user *pathname, char __user *list,
+ size_t size, unsigned int lookup_flags)
{
struct path path;
ssize_t error;
- unsigned int lookup_flags = LOOKUP_FOLLOW;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
@@ -590,23 +571,16 @@ retry:
return error;
}
+SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
+ size_t, size)
+{
+ return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
+}
+
SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
- struct path path;
- ssize_t error;
- unsigned int lookup_flags = 0;
-retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
- if (error)
- return error;
- error = listxattr(path.dentry, list, size);
- path_put(&path);
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- return error;
+ return path_listxattr(pathname, list, size, 0);
}
SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
@@ -640,12 +614,11 @@ removexattr(struct dentry *d, const char __user *name)
return vfs_removexattr(d, kname);
}
-SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
- const char __user *, name)
+static int path_removexattr(const char __user *pathname,
+ const char __user *name, unsigned int lookup_flags)
{
struct path path;
int error;
- unsigned int lookup_flags = LOOKUP_FOLLOW;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
@@ -663,27 +636,16 @@ retry:
return error;
}
+SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
+ const char __user *, name)
+{
+ return path_removexattr(pathname, name, LOOKUP_FOLLOW);
+}
+
SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
const char __user *, name)
{
- struct path path;
- int error;
- unsigned int lookup_flags = 0;
-retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
- if (error)
- return error;
- error = mnt_want_write(path.mnt);
- if (!error) {
- error = removexattr(path.dentry, name);
- mnt_drop_write(path.mnt);
- }
- path_put(&path);
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- return error;
+ return path_removexattr(pathname, name, 0);
}
SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 399e8cec6e60..5d47b4df61ea 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
+ depends on (64BIT || LBDAF)
select EXPORTFS
select LIBCRC32C
help
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c21f43506661..d61799949580 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -17,6 +17,7 @@
#
ccflags-y += -I$(src) # needed for trace events
+ccflags-y += -I$(src)/libxfs
ccflags-$(CONFIG_XFS_DEBUG) += -g
@@ -25,6 +26,39 @@ obj-$(CONFIG_XFS_FS) += xfs.o
# this one should be compiled first, as the tracing macros can easily blow up
xfs-y += xfs_trace.o
+# build the libxfs code first
+xfs-y += $(addprefix libxfs/, \
+ xfs_alloc.o \
+ xfs_alloc_btree.o \
+ xfs_attr.o \
+ xfs_attr_leaf.o \
+ xfs_attr_remote.o \
+ xfs_bmap.o \
+ xfs_bmap_btree.o \
+ xfs_btree.o \
+ xfs_da_btree.o \
+ xfs_da_format.o \
+ xfs_dir2.o \
+ xfs_dir2_block.o \
+ xfs_dir2_data.o \
+ xfs_dir2_leaf.o \
+ xfs_dir2_node.o \
+ xfs_dir2_sf.o \
+ xfs_dquot_buf.o \
+ xfs_ialloc.o \
+ xfs_ialloc_btree.o \
+ xfs_inode_fork.o \
+ xfs_inode_buf.o \
+ xfs_log_rlimit.o \
+ xfs_sb.o \
+ xfs_symlink_remote.o \
+ xfs_trans_resv.o \
+ )
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
+ xfs_rtbitmap.o \
+ )
+
# highlevel code
xfs-y += xfs_aops.o \
xfs_attr_inactive.o \
@@ -45,53 +79,27 @@ xfs-y += xfs_aops.o \
xfs_ioctl.o \
xfs_iomap.o \
xfs_iops.o \
+ xfs_inode.o \
xfs_itable.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
xfs_super.o \
xfs_symlink.o \
+ xfs_sysfs.o \
xfs_trans.o \
xfs_xattr.o \
kmem.o \
uuid.o
-# code shared with libxfs
-xfs-y += xfs_alloc.o \
- xfs_alloc_btree.o \
- xfs_attr.o \
- xfs_attr_leaf.o \
- xfs_attr_remote.o \
- xfs_bmap.o \
- xfs_bmap_btree.o \
- xfs_btree.o \
- xfs_da_btree.o \
- xfs_da_format.o \
- xfs_dir2.o \
- xfs_dir2_block.o \
- xfs_dir2_data.o \
- xfs_dir2_leaf.o \
- xfs_dir2_node.o \
- xfs_dir2_sf.o \
- xfs_dquot_buf.o \
- xfs_ialloc.o \
- xfs_ialloc_btree.o \
- xfs_icreate_item.o \
- xfs_inode.o \
- xfs_inode_fork.o \
- xfs_inode_buf.o \
- xfs_log_recover.o \
- xfs_log_rlimit.o \
- xfs_sb.o \
- xfs_symlink_remote.o \
- xfs_trans_resv.o
-
# low-level transaction/log code
xfs-y += xfs_log.o \
xfs_log_cil.o \
xfs_buf_item.o \
xfs_extfree_item.o \
+ xfs_icreate_item.o \
xfs_inode_item.o \
+ xfs_log_recover.o \
xfs_trans_ail.o \
xfs_trans_buf.o \
xfs_trans_extfree.o \
@@ -107,8 +115,7 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_quotaops.o
# xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
- xfs_rtbitmap.o
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_PROC_FS) += xfs_stats.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 844e288b9576..53e95b2a1369 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -21,7 +21,6 @@
#include <linux/swap.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
-#include "time.h"
#include "kmem.h"
#include "xfs_message.h"
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 6e247a99f5db..6e247a99f5db 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d43813267a80..eff34218f405 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -483,9 +483,9 @@ xfs_agfl_read_verify(
return;
if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_agfl_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -503,7 +503,7 @@ xfs_agfl_write_verify(
return;
if (!xfs_agfl_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -559,7 +559,7 @@ xfs_alloc_update_counters(
xfs_trans_agblocks_delta(tp, len);
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
be32_to_cpu(agf->agf_length)))
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
return 0;
@@ -2209,6 +2209,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
return false;
+ if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
+ return false;
+
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -2234,11 +2238,11 @@ xfs_agf_read_verify(
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
XFS_ERRTAG_ALLOC_READ_AGF,
XFS_RANDOM_ALLOC_READ_AGF))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -2252,7 +2256,7 @@ xfs_agf_write_verify(
struct xfs_buf_log_item *bip = bp->b_fspriv;
if (!xfs_agf_verify(mp, bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -2601,11 +2605,11 @@ xfs_free_extent(
*/
args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
if (args.agno >= args.mp->m_sb.sb_agcount)
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
if (args.agbno >= args.mp->m_sb.sb_agblocks)
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
args.pag = xfs_perag_get(args.mp, args.agno);
ASSERT(args.pag);
@@ -2617,7 +2621,7 @@ xfs_free_extent(
/* validate the extent size is legal now we have the agf locked */
if (args.agbno + len >
be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto error0;
}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index feacb061bab7..feacb061bab7 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 8358f1ded94d..e0e83e24d3ef 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -355,9 +355,9 @@ xfs_allocbt_read_verify(
struct xfs_buf *bp)
{
if (!xfs_btree_sblock_verify_crc(bp))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_allocbt_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error) {
trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -371,7 +371,7 @@ xfs_allocbt_write_verify(
{
if (!xfs_allocbt_verify(bp)) {
trace_xfs_btree_corrupt(bp, _RET_IP_);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45e189e7e81c..45e189e7e81c 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index bfe36fc2cdc2..353fb425faef 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -85,7 +85,7 @@ xfs_attr_args_init(
{
if (!name)
- return EINVAL;
+ return -EINVAL;
memset(args, 0, sizeof(*args));
args->geo = dp->i_mount->m_attr_geo;
@@ -95,7 +95,7 @@ xfs_attr_args_init(
args->name = name;
args->namelen = strlen((const char *)name);
if (args->namelen >= MAXNAMELEN)
- return EFAULT; /* match IRIX behaviour */
+ return -EFAULT; /* match IRIX behaviour */
args->hashval = xfs_da_hashname(args->name, args->namelen);
return 0;
@@ -131,10 +131,10 @@ xfs_attr_get(
XFS_STATS_INC(xs_attr_get);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return EIO;
+ return -EIO;
if (!xfs_inode_hasattr(ip))
- return ENOATTR;
+ return -ENOATTR;
error = xfs_attr_args_init(&args, ip, name, flags);
if (error)
@@ -145,7 +145,7 @@ xfs_attr_get(
lock_mode = xfs_ilock_attr_map_shared(ip);
if (!xfs_inode_hasattr(ip))
- error = ENOATTR;
+ error = -ENOATTR;
else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
error = xfs_attr_shortform_getvalue(&args);
else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
@@ -155,7 +155,7 @@ xfs_attr_get(
xfs_iunlock(ip, lock_mode);
*valuelenp = args.valuelen;
- return error == EEXIST ? 0 : error;
+ return error == -EEXIST ? 0 : error;
}
/*
@@ -213,7 +213,7 @@ xfs_attr_set(
XFS_STATS_INC(xs_attr_set);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return EIO;
+ return -EIO;
error = xfs_attr_args_init(&args, dp, name, flags);
if (error)
@@ -304,7 +304,7 @@ xfs_attr_set(
* the inode.
*/
error = xfs_attr_shortform_addname(&args);
- if (error != ENOSPC) {
+ if (error != -ENOSPC) {
/*
* Commit the shortform mods, and we're done.
* NOTE: this is also the error path (EEXIST, etc).
@@ -419,10 +419,10 @@ xfs_attr_remove(
XFS_STATS_INC(xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return EIO;
+ return -EIO;
if (!xfs_inode_hasattr(dp))
- return ENOATTR;
+ return -ENOATTR;
error = xfs_attr_args_init(&args, dp, name, flags);
if (error)
@@ -477,7 +477,7 @@ xfs_attr_remove(
xfs_trans_ijoin(args.trans, dp, 0);
if (!xfs_inode_hasattr(dp)) {
- error = XFS_ERROR(ENOATTR);
+ error = -ENOATTR;
} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
error = xfs_attr_shortform_remove(&args);
@@ -534,28 +534,28 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
trace_xfs_attr_sf_addname(args);
retval = xfs_attr_shortform_lookup(args);
- if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
- return(retval);
- } else if (retval == EEXIST) {
+ if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+ return retval;
+ } else if (retval == -EEXIST) {
if (args->flags & ATTR_CREATE)
- return(retval);
+ return retval;
retval = xfs_attr_shortform_remove(args);
ASSERT(retval == 0);
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
- return(XFS_ERROR(ENOSPC));
+ return -ENOSPC;
newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
if (!forkoff)
- return(XFS_ERROR(ENOSPC));
+ return -ENOSPC;
xfs_attr_shortform_add(args, forkoff);
- return(0);
+ return 0;
}
@@ -592,10 +592,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* the given flags produce an error or call for an atomic rename.
*/
retval = xfs_attr3_leaf_lookup_int(bp, args);
- if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+ if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
xfs_trans_brelse(args->trans, bp);
return retval;
- } else if (retval == EEXIST) {
+ } else if (retval == -EEXIST) {
if (args->flags & ATTR_CREATE) { /* pure create op */
xfs_trans_brelse(args->trans, bp);
return retval;
@@ -626,7 +626,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* if required.
*/
retval = xfs_attr3_leaf_add(bp, args);
- if (retval == ENOSPC) {
+ if (retval == -ENOSPC) {
/*
* Promote the attribute list to the Btree format, then
* Commit that transaction so that the node_addname() call
@@ -642,7 +642,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
- return(error);
+ return error;
}
/*
@@ -658,13 +658,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
error = xfs_trans_roll(&args->trans, dp);
if (error)
- return (error);
+ return error;
/*
* Fob the whole rest of the problem off on the Btree code.
*/
error = xfs_attr_node_addname(args);
- return(error);
+ return error;
}
/*
@@ -673,7 +673,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
error = xfs_trans_roll(&args->trans, dp);
if (error)
- return (error);
+ return error;
/*
* If there was an out-of-line value, allocate the blocks we
@@ -684,7 +684,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
if (args->rmtblkno > 0) {
error = xfs_attr_rmtval_set(args);
if (error)
- return(error);
+ return error;
}
/*
@@ -700,7 +700,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
error = xfs_attr3_leaf_flipflags(args);
if (error)
- return(error);
+ return error;
/*
* Dismantle the "old" attribute/value pair by removing
@@ -714,7 +714,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
- return(error);
+ return error;
}
/*
@@ -744,7 +744,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
- return(error);
+ return error;
}
/*
@@ -795,7 +795,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
return error;
error = xfs_attr3_leaf_lookup_int(bp, args);
- if (error == ENOATTR) {
+ if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
return error;
}
@@ -850,7 +850,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
return error;
error = xfs_attr3_leaf_lookup_int(bp, args);
- if (error != EEXIST) {
+ if (error != -EEXIST) {
xfs_trans_brelse(args->trans, bp);
return error;
}
@@ -906,9 +906,9 @@ restart:
goto out;
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+ if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
goto out;
- } else if (retval == EEXIST) {
+ } else if (retval == -EEXIST) {
if (args->flags & ATTR_CREATE)
goto out;
@@ -933,7 +933,7 @@ restart:
}
retval = xfs_attr3_leaf_add(blk->bp, state->args);
- if (retval == ENOSPC) {
+ if (retval == -ENOSPC) {
if (state->path.active == 1) {
/*
* Its really a single leaf node, but it had
@@ -1031,7 +1031,7 @@ restart:
if (args->rmtblkno > 0) {
error = xfs_attr_rmtval_set(args);
if (error)
- return(error);
+ return error;
}
/*
@@ -1061,7 +1061,7 @@ restart:
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
- return(error);
+ return error;
}
/*
@@ -1134,8 +1134,8 @@ out:
if (state)
xfs_da_state_free(state);
if (error)
- return(error);
- return(retval);
+ return error;
+ return retval;
}
/*
@@ -1168,7 +1168,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* Search to see if name exists, and get back a pointer to it.
*/
error = xfs_da3_node_lookup_int(state, &retval);
- if (error || (retval != EEXIST)) {
+ if (error || (retval != -EEXIST)) {
if (error == 0)
error = retval;
goto out;
@@ -1297,7 +1297,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
out:
xfs_da_state_free(state);
- return(error);
+ return error;
}
/*
@@ -1345,7 +1345,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
}
}
- return(0);
+ return 0;
}
/*
@@ -1376,7 +1376,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
if (error)
- return(error);
+ return error;
} else {
blk->bp = NULL;
}
@@ -1395,13 +1395,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
if (error)
- return(error);
+ return error;
} else {
blk->bp = NULL;
}
}
- return(0);
+ return 0;
}
/*
@@ -1431,7 +1431,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
error = xfs_da3_node_lookup_int(state, &retval);
if (error) {
retval = error;
- } else if (retval == EEXIST) {
+ } else if (retval == -EEXIST) {
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->bp != NULL);
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -1455,5 +1455,5 @@ xfs_attr_node_get(xfs_da_args_t *args)
}
xfs_da_state_free(state);
- return(retval);
+ return retval;
}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 28712d29e43c..b1f73dbbf3d8 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -214,7 +214,7 @@ xfs_attr3_leaf_write_verify(
struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
if (!xfs_attr3_leaf_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -242,9 +242,9 @@ xfs_attr3_leaf_read_verify(
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_attr3_leaf_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -547,7 +547,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
break;
}
if (i == end)
- return(XFS_ERROR(ENOATTR));
+ return -ENOATTR;
/*
* Fix up the attribute fork data, covering the hole
@@ -582,7 +582,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
xfs_sbversion_add_attr2(mp, args->trans);
- return(0);
+ return 0;
}
/*
@@ -611,9 +611,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
continue;
if (!xfs_attr_namesp_match(args->flags, sfe->flags))
continue;
- return(XFS_ERROR(EEXIST));
+ return -EEXIST;
}
- return(XFS_ERROR(ENOATTR));
+ return -ENOATTR;
}
/*
@@ -640,18 +640,18 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
continue;
if (args->flags & ATTR_KERNOVAL) {
args->valuelen = sfe->valuelen;
- return(XFS_ERROR(EEXIST));
+ return -EEXIST;
}
if (args->valuelen < sfe->valuelen) {
args->valuelen = sfe->valuelen;
- return(XFS_ERROR(ERANGE));
+ return -ERANGE;
}
args->valuelen = sfe->valuelen;
memcpy(args->value, &sfe->nameval[args->namelen],
args->valuelen);
- return(XFS_ERROR(EEXIST));
+ return -EEXIST;
}
- return(XFS_ERROR(ENOATTR));
+ return -ENOATTR;
}
/*
@@ -691,7 +691,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
* If we hit an IO error middle of the transaction inside
* grow_inode(), we may have inconsistent data. Bail out.
*/
- if (error == EIO)
+ if (error == -EIO)
goto out;
xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
@@ -730,9 +730,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
sfe->namelen);
nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
- ASSERT(error == ENOATTR);
+ ASSERT(error == -ENOATTR);
error = xfs_attr3_leaf_add(bp, &nargs);
- ASSERT(error != ENOSPC);
+ ASSERT(error != -ENOSPC);
if (error)
goto out;
sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
@@ -741,7 +741,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
out:
kmem_free(tmpbuffer);
- return(error);
+ return error;
}
/*
@@ -769,12 +769,12 @@ xfs_attr_shortform_allfit(
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* don't copy partial entries */
if (!(entry->flags & XFS_ATTR_LOCAL))
- return(0);
+ return 0;
name_loc = xfs_attr3_leaf_name_local(leaf, i);
if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
- return(0);
+ return 0;
if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
- return(0);
+ return 0;
bytes += sizeof(struct xfs_attr_sf_entry) - 1
+ name_loc->namelen
+ be16_to_cpu(name_loc->valuelen);
@@ -809,7 +809,7 @@ xfs_attr3_leaf_to_shortform(
tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
if (!tmpbuffer)
- return ENOMEM;
+ return -ENOMEM;
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
@@ -1017,10 +1017,10 @@ xfs_attr3_leaf_split(
ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
error = xfs_da_grow_inode(state->args, &blkno);
if (error)
- return(error);
+ return error;
error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
if (error)
- return(error);
+ return error;
newblk->blkno = blkno;
newblk->magic = XFS_ATTR_LEAF_MAGIC;
@@ -1031,7 +1031,7 @@ xfs_attr3_leaf_split(
xfs_attr3_leaf_rebalance(state, oldblk, newblk);
error = xfs_da3_blk_link(state, oldblk, newblk);
if (error)
- return(error);
+ return error;
/*
* Save info on "old" attribute for "atomic rename" ops, leaf_add()
@@ -1053,7 +1053,7 @@ xfs_attr3_leaf_split(
*/
oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
- return(error);
+ return error;
}
/*
@@ -1108,7 +1108,7 @@ xfs_attr3_leaf_add(
* no good and we should just give up.
*/
if (!ichdr.holes && sum < entsize)
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
/*
* Compact the entries to coalesce free space.
@@ -1121,7 +1121,7 @@ xfs_attr3_leaf_add(
* free region, in freemap[0]. If it is not big enough, give up.
*/
if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
- tmp = ENOSPC;
+ tmp = -ENOSPC;
goto out_log_hdr;
}
@@ -1692,7 +1692,7 @@ xfs_attr3_leaf_toosmall(
ichdr.usedbytes;
if (bytes > (state->args->geo->blksize >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
- return(0);
+ return 0;
}
/*
@@ -1711,7 +1711,7 @@ xfs_attr3_leaf_toosmall(
error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
if (error)
- return(error);
+ return error;
if (retval) {
*action = 0;
} else {
@@ -1740,7 +1740,7 @@ xfs_attr3_leaf_toosmall(
error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
blkno, -1, &bp);
if (error)
- return(error);
+ return error;
xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
@@ -1757,7 +1757,7 @@ xfs_attr3_leaf_toosmall(
}
if (i >= 2) {
*action = 0;
- return(0);
+ return 0;
}
/*
@@ -1773,13 +1773,13 @@ xfs_attr3_leaf_toosmall(
0, &retval);
}
if (error)
- return(error);
+ return error;
if (retval) {
*action = 0;
} else {
*action = 1;
}
- return(0);
+ return 0;
}
/*
@@ -2123,7 +2123,7 @@ xfs_attr3_leaf_lookup_int(
}
if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
args->index = probe;
- return XFS_ERROR(ENOATTR);
+ return -ENOATTR;
}
/*
@@ -2152,7 +2152,7 @@ xfs_attr3_leaf_lookup_int(
if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
} else {
name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
if (name_rmt->namelen != args->namelen)
@@ -2168,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
args->rmtblkcnt = xfs_attr3_rmt_blocks(
args->dp->i_mount,
args->rmtvaluelen);
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
}
}
args->index = probe;
- return XFS_ERROR(ENOATTR);
+ return -ENOATTR;
}
/*
@@ -2208,7 +2208,7 @@ xfs_attr3_leaf_getvalue(
}
if (args->valuelen < valuelen) {
args->valuelen = valuelen;
- return XFS_ERROR(ERANGE);
+ return -ERANGE;
}
args->valuelen = valuelen;
memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
@@ -2226,7 +2226,7 @@ xfs_attr3_leaf_getvalue(
}
if (args->valuelen < args->rmtvaluelen) {
args->valuelen = args->rmtvaluelen;
- return XFS_ERROR(ERANGE);
+ return -ERANGE;
}
args->valuelen = args->rmtvaluelen;
}
@@ -2481,7 +2481,7 @@ xfs_attr3_leaf_clearflag(
*/
error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
- return(error);
+ return error;
leaf = bp->b_addr;
entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
@@ -2548,7 +2548,7 @@ xfs_attr3_leaf_setflag(
*/
error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
- return(error);
+ return error;
leaf = bp->b_addr;
#ifdef DEBUG
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index e2929da7c3ba..e2929da7c3ba 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index b5adfecbb8ee..7510ab8058a4 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -138,11 +138,11 @@ xfs_attr3_rmt_read_verify(
while (len > 0) {
if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
break;
}
if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
break;
}
len -= blksize;
@@ -178,7 +178,7 @@ xfs_attr3_rmt_write_verify(
while (len > 0) {
if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -257,7 +257,7 @@ xfs_attr_rmtval_copyout(
xfs_alert(mp,
"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
bno, *offset, byte_cnt, ino);
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
}
@@ -452,7 +452,7 @@ xfs_attr_rmtval_set(
ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
- return(error);
+ return error;
}
/*
@@ -473,7 +473,7 @@ xfs_attr_rmtval_set(
*/
error = xfs_trans_roll(&args->trans, dp);
if (error)
- return (error);
+ return error;
}
/*
@@ -498,7 +498,7 @@ xfs_attr_rmtval_set(
blkcnt, &map, &nmap,
XFS_BMAPI_ATTRFORK);
if (error)
- return(error);
+ return error;
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -508,7 +508,7 @@ xfs_attr_rmtval_set(
bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
if (!bp)
- return ENOMEM;
+ return -ENOMEM;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
@@ -563,7 +563,7 @@ xfs_attr_rmtval_remove(
error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
if (error)
- return(error);
+ return error;
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -622,7 +622,7 @@ xfs_attr_rmtval_remove(
*/
error = xfs_trans_roll(&args->trans, args->dp);
if (error)
- return (error);
+ return error;
}
- return(0);
+ return 0;
}
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 5a9acfa156d7..5a9acfa156d7 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
index e1649c0d3e02..e1649c0d3e02 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/libxfs/xfs_bit.h
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 75c3fe5f3d9d..79c981984dca 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -392,7 +392,7 @@ xfs_bmap_check_leaf_extents(
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLDFSBNO);
+ ASSERT(bno != NULLFSBLOCK);
ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
@@ -1033,7 +1033,7 @@ xfs_bmap_add_attrfork_btree(
goto error0;
if (stat == 0) {
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
}
*firstblock = cur->bc_private.b.firstblock;
cur->bc_private.b.allocated = 0;
@@ -1115,7 +1115,7 @@ xfs_bmap_add_attrfork_local(
/* should only be called for types that support local format data */
ASSERT(0);
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
/*
@@ -1192,7 +1192,7 @@ xfs_bmap_add_attrfork(
break;
default:
ASSERT(0);
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto trans_cancel;
}
@@ -1299,7 +1299,7 @@ xfs_bmap_read_extents(
ASSERT(level > 0);
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLDFSBNO);
+ ASSERT(bno != NULLFSBLOCK);
ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
/*
@@ -1399,7 +1399,7 @@ xfs_bmap_read_extents(
return 0;
error0:
xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
@@ -1429,11 +1429,7 @@ xfs_bmap_search_multi_extents(
gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
gotp->br_state = XFS_EXT_INVALID;
-#if XFS_BIG_BLKNOS
gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
- gotp->br_startblock = 0xffffa5a5;
-#endif
prevp->br_startoff = NULLFILEOFF;
ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
@@ -1576,7 +1572,7 @@ xfs_bmap_last_before(
if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EIO);
+ return -EIO;
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
*last_block = 0;
return 0;
@@ -1690,7 +1686,7 @@ xfs_bmap_last_offset(
if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return XFS_ERROR(EIO);
+ return -EIO;
error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
if (error || is_empty)
@@ -3323,7 +3319,7 @@ xfs_bmap_extsize_align(
if (orig_off < align_off ||
orig_end > align_off + align_alen ||
align_alen - temp < orig_alen)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
/*
* Try to fix it by moving the start up.
*/
@@ -3348,7 +3344,7 @@ xfs_bmap_extsize_align(
* Result doesn't cover the request, fail it.
*/
if (orig_off < align_off || orig_end > align_off + align_alen)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
} else {
ASSERT(orig_off >= align_off);
ASSERT(orig_end <= align_off + align_alen);
@@ -4051,11 +4047,11 @@ xfs_bmapi_read(
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
XFS_STATS_INC(xs_blk_mapr);
@@ -4246,11 +4242,11 @@ xfs_bmapi_delay(
XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
XFS_STATS_INC(xs_blk_mapw);
@@ -4469,7 +4465,7 @@ xfs_bmapi_convert_unwritten(
* so generate another request.
*/
if (mval->br_blockcount < len)
- return EAGAIN;
+ return -EAGAIN;
return 0;
}
@@ -4540,11 +4536,11 @@ xfs_bmapi_write(
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -4620,7 +4616,7 @@ xfs_bmapi_write(
/* Execute unwritten extent conversion if necessary */
error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
- if (error == EAGAIN)
+ if (error == -EAGAIN)
continue;
if (error)
goto error0;
@@ -4922,7 +4918,7 @@ xfs_bmap_del_extent(
goto done;
cur->bc_rec.b = new;
error = xfs_btree_insert(cur, &i);
- if (error && error != ENOSPC)
+ if (error && error != -ENOSPC)
goto done;
/*
* If get no-space back from btree insert,
@@ -4930,7 +4926,7 @@ xfs_bmap_del_extent(
* block reservation.
* Fix up our state and return the error.
*/
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
/*
* Reset the cursor, don't trust
* it after any insert operation.
@@ -4958,7 +4954,7 @@ xfs_bmap_del_extent(
xfs_bmbt_set_blockcount(ep,
got.br_blockcount);
flags = 0;
- error = XFS_ERROR(ENOSPC);
+ error = -ENOSPC;
goto done;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
@@ -5076,11 +5072,11 @@ xfs_bunmapi(
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
mp = ip->i_mount;
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(len > 0);
@@ -5325,7 +5321,7 @@ xfs_bunmapi(
del.br_startoff > got.br_startoff &&
del.br_startoff + del.br_blockcount <
got.br_startoff + got.br_blockcount) {
- error = XFS_ERROR(ENOSPC);
+ error = -ENOSPC;
goto error0;
}
error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
@@ -5408,39 +5404,237 @@ error0:
}
/*
+ * Determine whether an extent shift can be accomplished by a merge with the
+ * extent that precedes the target hole of the shift.
+ */
+STATIC bool
+xfs_bmse_can_merge(
+ struct xfs_bmbt_irec *left, /* preceding extent */
+ struct xfs_bmbt_irec *got, /* current extent to shift */
+ xfs_fileoff_t shift) /* shift fsb */
+{
+ xfs_fileoff_t startoff;
+
+ startoff = got->br_startoff - shift;
+
+ /*
+ * The extent, once shifted, must be adjacent in-file and on-disk with
+ * the preceding extent.
+ */
+ if ((left->br_startoff + left->br_blockcount != startoff) ||
+ (left->br_startblock + left->br_blockcount != got->br_startblock) ||
+ (left->br_state != got->br_state) ||
+ (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+ return false;
+
+ return true;
+}
+
+/*
+ * A bmap extent shift adjusts the file offset of an extent to fill a preceding
+ * hole in the file. If an extent shift would result in the extent being fully
+ * adjacent to the extent that currently precedes the hole, we can merge with
+ * the preceding extent rather than do the shift.
+ *
+ * This function assumes the caller has verified a shift-by-merge is possible
+ * with the provided extents via xfs_bmse_can_merge().
+ */
+STATIC int
+xfs_bmse_merge(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t shift, /* shift fsb */
+ int current_ext, /* idx of gotp */
+ struct xfs_bmbt_rec_host *gotp, /* extent to shift */
+ struct xfs_bmbt_rec_host *leftp, /* preceding extent */
+ struct xfs_btree_cur *cur,
+ int *logflags) /* output */
+{
+ struct xfs_ifork *ifp;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec left;
+ xfs_filblks_t blockcount;
+ int error, i;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ xfs_bmbt_get_all(gotp, &got);
+ xfs_bmbt_get_all(leftp, &left);
+ blockcount = left.br_blockcount + got.br_blockcount;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(xfs_bmse_can_merge(&left, &got, shift));
+
+ /*
+ * Merge the in-core extents. Note that the host record pointers and
+ * current_ext index are invalid once the extent has been removed via
+ * xfs_iext_remove().
+ */
+ xfs_bmbt_set_blockcount(leftp, blockcount);
+ xfs_iext_remove(ip, current_ext, 1, 0);
+
+ /*
+ * Update the on-disk extent count, the btree if necessary and log the
+ * inode.
+ */
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ *logflags |= XFS_ILOG_CORE;
+ if (!cur) {
+ *logflags |= XFS_ILOG_DEXT;
+ return 0;
+ }
+
+ /* lookup and remove the extent to merge */
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
+ got.br_blockcount, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+
+ /* lookup and update size of the previous extent */
+ error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
+ left.br_blockcount, &i);
+ if (error)
+ goto out_error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+
+ left.br_blockcount = blockcount;
+
+ error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
+ left.br_blockcount, left.br_state);
+ if (error)
+ goto out_error;
+
+ return 0;
+
+out_error:
+ return error;
+}
+
+/*
+ * Shift a single extent.
+ */
+STATIC int
+xfs_bmse_shift_one(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t offset_shift_fsb,
+ int *current_ext,
+ struct xfs_bmbt_rec_host *gotp,
+ struct xfs_btree_cur *cur,
+ int *logflags)
+{
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t startoff;
+ struct xfs_bmbt_rec_host *leftp;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec left;
+ int error;
+ int i;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ xfs_bmbt_get_all(gotp, &got);
+ startoff = got.br_startoff - offset_shift_fsb;
+
+ /* delalloc extents should be prevented by caller */
+ XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock),
+ out_error);
+
+ /*
+ * If this is the first extent in the file, make sure there's enough
+ * room at the start of the file and jump right to the shift as there's
+ * no left extent to merge.
+ */
+ if (*current_ext == 0) {
+ if (got.br_startoff < offset_shift_fsb)
+ return -EINVAL;
+ goto shift_extent;
+ }
+
+ /* grab the left extent and check for a large enough hole */
+ leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
+ xfs_bmbt_get_all(leftp, &left);
+
+ if (startoff < left.br_startoff + left.br_blockcount)
+ return -EINVAL;
+
+ /* check whether to merge the extent or shift it down */
+ if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb))
+ goto shift_extent;
+
+ return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext,
+ gotp, leftp, cur, logflags);
+
+shift_extent:
+ /*
+ * Increment the extent index for the next iteration, update the start
+ * offset of the in-core extent and update the btree if applicable.
+ */
+ (*current_ext)++;
+ xfs_bmbt_set_startoff(gotp, startoff);
+ *logflags |= XFS_ILOG_CORE;
+ if (!cur) {
+ *logflags |= XFS_ILOG_DEXT;
+ return 0;
+ }
+
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
+ got.br_blockcount, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+
+ got.br_startoff = startoff;
+ error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+ got.br_blockcount, got.br_state);
+ if (error)
+ return error;
+
+ return 0;
+
+out_error:
+ return error;
+}
+
+/*
* Shift extent records to the left to cover a hole.
*
- * The maximum number of extents to be shifted in a single operation
- * is @num_exts, and @current_ext keeps track of the current extent
- * index we have shifted. @offset_shift_fsb is the length by which each
- * extent is shifted. If there is no hole to shift the extents
- * into, this will be considered invalid operation and we abort immediately.
+ * The maximum number of extents to be shifted in a single operation is
+ * @num_exts. @start_fsb specifies the file offset to start the shift and the
+ * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
+ * is the length by which each extent is shifted. If there is no hole to shift
+ * the extents into, this will be considered invalid operation and we abort
+ * immediately.
*/
int
xfs_bmap_shift_extents(
struct xfs_trans *tp,
struct xfs_inode *ip,
- int *done,
xfs_fileoff_t start_fsb,
xfs_fileoff_t offset_shift_fsb,
- xfs_extnum_t *current_ext,
+ int *done,
+ xfs_fileoff_t *next_fsb,
xfs_fsblock_t *firstblock,
struct xfs_bmap_free *flist,
int num_exts)
{
- struct xfs_btree_cur *cur;
+ struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_rec_host *gotp;
struct xfs_bmbt_irec got;
- struct xfs_bmbt_irec left;
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
xfs_extnum_t nexts = 0;
- xfs_fileoff_t startoff;
+ xfs_extnum_t current_ext;
int error = 0;
- int i;
int whichfork = XFS_DATA_FORK;
- int logflags;
- xfs_filblks_t blockcount = 0;
+ int logflags = 0;
int total_extents;
if (unlikely(XFS_TEST_ERROR(
@@ -5449,13 +5643,14 @@ xfs_bmap_shift_extents(
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmap_shift_extents",
XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
- ASSERT(current_ext != NULL);
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ifp = XFS_IFORK_PTR(ip, whichfork);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5465,142 +5660,62 @@ xfs_bmap_shift_extents(
return error;
}
- /*
- * If *current_ext is 0, we would need to lookup the extent
- * from where we would start shifting and store it in gotp.
- */
- if (!*current_ext) {
- gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
- /*
- * gotp can be null in 2 cases: 1) if there are no extents
- * or 2) start_fsb lies in a hole beyond which there are
- * no extents. Either way, we are done.
- */
- if (!gotp) {
- *done = 1;
- return 0;
- }
- }
-
- /* We are going to change core inode */
- logflags = XFS_ILOG_CORE;
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
cur->bc_private.b.flist = flist;
cur->bc_private.b.flags = 0;
- } else {
- cur = NULL;
- logflags |= XFS_ILOG_DEXT;
+ }
+
+ /*
+ * Look up the extent index for the fsb where we start shifting. We can
+ * henceforth iterate with current_ext as extent list changes are locked
+ * out via ilock.
+ *
+ * gotp can be null in 2 cases: 1) if there are no extents or 2)
+ * start_fsb lies in a hole beyond which there are no extents. Either
+ * way, we are done.
+ */
+ gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
+ if (!gotp) {
+ *done = 1;
+ goto del_cursor;
}
/*
* There may be delalloc extents in the data fork before the range we
- * are collapsing out, so we cannot
- * use the count of real extents here. Instead we have to calculate it
- * from the incore fork.
+ * are collapsing out, so we cannot use the count of real extents here.
+ * Instead we have to calculate it from the incore fork.
*/
total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
- while (nexts++ < num_exts && *current_ext < total_extents) {
-
- gotp = xfs_iext_get_ext(ifp, *current_ext);
- xfs_bmbt_get_all(gotp, &got);
- startoff = got.br_startoff - offset_shift_fsb;
-
- /*
- * Before shifting extent into hole, make sure that the hole
- * is large enough to accomodate the shift.
- */
- if (*current_ext) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
- *current_ext - 1), &left);
-
- if (startoff < left.br_startoff + left.br_blockcount)
- error = XFS_ERROR(EINVAL);
- } else if (offset_shift_fsb > got.br_startoff) {
- /*
- * When first extent is shifted, offset_shift_fsb
- * should be less than the stating offset of
- * the first extent.
- */
- error = XFS_ERROR(EINVAL);
- }
-
+ while (nexts++ < num_exts && current_ext < total_extents) {
+ error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
+ &current_ext, gotp, cur, &logflags);
if (error)
goto del_cursor;
- if (cur) {
- error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- &i);
- if (error)
- goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
- }
-
- /* Check if we can merge 2 adjacent extents */
- if (*current_ext &&
- left.br_startoff + left.br_blockcount == startoff &&
- left.br_startblock + left.br_blockcount ==
- got.br_startblock &&
- left.br_state == got.br_state &&
- left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
- blockcount = left.br_blockcount +
- got.br_blockcount;
- xfs_iext_remove(ip, *current_ext, 1, 0);
- if (cur) {
- error = xfs_btree_delete(cur, &i);
- if (error)
- goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
- }
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
- gotp = xfs_iext_get_ext(ifp, --*current_ext);
- xfs_bmbt_get_all(gotp, &got);
-
- /* Make cursor point to the extent we will update */
- if (cur) {
- error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- &i);
- if (error)
- goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
- }
-
- xfs_bmbt_set_blockcount(gotp, blockcount);
- got.br_blockcount = blockcount;
- } else {
- /* We have to update the startoff */
- xfs_bmbt_set_startoff(gotp, startoff);
- got.br_startoff = startoff;
- }
-
- if (cur) {
- error = xfs_bmbt_update(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- got.br_state);
- if (error)
- goto del_cursor;
- }
-
- (*current_ext)++;
+ /* update total extent count and grab the next record */
total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ if (current_ext >= total_extents)
+ break;
+ gotp = xfs_iext_get_ext(ifp, current_ext);
}
/* Check if we are done */
- if (*current_ext == total_extents)
+ if (current_ext == total_extents) {
*done = 1;
+ } else if (next_fsb) {
+ xfs_bmbt_get_all(gotp, &got);
+ *next_fsb = got.br_startoff;
+ }
del_cursor:
if (cur)
xfs_btree_del_cursor(cur,
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
- xfs_trans_log_inode(tp, ip, logflags);
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+
return error;
}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b879ca56a64c..44db6db86402 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -178,9 +178,8 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
- int *done, xfs_fileoff_t start_fsb,
- xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
- xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
- int num_exts);
+ xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb,
+ int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock,
+ struct xfs_bmap_free *flist, int num_exts);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 948836c4fd90..fba753308f31 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -111,23 +111,8 @@ __xfs_bmbt_get_all(
ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
s->br_startoff = ((xfs_fileoff_t)l0 &
xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-#if XFS_BIG_BLKNOS
s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
(((xfs_fsblock_t)l1) >> 21);
-#else
-#ifdef DEBUG
- {
- xfs_dfsbno_t b;
-
- b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
- (((xfs_dfsbno_t)l1) >> 21);
- ASSERT((b >> 32) == 0 || isnulldstartblock(b));
- s->br_startblock = (xfs_fsblock_t)b;
- }
-#else /* !DEBUG */
- s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
-#endif /* DEBUG */
-#endif /* XFS_BIG_BLKNOS */
s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
/* This is xfs_extent_state() in-line */
if (ext_flag) {
@@ -163,21 +148,8 @@ xfs_fsblock_t
xfs_bmbt_get_startblock(
xfs_bmbt_rec_host_t *r)
{
-#if XFS_BIG_BLKNOS
return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
(((xfs_fsblock_t)r->l1) >> 21);
-#else
-#ifdef DEBUG
- xfs_dfsbno_t b;
-
- b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
- (((xfs_dfsbno_t)r->l1) >> 21);
- ASSERT((b >> 32) == 0 || isnulldstartblock(b));
- return (xfs_fsblock_t)b;
-#else /* !DEBUG */
- return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
-#endif /* DEBUG */
-#endif /* XFS_BIG_BLKNOS */
}
/*
@@ -241,7 +213,6 @@ xfs_bmbt_set_allf(
ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-#if XFS_BIG_BLKNOS
ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -250,23 +221,6 @@ xfs_bmbt_set_allf(
r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
((xfs_bmbt_rec_base_t)blockcount &
(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-#else /* !XFS_BIG_BLKNOS */
- if (isnullstartblock(startblock)) {
- r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)startoff << 9) |
- (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
- r->l1 = xfs_mask64hi(11) |
- ((xfs_bmbt_rec_base_t)startblock << 21) |
- ((xfs_bmbt_rec_base_t)blockcount &
- (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
- } else {
- r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)startoff << 9);
- r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
- ((xfs_bmbt_rec_base_t)blockcount &
- (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
- }
-#endif /* XFS_BIG_BLKNOS */
}
/*
@@ -298,8 +252,6 @@ xfs_bmbt_disk_set_allf(
ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-
-#if XFS_BIG_BLKNOS
ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
r->l0 = cpu_to_be64(
@@ -310,26 +262,6 @@ xfs_bmbt_disk_set_allf(
((xfs_bmbt_rec_base_t)startblock << 21) |
((xfs_bmbt_rec_base_t)blockcount &
(xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-#else /* !XFS_BIG_BLKNOS */
- if (isnullstartblock(startblock)) {
- r->l0 = cpu_to_be64(
- ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)startoff << 9) |
- (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
- r->l1 = cpu_to_be64(xfs_mask64hi(11) |
- ((xfs_bmbt_rec_base_t)startblock << 21) |
- ((xfs_bmbt_rec_base_t)blockcount &
- (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
- } else {
- r->l0 = cpu_to_be64(
- ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)startoff << 9));
- r->l1 = cpu_to_be64(
- ((xfs_bmbt_rec_base_t)startblock << 21) |
- ((xfs_bmbt_rec_base_t)blockcount &
- (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
- }
-#endif /* XFS_BIG_BLKNOS */
}
/*
@@ -365,24 +297,11 @@ xfs_bmbt_set_startblock(
xfs_bmbt_rec_host_t *r,
xfs_fsblock_t v)
{
-#if XFS_BIG_BLKNOS
ASSERT((v & xfs_mask64hi(12)) == 0);
r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
(xfs_bmbt_rec_base_t)(v >> 43);
r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
(xfs_bmbt_rec_base_t)(v << 21);
-#else /* !XFS_BIG_BLKNOS */
- if (isnullstartblock(v)) {
- r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
- r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
- ((xfs_bmbt_rec_base_t)v << 21) |
- (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
- } else {
- r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
- r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
- (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
- }
-#endif /* XFS_BIG_BLKNOS */
}
/*
@@ -438,8 +357,8 @@ xfs_bmbt_to_bmdr(
cpu_to_be64(XFS_BUF_DADDR_NULL));
} else
ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
- ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
- ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
+ ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+ ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
ASSERT(rblock->bb_level != 0);
dblock->bb_level = rblock->bb_level;
dblock->bb_numrecs = rblock->bb_numrecs;
@@ -554,7 +473,7 @@ xfs_bmbt_alloc_block(
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
- error = XFS_ERROR(ENOSPC);
+ error = -ENOSPC;
goto error0;
}
error = xfs_alloc_vextent(&args);
@@ -763,11 +682,11 @@ xfs_bmbt_verify(
/* sibling pointer verification */
if (!block->bb_u.l.bb_leftsib ||
- (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+ (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
!XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
return false;
if (!block->bb_u.l.bb_rightsib ||
- (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+ (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
!XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
return false;
@@ -779,9 +698,9 @@ xfs_bmbt_read_verify(
struct xfs_buf *bp)
{
if (!xfs_btree_lblock_verify_crc(bp))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_bmbt_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error) {
trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -795,7 +714,7 @@ xfs_bmbt_write_verify(
{
if (!xfs_bmbt_verify(bp)) {
trace_xfs_btree_corrupt(bp, _RET_IP_);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -959,7 +878,7 @@ xfs_bmbt_change_owner(
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
if (!cur)
- return ENOMEM;
+ return -ENOMEM;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 819a8a4dee95..819a8a4dee95 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index cf893bc1e373..8fe6a93ff473 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -78,11 +78,11 @@ xfs_btree_check_lblock(
be16_to_cpu(block->bb_numrecs) <=
cur->bc_ops->get_maxrecs(cur, level) &&
block->bb_u.l.bb_leftsib &&
- (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+ (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
XFS_FSB_SANITY_CHECK(mp,
be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
block->bb_u.l.bb_rightsib &&
- (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+ (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
XFS_FSB_SANITY_CHECK(mp,
be64_to_cpu(block->bb_u.l.bb_rightsib)));
@@ -92,7 +92,7 @@ xfs_btree_check_lblock(
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -140,7 +140,7 @@ xfs_btree_check_sblock(
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -167,12 +167,12 @@ xfs_btree_check_block(
int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lptr(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_dfsbno_t bno, /* btree block disk address */
+ xfs_fsblock_t bno, /* btree block disk address */
int level) /* btree block level */
{
XFS_WANT_CORRUPTED_RETURN(
level > 0 &&
- bno != NULLDFSBNO &&
+ bno != NULLFSBLOCK &&
XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
return 0;
}
@@ -595,7 +595,7 @@ xfs_btree_islastblock(
block = xfs_btree_get_block(cur, level, &bp);
xfs_btree_check_block(cur, block, level, bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
+ return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
else
return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
}
@@ -771,16 +771,16 @@ xfs_btree_readahead_lblock(
struct xfs_btree_block *block)
{
int rval = 0;
- xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
- xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
- if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
xfs_btree_reada_bufl(cur->bc_mp, left, 1,
cur->bc_ops->buf_ops);
rval++;
}
- if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
xfs_btree_reada_bufl(cur->bc_mp, right, 1,
cur->bc_ops->buf_ops);
rval++;
@@ -852,7 +852,7 @@ xfs_btree_ptr_to_daddr(
union xfs_btree_ptr *ptr)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+ ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
} else {
@@ -900,9 +900,9 @@ xfs_btree_setbuf(
b = XFS_BUF_TO_BLOCK(bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
+ if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
- if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
+ if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
} else {
if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
@@ -918,7 +918,7 @@ xfs_btree_ptr_is_null(
union xfs_btree_ptr *ptr)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return ptr->l == cpu_to_be64(NULLDFSBNO);
+ return ptr->l == cpu_to_be64(NULLFSBLOCK);
else
return ptr->s == cpu_to_be32(NULLAGBLOCK);
}
@@ -929,7 +929,7 @@ xfs_btree_set_ptr_null(
union xfs_btree_ptr *ptr)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- ptr->l = cpu_to_be64(NULLDFSBNO);
+ ptr->l = cpu_to_be64(NULLFSBLOCK);
else
ptr->s = cpu_to_be32(NULLAGBLOCK);
}
@@ -997,8 +997,8 @@ xfs_btree_init_block_int(
buf->bb_numrecs = cpu_to_be16(numrecs);
if (flags & XFS_BTREE_LONG_PTRS) {
- buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+ buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.l.bb_owner = cpu_to_be64(owner);
@@ -1140,7 +1140,7 @@ xfs_btree_get_buf_block(
mp->m_bsize, flags);
if (!*bpp)
- return ENOMEM;
+ return -ENOMEM;
(*bpp)->b_ops = cur->bc_ops->buf_ops;
*block = XFS_BUF_TO_BLOCK(*bpp);
@@ -1498,7 +1498,7 @@ xfs_btree_increment(
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
goto out0;
ASSERT(0);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto error0;
}
ASSERT(lev < cur->bc_nlevels);
@@ -1597,7 +1597,7 @@ xfs_btree_decrement(
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
goto out0;
ASSERT(0);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto error0;
}
ASSERT(lev < cur->bc_nlevels);
@@ -4018,7 +4018,7 @@ xfs_btree_block_change_owner(
/* now read rh sibling block for next iteration */
xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
if (xfs_btree_ptr_is_null(cur, &rptr))
- return ENOENT;
+ return -ENOENT;
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4061,7 +4061,7 @@ xfs_btree_change_owner(
buffer_list);
} while (!error);
- if (error != ENOENT)
+ if (error != -ENOENT)
return error;
}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index a04b69422f67..8f18bab73ea5 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -258,7 +258,7 @@ xfs_btree_check_block(
int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lptr(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_dfsbno_t ptr, /* btree block disk address */
+ xfs_fsblock_t ptr, /* btree block disk address */
int level); /* btree block level */
/*
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676ad8cd..fad1676ad8cd 100644
--- a/fs/xfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index a514ab616650..fd827530afec 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -185,7 +185,7 @@ xfs_da3_node_write_verify(
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
if (!xfs_da3_node_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -214,13 +214,13 @@ xfs_da3_node_read_verify(
switch (be16_to_cpu(info->magic)) {
case XFS_DA3_NODE_MAGIC:
if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
break;
}
/* fall through */
case XFS_DA_NODE_MAGIC:
if (!xfs_da3_node_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
break;
}
return;
@@ -315,7 +315,7 @@ xfs_da3_node_create(
error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
if (error)
- return(error);
+ return error;
bp->b_ops = &xfs_da3_node_buf_ops;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
node = bp->b_addr;
@@ -337,7 +337,7 @@ xfs_da3_node_create(
XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
*bpp = bp;
- return(0);
+ return 0;
}
/*
@@ -385,8 +385,8 @@ xfs_da3_split(
switch (oldblk->magic) {
case XFS_ATTR_LEAF_MAGIC:
error = xfs_attr3_leaf_split(state, oldblk, newblk);
- if ((error != 0) && (error != ENOSPC)) {
- return(error); /* GROT: attr is inconsistent */
+ if ((error != 0) && (error != -ENOSPC)) {
+ return error; /* GROT: attr is inconsistent */
}
if (!error) {
addblk = newblk;
@@ -408,7 +408,7 @@ xfs_da3_split(
&state->extrablk);
}
if (error)
- return(error); /* GROT: attr inconsistent */
+ return error; /* GROT: attr inconsistent */
addblk = newblk;
break;
case XFS_DIR2_LEAFN_MAGIC:
@@ -422,7 +422,7 @@ xfs_da3_split(
max - i, &action);
addblk->bp = NULL;
if (error)
- return(error); /* GROT: dir is inconsistent */
+ return error; /* GROT: dir is inconsistent */
/*
* Record the newly split block for the next time thru?
*/
@@ -439,7 +439,7 @@ xfs_da3_split(
xfs_da3_fixhashpath(state, &state->path);
}
if (!addblk)
- return(0);
+ return 0;
/*
* Split the root node.
@@ -449,7 +449,7 @@ xfs_da3_split(
error = xfs_da3_root_split(state, oldblk, addblk);
if (error) {
addblk->bp = NULL;
- return(error); /* GROT: dir is inconsistent */
+ return error; /* GROT: dir is inconsistent */
}
/*
@@ -492,7 +492,7 @@ xfs_da3_split(
sizeof(node->hdr.info)));
}
addblk->bp = NULL;
- return(0);
+ return 0;
}
/*
@@ -670,18 +670,18 @@ xfs_da3_node_split(
*/
error = xfs_da_grow_inode(state->args, &blkno);
if (error)
- return(error); /* GROT: dir is inconsistent */
+ return error; /* GROT: dir is inconsistent */
error = xfs_da3_node_create(state->args, blkno, treelevel,
&newblk->bp, state->args->whichfork);
if (error)
- return(error); /* GROT: dir is inconsistent */
+ return error; /* GROT: dir is inconsistent */
newblk->blkno = blkno;
newblk->magic = XFS_DA_NODE_MAGIC;
xfs_da3_node_rebalance(state, oldblk, newblk);
error = xfs_da3_blk_link(state, oldblk, newblk);
if (error)
- return(error);
+ return error;
*result = 1;
} else {
*result = 0;
@@ -721,7 +721,7 @@ xfs_da3_node_split(
}
}
- return(0);
+ return 0;
}
/*
@@ -963,9 +963,9 @@ xfs_da3_join(
case XFS_ATTR_LEAF_MAGIC:
error = xfs_attr3_leaf_toosmall(state, &action);
if (error)
- return(error);
+ return error;
if (action == 0)
- return(0);
+ return 0;
xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
break;
case XFS_DIR2_LEAFN_MAGIC:
@@ -985,7 +985,7 @@ xfs_da3_join(
xfs_da3_fixhashpath(state, &state->path);
error = xfs_da3_node_toosmall(state, &action);
if (error)
- return(error);
+ return error;
if (action == 0)
return 0;
xfs_da3_node_unbalance(state, drop_blk, save_blk);
@@ -995,12 +995,12 @@ xfs_da3_join(
error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
xfs_da_state_kill_altpath(state);
if (error)
- return(error);
+ return error;
error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
drop_blk->bp);
drop_blk->bp = NULL;
if (error)
- return(error);
+ return error;
}
/*
* We joined all the way to the top. If it turns out that
@@ -1010,7 +1010,7 @@ xfs_da3_join(
xfs_da3_node_remove(state, drop_blk);
xfs_da3_fixhashpath(state, &state->path);
error = xfs_da3_root_join(state, &state->path.blk[0]);
- return(error);
+ return error;
}
#ifdef DEBUG
@@ -1099,7 +1099,7 @@ xfs_da3_root_join(
xfs_trans_log_buf(args->trans, root_blk->bp, 0,
args->geo->blksize - 1);
error = xfs_da_shrink_inode(args, child, bp);
- return(error);
+ return error;
}
/*
@@ -1142,7 +1142,7 @@ xfs_da3_node_toosmall(
dp->d_ops->node_hdr_from_disk(&nodehdr, node);
if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
- return(0); /* blk over 50%, don't try to join */
+ return 0; /* blk over 50%, don't try to join */
}
/*
@@ -1161,13 +1161,13 @@ xfs_da3_node_toosmall(
error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
if (error)
- return(error);
+ return error;
if (retval) {
*action = 0;
} else {
*action = 2;
}
- return(0);
+ return 0;
}
/*
@@ -1194,7 +1194,7 @@ xfs_da3_node_toosmall(
error = xfs_da3_node_read(state->args->trans, dp,
blkno, -1, &bp, state->args->whichfork);
if (error)
- return(error);
+ return error;
node = bp->b_addr;
dp->d_ops->node_hdr_from_disk(&thdr, node);
@@ -1486,7 +1486,7 @@ xfs_da3_node_lookup_int(
if (error) {
blk->blkno = 0;
state->path.active--;
- return(error);
+ return error;
}
curr = blk->bp->b_addr;
blk->magic = be16_to_cpu(curr->magic);
@@ -1579,25 +1579,25 @@ xfs_da3_node_lookup_int(
args->blkno = blk->blkno;
} else {
ASSERT(0);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
- if (((retval == ENOENT) || (retval == ENOATTR)) &&
+ if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
(blk->hashval == args->hashval)) {
error = xfs_da3_path_shift(state, &state->path, 1, 1,
&retval);
if (error)
- return(error);
+ return error;
if (retval == 0) {
continue;
} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
/* path_shift() gives ENOENT */
- retval = XFS_ERROR(ENOATTR);
+ retval = -ENOATTR;
}
}
break;
}
*result = retval;
- return(0);
+ return 0;
}
/*========================================================================
@@ -1692,7 +1692,7 @@ xfs_da3_blk_link(
be32_to_cpu(old_info->back),
-1, &bp, args->whichfork);
if (error)
- return(error);
+ return error;
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == old_info->magic);
@@ -1713,7 +1713,7 @@ xfs_da3_blk_link(
be32_to_cpu(old_info->forw),
-1, &bp, args->whichfork);
if (error)
- return(error);
+ return error;
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == old_info->magic);
@@ -1726,7 +1726,7 @@ xfs_da3_blk_link(
xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
- return(0);
+ return 0;
}
/*
@@ -1772,7 +1772,7 @@ xfs_da3_blk_unlink(
be32_to_cpu(drop_info->back),
-1, &bp, args->whichfork);
if (error)
- return(error);
+ return error;
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == save_info->magic);
@@ -1789,7 +1789,7 @@ xfs_da3_blk_unlink(
be32_to_cpu(drop_info->forw),
-1, &bp, args->whichfork);
if (error)
- return(error);
+ return error;
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == save_info->magic);
@@ -1801,7 +1801,7 @@ xfs_da3_blk_unlink(
}
xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
- return(0);
+ return 0;
}
/*
@@ -1859,9 +1859,9 @@ xfs_da3_path_shift(
}
}
if (level < 0) {
- *result = XFS_ERROR(ENOENT); /* we're out of our tree */
+ *result = -ENOENT; /* we're out of our tree */
ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
- return(0);
+ return 0;
}
/*
@@ -1883,7 +1883,7 @@ xfs_da3_path_shift(
error = xfs_da3_node_read(args->trans, dp, blkno, -1,
&blk->bp, args->whichfork);
if (error)
- return(error);
+ return error;
info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
@@ -2004,7 +2004,7 @@ xfs_da_grow_inode_int(
struct xfs_trans *tp = args->trans;
struct xfs_inode *dp = args->dp;
int w = args->whichfork;
- xfs_drfsbno_t nblks = dp->i_d.di_nblocks;
+ xfs_rfsblock_t nblks = dp->i_d.di_nblocks;
struct xfs_bmbt_irec map, *mapp;
int nmap, error, got, i, mapi;
@@ -2068,7 +2068,7 @@ xfs_da_grow_inode_int(
if (got != count || mapp[0].br_startoff != *bno ||
mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
*bno + count) {
- error = XFS_ERROR(ENOSPC);
+ error = -ENOSPC;
goto out_free_map;
}
@@ -2158,7 +2158,7 @@ xfs_da3_swap_lastblock(
if (unlikely(lastoff == 0)) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
/*
* Read the last block in the btree space.
@@ -2209,7 +2209,7 @@ xfs_da3_swap_lastblock(
sib_info->magic != dead_info->magic)) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
XFS_ERRLEVEL_LOW, mp);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto done;
}
sib_info->forw = cpu_to_be32(dead_blkno);
@@ -2231,7 +2231,7 @@ xfs_da3_swap_lastblock(
sib_info->magic != dead_info->magic)) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
XFS_ERRLEVEL_LOW, mp);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto done;
}
sib_info->back = cpu_to_be32(dead_blkno);
@@ -2254,7 +2254,7 @@ xfs_da3_swap_lastblock(
if (level >= 0 && level != par_hdr.level + 1) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
XFS_ERRLEVEL_LOW, mp);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto done;
}
level = par_hdr.level;
@@ -2267,7 +2267,7 @@ xfs_da3_swap_lastblock(
if (entno == par_hdr.count) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
XFS_ERRLEVEL_LOW, mp);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto done;
}
par_blkno = be32_to_cpu(btree[entno].before);
@@ -2294,7 +2294,7 @@ xfs_da3_swap_lastblock(
if (unlikely(par_blkno == 0)) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
XFS_ERRLEVEL_LOW, mp);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto done;
}
error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
@@ -2305,7 +2305,7 @@ xfs_da3_swap_lastblock(
if (par_hdr.level != level) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
XFS_ERRLEVEL_LOW, mp);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto done;
}
btree = dp->d_ops->node_tree_p(par_node);
@@ -2359,7 +2359,7 @@ xfs_da_shrink_inode(
error = xfs_bunmapi(tp, dp, dead_blkno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
0, args->firstblock, args->flist, &done);
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
if (w != XFS_DATA_FORK)
break;
error = xfs_da3_swap_lastblock(args, &dead_blkno,
@@ -2427,7 +2427,7 @@ xfs_buf_map_from_irec(
map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
KM_SLEEP | KM_NOFS);
if (!map)
- return ENOMEM;
+ return -ENOMEM;
*mapp = map;
}
@@ -2500,8 +2500,8 @@ xfs_dabuf_map(
}
if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
- error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED);
- if (unlikely(error == EFSCORRUPTED)) {
+ error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
+ if (unlikely(error == -EFSCORRUPTED)) {
if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
int i;
xfs_alert(mp, "%s: bno %lld dir: inode %lld",
@@ -2561,9 +2561,10 @@ xfs_da_get_buf(
bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
mapp, nmap, 0);
- error = bp ? bp->b_error : XFS_ERROR(EIO);
+ error = bp ? bp->b_error : -EIO;
if (error) {
- xfs_trans_brelse(trans, bp);
+ if (bp)
+ xfs_trans_brelse(trans, bp);
goto out_free;
}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 6e153e399a77..6e153e399a77 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index c9aee52a37e2..7e42fdfd2f1d 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -270,7 +270,6 @@ xfs_dir3_data_get_ftype(
{
__uint8_t ftype = dep->name[dep->namelen];
- ASSERT(ftype < XFS_DIR3_FT_MAX);
if (ftype >= XFS_DIR3_FT_MAX)
return XFS_DIR3_FT_UNKNOWN;
return ftype;
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 0a49b0286372..0a49b0286372 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
index 623bbe8fd921..623bbe8fd921 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/libxfs/xfs_dinode.h
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 79670cda48ae..7075aaf131f4 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -108,7 +108,7 @@ xfs_da_mount(
if (!mp->m_dir_geo || !mp->m_attr_geo) {
kmem_free(mp->m_dir_geo);
kmem_free(mp->m_attr_geo);
- return ENOMEM;
+ return -ENOMEM;
}
/* set up directory geometry */
@@ -202,7 +202,7 @@ xfs_dir_ino_validate(
xfs_warn(mp, "Invalid inode number 0x%Lx",
(unsigned long long) ino);
XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -226,7 +226,7 @@ xfs_dir_init(
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
- return ENOMEM;
+ return -ENOMEM;
args->geo = dp->i_mount->m_dir_geo;
args->dp = dp;
@@ -237,7 +237,8 @@ xfs_dir_init(
}
/*
- Enter a name in a directory.
+ * Enter a name in a directory, or check for available space.
+ * If inum is 0, only the available space test is performed.
*/
int
xfs_dir_createname(
@@ -254,14 +255,16 @@ xfs_dir_createname(
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- rval = xfs_dir_ino_validate(tp->t_mountp, inum);
- if (rval)
- return rval;
- XFS_STATS_INC(xs_dir_create);
+ if (inum) {
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
+ return rval;
+ XFS_STATS_INC(xs_dir_create);
+ }
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
- return ENOMEM;
+ return -ENOMEM;
args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
@@ -276,6 +279,8 @@ xfs_dir_createname(
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ if (!inum)
+ args->op_flags |= XFS_DA_OP_JUSTCHECK;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_addname(args);
@@ -314,18 +319,18 @@ xfs_dir_cilookup_result(
int len)
{
if (args->cmpresult == XFS_CMP_DIFFERENT)
- return ENOENT;
+ return -ENOENT;
if (args->cmpresult != XFS_CMP_CASE ||
!(args->op_flags & XFS_DA_OP_CILOOKUP))
- return EEXIST;
+ return -EEXIST;
args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
if (!args->value)
- return ENOMEM;
+ return -ENOMEM;
memcpy(args->value, name, len);
args->valuelen = len;
- return EEXIST;
+ return -EEXIST;
}
/*
@@ -392,7 +397,7 @@ xfs_dir_lookup(
rval = xfs_dir2_node_lookup(args);
out_check_rval:
- if (rval == EEXIST)
+ if (rval == -EEXIST)
rval = 0;
if (!rval) {
*inum = args->inumber;
@@ -428,7 +433,7 @@ xfs_dir_removename(
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
- return ENOMEM;
+ return -ENOMEM;
args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
@@ -493,7 +498,7 @@ xfs_dir_replace(
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
- return ENOMEM;
+ return -ENOMEM;
args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
@@ -535,62 +540,14 @@ out_free:
/*
* See if this entry can be added to the directory without allocating space.
- * First checks that the caller couldn't reserve enough space (resblks = 0).
*/
int
xfs_dir_canenter(
xfs_trans_t *tp,
xfs_inode_t *dp,
- struct xfs_name *name, /* name of entry to add */
- uint resblks)
+ struct xfs_name *name) /* name of entry to add */
{
- struct xfs_da_args *args;
- int rval;
- int v; /* type-checking value */
-
- if (resblks)
- return 0;
-
- ASSERT(S_ISDIR(dp->i_d.di_mode));
-
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
- if (!args)
- return ENOMEM;
-
- args->geo = dp->i_mount->m_dir_geo;
- args->name = name->name;
- args->namelen = name->len;
- args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
- args->dp = dp;
- args->whichfork = XFS_DATA_FORK;
- args->trans = tp;
- args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
- XFS_DA_OP_OKNOENT;
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- rval = xfs_dir2_sf_addname(args);
- goto out_free;
- }
-
- rval = xfs_dir2_isblock(args, &v);
- if (rval)
- goto out_free;
- if (v) {
- rval = xfs_dir2_block_addname(args);
- goto out_free;
- }
-
- rval = xfs_dir2_isleaf(args, &v);
- if (rval)
- goto out_free;
- if (v)
- rval = xfs_dir2_leaf_addname(args);
- else
- rval = xfs_dir2_node_addname(args);
-out_free:
- kmem_free(args);
- return rval;
+ return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0);
}
/*
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c8e86b0b5e99..4dff261e6ed5 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -136,7 +136,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_fsblock_t *first,
struct xfs_bmap_free *flist, xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, uint resblks);
+ struct xfs_name *name);
/*
* Direct call from the bmap code, bypassing the generic directory layer.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index c7cd3154026a..9628ceccfa02 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -91,9 +91,9 @@ xfs_dir3_block_read_verify(
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_dir3_block_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -108,7 +108,7 @@ xfs_dir3_block_write_verify(
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (!xfs_dir3_block_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -392,7 +392,7 @@ xfs_dir2_block_addname(
if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
xfs_trans_brelse(tp, bp);
if (!dup)
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
return 0;
}
@@ -402,7 +402,7 @@ xfs_dir2_block_addname(
if (!dup) {
/* Don't have a space reservation: return no-space. */
if (args->total == 0)
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
/*
* Convert to the next larger format.
* Then add the new entry in that format.
@@ -647,7 +647,7 @@ xfs_dir2_block_lookup(
args->filetype = dp->d_ops->data_get_ftype(dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(args->trans, bp);
- return XFS_ERROR(error);
+ return error;
}
/*
@@ -703,7 +703,7 @@ xfs_dir2_block_lookup_int(
if (low > high) {
ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
xfs_trans_brelse(tp, bp);
- return XFS_ERROR(ENOENT);
+ return -ENOENT;
}
}
/*
@@ -751,7 +751,7 @@ xfs_dir2_block_lookup_int(
* No match, release the buffer and return ENOENT.
*/
xfs_trans_brelse(tp, bp);
- return XFS_ERROR(ENOENT);
+ return -ENOENT;
}
/*
@@ -1091,7 +1091,7 @@ xfs_dir2_sf_to_block(
*/
if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- return XFS_ERROR(EIO);
+ return -EIO;
}
oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 8c2f6422648e..fdd803fecb8e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -100,7 +100,7 @@ __xfs_dir3_data_check(
break;
default:
XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
/*
@@ -256,7 +256,7 @@ xfs_dir3_data_reada_verify(
xfs_dir3_data_verify(bp);
return;
default:
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
break;
}
@@ -270,9 +270,9 @@ xfs_dir3_data_read_verify(
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_dir3_data_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -287,7 +287,7 @@ xfs_dir3_data_write_verify(
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (!xfs_dir3_data_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index fb0aad4440c1..a19174eb3cb2 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -183,9 +183,9 @@ __read_verify(
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_dir3_leaf_verify(bp, magic))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -201,7 +201,7 @@ __write_verify(
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
if (!xfs_dir3_leaf_verify(bp, magic)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -731,7 +731,7 @@ xfs_dir2_leaf_addname(
if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
args->total == 0) {
xfs_trans_brelse(tp, lbp);
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
}
/*
* Convert to node form.
@@ -755,7 +755,7 @@ xfs_dir2_leaf_addname(
*/
if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
xfs_trans_brelse(tp, lbp);
- return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
+ return use_block == -1 ? -ENOSPC : 0;
}
/*
* If no allocations are allowed, return now before we've
@@ -763,7 +763,7 @@ xfs_dir2_leaf_addname(
*/
if (args->total == 0 && use_block == -1) {
xfs_trans_brelse(tp, lbp);
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
}
/*
* Need to compact the leaf entries, removing stale ones.
@@ -1198,7 +1198,7 @@ xfs_dir2_leaf_lookup(
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(tp, dbp);
xfs_trans_brelse(tp, lbp);
- return XFS_ERROR(error);
+ return error;
}
/*
@@ -1327,13 +1327,13 @@ xfs_dir2_leaf_lookup_int(
return 0;
}
/*
- * No match found, return ENOENT.
+ * No match found, return -ENOENT.
*/
ASSERT(cidb == -1);
if (dbp)
xfs_trans_brelse(tp, dbp);
xfs_trans_brelse(tp, lbp);
- return XFS_ERROR(ENOENT);
+ return -ENOENT;
}
/*
@@ -1440,7 +1440,7 @@ xfs_dir2_leaf_removename(
* Just go on, returning success, leaving the
* empty block in place.
*/
- if (error == ENOSPC && args->total == 0)
+ if (error == -ENOSPC && args->total == 0)
error = 0;
xfs_dir3_leaf_check(dp, lbp);
return error;
@@ -1641,7 +1641,7 @@ xfs_dir2_leaf_trim_data(
* Get rid of the data block.
*/
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
- ASSERT(error != ENOSPC);
+ ASSERT(error != -ENOSPC);
xfs_trans_brelse(tp, dbp);
return error;
}
@@ -1815,7 +1815,7 @@ xfs_dir2_node_to_leaf(
* punching out the middle of an extent, and this is an
* isolated block.
*/
- ASSERT(error != ENOSPC);
+ ASSERT(error != -ENOSPC);
return error;
}
fbp = NULL;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index da43d304fca2..2ae6ac2c11ae 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -117,9 +117,9 @@ xfs_dir3_free_read_verify(
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_dir3_free_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -134,7 +134,7 @@ xfs_dir3_free_write_verify(
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (!xfs_dir3_free_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
@@ -406,7 +406,7 @@ xfs_dir2_leafn_add(
* into other peoples memory
*/
if (index < 0)
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
/*
* If there are already the maximum number of leaf entries in
@@ -417,7 +417,7 @@ xfs_dir2_leafn_add(
if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
if (!leafhdr.stale)
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
compact = leafhdr.stale > 1;
} else
compact = 0;
@@ -629,7 +629,7 @@ xfs_dir2_leafn_lookup_for_addname(
XFS_ERRLEVEL_LOW, mp);
if (curfdb != newfdb)
xfs_trans_brelse(tp, curbp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
curfdb = newfdb;
if (be16_to_cpu(bests[fi]) >= length)
@@ -660,7 +660,7 @@ out:
* Return the index, that will be the insertion point.
*/
*indexp = index;
- return XFS_ERROR(ENOENT);
+ return -ENOENT;
}
/*
@@ -789,7 +789,7 @@ xfs_dir2_leafn_lookup_for_entry(
curbp->b_ops = &xfs_dir3_data_buf_ops;
xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
if (cmp == XFS_CMP_EXACT)
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
}
}
ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
@@ -812,7 +812,7 @@ xfs_dir2_leafn_lookup_for_entry(
state->extravalid = 0;
}
*indexp = index;
- return XFS_ERROR(ENOENT);
+ return -ENOENT;
}
/*
@@ -1133,7 +1133,7 @@ xfs_dir3_data_block_free(
if (error == 0) {
fbp = NULL;
logfree = 0;
- } else if (error != ENOSPC || args->total != 0)
+ } else if (error != -ENOSPC || args->total != 0)
return error;
/*
* It's possible to get ENOSPC if there is no
@@ -1287,7 +1287,7 @@ xfs_dir2_leafn_remove(
* In this case just drop the buffer and some one else
* will eventually get rid of the empty block.
*/
- else if (!(error == ENOSPC && args->total == 0))
+ else if (!(error == -ENOSPC && args->total == 0))
return error;
}
/*
@@ -1599,7 +1599,7 @@ xfs_dir2_node_addname(
error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
- if (rval != ENOENT) {
+ if (rval != -ENOENT) {
goto done;
}
/*
@@ -1628,7 +1628,7 @@ xfs_dir2_node_addname(
* It didn't work, we need to split the leaf block.
*/
if (args->total == 0) {
- ASSERT(rval == ENOSPC);
+ ASSERT(rval == -ENOSPC);
goto done;
}
/*
@@ -1815,7 +1815,7 @@ xfs_dir2_node_addname_int(
* Not allowed to allocate, return failure.
*/
if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
/*
* Allocate and initialize the new data block.
@@ -1876,7 +1876,7 @@ xfs_dir2_node_addname_int(
}
XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
/*
@@ -2042,8 +2042,8 @@ xfs_dir2_node_lookup(
error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
- else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
- /* If a CI match, dup the actual name and return EEXIST */
+ else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
+ /* If a CI match, dup the actual name and return -EEXIST */
xfs_dir2_data_entry_t *dep;
dep = (xfs_dir2_data_entry_t *)
@@ -2096,7 +2096,7 @@ xfs_dir2_node_removename(
goto out_free;
/* Didn't find it, upper layer screwed up. */
- if (rval != EEXIST) {
+ if (rval != -EEXIST) {
error = rval;
goto out_free;
}
@@ -2169,7 +2169,7 @@ xfs_dir2_node_replace(
* It should be found, since the vnodeops layer has looked it up
* and locked it. But paranoia is good.
*/
- if (rval == EEXIST) {
+ if (rval == -EEXIST) {
struct xfs_dir2_leaf_entry *ents;
/*
* Find the leaf entry.
@@ -2272,7 +2272,7 @@ xfs_dir2_node_trim_free(
* space reservation, when breaking up an extent into two
* pieces. This is the last block of an extent.
*/
- ASSERT(error != ENOSPC);
+ ASSERT(error != -ENOSPC);
xfs_trans_brelse(tp, bp);
return error;
}
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 27ce0794d196..27ce0794d196 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 53c3be619db5..5079e051ef08 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -51,10 +51,9 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args);
#else
#define xfs_dir2_sf_check(args)
#endif /* DEBUG */
-#if XFS_BIG_INUMS
+
static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
-#endif /* XFS_BIG_INUMS */
/*
* Given a block directory (dp/block), calculate its size as a shortform (sf)
@@ -117,10 +116,10 @@ xfs_dir2_block_sfsize(
isdotdot =
dep->namelen == 2 &&
dep->name[0] == '.' && dep->name[1] == '.';
-#if XFS_BIG_INUMS
+
if (!isdot)
i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
-#endif
+
/* take into account the file type field */
if (!isdot && !isdotdot) {
count++;
@@ -251,7 +250,7 @@ xfs_dir2_block_to_sf(
logflags = XFS_ILOG_CORE;
error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
if (error) {
- ASSERT(error != ENOSPC);
+ ASSERT(error != -ENOSPC);
goto out;
}
@@ -299,7 +298,7 @@ xfs_dir2_sf_addname(
trace_xfs_dir2_sf_addname(args);
- ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+ ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
dp = args->dp;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
/*
@@ -307,7 +306,7 @@ xfs_dir2_sf_addname(
*/
if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
+ return -EIO;
}
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -318,7 +317,7 @@ xfs_dir2_sf_addname(
*/
incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
objchange = 0;
-#if XFS_BIG_INUMS
+
/*
* Do we have to change to 8 byte inodes?
*/
@@ -332,7 +331,7 @@ xfs_dir2_sf_addname(
(uint)sizeof(xfs_dir2_ino4_t));
objchange = 1;
}
-#endif
+
new_isize = (int)dp->i_d.di_size + incr_isize;
/*
* Won't fit as shortform any more (due to size),
@@ -345,7 +344,7 @@ xfs_dir2_sf_addname(
* Just checking or no space reservation, it doesn't fit.
*/
if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
/*
* Convert to block form then add the name.
*/
@@ -370,10 +369,8 @@ xfs_dir2_sf_addname(
*/
else {
ASSERT(pick == 2);
-#if XFS_BIG_INUMS
if (objchange)
xfs_dir2_sf_toino8(args);
-#endif
xfs_dir2_sf_addname_hard(args, objchange, new_isize);
}
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
@@ -425,10 +422,8 @@ xfs_dir2_sf_addname_easy(
* Update the header and inode.
*/
sfp->count++;
-#if XFS_BIG_INUMS
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
sfp->i8count++;
-#endif
dp->i_d.di_size = new_isize;
xfs_dir2_sf_check(args);
}
@@ -516,10 +511,8 @@ xfs_dir2_sf_addname_hard(
dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
dp->d_ops->sf_put_ftype(sfep, args->filetype);
sfp->count++;
-#if XFS_BIG_INUMS
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
sfp->i8count++;
-#endif
/*
* If there's more left to copy, do that.
*/
@@ -593,13 +586,8 @@ xfs_dir2_sf_addname_pick(
/*
* If changing the inode number size, do it the hard way.
*/
-#if XFS_BIG_INUMS
- if (objchange) {
+ if (objchange)
return 2;
- }
-#else
- ASSERT(objchange == 0);
-#endif
/*
* If it won't fit at the end then do it the hard way (use the hole).
*/
@@ -650,7 +638,6 @@ xfs_dir2_sf_check(
ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
}
ASSERT(i8count == sfp->i8count);
- ASSERT(XFS_BIG_INUMS || i8count == 0);
ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
ASSERT(offset +
(sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
@@ -738,7 +725,7 @@ xfs_dir2_sf_lookup(
*/
if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
+ return -EIO;
}
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -751,7 +738,7 @@ xfs_dir2_sf_lookup(
args->inumber = dp->i_ino;
args->cmpresult = XFS_CMP_EXACT;
args->filetype = XFS_DIR3_FT_DIR;
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
}
/*
* Special case for ..
@@ -761,7 +748,7 @@ xfs_dir2_sf_lookup(
args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
args->cmpresult = XFS_CMP_EXACT;
args->filetype = XFS_DIR3_FT_DIR;
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
}
/*
* Loop over all the entries trying to match ours.
@@ -781,20 +768,20 @@ xfs_dir2_sf_lookup(
args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
args->filetype = dp->d_ops->sf_get_ftype(sfep);
if (cmp == XFS_CMP_EXACT)
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
ci_sfep = sfep;
}
}
ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
/*
* Here, we can only be doing a lookup (not a rename or replace).
- * If a case-insensitive match was not found, return ENOENT.
+ * If a case-insensitive match was not found, return -ENOENT.
*/
if (!ci_sfep)
- return XFS_ERROR(ENOENT);
+ return -ENOENT;
/* otherwise process the CI match as required by the caller */
error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
- return XFS_ERROR(error);
+ return error;
}
/*
@@ -824,7 +811,7 @@ xfs_dir2_sf_removename(
*/
if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
+ return -EIO;
}
ASSERT(dp->i_df.if_bytes == oldsize);
ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -847,7 +834,7 @@ xfs_dir2_sf_removename(
* Didn't find it.
*/
if (i == sfp->count)
- return XFS_ERROR(ENOENT);
+ return -ENOENT;
/*
* Calculate sizes.
*/
@@ -870,7 +857,6 @@ xfs_dir2_sf_removename(
*/
xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-#if XFS_BIG_INUMS
/*
* Are we changing inode number size?
*/
@@ -880,7 +866,6 @@ xfs_dir2_sf_removename(
else
sfp->i8count--;
}
-#endif
xfs_dir2_sf_check(args);
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
return 0;
@@ -895,12 +880,8 @@ xfs_dir2_sf_replace(
{
xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
-#if XFS_BIG_INUMS || defined(DEBUG)
xfs_ino_t ino=0; /* entry old inode number */
-#endif
-#if XFS_BIG_INUMS
int i8elevated; /* sf_toino8 set i8count=1 */
-#endif
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
@@ -914,13 +895,13 @@ xfs_dir2_sf_replace(
*/
if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
+ return -EIO;
}
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-#if XFS_BIG_INUMS
+
/*
* New inode number is large, and need to convert to 8-byte inodes.
*/
@@ -951,17 +932,15 @@ xfs_dir2_sf_replace(
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
} else
i8elevated = 0;
-#endif
+
ASSERT(args->namelen != 1 || args->name[0] != '.');
/*
* Replace ..'s entry.
*/
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
-#if XFS_BIG_INUMS || defined(DEBUG)
ino = dp->d_ops->sf_get_parent_ino(sfp);
ASSERT(args->inumber != ino);
-#endif
dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
}
/*
@@ -972,10 +951,8 @@ xfs_dir2_sf_replace(
i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
-#if XFS_BIG_INUMS || defined(DEBUG)
ino = dp->d_ops->sf_get_ino(sfp, sfep);
ASSERT(args->inumber != ino);
-#endif
dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
dp->d_ops->sf_put_ftype(sfep, args->filetype);
break;
@@ -986,14 +963,11 @@ xfs_dir2_sf_replace(
*/
if (i == sfp->count) {
ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-#if XFS_BIG_INUMS
if (i8elevated)
xfs_dir2_sf_toino4(args);
-#endif
- return XFS_ERROR(ENOENT);
+ return -ENOENT;
}
}
-#if XFS_BIG_INUMS
/*
* See if the old number was large, the new number is small.
*/
@@ -1020,13 +994,11 @@ xfs_dir2_sf_replace(
if (!i8elevated)
sfp->i8count++;
}
-#endif
xfs_dir2_sf_check(args);
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
return 0;
}
-#if XFS_BIG_INUMS
/*
* Convert from 8-byte inode numbers to 4-byte inode numbers.
* The last 8-byte inode number is gone, but the count is still 1.
@@ -1181,4 +1153,3 @@ xfs_dir2_sf_toino8(
dp->i_d.di_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
-#endif /* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index c2ac0c611ad8..bb969337efc8 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -257,9 +257,9 @@ xfs_dquot_buf_read_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
if (!xfs_dquot_buf_verify_crc(mp, bp))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_dquot_buf_verify(mp, bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -277,7 +277,7 @@ xfs_dquot_buf_write_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
if (!xfs_dquot_buf_verify(mp, bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 34d85aca3058..7e42bba9a420 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -68,11 +68,7 @@ struct xfs_ifork;
#define XFS_RTLOBIT(w) xfs_lowbit32(w)
#define XFS_RTHIBIT(w) xfs_highbit32(w)
-#if XFS_BIG_BLKNOS
#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
-#else
-#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
-#endif
/*
* Dquot and dquot block format definitions
@@ -304,23 +300,15 @@ typedef struct xfs_bmbt_rec_host {
* Values and macros for delayed-allocation startblock fields.
*/
#define STARTBLOCKVALBITS 17
-#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
-#define DSTARTBLOCKMASKBITS (15 + 20)
+#define STARTBLOCKMASKBITS (15 + 20)
#define STARTBLOCKMASK \
(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define DSTARTBLOCKMASK \
- (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
static inline int isnullstartblock(xfs_fsblock_t x)
{
return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
}
-static inline int isnulldstartblock(xfs_dfsbno_t x)
-{
- return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
-}
-
static inline xfs_fsblock_t nullstartblock(int k)
{
ASSERT(k < (1 << STARTBLOCKVALBITS));
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 5960e5593fe0..23dcb72fc5e6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -292,7 +292,7 @@ xfs_ialloc_inode_init(
mp->m_bsize * blks_per_cluster,
XBF_UNMAPPED);
if (!fbuf)
- return ENOMEM;
+ return -ENOMEM;
/* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
@@ -380,7 +380,7 @@ xfs_ialloc_ag_alloc(
newlen = args.mp->m_ialloc_inos;
if (args.mp->m_maxicount &&
args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
args.minlen = args.maxlen = args.mp->m_ialloc_blks;
/*
* First try to allocate inodes contiguous with the last-allocated
@@ -1076,8 +1076,8 @@ xfs_dialloc_ag_finobt_newino(
int i;
if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
- error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
- &i);
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
if (error)
return error;
if (i == 1) {
@@ -1085,7 +1085,6 @@ xfs_dialloc_ag_finobt_newino(
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
-
return 0;
}
}
@@ -1385,7 +1384,7 @@ xfs_dialloc(
if (error) {
xfs_trans_brelse(tp, agbp);
- if (error != ENOSPC)
+ if (error != -ENOSPC)
goto out_error;
xfs_perag_put(pag);
@@ -1416,7 +1415,7 @@ nextag:
agno = 0;
if (agno == start_agno) {
*inop = NULLFSINO;
- return noroom ? ENOSPC : 0;
+ return noroom ? -ENOSPC : 0;
}
}
@@ -1425,7 +1424,7 @@ out_alloc:
return xfs_dialloc_ag(tp, agbp, parent, inop);
out_error:
xfs_perag_put(pag);
- return XFS_ERROR(error);
+ return error;
}
STATIC int
@@ -1682,7 +1681,7 @@ xfs_difree(
xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
__func__, agno, mp->m_sb.sb_agcount);
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
agino = XFS_INO_TO_AGINO(mp, inode);
if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
@@ -1690,14 +1689,14 @@ xfs_difree(
__func__, (unsigned long long)inode,
(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
agbno = XFS_AGINO_TO_AGBNO(mp, agino);
if (agbno >= mp->m_sb.sb_agblocks) {
xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
__func__, agbno, mp->m_sb.sb_agblocks);
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
/*
* Get the allocation group header.
@@ -1769,7 +1768,7 @@ xfs_imap_lookup(
if (i)
error = xfs_inobt_get_rec(cur, &rec, &i);
if (!error && i == 0)
- error = EINVAL;
+ error = -EINVAL;
}
xfs_trans_brelse(tp, agbp);
@@ -1780,12 +1779,12 @@ xfs_imap_lookup(
/* check that the returned record contains the required inode */
if (rec.ir_startino > agino ||
rec.ir_startino + mp->m_ialloc_inos <= agino)
- return EINVAL;
+ return -EINVAL;
/* for untrusted inodes check it is allocated first */
if ((flags & XFS_IGET_UNTRUSTED) &&
(rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
- return EINVAL;
+ return -EINVAL;
*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
*offset_agbno = agbno - *chunk_agbno;
@@ -1829,7 +1828,7 @@ xfs_imap(
* as they can be invalid without implying corruption.
*/
if (flags & XFS_IGET_UNTRUSTED)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
if (agno >= mp->m_sb.sb_agcount) {
xfs_alert(mp,
"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
@@ -1849,7 +1848,7 @@ xfs_imap(
}
xfs_stack_trace();
#endif /* DEBUG */
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
blks_per_cluster = xfs_icluster_size_fsb(mp);
@@ -1922,7 +1921,7 @@ out_map:
__func__, (unsigned long long) imap->im_blkno,
(unsigned long long) imap->im_len,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
return 0;
}
@@ -2051,6 +2050,8 @@ xfs_agi_verify(
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return false;
+ if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+ return false;
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -2072,11 +2073,11 @@ xfs_agi_read_verify(
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
XFS_ERRTAG_IALLOC_READ_AGI,
XFS_RANDOM_IALLOC_READ_AGI))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -2090,7 +2091,7 @@ xfs_agi_write_verify(
struct xfs_buf_log_item *bip = bp->b_fspriv;
if (!xfs_agi_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 95ad1c002d60..95ad1c002d60 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 726f83a681a5..c9b06f30fe86 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -272,9 +272,9 @@ xfs_inobt_read_verify(
struct xfs_buf *bp)
{
if (!xfs_btree_sblock_verify_crc(bp))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_inobt_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error) {
trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -288,7 +288,7 @@ xfs_inobt_write_verify(
{
if (!xfs_inobt_verify(bp)) {
trace_xfs_btree_corrupt(bp, _RET_IP_);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c2d0..d7ebea72c2d0 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cb35ae41d4a1..f18fd2da49f7 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -101,7 +101,7 @@ xfs_inode_buf_verify(
return;
}
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
#ifdef DEBUG
xfs_alert(mp,
@@ -174,14 +174,14 @@ xfs_imap_to_bp(
(int)imap->im_len, buf_flags, &bp,
&xfs_inode_buf_ops);
if (error) {
- if (error == EAGAIN) {
+ if (error == -EAGAIN) {
ASSERT(buf_flags & XBF_TRYLOCK);
return error;
}
- if (error == EFSCORRUPTED &&
+ if (error == -EFSCORRUPTED &&
(iget_flags & XFS_IGET_UNTRUSTED))
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
__func__, error);
@@ -390,7 +390,7 @@ xfs_iread(
__func__, ip->i_ino);
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto out_brelse;
}
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..9308c47f2a52 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index b031e8d0d928..6a00f7fed69d 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -102,7 +102,7 @@ xfs_iformat_fork(
be64_to_cpu(dip->di_nblocks));
XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
@@ -111,7 +111,7 @@ xfs_iformat_fork(
dip->di_forkoff);
XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
@@ -121,7 +121,7 @@ xfs_iformat_fork(
ip->i_ino);
XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
XFS_ERRLEVEL_LOW, ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
switch (ip->i_d.di_mode & S_IFMT) {
@@ -132,7 +132,7 @@ xfs_iformat_fork(
if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
ip->i_d.di_size = 0;
ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
@@ -153,7 +153,7 @@ xfs_iformat_fork(
XFS_CORRUPTION_ERROR("xfs_iformat(4)",
XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
di_size = be64_to_cpu(dip->di_size);
@@ -166,7 +166,7 @@ xfs_iformat_fork(
XFS_CORRUPTION_ERROR("xfs_iformat(5)",
XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
size = (int)di_size;
@@ -181,13 +181,13 @@ xfs_iformat_fork(
default:
XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
break;
default:
XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (error) {
return error;
@@ -211,7 +211,7 @@ xfs_iformat_fork(
XFS_CORRUPTION_ERROR("xfs_iformat(8)",
XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
@@ -223,7 +223,7 @@ xfs_iformat_fork(
error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
break;
default:
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
break;
}
if (error) {
@@ -266,7 +266,7 @@ xfs_iformat_local(
XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
ifp = XFS_IFORK_PTR(ip, whichfork);
real_size = 0;
@@ -322,7 +322,7 @@ xfs_iformat_extents(
(unsigned long long) ip->i_ino, nex);
XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
ifp->if_real_bytes = 0;
@@ -350,7 +350,7 @@ xfs_iformat_extents(
XFS_ERROR_REPORT("xfs_iformat_extents(2)",
XFS_ERRLEVEL_LOW,
ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
}
ifp->if_flags |= XFS_IFEXTENTS;
@@ -399,7 +399,7 @@ xfs_iformat_btree(
(unsigned long long) ip->i_ino);
XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
mp, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
ifp->if_broot_bytes = size;
@@ -436,7 +436,7 @@ xfs_iread_extents(
if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -528,7 +528,7 @@ xfs_iroot_realloc(
ifp->if_broot_bytes = (int)new_size;
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
XFS_IFORK_SIZE(ip, whichfork));
- memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+ memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
return;
}
@@ -575,7 +575,7 @@ xfs_iroot_realloc(
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
(int)new_size);
- memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+ memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
}
kmem_free(ifp->if_broot);
ifp->if_broot = new_broot;
@@ -1692,7 +1692,7 @@ xfs_iext_idx_to_irec(
}
*idxp = page_idx;
*erp_idxp = erp_idx;
- return(erp);
+ return erp;
}
/*
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7d3b1ed6dcbe..7d3b1ed6dcbe 100644
--- a/fs/xfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
index 90efdaf1706f..4ff2278e147a 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/libxfs/xfs_inum.h
@@ -54,11 +54,7 @@ struct xfs_mount;
#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
-#if XFS_BIG_INUMS
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#else
-#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
-#endif
#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
#endif /* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index f0969c77bdbe..aff12f2d4428 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -380,7 +380,7 @@ typedef struct xfs_icdinode {
xfs_ictimestamp_t di_mtime; /* time last modified */
xfs_ictimestamp_t di_ctime; /* time created/inode modified */
xfs_fsize_t di_size; /* number of bytes in file */
- xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
+ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
xfs_extnum_t di_nextents; /* number of extents in data fork */
xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
@@ -516,7 +516,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
* EFI/EFD log format definitions
*/
typedef struct xfs_extent {
- xfs_dfsbno_t ext_start;
+ xfs_fsblock_t ext_start;
xfs_extlen_t ext_len;
} xfs_extent_t;
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 1c55ccbb379d..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index ee7e0e80246b..ee7e0e80246b 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 137e20937077..1b0a08379759 100644
--- a/fs/xfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -98,8 +98,6 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
XFS_GQUOTA_ACTIVE | \
XFS_PQUOTA_ACTIVE))
-#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
- XFS_PQUOTA_ACTIVE))
#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f4dd697cac08..7c818f1e4484 100644
--- a/fs/xfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -424,20 +424,24 @@ xfs_rtfind_forw(
}
/*
- * Read and modify the summary information for a given extent size,
+ * Read and/or modify the summary information for a given extent size,
* bitmap block combination.
* Keeps track of a current summary block, so we don't keep reading
* it from the buffer cache.
+ *
+ * Summary information is returned in *sum if specified.
+ * If no delta is specified, returns summary only.
*/
int
-xfs_rtmodify_summary(
- xfs_mount_t *mp, /* file system mount point */
+xfs_rtmodify_summary_int(
+ xfs_mount_t *mp, /* file system mount structure */
xfs_trans_t *tp, /* transaction pointer */
int log, /* log2 of extent size */
xfs_rtblock_t bbno, /* bitmap block number */
int delta, /* change to make to summary info */
xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
{
xfs_buf_t *bp; /* buffer for the summary block */
int error; /* error value */
@@ -456,7 +460,7 @@ xfs_rtmodify_summary(
/*
* If we have an old buffer, and the block number matches, use that.
*/
- if (rbpp && *rbpp && *rsb == sb)
+ if (*rbpp && *rsb == sb)
bp = *rbpp;
/*
* Otherwise we have to get the buffer.
@@ -465,7 +469,7 @@ xfs_rtmodify_summary(
/*
* If there was an old one, get rid of it first.
*/
- if (rbpp && *rbpp)
+ if (*rbpp)
xfs_trans_brelse(tp, *rbpp);
error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
if (error) {
@@ -474,21 +478,38 @@ xfs_rtmodify_summary(
/*
* Remember this buffer and block for the next call.
*/
- if (rbpp) {
- *rbpp = bp;
- *rsb = sb;
- }
+ *rbpp = bp;
+ *rsb = sb;
}
/*
- * Point to the summary information, modify and log it.
+ * Point to the summary information, modify/log it, and/or copy it out.
*/
sp = XFS_SUMPTR(mp, bp, so);
- *sp += delta;
- xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
- (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+ if (delta) {
+ uint first = (uint)((char *)sp - (char *)bp->b_addr);
+
+ *sp += delta;
+ xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+ }
+ if (sum)
+ *sum = *sp;
return 0;
}
+int
+xfs_rtmodify_summary(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ int delta, /* change to make to summary info */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ return xfs_rtmodify_summary_int(mp, tp, log, bbno,
+ delta, rbpp, rsb, NULL);
+}
+
/*
* Set the given range of bitmap bits to the given value.
* Do whatever I/O and logging is required.
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 7703fa6770ff..5f902fa7913f 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -186,13 +186,13 @@ xfs_mount_validate_sb(
*/
if (sbp->sb_magicnum != XFS_SB_MAGIC) {
xfs_warn(mp, "bad magic number");
- return XFS_ERROR(EWRONGFS);
+ return -EWRONGFS;
}
if (!xfs_sb_good_version(sbp)) {
xfs_warn(mp, "bad version");
- return XFS_ERROR(EWRONGFS);
+ return -EWRONGFS;
}
/*
@@ -220,7 +220,7 @@ xfs_mount_validate_sb(
xfs_warn(mp,
"Attempted to mount read-only compatible filesystem read-write.\n"
"Filesystem can only be safely mounted read only.");
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
}
if (xfs_sb_has_incompat_feature(sbp,
@@ -230,7 +230,7 @@ xfs_mount_validate_sb(
"Filesystem can not be safely mounted by this kernel.",
(sbp->sb_features_incompat &
XFS_SB_FEAT_INCOMPAT_UNKNOWN));
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
}
@@ -238,13 +238,13 @@ xfs_mount_validate_sb(
if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
xfs_notice(mp,
"Version 5 of Super block has XFS_OQUOTA bits.");
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
xfs_notice(mp,
"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (unlikely(
@@ -252,7 +252,7 @@ xfs_mount_validate_sb(
xfs_warn(mp,
"filesystem is marked as having an external log; "
"specify logdev on the mount command line.");
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
if (unlikely(
@@ -260,7 +260,7 @@ xfs_mount_validate_sb(
xfs_warn(mp,
"filesystem is marked as having an internal log; "
"do not specify logdev on the mount command line.");
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
/*
@@ -279,11 +279,13 @@ xfs_mount_validate_sb(
sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
+ sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG ||
sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
+ sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE ||
sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
(sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
(sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
@@ -294,7 +296,7 @@ xfs_mount_validate_sb(
sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) ||
sbp->sb_shared_vn != 0)) {
xfs_notice(mp, "SB sanity check failed");
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
/*
@@ -305,7 +307,7 @@ xfs_mount_validate_sb(
"File system with blocksize %d bytes. "
"Only pagesize (%ld) or less will currently work.",
sbp->sb_blocksize, PAGE_SIZE);
- return XFS_ERROR(ENOSYS);
+ return -ENOSYS;
}
/*
@@ -320,19 +322,19 @@ xfs_mount_validate_sb(
default:
xfs_warn(mp, "inode size of %d bytes not supported",
sbp->sb_inodesize);
- return XFS_ERROR(ENOSYS);
+ return -ENOSYS;
}
if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
xfs_warn(mp,
"file system too large to be mounted on this system.");
- return XFS_ERROR(EFBIG);
+ return -EFBIG;
}
if (check_inprogress && sbp->sb_inprogress) {
xfs_warn(mp, "Offline file system operation in progress!");
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -386,10 +388,11 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
}
}
-void
-xfs_sb_from_disk(
+static void
+__xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from)
+ xfs_dsb_t *from,
+ bool convert_xquota)
{
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
@@ -442,9 +445,22 @@ xfs_sb_from_disk(
to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
to->sb_features_log_incompat =
be32_to_cpu(from->sb_features_log_incompat);
+ /* crc is only used on disk, not in memory; just init to 0 here. */
+ to->sb_crc = 0;
to->sb_pad = 0;
to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
to->sb_lsn = be64_to_cpu(from->sb_lsn);
+ /* Convert on-disk flags to in-memory flags? */
+ if (convert_xquota)
+ xfs_sb_quota_from_disk(to);
+}
+
+void
+xfs_sb_from_disk(
+ struct xfs_sb *to,
+ xfs_dsb_t *from)
+{
+ __xfs_sb_from_disk(to, from, true);
}
static inline void
@@ -536,6 +552,9 @@ xfs_sb_to_disk(
if (!fields)
return;
+ /* We should never write the crc here, it's updated in the IO path */
+ fields &= ~XFS_SB_CRC;
+
xfs_sb_quota_to_disk(to, from, &fields);
while (fields) {
f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
@@ -577,7 +596,11 @@ xfs_sb_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_sb sb;
- xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+ /*
+ * Use call variant which doesn't convert quota flags from disk
+ * format, because xfs_mount_validate_sb checks the on-disk flags.
+ */
+ __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
/*
* Only check the in progress field for the primary superblock as
@@ -620,7 +643,7 @@ xfs_sb_read_verify(
/* Only fail bad secondaries on a known V5 filesystem */
if (bp->b_bn == XFS_SB_DADDR ||
xfs_sb_version_hascrc(&mp->m_sb)) {
- error = EFSBADCRC;
+ error = -EFSBADCRC;
goto out_error;
}
}
@@ -630,7 +653,7 @@ xfs_sb_read_verify(
out_error:
if (error) {
xfs_buf_ioerror(bp, error);
- if (error == EFSCORRUPTED || error == EFSBADCRC)
+ if (error == -EFSCORRUPTED || error == -EFSBADCRC)
xfs_verifier_error(bp);
}
}
@@ -653,7 +676,7 @@ xfs_sb_quiet_read_verify(
return;
}
/* quietly fail */
- xfs_buf_ioerror(bp, EWRONGFS);
+ xfs_buf_ioerror(bp, -EWRONGFS);
}
static void
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index c43c2d609a24..2e739708afd3 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -87,11 +87,11 @@ struct xfs_trans;
typedef struct xfs_sb {
__uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
__uint32_t sb_blocksize; /* logical block size, bytes */
- xfs_drfsbno_t sb_dblocks; /* number of data blocks */
- xfs_drfsbno_t sb_rblocks; /* number of realtime blocks */
- xfs_drtbno_t sb_rextents; /* number of realtime extents */
+ xfs_rfsblock_t sb_dblocks; /* number of data blocks */
+ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
+ xfs_rtblock_t sb_rextents; /* number of realtime extents */
uuid_t sb_uuid; /* file system unique id */
- xfs_dfsbno_t sb_logstart; /* starting block of log if internal */
+ xfs_fsblock_t sb_logstart; /* starting block of log if internal */
xfs_ino_t sb_rootino; /* root inode number */
xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..82404da2ca67 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 23c2f2577c8d..5782f037eab4 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -133,9 +133,9 @@ xfs_symlink_read_verify(
return;
if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
- xfs_buf_ioerror(bp, EFSBADCRC);
+ xfs_buf_ioerror(bp, -EFSBADCRC);
else if (!xfs_symlink_verify(bp))
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
xfs_verifier_error(bp);
@@ -153,7 +153,7 @@ xfs_symlink_write_verify(
return;
if (!xfs_symlink_verify(bp)) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f2bda7c76b8a..f2bda7c76b8a 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..1097d14cd583 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index bf9c4579334d..bf9c4579334d 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
diff --git a/fs/xfs/time.h b/fs/xfs/time.h
deleted file mode 100644
index 387e695a184c..000000000000
--- a/fs/xfs/time.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_TIME_H__
-#define __XFS_SUPPORT_TIME_H__
-
-#include <linux/sched.h>
-#include <linux/time.h>
-
-typedef struct timespec timespec_t;
-
-static inline void delay(long ticks)
-{
- schedule_timeout_uninterruptible(ticks);
-}
-
-static inline void nanotime(struct timespec *tvp)
-{
- *tvp = CURRENT_TIME;
-}
-
-#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6888ad886ff6..a65fa5dde6e9 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type)
if (!xfs_acl)
return ERR_PTR(-ENOMEM);
- error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+ error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
&len, ATTR_ROOT);
if (error) {
/*
@@ -210,7 +210,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
len -= sizeof(struct xfs_acl_entry) *
(XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
- error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+ error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
len, ATTR_ROOT);
kmem_free(xfs_acl);
@@ -218,7 +218,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
/*
* A NULL ACL argument means we want to remove the ACL.
*/
- error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+ error = xfs_attr_remove(ip, ea_name, ATTR_ROOT);
/*
* If the attribute didn't exist to start with that's fine.
@@ -244,7 +244,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
iattr.ia_mode = mode;
iattr.ia_ctime = current_fs_time(inode->i_sb);
- error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+ error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
}
return error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index faaf716e2080..f5b2453a43b2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -240,7 +240,7 @@ xfs_end_io(
done:
if (error)
- ioend->io_error = -error;
+ ioend->io_error = error;
xfs_destroy_ioend(ioend);
}
@@ -308,14 +308,14 @@ xfs_map_blocks(
int nimaps = 1;
if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+ return -EIO;
if (type == XFS_IO_UNWRITTEN)
bmapi_flags |= XFS_BMAPI_IGSTATE;
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
if (nonblocking)
- return -XFS_ERROR(EAGAIN);
+ return -EAGAIN;
xfs_ilock(ip, XFS_ILOCK_SHARED);
}
@@ -332,14 +332,14 @@ xfs_map_blocks(
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (error)
- return -XFS_ERROR(error);
+ return error;
if (type == XFS_IO_DELALLOC &&
(!nimaps || isnullstartblock(imap->br_startblock))) {
error = xfs_iomap_write_allocate(ip, offset, imap);
if (!error)
trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
- return -XFS_ERROR(error);
+ return error;
}
#ifdef DEBUG
@@ -434,10 +434,22 @@ xfs_start_page_writeback(
{
ASSERT(PageLocked(page));
ASSERT(!PageWriteback(page));
- if (clear_dirty)
+
+ /*
+ * if the page was not fully cleaned, we need to ensure that the higher
+ * layers come back to it correctly. That means we need to keep the page
+ * dirty, and for WB_SYNC_ALL writeback we need to ensure the
+ * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
+ * write this page in this writeback sweep will be made.
+ */
+ if (clear_dirty) {
clear_page_dirty_for_io(page);
- set_page_writeback(page);
+ set_page_writeback(page);
+ } else
+ set_page_writeback_keepwrite(page);
+
unlock_page(page);
+
/* If no buffers on the page are to be written, finish it here */
if (!buffers)
end_page_writeback(page);
@@ -502,7 +514,7 @@ xfs_submit_ioend(
* time.
*/
if (fail) {
- ioend->io_error = -fail;
+ ioend->io_error = fail;
xfs_finish_ioend(ioend);
continue;
}
@@ -548,6 +560,13 @@ xfs_cancel_ioend(
do {
next_bh = bh->b_private;
clear_buffer_async_write(bh);
+ /*
+ * The unwritten flag is cleared when added to the
+ * ioend. We're not submitting for I/O so mark the
+ * buffer unwritten again for next time around.
+ */
+ if (ioend->io_type == XFS_IO_UNWRITTEN)
+ set_buffer_unwritten(bh);
unlock_buffer(bh);
} while ((bh = next_bh) != NULL);
@@ -1253,7 +1272,7 @@ __xfs_get_blocks(
int new = 0;
if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+ return -EIO;
offset = (xfs_off_t)iblock << inode->i_blkbits;
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1302,7 +1321,7 @@ __xfs_get_blocks(
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
if (error)
- return -error;
+ return error;
new = 1;
} else {
/*
@@ -1415,7 +1434,7 @@ __xfs_get_blocks(
out_unlock:
xfs_iunlock(ip, lockmode);
- return -error;
+ return error;
}
int
@@ -1753,11 +1772,72 @@ xfs_vm_readpages(
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
}
+/*
+ * This is basically a copy of __set_page_dirty_buffers() with one
+ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
+ * dirty, we'll never be able to clean them because we don't write buffers
+ * beyond EOF, and that means we can't invalidate pages that span EOF
+ * that have been marked dirty. Further, the dirty state can leak into
+ * the file interior if the file is extended, resulting in all sorts of
+ * bad things happening as the state does not match the underlying data.
+ *
+ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
+ * this only exist because of bufferheads and how the generic code manages them.
+ */
+STATIC int
+xfs_vm_set_page_dirty(
+ struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ loff_t end_offset;
+ loff_t offset;
+ int newly_dirty;
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
+
+ end_offset = i_size_read(inode);
+ offset = page_offset(page);
+
+ spin_lock(&mapping->private_lock);
+ if (page_has_buffers(page)) {
+ struct buffer_head *head = page_buffers(page);
+ struct buffer_head *bh = head;
+
+ do {
+ if (offset < end_offset)
+ set_buffer_dirty(bh);
+ bh = bh->b_this_page;
+ offset += 1 << inode->i_blkbits;
+ } while (bh != head);
+ }
+ newly_dirty = !TestSetPageDirty(page);
+ spin_unlock(&mapping->private_lock);
+
+ if (newly_dirty) {
+ /* sigh - __set_page_dirty() is static, so copy it here, too */
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(!PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ }
+ return newly_dirty;
+}
+
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages,
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
+ .set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
.write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 09480c57f069..aa2a8b1838a2 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -76,7 +76,7 @@ xfs_attr3_leaf_freextent(
error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
&map, &nmap, XFS_BMAPI_ATTRFORK);
if (error) {
- return(error);
+ return error;
}
ASSERT(nmap == 1);
ASSERT(map.br_startblock != DELAYSTARTBLOCK);
@@ -95,21 +95,21 @@ xfs_attr3_leaf_freextent(
dp->i_mount->m_ddev_targp,
dblkno, dblkcnt, 0);
if (!bp)
- return ENOMEM;
+ return -ENOMEM;
xfs_trans_binval(*trans, bp);
/*
* Roll to next transaction.
*/
error = xfs_trans_roll(trans, dp);
if (error)
- return (error);
+ return error;
}
tblkno += map.br_blockcount;
tblkcnt -= map.br_blockcount;
}
- return(0);
+ return 0;
}
/*
@@ -227,7 +227,7 @@ xfs_attr3_node_inactive(
*/
if (level > XFS_DA_NODE_MAXDEPTH) {
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
- return XFS_ERROR(EIO);
+ return -EIO;
}
node = bp->b_addr;
@@ -256,7 +256,7 @@ xfs_attr3_node_inactive(
error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
XFS_ATTR_FORK);
if (error)
- return(error);
+ return error;
if (child_bp) {
/* save for re-read later */
child_blkno = XFS_BUF_ADDR(child_bp);
@@ -277,7 +277,7 @@ xfs_attr3_node_inactive(
child_bp);
break;
default:
- error = XFS_ERROR(EIO);
+ error = -EIO;
xfs_trans_brelse(*trans, child_bp);
break;
}
@@ -360,7 +360,7 @@ xfs_attr3_root_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, bp);
break;
default:
- error = XFS_ERROR(EIO);
+ error = -EIO;
xfs_trans_brelse(*trans, bp);
break;
}
@@ -414,7 +414,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
if (error) {
xfs_trans_cancel(trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -443,10 +443,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
out:
xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 90e2eeb21207..62db83ab6cbc 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -50,11 +50,11 @@ xfs_attr_shortform_compare(const void *a, const void *b)
sa = (xfs_attr_sf_sort_t *)a;
sb = (xfs_attr_sf_sort_t *)b;
if (sa->hash < sb->hash) {
- return(-1);
+ return -1;
} else if (sa->hash > sb->hash) {
- return(1);
+ return 1;
} else {
- return(sa->entno - sb->entno);
+ return sa->entno - sb->entno;
}
}
@@ -86,7 +86,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
- return(0);
+ return 0;
cursor = context->cursor;
ASSERT(cursor != NULL);
@@ -124,7 +124,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
}
trace_xfs_attr_list_sf_all(context);
- return(0);
+ return 0;
}
/* do no more for a search callback */
@@ -150,7 +150,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe);
kmem_free(sbuf);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
sbp->entno = i;
@@ -188,7 +188,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
}
if (i == nsbuf) {
kmem_free(sbuf);
- return(0);
+ return 0;
}
/*
@@ -213,7 +213,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
}
kmem_free(sbuf);
- return(0);
+ return 0;
}
STATIC int
@@ -243,8 +243,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
if (cursor->blkno > 0) {
error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
&bp, XFS_ATTR_FORK);
- if ((error != 0) && (error != EFSCORRUPTED))
- return(error);
+ if ((error != 0) && (error != -EFSCORRUPTED))
+ return error;
if (bp) {
struct xfs_attr_leaf_entry *entries;
@@ -295,7 +295,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
cursor->blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
- return(error);
+ return error;
node = bp->b_addr;
magic = be16_to_cpu(node->hdr.info.magic);
if (magic == XFS_ATTR_LEAF_MAGIC ||
@@ -308,7 +308,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
context->dp->i_mount,
node);
xfs_trans_brelse(NULL, bp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
dp->d_ops->node_hdr_from_disk(&nodehdr, node);
@@ -496,11 +496,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
context->cursor->blkno = 0;
error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
if (error)
- return XFS_ERROR(error);
+ return error;
error = xfs_attr3_leaf_list_int(bp, context);
xfs_trans_brelse(NULL, bp);
- return XFS_ERROR(error);
+ return error;
}
int
@@ -514,7 +514,7 @@ xfs_attr_list_int(
XFS_STATS_INC(xs_attr_list);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return EIO;
+ return -EIO;
/*
* Decide on what work routines to call based on the inode size.
@@ -616,16 +616,16 @@ xfs_attr_list(
* Validate the cursor.
*/
if (cursor->pad1 || cursor->pad2)
- return(XFS_ERROR(EINVAL));
+ return -EINVAL;
if ((cursor->initted == 0) &&
(cursor->hashval || cursor->blkno || cursor->offset))
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
/*
* Check for a properly aligned buffer.
*/
if (((long)buffer) & (sizeof(int)-1))
- return XFS_ERROR(EFAULT);
+ return -EFAULT;
if (flags & ATTR_KERNOVAL)
bufsize = 0;
@@ -648,6 +648,6 @@ xfs_attr_list(
alist->al_offset[0] = context.bufsize;
error = xfs_attr_list_int(&context);
- ASSERT(error >= 0);
+ ASSERT(error <= 0);
return error;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 64731ef3324d..281002689d64 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -133,7 +133,7 @@ xfs_bmap_finish(
mp = ntp->t_mountp;
if (!XFS_FORCED_SHUTDOWN(mp))
xfs_force_shutdown(mp,
- (error == EFSCORRUPTED) ?
+ (error == -EFSCORRUPTED) ?
SHUTDOWN_CORRUPT_INCORE :
SHUTDOWN_META_IO_ERROR);
return error;
@@ -365,7 +365,7 @@ xfs_bmap_count_tree(
xfs_trans_brelse(tp, bp);
XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
xfs_trans_brelse(tp, bp);
} else {
@@ -425,14 +425,14 @@ xfs_bmap_count_blocks(
ASSERT(level > 0);
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLDFSBNO);
+ ASSERT(bno != NULLFSBLOCK);
ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
return 0;
@@ -524,13 +524,13 @@ xfs_getbmap(
if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
} else if (unlikely(
ip->i_d.di_aformat != 0 &&
ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
prealloced = 0;
@@ -539,7 +539,7 @@ xfs_getbmap(
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
if (xfs_get_extsz_hint(ip) ||
ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
@@ -559,26 +559,26 @@ xfs_getbmap(
bmv->bmv_entries = 0;
return 0;
} else if (bmv->bmv_length < 0) {
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
nex = bmv->bmv_count - 1;
if (nex <= 0)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
bmvend = bmv->bmv_offset + bmv->bmv_length;
if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
- return XFS_ERROR(ENOMEM);
+ return -ENOMEM;
out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
if (!out)
- return XFS_ERROR(ENOMEM);
+ return -ENOMEM;
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (whichfork == XFS_DATA_FORK) {
if (!(iflags & BMV_IF_DELALLOC) &&
(ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
- error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (error)
goto out_unlock_iolock;
@@ -611,7 +611,7 @@ xfs_getbmap(
/*
* Allocate enough space to handle "subnex" maps at a time.
*/
- error = ENOMEM;
+ error = -ENOMEM;
subnex = 16;
map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
if (!map)
@@ -809,7 +809,7 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
* have speculative prealloc/delalloc blocks to remove.
*/
if (VFS_I(ip)->i_size == 0 &&
- VN_CACHED(VFS_I(ip)) == 0 &&
+ VFS_I(ip)->i_mapping->nrpages == 0 &&
ip->i_delayed_blks == 0)
return false;
@@ -882,7 +882,7 @@ xfs_free_eofblocks(
if (need_iolock) {
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
xfs_trans_cancel(tp, 0);
- return EAGAIN;
+ return -EAGAIN;
}
}
@@ -955,14 +955,14 @@ xfs_alloc_file_space(
trace_xfs_alloc_file_space(ip);
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
error = xfs_qm_dqattach(ip, 0);
if (error)
return error;
if (len <= 0)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
@@ -1028,7 +1028,7 @@ xfs_alloc_file_space(
/*
* Free the transaction structure.
*/
- ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
xfs_trans_cancel(tp, 0);
break;
}
@@ -1065,7 +1065,7 @@ xfs_alloc_file_space(
allocated_fsb = imapp->br_blockcount;
if (nimaps == 0) {
- error = XFS_ERROR(ENOSPC);
+ error = -ENOSPC;
break;
}
@@ -1122,14 +1122,6 @@ xfs_zero_remaining_bytes(
if (endoff > XFS_ISIZE(ip))
endoff = XFS_ISIZE(ip);
- bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp,
- BTOBB(mp->m_sb.sb_blocksize), 0);
- if (!bp)
- return XFS_ERROR(ENOMEM);
-
- xfs_buf_unlock(bp);
-
for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
uint lock_mode;
@@ -1152,42 +1144,24 @@ xfs_zero_remaining_bytes(
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
if (imap.br_state == XFS_EXT_UNWRITTEN)
continue;
- XFS_BUF_UNDONE(bp);
- XFS_BUF_UNWRITE(bp);
- XFS_BUF_READ(bp);
- XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
- if (XFS_FORCED_SHUTDOWN(mp)) {
- error = XFS_ERROR(EIO);
- break;
- }
- xfs_buf_iorequest(bp);
- error = xfs_buf_iowait(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp,
- "xfs_zero_remaining_bytes(read)");
- break;
- }
+ error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp,
+ xfs_fsb_to_db(ip, imap.br_startblock),
+ BTOBB(mp->m_sb.sb_blocksize),
+ 0, &bp, NULL);
+ if (error)
+ return error;
+
memset(bp->b_addr +
- (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
- 0, lastoffset - offset + 1);
- XFS_BUF_UNDONE(bp);
- XFS_BUF_UNREAD(bp);
- XFS_BUF_WRITE(bp);
-
- if (XFS_FORCED_SHUTDOWN(mp)) {
- error = XFS_ERROR(EIO);
- break;
- }
- xfs_buf_iorequest(bp);
- error = xfs_buf_iowait(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp,
- "xfs_zero_remaining_bytes(write)");
- break;
- }
+ (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+ 0, lastoffset - offset + 1);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
}
- xfs_buf_free(bp);
return error;
}
@@ -1205,6 +1179,7 @@ xfs_free_file_space(
xfs_bmap_free_t free_list;
xfs_bmbt_irec_t imap;
xfs_off_t ioffset;
+ xfs_off_t iendoffset;
xfs_extlen_t mod=0;
xfs_mount_t *mp;
int nimap;
@@ -1233,12 +1208,13 @@ xfs_free_file_space(
inode_dio_wait(VFS_I(ip));
rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
- ioffset = offset & ~(rounding - 1);
- error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- ioffset, -1);
+ ioffset = round_down(offset, rounding);
+ iendoffset = round_up(offset + len, rounding) - 1;
+ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
+ iendoffset);
if (error)
goto out;
- truncate_pagecache_range(VFS_I(ip), ioffset, -1);
+ truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
/*
* Need to zero the stuff we're not freeing, on disk.
@@ -1315,7 +1291,7 @@ xfs_free_file_space(
/*
* Free the transaction structure.
*/
- ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
xfs_trans_cancel(tp, 0);
break;
}
@@ -1362,7 +1338,10 @@ xfs_free_file_space(
goto out;
}
-
+/*
+ * Preallocate and zero a range of a file. This mechanism has the allocation
+ * semantics of fallocate and in addition converts data in the range to zeroes.
+ */
int
xfs_zero_file_space(
struct xfs_inode *ip,
@@ -1370,65 +1349,30 @@ xfs_zero_file_space(
xfs_off_t len)
{
struct xfs_mount *mp = ip->i_mount;
- uint granularity;
- xfs_off_t start_boundary;
- xfs_off_t end_boundary;
+ uint blksize;
int error;
trace_xfs_zero_file_space(ip);
- granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+ blksize = 1 << mp->m_sb.sb_blocklog;
/*
- * Round the range of extents we are going to convert inwards. If the
- * offset is aligned, then it doesn't get changed so we zero from the
- * start of the block offset points to.
+ * Punch a hole and prealloc the range. We use hole punch rather than
+ * unwritten extent conversion for two reasons:
+ *
+ * 1.) Hole punch handles partial block zeroing for us.
+ *
+ * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
+ * by virtue of the hole punch.
*/
- start_boundary = round_up(offset, granularity);
- end_boundary = round_down(offset + len, granularity);
-
- ASSERT(start_boundary >= offset);
- ASSERT(end_boundary <= offset + len);
-
- if (start_boundary < end_boundary - 1) {
- /*
- * punch out delayed allocation blocks and the page cache over
- * the conversion range
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmap_punch_delalloc_range(ip,
- XFS_B_TO_FSBT(mp, start_boundary),
- XFS_B_TO_FSB(mp, end_boundary - start_boundary));
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- truncate_pagecache_range(VFS_I(ip), start_boundary,
- end_boundary - 1);
-
- /* convert the blocks */
- error = xfs_alloc_file_space(ip, start_boundary,
- end_boundary - start_boundary - 1,
- XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
- if (error)
- goto out;
-
- /* We've handled the interior of the range, now for the edges */
- if (start_boundary != offset) {
- error = xfs_iozero(ip, offset, start_boundary - offset);
- if (error)
- goto out;
- }
-
- if (end_boundary != offset + len)
- error = xfs_iozero(ip, end_boundary,
- offset + len - end_boundary);
-
- } else {
- /*
- * It's either a sub-granularity range or the range spanned lies
- * partially across two adjacent blocks.
- */
- error = xfs_iozero(ip, offset, len);
- }
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ goto out;
+ error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+ round_up(offset + len, blksize) -
+ round_down(offset, blksize),
+ XFS_BMAPI_PREALLOC);
out:
return error;
@@ -1456,24 +1400,50 @@ xfs_collapse_file_space(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error;
- xfs_extnum_t current_ext = 0;
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
int committed;
xfs_fileoff_t start_fsb;
+ xfs_fileoff_t next_fsb;
xfs_fileoff_t shift_fsb;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
trace_xfs_collapse_file_space(ip);
- start_fsb = XFS_B_TO_FSB(mp, offset + len);
+ next_fsb = XFS_B_TO_FSB(mp, offset + len);
shift_fsb = XFS_B_TO_FSB(mp, len);
error = xfs_free_file_space(ip, offset, len);
if (error)
return error;
+ /*
+ * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
+ * into the accessible region of the file.
+ */
+ if (xfs_can_free_eofblocks(ip, true)) {
+ error = xfs_free_eofblocks(mp, ip, false);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Writeback and invalidate cache for the remainder of the file as we're
+ * about to shift down every extent from the collapse range to EOF. The
+ * free of the collapse range above might have already done some of
+ * this, but we shouldn't rely on it to do anything outside of the range
+ * that was freed.
+ */
+ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ offset + len, -1);
+ if (error)
+ return error;
+ error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ (offset + len) >> PAGE_CACHE_SHIFT, -1);
+ if (error)
+ return error;
+
while (!error && !done) {
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
/*
@@ -1505,10 +1475,10 @@ xfs_collapse_file_space(
* We are using the write transaction in which max 2 bmbt
* updates are allowed
*/
- error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
- shift_fsb, &current_ext,
- &first_block, &free_list,
- XFS_BMAP_MAX_SHIFT_EXTENTS);
+ start_fsb = next_fsb;
+ error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb,
+ &done, &next_fsb, &first_block, &free_list,
+ XFS_BMAP_MAX_SHIFT_EXTENTS);
if (error)
goto out;
@@ -1557,14 +1527,14 @@ xfs_swap_extents_check_format(
/* Should never get a local format */
if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- return EINVAL;
+ return -EINVAL;
/*
* if the target inode has less extents that then temporary inode then
* why did userspace call us?
*/
if (ip->i_d.di_nextents < tip->i_d.di_nextents)
- return EINVAL;
+ return -EINVAL;
/*
* if the target inode is in extent form and the temp inode is in btree
@@ -1573,19 +1543,19 @@ xfs_swap_extents_check_format(
*/
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- return EINVAL;
+ return -EINVAL;
/* Check temp in extent form to max in target */
if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
- return EINVAL;
+ return -EINVAL;
/* Check target in extent form to max in temp */
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
- return EINVAL;
+ return -EINVAL;
/*
* If we are in a btree format, check that the temp root block will fit
@@ -1599,25 +1569,49 @@ xfs_swap_extents_check_format(
if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_BOFF(ip) &&
XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
- return EINVAL;
+ return -EINVAL;
if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
- return EINVAL;
+ return -EINVAL;
}
/* Reciprocal target->temp btree format checks */
if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_BOFF(tip) &&
XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
- return EINVAL;
+ return -EINVAL;
if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
- return EINVAL;
+ return -EINVAL;
}
return 0;
}
+static int
+xfs_swap_extent_flush(
+ struct xfs_inode *ip)
+{
+ int error;
+
+ error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (error)
+ return error;
+ truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+ /* Verify O_DIRECT for ftmp */
+ if (VFS_I(ip)->i_mapping->nrpages)
+ return -EINVAL;
+
+ /*
+ * Don't try to swap extents on mmap()d files because we can't lock
+ * out races against page faults safely.
+ */
+ if (mapping_mapped(VFS_I(ip)->i_mapping))
+ return -EBUSY;
+ return 0;
+}
+
int
xfs_swap_extents(
xfs_inode_t *ip, /* target inode */
@@ -1633,51 +1627,57 @@ xfs_swap_extents(
int aforkblks = 0;
int taforkblks = 0;
__uint64_t tmp;
+ int lock_flags;
tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
if (!tempifp) {
- error = XFS_ERROR(ENOMEM);
+ error = -ENOMEM;
goto out;
}
/*
- * we have to do two separate lock calls here to keep lockdep
- * happy. If we try to get all the locks in one call, lock will
- * report false positives when we drop the ILOCK and regain them
- * below.
+ * Lock up the inodes against other IO and truncate to begin with.
+ * Then we can ensure the inodes are flushed and have no page cache
+ * safely. Once we have done this we can take the ilocks and do the rest
+ * of the checks.
*/
+ lock_flags = XFS_IOLOCK_EXCL;
xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
- xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
/* Verify that both files have the same format */
if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_unlock;
}
/* Verify both files are either real-time or non-realtime */
if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_unlock;
}
- error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+ error = xfs_swap_extent_flush(ip);
+ if (error)
+ goto out_unlock;
+ error = xfs_swap_extent_flush(tip);
if (error)
goto out_unlock;
- truncate_pagecache_range(VFS_I(tip), 0, -1);
- /* Verify O_DIRECT for ftmp */
- if (VN_CACHED(VFS_I(tip)) != 0) {
- error = XFS_ERROR(EINVAL);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
goto out_unlock;
}
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+ lock_flags |= XFS_ILOCK_EXCL;
/* Verify all data are being swapped */
if (sxp->sx_offset != 0 ||
sxp->sx_length != ip->i_d.di_size ||
sxp->sx_length != tip->i_d.di_size) {
- error = XFS_ERROR(EFAULT);
- goto out_unlock;
+ error = -EFAULT;
+ goto out_trans_cancel;
}
trace_xfs_swap_extent_before(ip, 0);
@@ -1689,7 +1689,7 @@ xfs_swap_extents(
xfs_notice(mp,
"%s: inode 0x%llx format is incompatible for exchanging.",
__func__, ip->i_ino);
- goto out_unlock;
+ goto out_trans_cancel;
}
/*
@@ -1703,43 +1703,9 @@ xfs_swap_extents(
(sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
(sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
(sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
- error = XFS_ERROR(EBUSY);
- goto out_unlock;
+ error = -EBUSY;
+ goto out_trans_cancel;
}
-
- /* We need to fail if the file is memory mapped. Once we have tossed
- * all existing pages, the page fault will have no option
- * but to go to the filesystem for pages. By making the page fault call
- * vop_read (or write in the case of autogrow) they block on the iolock
- * until we have switched the extents.
- */
- if (VN_MAPPED(VFS_I(ip))) {
- error = XFS_ERROR(EBUSY);
- goto out_unlock;
- }
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_iunlock(tip, XFS_ILOCK_EXCL);
-
- /*
- * There is a race condition here since we gave up the
- * ilock. However, the data fork will not change since
- * we have the iolock (locked for truncation too) so we
- * are safe. We don't really care if non-io related
- * fields change.
- */
- truncate_pagecache_range(VFS_I(ip), 0, -1);
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_iunlock(tip, XFS_IOLOCK_EXCL);
- xfs_trans_cancel(tp, 0);
- goto out;
- }
- xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
/*
* Count the number of extended attribute blocks
*/
@@ -1757,8 +1723,8 @@ xfs_swap_extents(
goto out_trans_cancel;
}
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, lock_flags);
+ xfs_trans_ijoin(tp, tip, lock_flags);
/*
* Before we've swapped the forks, lets set the owners of the forks
@@ -1887,8 +1853,8 @@ out:
return error;
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, lock_flags);
+ xfs_iunlock(tip, lock_flags);
goto out;
out_trans_cancel:
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 7a34a1ae6552..24b4ebea0d4d 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -130,7 +130,7 @@ xfs_buf_get_maps(
bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
KM_NOFS);
if (!bp->b_maps)
- return ENOMEM;
+ return -ENOMEM;
return 0;
}
@@ -344,7 +344,7 @@ retry:
if (unlikely(page == NULL)) {
if (flags & XBF_READ_AHEAD) {
bp->b_page_count = i;
- error = ENOMEM;
+ error = -ENOMEM;
goto out_free_pages;
}
@@ -465,7 +465,7 @@ _xfs_buf_find(
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
if (blkno >= eofs) {
/*
- * XXX (dgc): we should really be returning EFSCORRUPTED here,
+ * XXX (dgc): we should really be returning -EFSCORRUPTED here,
* but none of the higher level infrastructure supports
* returning a specific error on buffer lookup failures.
*/
@@ -623,10 +623,11 @@ _xfs_buf_read(
bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
- xfs_buf_iorequest(bp);
- if (flags & XBF_ASYNC)
+ if (flags & XBF_ASYNC) {
+ xfs_buf_submit(bp);
return 0;
- return xfs_buf_iowait(bp);
+ }
+ return xfs_buf_submit_wait(bp);
}
xfs_buf_t *
@@ -687,34 +688,39 @@ xfs_buf_readahead_map(
* Read an uncached buffer from disk. Allocates and returns a locked
* buffer containing the disk contents or nothing.
*/
-struct xfs_buf *
+int
xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
int flags,
+ struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
+ *bpp = NULL;
+
bp = xfs_buf_get_uncached(target, numblks, flags);
if (!bp)
- return NULL;
+ return -ENOMEM;
/* set up the buffer for a read IO */
ASSERT(bp->b_map_count == 1);
- bp->b_bn = daddr;
+ bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
bp->b_maps[0].bm_bn = daddr;
bp->b_flags |= XBF_READ;
bp->b_ops = ops;
- if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+ xfs_buf_submit_wait(bp);
+ if (bp->b_error) {
+ int error = bp->b_error;
xfs_buf_relse(bp);
- return NULL;
+ return error;
}
- xfs_buf_iorequest(bp);
- xfs_buf_iowait(bp);
- return bp;
+
+ *bpp = bp;
+ return 0;
}
/*
@@ -998,53 +1004,56 @@ xfs_buf_wait_unpin(
* Buffer Utility Routines
*/
-STATIC void
-xfs_buf_iodone_work(
- struct work_struct *work)
+void
+xfs_buf_ioend(
+ struct xfs_buf *bp)
{
- struct xfs_buf *bp =
- container_of(work, xfs_buf_t, b_iodone_work);
- bool read = !!(bp->b_flags & XBF_READ);
+ bool read = bp->b_flags & XBF_READ;
+
+ trace_xfs_buf_iodone(bp, _RET_IP_);
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- /* only validate buffers that were read without errors */
- if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+ /*
+ * Pull in IO completion errors now. We are guaranteed to be running
+ * single threaded, so we don't need the lock to read b_io_error.
+ */
+ if (!bp->b_error && bp->b_io_error)
+ xfs_buf_ioerror(bp, bp->b_io_error);
+
+ /* Only validate buffers that were read without errors */
+ if (read && !bp->b_error && bp->b_ops) {
+ ASSERT(!bp->b_iodone);
bp->b_ops->verify_read(bp);
+ }
+
+ if (!bp->b_error)
+ bp->b_flags |= XBF_DONE;
if (bp->b_iodone)
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
- else {
- ASSERT(read && bp->b_ops);
+ else
complete(&bp->b_iowait);
- }
}
-void
-xfs_buf_ioend(
- struct xfs_buf *bp,
- int schedule)
+static void
+xfs_buf_ioend_work(
+ struct work_struct *work)
{
- bool read = !!(bp->b_flags & XBF_READ);
-
- trace_xfs_buf_iodone(bp, _RET_IP_);
+ struct xfs_buf *bp =
+ container_of(work, xfs_buf_t, b_iodone_work);
- if (bp->b_error == 0)
- bp->b_flags |= XBF_DONE;
+ xfs_buf_ioend(bp);
+}
- if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
- if (schedule) {
- INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
- queue_work(xfslogd_workqueue, &bp->b_iodone_work);
- } else {
- xfs_buf_iodone_work(&bp->b_iodone_work);
- }
- } else {
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- complete(&bp->b_iowait);
- }
+void
+xfs_buf_ioend_async(
+ struct xfs_buf *bp)
+{
+ INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
+ queue_work(xfslogd_workqueue, &bp->b_iodone_work);
}
void
@@ -1052,8 +1061,8 @@ xfs_buf_ioerror(
xfs_buf_t *bp,
int error)
{
- ASSERT(error >= 0 && error <= 0xffff);
- bp->b_error = (unsigned short)error;
+ ASSERT(error <= 0 && error >= -1000);
+ bp->b_error = error;
trace_xfs_buf_ioerror(bp, error, _RET_IP_);
}
@@ -1064,97 +1073,7 @@ xfs_buf_ioerror_alert(
{
xfs_alert(bp->b_target->bt_mount,
"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
- (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
-}
-
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- * so that the proper iodone callbacks get called.
- */
-STATIC int
-xfs_bioerror(
- xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
- ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-
- /*
- * No need to wait until the buffer is unpinned, we aren't flushing it.
- */
- xfs_buf_ioerror(bp, EIO);
-
- /*
- * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
- */
- XFS_BUF_UNREAD(bp);
- XFS_BUF_UNDONE(bp);
- xfs_buf_stale(bp);
-
- xfs_buf_ioend(bp, 0);
-
- return EIO;
-}
-
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the xfs_buf_ioend call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
- struct xfs_buf *bp)
-{
- int64_t fl = bp->b_flags;
- /*
- * No need to wait until the buffer is unpinned.
- * We aren't flushing it.
- *
- * chunkhold expects B_DONE to be set, whether
- * we actually finish the I/O or not. We don't want to
- * change that interface.
- */
- XFS_BUF_UNREAD(bp);
- XFS_BUF_DONE(bp);
- xfs_buf_stale(bp);
- bp->b_iodone = NULL;
- if (!(fl & XBF_ASYNC)) {
- /*
- * Mark b_error and B_ERROR _both_.
- * Lot's of chunkcache code assumes that.
- * There's no reason to mark error for
- * ASYNC buffers.
- */
- xfs_buf_ioerror(bp, EIO);
- complete(&bp->b_iowait);
- } else {
- xfs_buf_relse(bp);
- }
-
- return EIO;
-}
-
-STATIC int
-xfs_bdstrat_cb(
- struct xfs_buf *bp)
-{
- if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
- trace_xfs_bdstrat_shut(bp, _RET_IP_);
- /*
- * Metadata write that didn't get logged but
- * written delayed anyway. These aren't associated
- * with a transaction, and can be ignored.
- */
- if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
- return xfs_bioerror_relse(bp);
- else
- return xfs_bioerror(bp);
- }
-
- xfs_buf_iorequest(bp);
- return 0;
+ (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
}
int
@@ -1166,11 +1085,10 @@ xfs_bwrite(
ASSERT(xfs_buf_islocked(bp));
bp->b_flags |= XBF_WRITE;
- bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
-
- xfs_bdstrat_cb(bp);
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+ XBF_WRITE_FAIL | XBF_DONE);
- error = xfs_buf_iowait(bp);
+ error = xfs_buf_submit_wait(bp);
if (error) {
xfs_force_shutdown(bp->b_target->bt_mount,
SHUTDOWN_META_IO_ERROR);
@@ -1179,15 +1097,6 @@ xfs_bwrite(
}
STATIC void
-_xfs_buf_ioend(
- xfs_buf_t *bp,
- int schedule)
-{
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
- xfs_buf_ioend(bp, schedule);
-}
-
-STATIC void
xfs_buf_bio_end_io(
struct bio *bio,
int error)
@@ -1198,13 +1107,18 @@ xfs_buf_bio_end_io(
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
*/
- if (!bp->b_error)
- xfs_buf_ioerror(bp, -error);
+ if (error) {
+ spin_lock(&bp->b_lock);
+ if (!bp->b_io_error)
+ bp->b_io_error = error;
+ spin_unlock(&bp->b_lock);
+ }
if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
- _xfs_buf_ioend(bp, 1);
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ xfs_buf_ioend_async(bp);
bio_put(bio);
}
@@ -1283,10 +1197,10 @@ next_chunk:
} else {
/*
* This is guaranteed not to be the last io reference count
- * because the caller (xfs_buf_iorequest) holds a count itself.
+ * because the caller (xfs_buf_submit) holds a count itself.
*/
atomic_dec(&bp->b_io_remaining);
- xfs_buf_ioerror(bp, EIO);
+ xfs_buf_ioerror(bp, -EIO);
bio_put(bio);
}
@@ -1330,6 +1244,20 @@ _xfs_buf_ioapply(
SHUTDOWN_CORRUPT_INCORE);
return;
}
+ } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ /*
+ * non-crc filesystems don't attach verifiers during
+ * log recovery, so don't warn for such filesystems.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_warn(mp,
+ "%s: no ops on block 0x%llx/0x%x",
+ __func__, bp->b_bn, bp->b_length);
+ xfs_hex_dump(bp->b_addr, 64);
+ dump_stack();
+ }
}
} else if (bp->b_flags & XBF_READ_AHEAD) {
rw = READA;
@@ -1359,53 +1287,131 @@ _xfs_buf_ioapply(
blk_finish_plug(&plug);
}
+/*
+ * Asynchronous IO submission path. This transfers the buffer lock ownership and
+ * the current reference to the IO. It is not safe to reference the buffer after
+ * a call to this function unless the caller holds an additional reference
+ * itself.
+ */
void
-xfs_buf_iorequest(
- xfs_buf_t *bp)
+xfs_buf_submit(
+ struct xfs_buf *bp)
{
- trace_xfs_buf_iorequest(bp, _RET_IP_);
+ trace_xfs_buf_submit(bp, _RET_IP_);
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+ ASSERT(bp->b_flags & XBF_ASYNC);
+
+ /* on shutdown we stale and complete the buffer immediately */
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ xfs_buf_ioerror(bp, -EIO);
+ bp->b_flags &= ~XBF_DONE;
+ xfs_buf_stale(bp);
+ xfs_buf_ioend(bp);
+ return;
+ }
if (bp->b_flags & XBF_WRITE)
xfs_buf_wait_unpin(bp);
+
+ /* clear the internal error state to avoid spurious errors */
+ bp->b_io_error = 0;
+
+ /*
+ * The caller's reference is released during I/O completion.
+ * This occurs some time after the last b_io_remaining reference is
+ * released, so after we drop our Io reference we have to have some
+ * other reference to ensure the buffer doesn't go away from underneath
+ * us. Take a direct reference to ensure we have safe access to the
+ * buffer until we are finished with it.
+ */
xfs_buf_hold(bp);
/*
- * Set the count to 1 initially, this will stop an I/O
- * completion callout which happens before we have started
- * all the I/O from calling xfs_buf_ioend too early.
+ * Set the count to 1 initially, this will stop an I/O completion
+ * callout which happens before we have started all the I/O from calling
+ * xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
_xfs_buf_ioapply(bp);
+
/*
- * If _xfs_buf_ioapply failed, we'll get back here with
- * only the reference we took above. _xfs_buf_ioend will
- * drop it to zero, so we'd better not queue it for later,
- * or we'll free it before it's done.
+ * If _xfs_buf_ioapply failed, we can get back here with only the IO
+ * reference we took above. If we drop it to zero, run completion so
+ * that we don't return to the caller with completion still pending.
*/
- _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+ if (bp->b_error)
+ xfs_buf_ioend(bp);
+ else
+ xfs_buf_ioend_async(bp);
+ }
xfs_buf_rele(bp);
+ /* Note: it is not safe to reference bp now we've dropped our ref */
}
/*
- * Waits for I/O to complete on the buffer supplied. It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer, in which
- * case nothing will ever complete. It returns the I/O error code, if any, or
- * 0 if there was no error.
+ * Synchronous buffer IO submission path, read or write.
*/
int
-xfs_buf_iowait(
- xfs_buf_t *bp)
+xfs_buf_submit_wait(
+ struct xfs_buf *bp)
{
- trace_xfs_buf_iowait(bp, _RET_IP_);
+ int error;
- if (!bp->b_error)
- wait_for_completion(&bp->b_iowait);
+ trace_xfs_buf_submit_wait(bp, _RET_IP_);
+
+ ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
+
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ xfs_buf_ioerror(bp, -EIO);
+ xfs_buf_stale(bp);
+ bp->b_flags &= ~XBF_DONE;
+ return -EIO;
+ }
+
+ if (bp->b_flags & XBF_WRITE)
+ xfs_buf_wait_unpin(bp);
+
+ /* clear the internal error state to avoid spurious errors */
+ bp->b_io_error = 0;
+
+ /*
+ * For synchronous IO, the IO does not inherit the submitters reference
+ * count, nor the buffer lock. Hence we cannot release the reference we
+ * are about to take until we've waited for all IO completion to occur,
+ * including any xfs_buf_ioend_async() work that may be pending.
+ */
+ xfs_buf_hold(bp);
+
+ /*
+ * Set the count to 1 initially, this will stop an I/O completion
+ * callout which happens before we have started all the I/O from calling
+ * xfs_buf_ioend too early.
+ */
+ atomic_set(&bp->b_io_remaining, 1);
+ _xfs_buf_ioapply(bp);
+
+ /*
+ * make sure we run completion synchronously if it raced with us and is
+ * already complete.
+ */
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ xfs_buf_ioend(bp);
+ /* wait for completion before gathering the error from the buffer */
+ trace_xfs_buf_iowait(bp, _RET_IP_);
+ wait_for_completion(&bp->b_iowait);
trace_xfs_buf_iowait_done(bp, _RET_IP_);
- return bp->b_error;
+ error = bp->b_error;
+
+ /*
+ * all done now, we can release the hold that keeps the buffer
+ * referenced for the entire IO.
+ */
+ xfs_buf_rele(bp);
+ return error;
}
xfs_caddr_t
@@ -1628,7 +1634,7 @@ xfs_setsize_buftarg(
xfs_warn(btp->bt_mount,
"Cannot set_blocksize to %u on device %s",
sectorsize, name);
- return EINVAL;
+ return -EINVAL;
}
/* Set up device logical sector size mask */
@@ -1664,8 +1670,6 @@ xfs_alloc_buftarg(
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
btp->bt_bdi = blk_get_backing_dev_info(bdev);
- if (!btp->bt_bdi)
- goto error;
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
@@ -1799,13 +1803,19 @@ __xfs_buf_delwri_submit(
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, io_list, b_list) {
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
- bp->b_flags |= XBF_WRITE;
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC;
- if (!wait) {
- bp->b_flags |= XBF_ASYNC;
+ /*
+ * we do all Io submission async. This means if we need to wait
+ * for IO completion we need to take an extra reference so the
+ * buffer is still valid on the other side.
+ */
+ if (wait)
+ xfs_buf_hold(bp);
+ else
list_del_init(&bp->b_list);
- }
- xfs_bdstrat_cb(bp);
+
+ xfs_buf_submit(bp);
}
blk_finish_plug(&plug);
@@ -1852,7 +1862,10 @@ xfs_buf_delwri_submit(
bp = list_first_entry(&io_list, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
- error2 = xfs_buf_iowait(bp);
+
+ /* locking the buffer will wait for async IO completion. */
+ xfs_buf_lock(bp);
+ error2 = bp->b_error;
xfs_buf_relse(bp);
if (!error)
error = error2;
@@ -1870,7 +1883,7 @@ xfs_buf_init(void)
goto out;
xfslogd_workqueue = alloc_workqueue("xfslogd",
- WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
if (!xfslogd_workqueue)
goto out_free_buf_zone;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a7a5523d3dc..82002c00af90 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -158,6 +158,7 @@ typedef struct xfs_buf {
struct list_head b_lru; /* lru list */
spinlock_t b_lock; /* internal state lock */
unsigned int b_state; /* internal state flags */
+ int b_io_error; /* internal IO error state */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
@@ -178,7 +179,7 @@ typedef struct xfs_buf {
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset in first page */
- unsigned short b_error; /* error code on I/O */
+ int b_error; /* error code on I/O */
const struct xfs_buf_ops *b_ops;
#ifdef XFS_BUF_LOCK_TRACKING
@@ -268,9 +269,9 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
int flags);
-struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
- xfs_daddr_t daddr, size_t numblks, int flags,
- const struct xfs_buf_ops *ops);
+int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
+ size_t numblks, int flags, struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
@@ -286,18 +287,16 @@ extern void xfs_buf_unlock(xfs_buf_t *);
/* Buffer Read and Write Routines */
extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_buf_ioend(xfs_buf_t *, int);
+extern void xfs_buf_ioend(struct xfs_buf *bp);
extern void xfs_buf_ioerror(xfs_buf_t *, int);
extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
-extern void xfs_buf_iorequest(xfs_buf_t *);
-extern int xfs_buf_iowait(xfs_buf_t *);
+extern void xfs_buf_submit(struct xfs_buf *bp);
+extern int xfs_buf_submit_wait(struct xfs_buf *bp);
extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
xfs_buf_rw_t);
#define xfs_buf_zero(bp, off, len) \
xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-extern int xfs_bioerror_relse(struct xfs_buf *);
-
/* Buffer Utility Routines */
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4654338b03fc..f15969543326 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -488,10 +488,10 @@ xfs_buf_item_unpin(
xfs_buf_lock(bp);
xfs_buf_hold(bp);
bp->b_flags |= XBF_ASYNC;
- xfs_buf_ioerror(bp, EIO);
+ xfs_buf_ioerror(bp, -EIO);
XFS_BUF_UNDONE(bp);
xfs_buf_stale(bp);
- xfs_buf_ioend(bp, 0);
+ xfs_buf_ioend(bp);
}
}
@@ -501,7 +501,7 @@ xfs_buf_item_unpin(
* buffer being bad..
*/
-DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
STATIC uint
xfs_buf_item_push(
@@ -725,7 +725,7 @@ xfs_buf_item_get_format(
bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
KM_SLEEP);
if (!bip->bli_formats)
- return ENOMEM;
+ return -ENOMEM;
return 0;
}
@@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks(
* a way to shut the filesystem down if the writes keep failing.
*
* In practice we'll shut the filesystem down soon as non-transient
- * erorrs tend to affect the whole device and a failing log write
+ * errors tend to affect the whole device and a failing log write
* will make us give up. But we really ought to do better here.
*/
if (XFS_BUF_ISASYNC(bp)) {
@@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks(
if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
bp->b_flags |= XBF_WRITE | XBF_ASYNC |
XBF_DONE | XBF_WRITE_FAIL;
- xfs_buf_iorequest(bp);
+ xfs_buf_submit(bp);
} else {
xfs_buf_relse(bp);
}
@@ -1115,7 +1115,7 @@ do_callbacks:
xfs_buf_do_callbacks(bp);
bp->b_fspriv = NULL;
bp->b_iodone = NULL;
- xfs_buf_ioend(bp, 0);
+ xfs_buf_ioend(bp);
}
/*
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 48e99afb9cb0..f1b69edcdf31 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -95,7 +95,7 @@ xfs_dir2_sf_getdents(
*/
if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
+ return -EIO;
}
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
@@ -677,7 +677,7 @@ xfs_readdir(
trace_xfs_readdir(dp);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return XFS_ERROR(EIO);
+ return -EIO;
ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_getdents);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 4f11ef011139..13d08a1b390e 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -124,7 +124,7 @@ xfs_trim_extents(
}
trace_xfs_discard_extent(mp, agno, fbno, flen);
- error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+ error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
if (error)
goto out_del_cursor;
*blocks_trimmed += flen;
@@ -166,11 +166,11 @@ xfs_ioc_trim(
int error, last_error = 0;
if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (!blk_queue_discard(q))
- return -XFS_ERROR(EOPNOTSUPP);
+ return -EOPNOTSUPP;
if (copy_from_user(&range, urange, sizeof(range)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
/*
* Truncating down the len isn't actually quite correct, but using
@@ -182,7 +182,7 @@ xfs_ioc_trim(
if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
range.len < mp->m_sb.sb_blocksize)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
start = BTOBB(range.start);
end = start + BTOBBT(range.len) - 1;
@@ -195,7 +195,7 @@ xfs_ioc_trim(
end_agno = xfs_daddr_to_agno(mp, end);
for (agno = start_agno; agno <= end_agno; agno++) {
- error = -xfs_trim_extents(mp, agno, start, end, minlen,
+ error = xfs_trim_extents(mp, agno, start, end, minlen,
&blocks_trimmed);
if (error)
last_error = error;
@@ -206,7 +206,7 @@ xfs_ioc_trim(
range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
if (copy_to_user(urange, &range, sizeof(range)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -222,11 +222,11 @@ xfs_discard_extents(
trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
busyp->length);
- error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+ error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
GFP_NOFS, 0);
- if (error && error != EOPNOTSUPP) {
+ if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llu,%u], error %d",
(unsigned long long)busyp->bno,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 3ee0cd43edc0..63c2de49f61d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -327,7 +327,7 @@ xfs_qm_dqalloc(
*/
if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
- return (ESRCH);
+ return -ESRCH;
}
xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
@@ -354,7 +354,7 @@ xfs_qm_dqalloc(
mp->m_quotainfo->qi_dqchunklen,
0);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error1;
}
bp->b_ops = &xfs_dquot_buf_ops;
@@ -400,7 +400,7 @@ xfs_qm_dqalloc(
error0:
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
- return (error);
+ return error;
}
STATIC int
@@ -426,7 +426,7 @@ xfs_qm_dqrepair(
if (error) {
ASSERT(*bpp == NULL);
- return XFS_ERROR(error);
+ return error;
}
(*bpp)->b_ops = &xfs_dquot_buf_ops;
@@ -442,7 +442,7 @@ xfs_qm_dqrepair(
if (error) {
/* repair failed, we're screwed */
xfs_trans_brelse(tp, *bpp);
- return XFS_ERROR(EIO);
+ return -EIO;
}
}
@@ -480,7 +480,7 @@ xfs_qm_dqtobp(
* didn't have the quota inode lock.
*/
xfs_iunlock(quotip, lock_mode);
- return ESRCH;
+ return -ESRCH;
}
/*
@@ -508,7 +508,7 @@ xfs_qm_dqtobp(
* We don't allocate unless we're asked to
*/
if (!(flags & XFS_QMOPT_DQALLOC))
- return ENOENT;
+ return -ENOENT;
ASSERT(tp);
error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
@@ -530,7 +530,7 @@ xfs_qm_dqtobp(
mp->m_quotainfo->qi_dqchunklen,
0, &bp, &xfs_dquot_buf_ops);
- if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+ if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
mp->m_quotainfo->qi_dqperchunk;
ASSERT(bp == NULL);
@@ -539,7 +539,7 @@ xfs_qm_dqtobp(
if (error) {
ASSERT(bp == NULL);
- return XFS_ERROR(error);
+ return error;
}
}
@@ -547,7 +547,7 @@ xfs_qm_dqtobp(
*O_bpp = bp;
*O_ddpp = bp->b_addr + dqp->q_bufoffset;
- return (0);
+ return 0;
}
@@ -715,7 +715,7 @@ xfs_qm_dqget(
if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
(! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
(! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
- return (ESRCH);
+ return -ESRCH;
}
#ifdef DEBUG
@@ -723,7 +723,7 @@ xfs_qm_dqget(
if ((xfs_dqerror_target == mp->m_ddev_targp) &&
(xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
xfs_debug(mp, "Returning error in dqget");
- return (EIO);
+ return -EIO;
}
}
@@ -796,14 +796,14 @@ restart:
} else {
/* inode stays locked on return */
xfs_qm_dqdestroy(dqp);
- return XFS_ERROR(ESRCH);
+ return -ESRCH;
}
}
mutex_lock(&qi->qi_tree_lock);
- error = -radix_tree_insert(tree, id, dqp);
+ error = radix_tree_insert(tree, id, dqp);
if (unlikely(error)) {
- WARN_ON(error != EEXIST);
+ WARN_ON(error != -EEXIST);
/*
* Duplicate found. Just throw away the new dquot and start
@@ -829,7 +829,7 @@ restart:
ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
trace_xfs_dqget_miss(dqp);
*O_dqpp = dqp;
- return (0);
+ return 0;
}
/*
@@ -966,7 +966,7 @@ xfs_qm_dqflush(
SHUTDOWN_CORRUPT_INCORE);
else
spin_unlock(&mp->m_ail->xa_lock);
- error = XFS_ERROR(EIO);
+ error = -EIO;
goto out_unlock;
}
@@ -974,7 +974,8 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ &xfs_dquot_buf_ops);
if (error)
goto out_unlock;
@@ -992,7 +993,7 @@ xfs_qm_dqflush(
xfs_buf_relse(bp);
xfs_dqfunlock(dqp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return XFS_ERROR(EIO);
+ return -EIO;
}
/* This is the only portion of data that needs to persist */
@@ -1045,7 +1046,7 @@ xfs_qm_dqflush(
out_unlock:
xfs_dqfunlock(dqp);
- return XFS_ERROR(EIO);
+ return -EIO;
}
/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 68a68f704837..c24c67e22a2a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -139,6 +139,21 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
}
}
+/*
+ * Check whether a dquot is under low free space conditions. We assume the quota
+ * is enabled and enforced.
+ */
+static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
+{
+ int64_t freesp;
+
+ freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
+ if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
+ return true;
+
+ return false;
+}
+
#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index edac5b057d28..b92fd7bc49e3 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -27,29 +27,6 @@
#ifdef DEBUG
-int xfs_etrap[XFS_ERROR_NTRAP] = {
- 0,
-};
-
-int
-xfs_error_trap(int e)
-{
- int i;
-
- if (!e)
- return 0;
- for (i = 0; i < XFS_ERROR_NTRAP; i++) {
- if (xfs_etrap[i] == 0)
- break;
- if (e != xfs_etrap[i])
- continue;
- xfs_notice(NULL, "%s: error %d", __func__, e);
- BUG();
- break;
- }
- return e;
-}
-
int xfs_etest[XFS_NUM_INJECT_ERROR];
int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
@@ -190,7 +167,7 @@ xfs_verifier_error(
struct xfs_mount *mp = bp->b_target->bt_mount;
xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
- bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
+ bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
__return_address, bp->b_bn);
xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c1c57d4a4b5d..279a76e52791 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,15 +18,6 @@
#ifndef __XFS_ERROR_H__
#define __XFS_ERROR_H__
-#ifdef DEBUG
-#define XFS_ERROR_NTRAP 10
-extern int xfs_etrap[XFS_ERROR_NTRAP];
-extern int xfs_error_trap(int);
-#define XFS_ERROR(e) xfs_error_trap(e)
-#else
-#define XFS_ERROR(e) (e)
-#endif
-
struct xfs_mount;
extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
@@ -56,7 +47,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
if (unlikely(!fs_is_ok)) { \
XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
XFS_ERRLEVEL_LOW, NULL); \
- error = XFS_ERROR(EFSCORRUPTED); \
+ error = -EFSCORRUPTED; \
goto l; \
} \
}
@@ -68,7 +59,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
if (unlikely(!fs_is_ok)) { \
XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
XFS_ERRLEVEL_LOW, NULL); \
- return XFS_ERROR(EFSCORRUPTED); \
+ return -EFSCORRUPTED; \
} \
}
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 753e467aa1a5..5a6bd5d8779a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -147,9 +147,9 @@ xfs_nfs_get_inode(
* We don't use ESTALE directly down the chain to not
* confuse applications using bulkstat that expect EINVAL.
*/
- if (error == EINVAL || error == ENOENT)
- error = ESTALE;
- return ERR_PTR(-error);
+ if (error == -EINVAL || error == -ENOENT)
+ error = -ESTALE;
+ return ERR_PTR(error);
}
if (ip->i_d.di_gen != generation) {
@@ -217,7 +217,7 @@ xfs_fs_get_parent(
error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
if (unlikely(error))
- return ERR_PTR(-error);
+ return ERR_PTR(error);
return d_obtain_alias(VFS_I(cip));
}
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
if (!lsn)
return 0;
- return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index fb7a4c1ce1c5..c4327419dc5c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -298,7 +298,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f66779d7a46..eb596b419942 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_dinode.h"
+#include "xfs_icache.h"
#include <linux/aio.h>
#include <linux/dcache.h>
@@ -155,7 +156,7 @@ xfs_dir_fsync(
if (!lsn)
return 0;
- return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
STATIC int
@@ -179,7 +180,7 @@ xfs_file_fsync(
return error;
if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+ return -EIO;
xfs_iflags_clear(ip, XFS_ITRUNCATED);
@@ -225,7 +226,7 @@ xfs_file_fsync(
!log_flushed)
xfs_blkdev_issue_flush(mp->m_ddev_targp);
- return -error;
+ return error;
}
STATIC ssize_t
@@ -246,11 +247,11 @@ xfs_file_read_iter(
XFS_STATS_INC(xs_read_calls);
if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
+ ioflags |= XFS_IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+ ioflags |= XFS_IO_INVIS;
- if (unlikely(ioflags & IO_ISDIRECT)) {
+ if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -258,7 +259,7 @@ xfs_file_read_iter(
if ((pos | size) & target->bt_logical_sectormask) {
if (pos == i_size_read(inode))
return 0;
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
}
}
@@ -283,19 +284,29 @@ xfs_file_read_iter(
* proceeed concurrently without serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+ if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
if (inode->i_mapping->nrpages) {
ret = filemap_write_and_wait_range(
VFS_I(ip)->i_mapping,
- pos, -1);
+ pos, pos + size - 1);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+
+ /*
+ * Invalidate whole pages. This can return an error if
+ * we fail to invalidate a page, but this should never
+ * happen on XFS. Warn if it does fail.
+ */
+ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ (pos + size - 1) >> PAGE_CACHE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
@@ -325,7 +336,7 @@ xfs_file_splice_read(
XFS_STATS_INC(xs_read_calls);
if (infilp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+ ioflags |= XFS_IO_INVIS;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
@@ -524,7 +535,7 @@ restart:
xfs_rw_ilock(ip, *iolock);
goto restart;
}
- error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+ error = xfs_zero_eof(ip, *pos, i_size_read(inode));
if (error)
return error;
}
@@ -594,7 +605,7 @@ xfs_file_dio_aio_write(
/* DIO must be aligned to device logical sector size */
if ((pos | count) & target->bt_logical_sectormask)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
@@ -631,10 +642,19 @@ xfs_file_dio_aio_write(
if (mapping->nrpages) {
ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, -1);
+ pos, pos + count - 1);
if (ret)
goto out;
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+ /*
+ * Invalidate whole pages. This can return an error if
+ * we fail to invalidate a page, but this should never
+ * happen on XFS. Warn if it does fail.
+ */
+ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ (pos + count - 1) >> PAGE_CACHE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
/*
@@ -689,14 +709,28 @@ write_retry:
ret = generic_perform_write(file, from, pos);
if (likely(ret >= 0))
iocb->ki_pos = pos + ret;
+
/*
- * If we just got an ENOSPC, try to write back all dirty inodes to
- * convert delalloc space to free up some of the excess reserved
- * metadata space.
+ * If we hit a space limit, try to free up some lingering preallocated
+ * space before returning an error. In the case of ENOSPC, first try to
+ * write back all dirty inodes to free up some of the excess reserved
+ * metadata space. This reduces the chances that the eofblocks scan
+ * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
+ * also behaves as a filter to prevent too many eofblocks scans from
+ * running at the same time.
*/
- if (ret == -ENOSPC && !enospc) {
+ if (ret == -EDQUOT && !enospc) {
+ enospc = xfs_inode_free_quota_eofblocks(ip);
+ if (enospc)
+ goto write_retry;
+ } else if (ret == -ENOSPC && !enospc) {
+ struct xfs_eofblocks eofb = {0};
+
enospc = 1;
xfs_flush_inodes(ip->i_mount);
+ eofb.eof_scan_owner = ip->i_ino; /* for locking */
+ eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+ xfs_icache_free_eofblocks(ip->i_mount, &eofb);
goto write_retry;
}
@@ -772,7 +806,7 @@ xfs_file_fallocate(
unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
if (offset & blksize_mask || len & blksize_mask) {
- error = EINVAL;
+ error = -EINVAL;
goto out_unlock;
}
@@ -781,7 +815,7 @@ xfs_file_fallocate(
* in which case it is effectively a truncate operation
*/
if (offset + len >= i_size_read(inode)) {
- error = EINVAL;
+ error = -EINVAL;
goto out_unlock;
}
@@ -794,7 +828,7 @@ xfs_file_fallocate(
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
new_size = offset + len;
- error = -inode_newsize_ok(inode, new_size);
+ error = inode_newsize_ok(inode, new_size);
if (error)
goto out_unlock;
}
@@ -844,7 +878,7 @@ xfs_file_fallocate(
out_unlock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return -error;
+ return error;
}
@@ -889,7 +923,7 @@ xfs_file_release(
struct inode *inode,
struct file *filp)
{
- return -xfs_release(XFS_I(inode));
+ return xfs_release(XFS_I(inode));
}
STATIC int
@@ -918,7 +952,7 @@ xfs_file_readdir(
error = xfs_readdir(ip, ctx, bufsize);
if (error)
- return -error;
+ return error;
return 0;
}
@@ -949,7 +983,7 @@ xfs_vm_page_mkwrite(
/*
* This type is designed to indicate the type of offset we would like
- * to search from page cache for either xfs_seek_data() or xfs_seek_hole().
+ * to search from page cache for xfs_seek_hole_data().
*/
enum {
HOLE_OFF = 0,
@@ -1006,7 +1040,7 @@ xfs_lookup_buffer_offset(
/*
* This routine is called to find out and return a data or hole offset
* from the page cache for unwritten extents according to the desired
- * type for xfs_seek_data() or xfs_seek_hole().
+ * type for xfs_seek_hole_data().
*
* The argument offset is used to tell where we start to search from the
* page cache. Map is used to figure out the end points of the range to
@@ -1166,9 +1200,10 @@ out:
}
STATIC loff_t
-xfs_seek_data(
+xfs_seek_hole_data(
struct file *file,
- loff_t start)
+ loff_t start,
+ int whence)
{
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
@@ -1180,11 +1215,14 @@ xfs_seek_data(
uint lock;
int error;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
lock = xfs_ilock_data_map_shared(ip);
isize = i_size_read(inode);
if (start >= isize) {
- error = ENXIO;
+ error = -ENXIO;
goto out_unlock;
}
@@ -1194,6 +1232,7 @@ xfs_seek_data(
*/
fsbno = XFS_B_TO_FSBT(mp, start);
end = XFS_B_TO_FSB(mp, isize);
+
for (;;) {
struct xfs_bmbt_irec map[2];
int nmap = 2;
@@ -1206,7 +1245,7 @@ xfs_seek_data(
/* No extents at given offset, must be beyond EOF */
if (nmap == 0) {
- error = ENXIO;
+ error = -ENXIO;
goto out_unlock;
}
@@ -1214,30 +1253,49 @@ xfs_seek_data(
offset = max_t(loff_t, start,
XFS_FSB_TO_B(mp, map[i].br_startoff));
- /* Landed in a data extent */
- if (map[i].br_startblock == DELAYSTARTBLOCK ||
- (map[i].br_state == XFS_EXT_NORM &&
- !isnullstartblock(map[i].br_startblock)))
+ /* Landed in the hole we wanted? */
+ if (whence == SEEK_HOLE &&
+ map[i].br_startblock == HOLESTARTBLOCK)
+ goto out;
+
+ /* Landed in the data extent we wanted? */
+ if (whence == SEEK_DATA &&
+ (map[i].br_startblock == DELAYSTARTBLOCK ||
+ (map[i].br_state == XFS_EXT_NORM &&
+ !isnullstartblock(map[i].br_startblock))))
goto out;
/*
- * Landed in an unwritten extent, try to search data
- * from page cache.
+ * Landed in an unwritten extent, try to search
+ * for hole or data from page cache.
*/
if (map[i].br_state == XFS_EXT_UNWRITTEN) {
if (xfs_find_get_desired_pgoff(inode, &map[i],
- DATA_OFF, &offset))
+ whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF,
+ &offset))
goto out;
}
}
/*
- * map[0] is hole or its an unwritten extent but
- * without data in page cache. Probably means that
- * we are reading after EOF if nothing in map[1].
+ * We only received one extent out of the two requested. This
+ * means we've hit EOF and didn't find what we are looking for.
*/
if (nmap == 1) {
- error = ENXIO;
+ /*
+ * If we were looking for a hole, set offset to
+ * the end of the file (i.e., there is an implicit
+ * hole at the end of any file).
+ */
+ if (whence == SEEK_HOLE) {
+ offset = isize;
+ break;
+ }
+ /*
+ * If we were looking for data, it's nowhere to be found
+ */
+ ASSERT(whence == SEEK_DATA);
+ error = -ENXIO;
goto out_unlock;
}
@@ -1245,132 +1303,37 @@ xfs_seek_data(
/*
* Nothing was found, proceed to the next round of search
- * if reading offset not beyond or hit EOF.
+ * if the next reading offset is not at or beyond EOF.
*/
fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
start = XFS_FSB_TO_B(mp, fsbno);
if (start >= isize) {
- error = ENXIO;
- goto out_unlock;
- }
- }
-
-out:
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-
-out_unlock:
- xfs_iunlock(ip, lock);
-
- if (error)
- return -error;
- return offset;
-}
-
-STATIC loff_t
-xfs_seek_hole(
- struct file *file,
- loff_t start)
-{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- loff_t uninitialized_var(offset);
- xfs_fsize_t isize;
- xfs_fileoff_t fsbno;
- xfs_filblks_t end;
- uint lock;
- int error;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
-
- lock = xfs_ilock_data_map_shared(ip);
-
- isize = i_size_read(inode);
- if (start >= isize) {
- error = ENXIO;
- goto out_unlock;
- }
-
- fsbno = XFS_B_TO_FSBT(mp, start);
- end = XFS_B_TO_FSB(mp, isize);
-
- for (;;) {
- struct xfs_bmbt_irec map[2];
- int nmap = 2;
- unsigned int i;
-
- error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
- XFS_BMAPI_ENTIRE);
- if (error)
- goto out_unlock;
-
- /* No extents at given offset, must be beyond EOF */
- if (nmap == 0) {
- error = ENXIO;
- goto out_unlock;
- }
-
- for (i = 0; i < nmap; i++) {
- offset = max_t(loff_t, start,
- XFS_FSB_TO_B(mp, map[i].br_startoff));
-
- /* Landed in a hole */
- if (map[i].br_startblock == HOLESTARTBLOCK)
- goto out;
-
- /*
- * Landed in an unwritten extent, try to search hole
- * from page cache.
- */
- if (map[i].br_state == XFS_EXT_UNWRITTEN) {
- if (xfs_find_get_desired_pgoff(inode, &map[i],
- HOLE_OFF, &offset))
- goto out;
+ if (whence == SEEK_HOLE) {
+ offset = isize;
+ break;
}
- }
-
- /*
- * map[0] contains data or its unwritten but contains
- * data in page cache, probably means that we are
- * reading after EOF. We should fix offset to point
- * to the end of the file(i.e., there is an implicit
- * hole at the end of any file).
- */
- if (nmap == 1) {
- offset = isize;
- break;
- }
-
- ASSERT(i > 1);
-
- /*
- * Both mappings contains data, proceed to the next round of
- * search if the current reading offset not beyond or hit EOF.
- */
- fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
- start = XFS_FSB_TO_B(mp, fsbno);
- if (start >= isize) {
- offset = isize;
- break;
+ ASSERT(whence == SEEK_DATA);
+ error = -ENXIO;
+ goto out_unlock;
}
}
out:
/*
- * At this point, we must have found a hole. However, the returned
+ * If at this point we have found the hole we wanted, the returned
* offset may be bigger than the file size as it may be aligned to
- * page boundary for unwritten extents, we need to deal with this
+ * page boundary for unwritten extents. We need to deal with this
* situation in particular.
*/
- offset = min_t(loff_t, offset, isize);
+ if (whence == SEEK_HOLE)
+ offset = min_t(loff_t, offset, isize);
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
xfs_iunlock(ip, lock);
if (error)
- return -error;
+ return error;
return offset;
}
@@ -1378,17 +1341,16 @@ STATIC loff_t
xfs_file_llseek(
struct file *file,
loff_t offset,
- int origin)
+ int whence)
{
- switch (origin) {
+ switch (whence) {
case SEEK_END:
case SEEK_CUR:
case SEEK_SET:
- return generic_file_llseek(file, offset, origin);
- case SEEK_DATA:
- return xfs_seek_data(file, offset);
+ return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
- return xfs_seek_hole(file, offset);
+ case SEEK_DATA:
+ return xfs_seek_hole_data(file, offset, whence);
default:
return -EINVAL;
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 8ec81bed7992..e92730c1d3ca 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -258,7 +258,7 @@ next_ag:
if (*agp == NULLAGNUMBER)
return 0;
- err = ENOMEM;
+ err = -ENOMEM;
item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
if (!item)
goto out_put_ag;
@@ -268,7 +268,7 @@ next_ag:
err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
if (err) {
- if (err == EEXIST)
+ if (err == -EEXIST)
err = 0;
goto out_free_item;
}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d34703dbcb42..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -255,8 +255,8 @@ typedef struct xfs_fsop_resblks {
((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
/* Used for sanity checks on superblock */
-#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks)
-#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) * \
+#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks)
+#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) * \
(s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
/*
@@ -375,6 +375,9 @@ struct xfs_fs_eofblocks {
#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_UNION (1 << 5) /* union filter algorithm;
+ * kernel only, not included in
+ * valid mask */
#define XFS_EOF_FLAGS_VALID \
(XFS_EOF_FLAGS_SYNC | \
XFS_EOF_FLAGS_UID | \
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index d2295561570a..c05ac8b70fa9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -168,20 +168,15 @@ xfs_growfs_data_private(
nb = in->newblocks;
pct = in->imaxpct;
if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
return error;
dpct = pct - mp->m_sb.sb_imax_pct;
- bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+ error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, NULL);
- if (!bp)
- return EIO;
- if (bp->b_error) {
- error = bp->b_error;
- xfs_buf_relse(bp);
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+ if (error)
return error;
- }
xfs_buf_relse(bp);
new = nb; /* use new as a temporary here */
@@ -191,7 +186,7 @@ xfs_growfs_data_private(
nagcount--;
nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
if (nb < mp->m_sb.sb_dblocks)
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
new = nb - mp->m_sb.sb_dblocks;
oagcount = mp->m_sb.sb_agcount;
@@ -229,7 +224,7 @@ xfs_growfs_data_private(
XFS_FSS_TO_BB(mp, 1), 0,
&xfs_agf_buf_ops);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error0;
}
@@ -270,7 +265,7 @@ xfs_growfs_data_private(
XFS_FSS_TO_BB(mp, 1), 0,
&xfs_agfl_buf_ops);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error0;
}
@@ -298,7 +293,7 @@ xfs_growfs_data_private(
XFS_FSS_TO_BB(mp, 1), 0,
&xfs_agi_buf_ops);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error0;
}
@@ -336,7 +331,7 @@ xfs_growfs_data_private(
&xfs_allocbt_buf_ops);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error0;
}
@@ -365,7 +360,7 @@ xfs_growfs_data_private(
BTOBB(mp->m_sb.sb_blocksize), 0,
&xfs_allocbt_buf_ops);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error0;
}
@@ -395,7 +390,7 @@ xfs_growfs_data_private(
BTOBB(mp->m_sb.sb_blocksize), 0,
&xfs_inobt_buf_ops);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error0;
}
@@ -420,7 +415,7 @@ xfs_growfs_data_private(
BTOBB(mp->m_sb.sb_blocksize), 0,
&xfs_inobt_buf_ops);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error0;
}
@@ -531,7 +526,7 @@ xfs_growfs_data_private(
bp->b_ops = &xfs_sb_buf_ops;
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
} else
- error = ENOMEM;
+ error = -ENOMEM;
}
/*
@@ -576,17 +571,17 @@ xfs_growfs_log_private(
nb = in->newblocks;
if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
if (nb == mp->m_sb.sb_logblocks &&
in->isint == (mp->m_sb.sb_logstart != 0))
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
/*
* Moving the log is hard, need new interfaces to sync
* the log first, hold off all activity while moving it.
* Can have shorter or longer log in the same space,
* or transform internal to external log or vice versa.
*/
- return XFS_ERROR(ENOSYS);
+ return -ENOSYS;
}
/*
@@ -604,9 +599,9 @@ xfs_growfs_data(
int error;
if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
+ return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
- return XFS_ERROR(EWOULDBLOCK);
+ return -EWOULDBLOCK;
error = xfs_growfs_data_private(mp, in);
mutex_unlock(&mp->m_growlock);
return error;
@@ -620,9 +615,9 @@ xfs_growfs_log(
int error;
if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
+ return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
- return XFS_ERROR(EWOULDBLOCK);
+ return -EWOULDBLOCK;
error = xfs_growfs_log_private(mp, in);
mutex_unlock(&mp->m_growlock);
return error;
@@ -674,7 +669,7 @@ xfs_reserve_blocks(
/* If inval is null, report current values and return */
if (inval == (__uint64_t *)NULL) {
if (!outval)
- return EINVAL;
+ return -EINVAL;
outval->resblks = mp->m_resblks;
outval->resblks_avail = mp->m_resblks_avail;
return 0;
@@ -757,7 +752,7 @@ out:
int error;
error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
fdblks_delta, 0);
- if (error == ENOSPC)
+ if (error == -ENOSPC)
goto retry;
}
return 0;
@@ -818,7 +813,7 @@ xfs_fs_goingdown(
SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
break;
default:
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
return 0;
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5399ef222dd7..4d41b241298f 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -43,3 +43,7 @@ xfs_param_t xfs_params = {
.fstrm_timer = { 1, 30*100, 3600*100},
.eofb_timer = { 1, 300, 3600*24},
};
+
+struct xfs_globals xfs_globals = {
+ .log_recovery_delay = 0, /* no delay by default */
+};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c48df5f25b9f..b45f7b27b5df 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,6 +33,8 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_bmap_util.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -158,7 +160,7 @@ xfs_iget_cache_hit(
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
XFS_STATS_INC(xs_ig_frecycle);
- error = EAGAIN;
+ error = -EAGAIN;
goto out_error;
}
@@ -176,7 +178,7 @@ xfs_iget_cache_hit(
if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
trace_xfs_iget_skip(ip);
XFS_STATS_INC(xs_ig_frecycle);
- error = EAGAIN;
+ error = -EAGAIN;
goto out_error;
}
@@ -184,7 +186,7 @@ xfs_iget_cache_hit(
* If lookup is racing with unlink return an error immediately.
*/
if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
+ error = -ENOENT;
goto out_error;
}
@@ -206,7 +208,7 @@ xfs_iget_cache_hit(
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
- error = -inode_init_always(mp->m_super, inode);
+ error = inode_init_always(mp->m_super, inode);
if (error) {
/*
* Re-initializing the inode failed, and we are in deep
@@ -243,7 +245,7 @@ xfs_iget_cache_hit(
/* If the VFS inode is being torn down, pause and try again. */
if (!igrab(inode)) {
trace_xfs_iget_skip(ip);
- error = EAGAIN;
+ error = -EAGAIN;
goto out_error;
}
@@ -285,7 +287,7 @@ xfs_iget_cache_miss(
ip = xfs_inode_alloc(mp, ino);
if (!ip)
- return ENOMEM;
+ return -ENOMEM;
error = xfs_iread(mp, tp, ip, flags);
if (error)
@@ -294,7 +296,7 @@ xfs_iget_cache_miss(
trace_xfs_iget_miss(ip);
if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
+ error = -ENOENT;
goto out_destroy;
}
@@ -305,7 +307,7 @@ xfs_iget_cache_miss(
* recurse into the file system.
*/
if (radix_tree_preload(GFP_NOFS)) {
- error = EAGAIN;
+ error = -EAGAIN;
goto out_destroy;
}
@@ -341,7 +343,7 @@ xfs_iget_cache_miss(
if (unlikely(error)) {
WARN_ON(error != -EEXIST);
XFS_STATS_INC(xs_ig_dup);
- error = EAGAIN;
+ error = -EAGAIN;
goto out_preload_end;
}
spin_unlock(&pag->pag_ici_lock);
@@ -408,7 +410,7 @@ xfs_iget(
/* reject inode numbers outside existing AGs */
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
- return EINVAL;
+ return -EINVAL;
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -445,7 +447,7 @@ again:
return 0;
out_error_or_again:
- if (error == EAGAIN) {
+ if (error == -EAGAIN) {
delay(1);
goto again;
}
@@ -489,18 +491,18 @@ xfs_inode_ag_walk_grab(
/* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
/* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode))
- return ENOENT;
+ return -ENOENT;
/* inode is valid */
return 0;
out_unlock_noent:
spin_unlock(&ip->i_flags_lock);
- return ENOENT;
+ return -ENOENT;
}
STATIC int
@@ -583,16 +585,16 @@ restart:
continue;
error = execute(batch[i], flags, args);
IRELE(batch[i]);
- if (error == EAGAIN) {
+ if (error == -EAGAIN) {
skipped++;
continue;
}
- if (error && last_error != EFSCORRUPTED)
+ if (error && last_error != -EFSCORRUPTED)
last_error = error;
}
/* bail out if the filesystem is corrupted. */
- if (error == EFSCORRUPTED)
+ if (error == -EFSCORRUPTED)
break;
cond_resched();
@@ -652,11 +654,11 @@ xfs_inode_ag_iterator(
xfs_perag_put(pag);
if (error) {
last_error = error;
- if (error == EFSCORRUPTED)
+ if (error == -EFSCORRUPTED)
break;
}
}
- return XFS_ERROR(last_error);
+ return last_error;
}
int
@@ -680,11 +682,11 @@ xfs_inode_ag_iterator_tag(
xfs_perag_put(pag);
if (error) {
last_error = error;
- if (error == EFSCORRUPTED)
+ if (error == -EFSCORRUPTED)
break;
}
}
- return XFS_ERROR(last_error);
+ return last_error;
}
/*
@@ -944,7 +946,7 @@ restart:
* see the stale flag set on the inode.
*/
error = xfs_iflush(ip, &bp);
- if (error == EAGAIN) {
+ if (error == -EAGAIN) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
/* backoff longer than in xfs_ifree_cluster */
delay(2);
@@ -997,7 +999,7 @@ out:
xfs_iflags_clear(ip, XFS_IRECLAIM);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
/*
- * We could return EAGAIN here to make reclaim rescan the inode tree in
+ * We could return -EAGAIN here to make reclaim rescan the inode tree in
* a short while. However, this just burns CPU time scanning the tree
* waiting for IO to complete and the reclaim work never goes back to
* the idle state. Instead, return 0 to let the next scheduled
@@ -1100,7 +1102,7 @@ restart:
if (!batch[i])
continue;
error = xfs_reclaim_inode(batch[i], pag, flags);
- if (error && last_error != EFSCORRUPTED)
+ if (error && last_error != -EFSCORRUPTED)
last_error = error;
}
@@ -1129,7 +1131,7 @@ restart:
trylock = 0;
goto restart;
}
- return XFS_ERROR(last_error);
+ return last_error;
}
int
@@ -1203,6 +1205,30 @@ xfs_inode_match_id(
return 1;
}
+/*
+ * A union-based inode filtering algorithm. Process the inode if any of the
+ * criteria match. This is for global/internal scans only.
+ */
+STATIC int
+xfs_inode_match_id_union(
+ struct xfs_inode *ip,
+ struct xfs_eofblocks *eofb)
+{
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+ uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+ return 1;
+
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+ gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+ return 1;
+
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
+ xfs_get_projid(ip) == eofb->eof_prid)
+ return 1;
+
+ return 0;
+}
+
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
@@ -1211,6 +1237,10 @@ xfs_inode_free_eofblocks(
{
int ret;
struct xfs_eofblocks *eofb = args;
+ bool need_iolock = true;
+ int match;
+
+ ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
if (!xfs_can_free_eofblocks(ip, false)) {
/* inode could be preallocated or append-only */
@@ -1228,19 +1258,31 @@ xfs_inode_free_eofblocks(
return 0;
if (eofb) {
- if (!xfs_inode_match_id(ip, eofb))
+ if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+ match = xfs_inode_match_id_union(ip, eofb);
+ else
+ match = xfs_inode_match_id(ip, eofb);
+ if (!match)
return 0;
/* skip the inode if the file size is too small */
if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
XFS_ISIZE(ip) < eofb->eof_min_file_size)
return 0;
+
+ /*
+ * A scan owner implies we already hold the iolock. Skip it in
+ * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+ * the possibility of EAGAIN being returned.
+ */
+ if (eofb->eof_scan_owner == ip->i_ino)
+ need_iolock = false;
}
- ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+ ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
/* don't revisit the inode if we're not waiting */
- if (ret == EAGAIN && !(flags & SYNC_WAIT))
+ if (ret == -EAGAIN && !(flags & SYNC_WAIT))
ret = 0;
return ret;
@@ -1260,6 +1302,55 @@ xfs_icache_free_eofblocks(
eofb, XFS_ICI_EOFBLOCKS_TAG);
}
+/*
+ * Run eofblocks scans on the quotas applicable to the inode. For inodes with
+ * multiple quotas, we don't know exactly which quota caused an allocation
+ * failure. We make a best effort by including each quota under low free space
+ * conditions (less than 1% free space) in the scan.
+ */
+int
+xfs_inode_free_quota_eofblocks(
+ struct xfs_inode *ip)
+{
+ int scan = 0;
+ struct xfs_eofblocks eofb = {0};
+ struct xfs_dquot *dq;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+ /*
+ * Set the scan owner to avoid a potential livelock. Otherwise, the scan
+ * can repeatedly trylock on the inode we're currently processing. We
+ * run a sync scan to increase effectiveness and use the union filter to
+ * cover all applicable quotas in a single scan.
+ */
+ eofb.eof_scan_owner = ip->i_ino;
+ eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
+
+ if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
+ dq = xfs_inode_dquot(ip, XFS_DQ_USER);
+ if (dq && xfs_dquot_lowsp(dq)) {
+ eofb.eof_uid = VFS_I(ip)->i_uid;
+ eofb.eof_flags |= XFS_EOF_FLAGS_UID;
+ scan = 1;
+ }
+ }
+
+ if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
+ dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
+ if (dq && xfs_dquot_lowsp(dq)) {
+ eofb.eof_gid = VFS_I(ip)->i_gid;
+ eofb.eof_flags |= XFS_EOF_FLAGS_GID;
+ scan = 1;
+ }
+ }
+
+ if (scan)
+ xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+
+ return scan;
+}
+
void
xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9cf017b899be..46748b86b12f 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -27,6 +27,7 @@ struct xfs_eofblocks {
kgid_t eof_gid;
prid_t eof_prid;
__u64 eof_min_file_size;
+ xfs_ino_t eof_scan_owner;
};
#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
@@ -57,6 +58,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
void xfs_eofblocks_worker(struct work_struct *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
@@ -72,31 +74,32 @@ xfs_fs_eofblocks_from_user(
struct xfs_eofblocks *dst)
{
if (src->eof_version != XFS_EOFBLOCKS_VERSION)
- return EINVAL;
+ return -EINVAL;
if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
- return EINVAL;
+ return -EINVAL;
if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
memchr_inv(src->pad64, 0, sizeof(src->pad64)))
- return EINVAL;
+ return -EINVAL;
dst->eof_flags = src->eof_flags;
dst->eof_prid = src->eof_prid;
dst->eof_min_file_size = src->eof_min_file_size;
+ dst->eof_scan_owner = NULLFSINO;
dst->eof_uid = INVALID_UID;
if (src->eof_flags & XFS_EOF_FLAGS_UID) {
dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
if (!uid_valid(dst->eof_uid))
- return EINVAL;
+ return -EINVAL;
}
dst->eof_gid = INVALID_GID;
if (src->eof_flags & XFS_EOF_FLAGS_GID) {
dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
if (!gid_valid(dst->eof_gid))
- return EINVAL;
+ return -EINVAL;
}
return 0;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a6115fe1ac94..8ed049d1e332 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -583,7 +583,7 @@ xfs_lookup(
trace_xfs_lookup(dp, name);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return XFS_ERROR(EIO);
+ return -EIO;
lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
@@ -654,7 +654,7 @@ xfs_ialloc(
xfs_inode_t *ip;
uint flags;
int error;
- timespec_t tv;
+ struct timespec tv;
/*
* Call the space management code to pick
@@ -720,7 +720,7 @@ xfs_ialloc(
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
- nanotime(&tv);
+ tv = current_fs_time(mp->m_super);
ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
ip->i_d.di_atime = ip->i_d.di_mtime;
@@ -769,6 +769,8 @@ xfs_ialloc(
di_flags |= XFS_DIFLAG_EXTSZINHERIT;
ip->i_d.di_extsize = pip->i_d.di_extsize;
}
+ if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
} else if (S_ISREG(mode)) {
if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
di_flags |= XFS_DIFLAG_REALTIME;
@@ -789,8 +791,6 @@ xfs_ialloc(
if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
xfs_inherit_nosymlinks)
di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
xfs_inherit_nodefrag)
di_flags |= XFS_DIFLAG_NODEFRAG;
@@ -893,7 +893,7 @@ xfs_dir_ialloc(
}
if (!ialloc_context && !ip) {
*ipp = NULL;
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
}
/*
@@ -1088,7 +1088,7 @@ xfs_create(
trace_xfs_create(dp, name);
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1125,12 +1125,12 @@ xfs_create(
*/
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
error = xfs_trans_reserve(tp, &tres, resblks, 0);
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(mp);
error = xfs_trans_reserve(tp, &tres, resblks, 0);
}
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
resblks = 0;
error = xfs_trans_reserve(tp, &tres, 0, 0);
@@ -1153,9 +1153,11 @@ xfs_create(
if (error)
goto out_trans_cancel;
- error = xfs_dir_canenter(tp, dp, name, resblks);
- if (error)
- goto out_trans_cancel;
+ if (!resblks) {
+ error = xfs_dir_canenter(tp, dp, name);
+ if (error)
+ goto out_trans_cancel;
+ }
/*
* A newly created regular or special file just has one directory
@@ -1165,7 +1167,7 @@ xfs_create(
error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
prid, resblks > 0, &ip, &committed);
if (error) {
- if (error == ENOSPC)
+ if (error == -ENOSPC)
goto out_trans_cancel;
goto out_trans_abort;
}
@@ -1184,7 +1186,7 @@ xfs_create(
&first_block, &free_list, resblks ?
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
if (error) {
- ASSERT(error != ENOSPC);
+ ASSERT(error != -ENOSPC);
goto out_trans_abort;
}
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1274,7 +1276,7 @@ xfs_create_tmpfile(
uint resblks;
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1293,7 +1295,7 @@ xfs_create_tmpfile(
tres = &M_RES(mp)->tr_create_tmpfile;
error = xfs_trans_reserve(tp, tres, resblks, 0);
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
/* No space at all so try a "no-allocation" reservation */
resblks = 0;
error = xfs_trans_reserve(tp, tres, 0, 0);
@@ -1311,7 +1313,7 @@ xfs_create_tmpfile(
error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
prid, resblks > 0, &ip, NULL);
if (error) {
- if (error == ENOSPC)
+ if (error == -ENOSPC)
goto out_trans_cancel;
goto out_trans_abort;
}
@@ -1382,7 +1384,7 @@ xfs_link(
ASSERT(!S_ISDIR(sip->i_d.di_mode));
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
error = xfs_qm_dqattach(sip, 0);
if (error)
@@ -1396,7 +1398,7 @@ xfs_link(
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
}
@@ -1417,13 +1419,15 @@ xfs_link(
*/
if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
(xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
- error = XFS_ERROR(EXDEV);
+ error = -EXDEV;
goto error_return;
}
- error = xfs_dir_canenter(tp, tdp, target_name, resblks);
- if (error)
- goto error_return;
+ if (!resblks) {
+ error = xfs_dir_canenter(tp, tdp, target_name);
+ if (error)
+ goto error_return;
+ }
xfs_bmap_init(&free_list, &first_block);
@@ -1635,8 +1639,8 @@ xfs_release(
truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
if (truncated) {
xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
- if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
- error = -filemap_flush(VFS_I(ip)->i_mapping);
+ if (ip->i_delayed_blks > 0) {
+ error = filemap_flush(VFS_I(ip)->i_mapping);
if (error)
return error;
}
@@ -1673,7 +1677,7 @@ xfs_release(
return 0;
error = xfs_free_eofblocks(mp, ip, true);
- if (error && error != EAGAIN)
+ if (error && error != -EAGAIN)
return error;
/* delalloc blocks after truncation means it really is dirty */
@@ -1772,7 +1776,7 @@ xfs_inactive_ifree(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0);
if (error) {
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
xfs_warn_ratelimited(mp,
"Failed to remove inode(s) from unlinked list. "
"Please free space, unmount and run xfs_repair.");
@@ -2219,7 +2223,7 @@ xfs_ifree_cluster(
XBF_UNMAPPED);
if (!bp)
- return ENOMEM;
+ return -ENOMEM;
/*
* This buffer may not have been correctly initialised as we
@@ -2491,7 +2495,7 @@ xfs_remove(
trace_xfs_remove(dp, name);
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
error = xfs_qm_dqattach(dp, 0);
if (error)
@@ -2521,12 +2525,12 @@ xfs_remove(
*/
resblks = XFS_REMOVE_SPACE_RES(mp);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
}
if (error) {
- ASSERT(error != ENOSPC);
+ ASSERT(error != -ENOSPC);
cancel_flags = 0;
goto out_trans_cancel;
}
@@ -2543,11 +2547,11 @@ xfs_remove(
if (is_dir) {
ASSERT(ip->i_d.di_nlink >= 2);
if (ip->i_d.di_nlink != 2) {
- error = XFS_ERROR(ENOTEMPTY);
+ error = -ENOTEMPTY;
goto out_trans_cancel;
}
if (!xfs_dir_isempty(ip)) {
- error = XFS_ERROR(ENOTEMPTY);
+ error = -ENOTEMPTY;
goto out_trans_cancel;
}
@@ -2582,7 +2586,7 @@ xfs_remove(
error = xfs_dir_removename(tp, dp, name, ip->i_ino,
&first_block, &free_list, resblks);
if (error) {
- ASSERT(error != ENOENT);
+ ASSERT(error != -ENOENT);
goto out_bmap_cancel;
}
@@ -2702,7 +2706,7 @@ xfs_rename(
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
- if (error == ENOSPC) {
+ if (error == -ENOSPC) {
spaceres = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
}
@@ -2747,7 +2751,7 @@ xfs_rename(
*/
if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
(xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
- error = XFS_ERROR(EXDEV);
+ error = -EXDEV;
goto error_return;
}
@@ -2759,9 +2763,11 @@ xfs_rename(
* If there's no space reservation, check the entry will
* fit before actually inserting it.
*/
- error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
- if (error)
- goto error_return;
+ if (!spaceres) {
+ error = xfs_dir_canenter(tp, target_dp, target_name);
+ if (error)
+ goto error_return;
+ }
/*
* If target does not exist and the rename crosses
* directories, adjust the target directory link count
@@ -2770,7 +2776,7 @@ xfs_rename(
error = xfs_dir_createname(tp, target_dp, target_name,
src_ip->i_ino, &first_block,
&free_list, spaceres);
- if (error == ENOSPC)
+ if (error == -ENOSPC)
goto error_return;
if (error)
goto abort_return;
@@ -2795,7 +2801,7 @@ xfs_rename(
*/
if (!(xfs_dir_isempty(target_ip)) ||
(target_ip->i_d.di_nlink > 2)) {
- error = XFS_ERROR(EEXIST);
+ error = -EEXIST;
goto error_return;
}
}
@@ -2847,7 +2853,7 @@ xfs_rename(
error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
target_dp->i_ino,
&first_block, &free_list, spaceres);
- ASSERT(error != EEXIST);
+ ASSERT(error != -EEXIST);
if (error)
goto abort_return;
}
@@ -3055,8 +3061,8 @@ cluster_corrupt_out:
if (bp->b_iodone) {
XFS_BUF_UNDONE(bp);
xfs_buf_stale(bp);
- xfs_buf_ioerror(bp, EIO);
- xfs_buf_ioend(bp, 0);
+ xfs_buf_ioerror(bp, -EIO);
+ xfs_buf_ioend(bp);
} else {
xfs_buf_stale(bp);
xfs_buf_relse(bp);
@@ -3069,7 +3075,7 @@ cluster_corrupt_out:
xfs_iflush_abort(iq, false);
kmem_free(ilist);
xfs_perag_put(pag);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
/*
@@ -3124,7 +3130,7 @@ xfs_iflush(
* as we wait for an empty AIL as part of the unmount process.
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
- error = XFS_ERROR(EIO);
+ error = -EIO;
goto abort_out;
}
@@ -3167,7 +3173,7 @@ corrupt_out:
xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
cluster_corrupt_out:
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
abort_out:
/*
* Unlocks the flush lock
@@ -3331,5 +3337,5 @@ xfs_iflush_int(
return 0;
corrupt_out:
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f72bffa67266..9af2882e1f4c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -102,7 +102,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
{
xfs_fsize_t i_size = i_size_read(VFS_I(ip));
- if (new_size > i_size)
+ if (new_size > i_size || new_size < 0)
new_size = i_size;
return new_size > ip->i_d.di_size ? new_size : 0;
}
@@ -398,4 +398,14 @@ do { \
extern struct kmem_zone *xfs_inode_zone;
+/*
+ * Flags for read/write calls
+ */
+#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */
+#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */
+
+#define XFS_IO_FLAGS \
+ { XFS_IO_ISDIRECT, "DIRECT" }, \
+ { XFS_IO_INVIS, "INVIS"}
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a640137b3573..63de0b0acc32 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -615,7 +615,7 @@ xfs_iflush_done(
blip = bp->b_fspriv;
prev = NULL;
while (blip != NULL) {
- if (lip->li_cb != xfs_iflush_done) {
+ if (blip->li_cb != xfs_iflush_done) {
prev = blip;
blip = blip->li_bio_list;
continue;
@@ -788,5 +788,5 @@ xfs_inode_item_format_convert(
in_f->ilf_boffset = in_f64->ilf_boffset;
return 0;
}
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8bc1bbce7451..24c926b6fe85 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -207,7 +207,7 @@ xfs_open_by_handle(
struct path path;
if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
dentry = xfs_handlereq_to_dentry(parfilp, hreq);
if (IS_ERR(dentry))
@@ -216,7 +216,7 @@ xfs_open_by_handle(
/* Restrict xfs_open_by_handle to directories & regular files. */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- error = -XFS_ERROR(EPERM);
+ error = -EPERM;
goto out_dput;
}
@@ -228,18 +228,18 @@ xfs_open_by_handle(
fmode = OPEN_FMODE(permflag);
if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
(fmode & FMODE_WRITE) && IS_APPEND(inode)) {
- error = -XFS_ERROR(EPERM);
+ error = -EPERM;
goto out_dput;
}
if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
- error = -XFS_ERROR(EACCES);
+ error = -EACCES;
goto out_dput;
}
/* Can't write directories. */
if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
- error = -XFS_ERROR(EISDIR);
+ error = -EISDIR;
goto out_dput;
}
@@ -282,7 +282,7 @@ xfs_readlink_by_handle(
int error;
if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
dentry = xfs_handlereq_to_dentry(parfilp, hreq);
if (IS_ERR(dentry))
@@ -290,22 +290,22 @@ xfs_readlink_by_handle(
/* Restrict this handle operation to symlinks only. */
if (!S_ISLNK(dentry->d_inode->i_mode)) {
- error = -XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_dput;
}
if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
- error = -XFS_ERROR(EFAULT);
+ error = -EFAULT;
goto out_dput;
}
link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
if (!link) {
- error = -XFS_ERROR(ENOMEM);
+ error = -ENOMEM;
goto out_dput;
}
- error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+ error = xfs_readlink(XFS_I(dentry->d_inode), link);
if (error)
goto out_kfree;
error = readlink_copy(hreq->ohandle, olen, link);
@@ -330,10 +330,10 @@ xfs_set_dmattrs(
int error;
if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
+ return -EPERM;
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -364,9 +364,9 @@ xfs_fssetdm_by_handle(
struct dentry *dentry;
if (!capable(CAP_MKNOD))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(parfilp);
if (error)
@@ -379,16 +379,16 @@ xfs_fssetdm_by_handle(
}
if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
- error = -XFS_ERROR(EPERM);
+ error = -EPERM;
goto out;
}
if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
- error = -XFS_ERROR(EFAULT);
+ error = -EFAULT;
goto out;
}
- error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+ error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
fsd.fsd_dmstate);
out:
@@ -409,18 +409,18 @@ xfs_attrlist_by_handle(
char *kbuf;
if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
al_hreq.buflen > XATTR_LIST_MAX)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
/*
* Reject flags, only allow namespaces.
*/
if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
if (IS_ERR(dentry))
@@ -431,7 +431,7 @@ xfs_attrlist_by_handle(
goto out_dput;
cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+ error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
al_hreq.flags, cursor);
if (error)
goto out_kfree;
@@ -455,20 +455,20 @@ xfs_attrmulti_attr_get(
__uint32_t flags)
{
unsigned char *kbuf;
- int error = EFAULT;
+ int error = -EFAULT;
if (*len > XATTR_SIZE_MAX)
- return EINVAL;
+ return -EINVAL;
kbuf = kmem_zalloc_large(*len, KM_SLEEP);
if (!kbuf)
- return ENOMEM;
+ return -ENOMEM;
error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
if (error)
goto out_kfree;
if (copy_to_user(ubuf, kbuf, *len))
- error = EFAULT;
+ error = -EFAULT;
out_kfree:
kmem_free(kbuf);
@@ -484,20 +484,17 @@ xfs_attrmulti_attr_set(
__uint32_t flags)
{
unsigned char *kbuf;
- int error = EFAULT;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return EPERM;
+ return -EPERM;
if (len > XATTR_SIZE_MAX)
- return EINVAL;
+ return -EINVAL;
kbuf = memdup_user(ubuf, len);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
-
- return error;
+ return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
}
int
@@ -507,7 +504,7 @@ xfs_attrmulti_attr_remove(
__uint32_t flags)
{
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return EPERM;
+ return -EPERM;
return xfs_attr_remove(XFS_I(inode), name, flags);
}
@@ -524,9 +521,9 @@ xfs_attrmulti_by_handle(
unsigned char *attr_name;
if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
/* overflow check */
if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
@@ -536,18 +533,18 @@ xfs_attrmulti_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- error = E2BIG;
+ error = -E2BIG;
size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
if (!size || size > 16 * PAGE_SIZE)
goto out_dput;
ops = memdup_user(am_hreq.ops, size);
if (IS_ERR(ops)) {
- error = -PTR_ERR(ops);
+ error = PTR_ERR(ops);
goto out_dput;
}
- error = ENOMEM;
+ error = -ENOMEM;
attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
if (!attr_name)
goto out_kfree_ops;
@@ -557,7 +554,7 @@ xfs_attrmulti_by_handle(
ops[i].am_error = strncpy_from_user((char *)attr_name,
ops[i].am_attrname, MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = ERANGE;
+ error = -ERANGE;
if (ops[i].am_error < 0)
break;
@@ -588,19 +585,19 @@ xfs_attrmulti_by_handle(
mnt_drop_write_file(parfilp);
break;
default:
- ops[i].am_error = EINVAL;
+ ops[i].am_error = -EINVAL;
}
}
if (copy_to_user(am_hreq.ops, ops, size))
- error = XFS_ERROR(EFAULT);
+ error = -EFAULT;
kfree(attr_name);
out_kfree_ops:
kfree(ops);
out_dput:
dput(dentry);
- return -error;
+ return error;
}
int
@@ -625,16 +622,16 @@ xfs_ioc_space(
*/
if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (!(filp->f_mode & FMODE_WRITE))
- return -XFS_ERROR(EBADF);
+ return -EBADF;
if (!S_ISREG(inode->i_mode))
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
error = mnt_want_write_file(filp);
if (error)
@@ -652,7 +649,7 @@ xfs_ioc_space(
bf->l_start += XFS_ISIZE(ip);
break;
default:
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_unlock;
}
@@ -669,7 +666,7 @@ xfs_ioc_space(
case XFS_IOC_UNRESVSP:
case XFS_IOC_UNRESVSP64:
if (bf->l_len <= 0) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_unlock;
}
break;
@@ -682,7 +679,7 @@ xfs_ioc_space(
bf->l_start > mp->m_super->s_maxbytes ||
bf->l_start + bf->l_len < 0 ||
bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_unlock;
}
@@ -723,7 +720,7 @@ xfs_ioc_space(
break;
default:
ASSERT(0);
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
}
if (error)
@@ -739,7 +736,7 @@ xfs_ioc_space(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- if (!(ioflags & IO_INVIS)) {
+ if (!(ioflags & XFS_IO_INVIS)) {
ip->i_d.di_mode &= ~S_ISUID;
if (ip->i_d.di_mode & S_IXGRP)
ip->i_d.di_mode &= ~S_ISGID;
@@ -759,7 +756,7 @@ xfs_ioc_space(
out_unlock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
STATIC int
@@ -781,41 +778,41 @@ xfs_ioc_bulkstat(
return -EPERM;
if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+ return -EIO;
if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if ((count = bulkreq.icount) <= 0)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
if (bulkreq.ubuffer == NULL)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
if (cmd == XFS_IOC_FSINUMBERS)
error = xfs_inumbers(mp, &inlast, &count,
bulkreq.ubuffer, xfs_inumbers_fmt);
else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
- error = xfs_bulkstat_single(mp, &inlast,
- bulkreq.ubuffer, &done);
+ error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
+ sizeof(xfs_bstat_t), NULL, &done);
else /* XFS_IOC_FSBULKSTAT */
error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
sizeof(xfs_bstat_t), bulkreq.ubuffer,
&done);
if (error)
- return -error;
+ return error;
if (bulkreq.ocount != NULL) {
if (copy_to_user(bulkreq.lastip, &inlast,
sizeof(xfs_ino_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
}
return 0;
@@ -831,7 +828,7 @@ xfs_ioc_fsgeometry_v1(
error = xfs_fs_geometry(mp, &fsgeo, 3);
if (error)
- return -error;
+ return error;
/*
* Caller should have passed an argument of type
@@ -839,7 +836,7 @@ xfs_ioc_fsgeometry_v1(
* xfs_fsop_geom_t that xfs_fs_geometry() fills in.
*/
if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -853,10 +850,10 @@ xfs_ioc_fsgeometry(
error = xfs_fs_geometry(mp, &fsgeo, 4);
if (error)
- return -error;
+ return error;
if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -971,8 +968,6 @@ xfs_set_diflags(
di_flags |= XFS_DIFLAG_NOATIME;
if (xflags & XFS_XFLAG_NODUMP)
di_flags |= XFS_DIFLAG_NODUMP;
- if (xflags & XFS_XFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
if (xflags & XFS_XFLAG_NODEFRAG)
di_flags |= XFS_DIFLAG_NODEFRAG;
if (xflags & XFS_XFLAG_FILESTREAM)
@@ -984,6 +979,8 @@ xfs_set_diflags(
di_flags |= XFS_DIFLAG_NOSYMLINKS;
if (xflags & XFS_XFLAG_EXTSZINHERIT)
di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ if (xflags & XFS_XFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
} else if (S_ISREG(ip->i_d.di_mode)) {
if (xflags & XFS_XFLAG_REALTIME)
di_flags |= XFS_DIFLAG_REALTIME;
@@ -1041,16 +1038,16 @@ xfs_ioctl_setattr(
trace_xfs_ioctl_setattr(ip);
if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
+ return -EROFS;
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
/*
* Disallow 32bit project ids when projid32bit feature is not enabled.
*/
if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
/*
* If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1088,7 +1085,7 @@ xfs_ioctl_setattr(
* CAP_FSETID capability is applicable.
*/
if (!inode_owner_or_capable(VFS_I(ip))) {
- code = XFS_ERROR(EPERM);
+ code = -EPERM;
goto error_return;
}
@@ -1099,7 +1096,7 @@ xfs_ioctl_setattr(
*/
if (mask & FSX_PROJID) {
if (current_user_ns() != &init_user_ns) {
- code = XFS_ERROR(EINVAL);
+ code = -EINVAL;
goto error_return;
}
@@ -1122,7 +1119,7 @@ xfs_ioctl_setattr(
if (ip->i_d.di_nextents &&
((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
fa->fsx_extsize)) {
- code = XFS_ERROR(EINVAL); /* EFBIG? */
+ code = -EINVAL; /* EFBIG? */
goto error_return;
}
@@ -1141,7 +1138,7 @@ xfs_ioctl_setattr(
extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
if (extsize_fsb > MAXEXTLEN) {
- code = XFS_ERROR(EINVAL);
+ code = -EINVAL;
goto error_return;
}
@@ -1153,13 +1150,13 @@ xfs_ioctl_setattr(
} else {
size = mp->m_sb.sb_blocksize;
if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
- code = XFS_ERROR(EINVAL);
+ code = -EINVAL;
goto error_return;
}
}
if (fa->fsx_extsize % size) {
- code = XFS_ERROR(EINVAL);
+ code = -EINVAL;
goto error_return;
}
}
@@ -1173,7 +1170,7 @@ xfs_ioctl_setattr(
if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
(XFS_IS_REALTIME_INODE(ip)) !=
(fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
- code = XFS_ERROR(EINVAL); /* EFBIG? */
+ code = -EINVAL; /* EFBIG? */
goto error_return;
}
@@ -1184,7 +1181,7 @@ xfs_ioctl_setattr(
if ((mp->m_sb.sb_rblocks == 0) ||
(mp->m_sb.sb_rextsize == 0) ||
(ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
- code = XFS_ERROR(EINVAL);
+ code = -EINVAL;
goto error_return;
}
}
@@ -1198,7 +1195,7 @@ xfs_ioctl_setattr(
(fa->fsx_xflags &
(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
!capable(CAP_LINUX_IMMUTABLE)) {
- code = XFS_ERROR(EPERM);
+ code = -EPERM;
goto error_return;
}
}
@@ -1234,13 +1231,25 @@ xfs_ioctl_setattr(
}
- if (mask & FSX_EXTSIZE)
- ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
if (mask & FSX_XFLAGS) {
xfs_set_diflags(ip, fa->fsx_xflags);
xfs_diflags_to_linux(ip);
}
+ /*
+ * Only set the extent size hint if we've already determined that the
+ * extent size hint should be set on the inode. If no extent size flags
+ * are set on the inode then unconditionally clear the extent size hint.
+ */
+ if (mask & FSX_EXTSIZE) {
+ int extsize = 0;
+
+ if (ip->i_d.di_flags &
+ (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
+ extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+ ip->i_d.di_extsize = extsize;
+ }
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1301,7 +1310,7 @@ xfs_ioc_fssetxattr(
return error;
error = xfs_ioctl_setattr(ip, &fa, mask);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
STATIC int
@@ -1346,17 +1355,17 @@ xfs_ioc_setxflags(
return error;
error = xfs_ioctl_setattr(ip, &fa, mask);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
STATIC int
xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
{
- struct getbmap __user *base = *ap;
+ struct getbmap __user *base = (struct getbmap __user *)*ap;
/* copy only getbmap portion (not getbmapx) */
if (copy_to_user(base, bmv, sizeof(struct getbmap)))
- return XFS_ERROR(EFAULT);
+ return -EFAULT;
*ap += sizeof(struct getbmap);
return 0;
@@ -1373,33 +1382,33 @@ xfs_ioc_getbmap(
int error;
if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if (bmx.bmv_count < 2)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
- if (ioflags & IO_INVIS)
+ if (ioflags & XFS_IO_INVIS)
bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
- (struct getbmap *)arg+1);
+ (__force struct getbmap *)arg+1);
if (error)
- return -error;
+ return error;
/* copy back header - only size of getbmap */
if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
STATIC int
xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
{
- struct getbmapx __user *base = *ap;
+ struct getbmapx __user *base = (struct getbmapx __user *)*ap;
if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
- return XFS_ERROR(EFAULT);
+ return -EFAULT;
*ap += sizeof(struct getbmapx);
return 0;
@@ -1414,22 +1423,22 @@ xfs_ioc_getbmapx(
int error;
if (copy_from_user(&bmx, arg, sizeof(bmx)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if (bmx.bmv_count < 2)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
if (bmx.bmv_iflags & (~BMV_IF_VALID))
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
- (struct getbmapx *)arg+1);
+ (__force struct getbmapx *)arg+1);
if (error)
- return -error;
+ return error;
/* copy back header */
if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -1445,33 +1454,33 @@ xfs_ioc_swapext(
/* Pull information for the target fd */
f = fdget((int)sxp->sx_fdtarget);
if (!f.file) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out;
}
if (!(f.file->f_mode & FMODE_WRITE) ||
!(f.file->f_mode & FMODE_READ) ||
(f.file->f_flags & O_APPEND)) {
- error = XFS_ERROR(EBADF);
+ error = -EBADF;
goto out_put_file;
}
tmp = fdget((int)sxp->sx_fdtmp);
if (!tmp.file) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_put_file;
}
if (!(tmp.file->f_mode & FMODE_WRITE) ||
!(tmp.file->f_mode & FMODE_READ) ||
(tmp.file->f_flags & O_APPEND)) {
- error = XFS_ERROR(EBADF);
+ error = -EBADF;
goto out_put_tmp_file;
}
if (IS_SWAPFILE(file_inode(f.file)) ||
IS_SWAPFILE(file_inode(tmp.file))) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_put_tmp_file;
}
@@ -1479,17 +1488,17 @@ xfs_ioc_swapext(
tip = XFS_I(file_inode(tmp.file));
if (ip->i_mount != tip->i_mount) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_put_tmp_file;
}
if (ip->i_ino == tip->i_ino) {
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto out_put_tmp_file;
}
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- error = XFS_ERROR(EIO);
+ error = -EIO;
goto out_put_tmp_file;
}
@@ -1523,7 +1532,7 @@ xfs_file_ioctl(
int error;
if (filp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+ ioflags |= XFS_IO_INVIS;
trace_xfs_file_ioctl(ip);
@@ -1542,7 +1551,7 @@ xfs_file_ioctl(
xfs_flock64_t bf;
if (copy_from_user(&bf, arg, sizeof(bf)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
}
case XFS_IOC_DIOINFO: {
@@ -1555,7 +1564,7 @@ xfs_file_ioctl(
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
if (copy_to_user(arg, &da, sizeof(da)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -1588,7 +1597,7 @@ xfs_file_ioctl(
struct fsdmidata dmi;
if (copy_from_user(&dmi, arg, sizeof(dmi)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
@@ -1597,7 +1606,7 @@ xfs_file_ioctl(
error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
dmi.fsd_dmstate);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
case XFS_IOC_GETBMAP:
@@ -1613,14 +1622,14 @@ xfs_file_ioctl(
xfs_fsop_handlereq_t hreq;
if (copy_from_user(&hreq, arg, sizeof(hreq)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return xfs_find_handle(cmd, &hreq);
}
case XFS_IOC_OPEN_BY_HANDLE: {
xfs_fsop_handlereq_t hreq;
if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return xfs_open_by_handle(filp, &hreq);
}
case XFS_IOC_FSSETDM_BY_HANDLE:
@@ -1630,7 +1639,7 @@ xfs_file_ioctl(
xfs_fsop_handlereq_t hreq;
if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return xfs_readlink_by_handle(filp, &hreq);
}
case XFS_IOC_ATTRLIST_BY_HANDLE:
@@ -1643,13 +1652,13 @@ xfs_file_ioctl(
struct xfs_swapext sxp;
if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_ioc_swapext(&sxp);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
case XFS_IOC_FSCOUNTS: {
@@ -1657,10 +1666,10 @@ xfs_file_ioctl(
error = xfs_fs_counts(mp, &out);
if (error)
- return -error;
+ return error;
if (copy_to_user(arg, &out, sizeof(out)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -1672,10 +1681,10 @@ xfs_file_ioctl(
return -EPERM;
if (mp->m_flags & XFS_MOUNT_RDONLY)
- return -XFS_ERROR(EROFS);
+ return -EROFS;
if (copy_from_user(&inout, arg, sizeof(inout)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
@@ -1686,10 +1695,10 @@ xfs_file_ioctl(
error = xfs_reserve_blocks(mp, &in, &inout);
mnt_drop_write_file(filp);
if (error)
- return -error;
+ return error;
if (copy_to_user(arg, &inout, sizeof(inout)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -1701,10 +1710,10 @@ xfs_file_ioctl(
error = xfs_reserve_blocks(mp, NULL, &out);
if (error)
- return -error;
+ return error;
if (copy_to_user(arg, &out, sizeof(out)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -1713,42 +1722,42 @@ xfs_file_ioctl(
xfs_growfs_data_t in;
if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_growfs_data(mp, &in);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
case XFS_IOC_FSGROWFSLOG: {
xfs_growfs_log_t in;
if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_growfs_log(mp, &in);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
case XFS_IOC_FSGROWFSRT: {
xfs_growfs_rt_t in;
if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_growfs_rt(mp, &in);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
case XFS_IOC_GOINGDOWN: {
@@ -1758,10 +1767,9 @@ xfs_file_ioctl(
return -EPERM;
if (get_user(in, (__uint32_t __user *)arg))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
- error = xfs_fs_goingdown(mp, in);
- return -error;
+ return xfs_fs_goingdown(mp, in);
}
case XFS_IOC_ERROR_INJECTION: {
@@ -1771,18 +1779,16 @@ xfs_file_ioctl(
return -EPERM;
if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
- error = xfs_errortag_add(in.errtag, mp);
- return -error;
+ return xfs_errortag_add(in.errtag, mp);
}
case XFS_IOC_ERROR_CLEARALL:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- error = xfs_errortag_clearall(mp, 1);
- return -error;
+ return xfs_errortag_clearall(mp, 1);
case XFS_IOC_FREE_EOFBLOCKS: {
struct xfs_fs_eofblocks eofb;
@@ -1792,16 +1798,16 @@ xfs_file_ioctl(
return -EPERM;
if (mp->m_flags & XFS_MOUNT_RDONLY)
- return -XFS_ERROR(EROFS);
+ return -EROFS;
if (copy_from_user(&eofb, arg, sizeof(eofb)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
if (error)
- return -error;
+ return error;
- return -xfs_icache_free_eofblocks(mp, &keofb);
+ return xfs_icache_free_eofblocks(mp, &keofb);
}
default:
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 944d5baa710a..94ce027e28e3 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -28,7 +28,6 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_vnode.h"
#include "xfs_inode.h"
#include "xfs_itable.h"
#include "xfs_error.h"
@@ -56,7 +55,7 @@ xfs_compat_flock64_copyin(
get_user(bf->l_sysid, &arg32->l_sysid) ||
get_user(bf->l_pid, &arg32->l_pid) ||
copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -70,10 +69,10 @@ xfs_compat_ioc_fsgeometry_v1(
error = xfs_fs_geometry(mp, &fsgeo, 3);
if (error)
- return -error;
+ return error;
/* The 32-bit variant simply has some padding at the end */
if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -84,7 +83,7 @@ xfs_compat_growfs_data_copyin(
{
if (get_user(in->newblocks, &arg32->newblocks) ||
get_user(in->imaxpct, &arg32->imaxpct))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -95,14 +94,14 @@ xfs_compat_growfs_rt_copyin(
{
if (get_user(in->newblocks, &arg32->newblocks) ||
get_user(in->extsize, &arg32->extsize))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
STATIC int
xfs_inumbers_fmt_compat(
void __user *ubuffer,
- const xfs_inogrp_t *buffer,
+ const struct xfs_inogrp *buffer,
long count,
long *written)
{
@@ -113,7 +112,7 @@ xfs_inumbers_fmt_compat(
if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) ||
put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
}
*written = count * sizeof(*p32);
return 0;
@@ -132,7 +131,7 @@ xfs_ioctl32_bstime_copyin(
if (get_user(sec32, &bstime32->tv_sec) ||
get_user(bstime->tv_nsec, &bstime32->tv_nsec))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
bstime->tv_sec = sec32;
return 0;
}
@@ -161,10 +160,11 @@ xfs_ioctl32_bstat_copyin(
get_user(bstat->bs_gen, &bstat32->bs_gen) ||
get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
+ get_user(bstat->bs_forkoff, &bstat32->bs_forkoff) ||
get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
get_user(bstat->bs_aextents, &bstat32->bs_aextents))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -180,7 +180,7 @@ xfs_bstime_store_compat(
sec32 = p->tv_sec;
if (put_user(sec32, &p32->tv_sec) ||
put_user(p->tv_nsec, &p32->tv_nsec))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return 0;
}
@@ -195,7 +195,7 @@ xfs_bulkstat_one_fmt_compat(
compat_xfs_bstat_t __user *p32 = ubuffer;
if (ubsize < sizeof(*p32))
- return XFS_ERROR(ENOMEM);
+ return -ENOMEM;
if (put_user(buffer->bs_ino, &p32->bs_ino) ||
put_user(buffer->bs_mode, &p32->bs_mode) ||
@@ -215,10 +215,11 @@ xfs_bulkstat_one_fmt_compat(
put_user(buffer->bs_gen, &p32->bs_gen) ||
put_user(buffer->bs_projid, &p32->bs_projid) ||
put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
+ put_user(buffer->bs_forkoff, &p32->bs_forkoff) ||
put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
put_user(buffer->bs_aextents, &p32->bs_aextents))
- return XFS_ERROR(EFAULT);
+ return -EFAULT;
if (ubused)
*ubused = sizeof(*p32);
return 0;
@@ -256,30 +257,30 @@ xfs_compat_ioc_bulkstat(
/* should be called again (unused here, but used in dmapi) */
if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+ return -EIO;
if (get_user(addr, &p32->lastip))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
bulkreq.lastip = compat_ptr(addr);
if (get_user(bulkreq.icount, &p32->icount) ||
get_user(addr, &p32->ubuffer))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
bulkreq.ubuffer = compat_ptr(addr);
if (get_user(addr, &p32->ocount))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
bulkreq.ocount = compat_ptr(addr);
if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if ((count = bulkreq.icount) <= 0)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
if (bulkreq.ubuffer == NULL)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
if (cmd == XFS_IOC_FSINUMBERS_32) {
error = xfs_inumbers(mp, &inlast, &count,
@@ -294,17 +295,17 @@ xfs_compat_ioc_bulkstat(
xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
bulkreq.ubuffer, &done);
} else
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
if (error)
- return -error;
+ return error;
if (bulkreq.ocount != NULL) {
if (copy_to_user(bulkreq.lastip, &inlast,
sizeof(xfs_ino_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
}
return 0;
@@ -318,7 +319,7 @@ xfs_compat_handlereq_copyin(
compat_xfs_fsop_handlereq_t hreq32;
if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
hreq->fd = hreq32.fd;
hreq->path = compat_ptr(hreq32.path);
@@ -352,19 +353,19 @@ xfs_compat_attrlist_by_handle(
char *kbuf;
if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (copy_from_user(&al_hreq, arg,
sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
al_hreq.buflen > XATTR_LIST_MAX)
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
/*
* Reject flags, only allow namespaces.
*/
if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
- return -XFS_ERROR(EINVAL);
+ return -EINVAL;
dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
if (IS_ERR(dentry))
@@ -376,7 +377,7 @@ xfs_compat_attrlist_by_handle(
goto out_dput;
cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+ error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
al_hreq.flags, cursor);
if (error)
goto out_kfree;
@@ -404,10 +405,10 @@ xfs_compat_attrmulti_by_handle(
unsigned char *attr_name;
if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (copy_from_user(&am_hreq, arg,
sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
/* overflow check */
if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
@@ -417,7 +418,7 @@ xfs_compat_attrmulti_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- error = E2BIG;
+ error = -E2BIG;
size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
if (!size || size > 16 * PAGE_SIZE)
goto out_dput;
@@ -428,7 +429,7 @@ xfs_compat_attrmulti_by_handle(
goto out_dput;
}
- error = ENOMEM;
+ error = -ENOMEM;
attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
if (!attr_name)
goto out_kfree_ops;
@@ -439,7 +440,7 @@ xfs_compat_attrmulti_by_handle(
compat_ptr(ops[i].am_attrname),
MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = ERANGE;
+ error = -ERANGE;
if (ops[i].am_error < 0)
break;
@@ -470,19 +471,19 @@ xfs_compat_attrmulti_by_handle(
mnt_drop_write_file(parfilp);
break;
default:
- ops[i].am_error = EINVAL;
+ ops[i].am_error = -EINVAL;
}
}
if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
- error = XFS_ERROR(EFAULT);
+ error = -EFAULT;
kfree(attr_name);
out_kfree_ops:
kfree(ops);
out_dput:
dput(dentry);
- return -error;
+ return error;
}
STATIC int
@@ -496,26 +497,26 @@ xfs_compat_fssetdm_by_handle(
struct dentry *dentry;
if (!capable(CAP_MKNOD))
- return -XFS_ERROR(EPERM);
+ return -EPERM;
if (copy_from_user(&dmhreq, arg,
sizeof(compat_xfs_fsop_setdm_handlereq_t)))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
- error = -XFS_ERROR(EPERM);
+ error = -EPERM;
goto out;
}
if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
- error = -XFS_ERROR(EFAULT);
+ error = -EFAULT;
goto out;
}
- error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+ error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
fsd.fsd_dmstate);
out:
@@ -537,7 +538,7 @@ xfs_file_compat_ioctl(
int error;
if (filp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+ ioflags |= XFS_IO_INVIS;
trace_xfs_file_compat_ioctl(ip);
@@ -588,7 +589,7 @@ xfs_file_compat_ioctl(
struct xfs_flock64 bf;
if (xfs_compat_flock64_copyin(&bf, arg))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
}
@@ -598,25 +599,25 @@ xfs_file_compat_ioctl(
struct xfs_growfs_data in;
if (xfs_compat_growfs_data_copyin(&in, arg))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_growfs_data(mp, &in);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
case XFS_IOC_FSGROWFSRT_32: {
struct xfs_growfs_rt in;
if (xfs_compat_growfs_rt_copyin(&in, arg))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_growfs_rt(mp, &in);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
#endif
/* long changes size, but xfs only copiese out 32 bits */
@@ -633,13 +634,13 @@ xfs_file_compat_ioctl(
if (copy_from_user(&sxp, sxu,
offsetof(struct xfs_swapext, sx_stat)) ||
xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_ioc_swapext(&sxp);
mnt_drop_write_file(filp);
- return -error;
+ return error;
}
case XFS_IOC_FSBULKSTAT_32:
case XFS_IOC_FSBULKSTAT_SINGLE_32:
@@ -651,7 +652,7 @@ xfs_file_compat_ioctl(
struct xfs_fsop_handlereq hreq;
if (xfs_compat_handlereq_copyin(&hreq, arg))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
return xfs_find_handle(cmd, &hreq);
}
@@ -659,14 +660,14 @@ xfs_file_compat_ioctl(
struct xfs_fsop_handlereq hreq;
if (xfs_compat_handlereq_copyin(&hreq, arg))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return xfs_open_by_handle(filp, &hreq);
}
case XFS_IOC_READLINK_BY_HANDLE_32: {
struct xfs_fsop_handlereq hreq;
if (xfs_compat_handlereq_copyin(&hreq, arg))
- return -XFS_ERROR(EFAULT);
+ return -EFAULT;
return xfs_readlink_by_handle(filp, &hreq);
}
case XFS_IOC_ATTRLIST_BY_HANDLE_32:
@@ -676,6 +677,6 @@ xfs_file_compat_ioctl(
case XFS_IOC_FSSETDM_BY_HANDLE_32:
return xfs_compat_fssetdm_by_handle(filp, arg);
default:
- return -XFS_ERROR(ENOIOCTLCMD);
+ return -ENOIOCTLCMD;
}
}
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 80f4060e8970..b1bb45444df8 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -67,8 +67,9 @@ typedef struct compat_xfs_bstat {
__u32 bs_gen; /* generation count */
__u16 bs_projid_lo; /* lower part of project id */
#define bs_projid bs_projid_lo /* (previously just bs_projid) */
+ __u16 bs_forkoff; /* inode fork offset in bytes */
__u16 bs_projid_hi; /* high part of project id */
- unsigned char bs_pad[12]; /* pad space, unused */
+ unsigned char bs_pad[10]; /* pad space, unused */
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6d3ec2b6ee29..afcf3c926565 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -110,7 +110,7 @@ xfs_alert_fsblock_zero(
(unsigned long long)imap->br_startoff,
(unsigned long long)imap->br_blockcount,
imap->br_state);
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
int
@@ -138,7 +138,7 @@ xfs_iomap_write_direct(
error = xfs_qm_dqattach(ip, 0);
if (error)
- return XFS_ERROR(error);
+ return error;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
@@ -148,7 +148,7 @@ xfs_iomap_write_direct(
if ((offset + count) > XFS_ISIZE(ip)) {
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
- return XFS_ERROR(error);
+ return error;
} else {
if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -188,7 +188,7 @@ xfs_iomap_write_direct(
*/
if (error) {
xfs_trans_cancel(tp, 0);
- return XFS_ERROR(error);
+ return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -225,7 +225,7 @@ xfs_iomap_write_direct(
* Copy any maps to caller's array and return any error.
*/
if (nimaps == 0) {
- error = XFS_ERROR(ENOSPC);
+ error = -ENOSPC;
goto out_unlock;
}
@@ -397,15 +397,17 @@ xfs_quota_calc_throttle(
struct xfs_inode *ip,
int type,
xfs_fsblock_t *qblocks,
- int *qshift)
+ int *qshift,
+ int64_t *qfreesp)
{
int64_t freesp;
int shift = 0;
struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
- /* over hi wmark, squash the prealloc completely */
- if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+ /* no dq, or over hi wmark, squash the prealloc completely */
+ if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
*qblocks = 0;
+ *qfreesp = 0;
return;
}
@@ -418,6 +420,9 @@ xfs_quota_calc_throttle(
shift += 2;
}
+ if (freesp < *qfreesp)
+ *qfreesp = freesp;
+
/* only overwrite the throttle values if we are more aggressive */
if ((freesp >> shift) < (*qblocks >> *qshift)) {
*qblocks = freesp;
@@ -476,15 +481,18 @@ xfs_iomap_prealloc_size(
}
/*
- * Check each quota to cap the prealloc size and provide a shift
- * value to throttle with.
+ * Check each quota to cap the prealloc size, provide a shift value to
+ * throttle with and adjust amount of available space.
*/
if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift);
+ xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
+ &freesp);
if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift);
+ xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
+ &freesp);
if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift);
+ xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
+ &freesp);
/*
* The final prealloc size is set to the minimum of free space available
@@ -552,7 +560,7 @@ xfs_iomap_write_delay(
*/
error = xfs_qm_dqattach_locked(ip, 0);
if (error)
- return XFS_ERROR(error);
+ return error;
extsz = xfs_get_extsz_hint(ip);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -596,11 +604,11 @@ retry:
imap, &nimaps, XFS_BMAPI_ENTIRE);
switch (error) {
case 0:
- case ENOSPC:
- case EDQUOT:
+ case -ENOSPC:
+ case -EDQUOT:
break;
default:
- return XFS_ERROR(error);
+ return error;
}
/*
@@ -614,7 +622,7 @@ retry:
error = 0;
goto retry;
}
- return XFS_ERROR(error ? error : ENOSPC);
+ return error ? error : -ENOSPC;
}
if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
@@ -663,7 +671,7 @@ xfs_iomap_write_allocate(
*/
error = xfs_qm_dqattach(ip, 0);
if (error)
- return XFS_ERROR(error);
+ return error;
offset_fsb = XFS_B_TO_FSBT(mp, offset);
count_fsb = imap->br_blockcount;
@@ -690,7 +698,7 @@ xfs_iomap_write_allocate(
nres, 0);
if (error) {
xfs_trans_cancel(tp, 0);
- return XFS_ERROR(error);
+ return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -739,7 +747,7 @@ xfs_iomap_write_allocate(
if ((map_start_fsb + count_fsb) > last_block) {
count_fsb = last_block - map_start_fsb;
if (count_fsb == 0) {
- error = EAGAIN;
+ error = -EAGAIN;
goto trans_cancel;
}
}
@@ -793,7 +801,7 @@ trans_cancel:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
error0:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return XFS_ERROR(error);
+ return error;
}
int
@@ -853,7 +861,7 @@ xfs_iomap_write_unwritten(
resblks, 0);
if (error) {
xfs_trans_cancel(tp, 0);
- return XFS_ERROR(error);
+ return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -892,7 +900,7 @@ xfs_iomap_write_unwritten(
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
- return XFS_ERROR(error);
+ return error;
if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
return xfs_alert_fsblock_zero(ip, &imap);
@@ -915,5 +923,5 @@ error_on_bmapi_transaction:
xfs_bmap_cancel(&free_list);
xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return XFS_ERROR(error);
+ return error;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 205613a06068..ec6dcdc181ee 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -72,7 +72,7 @@ xfs_initxattrs(
int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- error = -xfs_attr_set(ip, xattr->name, xattr->value,
+ error = xfs_attr_set(ip, xattr->name, xattr->value,
xattr->value_len, ATTR_SECURE);
if (error < 0)
break;
@@ -93,7 +93,7 @@ xfs_init_security(
struct inode *dir,
const struct qstr *qstr)
{
- return -security_inode_init_security(inode, dir, qstr,
+ return security_inode_init_security(inode, dir, qstr,
&xfs_initxattrs, NULL);
}
@@ -173,12 +173,12 @@ xfs_generic_create(
#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
- error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
goto out_cleanup_inode;
}
if (acl) {
- error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
if (error)
goto out_cleanup_inode;
}
@@ -194,7 +194,7 @@ xfs_generic_create(
posix_acl_release(default_acl);
if (acl)
posix_acl_release(acl);
- return -error;
+ return error;
out_cleanup_inode:
if (!tmpfile)
@@ -248,8 +248,8 @@ xfs_vn_lookup(
xfs_dentry_to_name(&name, dentry, 0);
error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
if (unlikely(error)) {
- if (unlikely(error != ENOENT))
- return ERR_PTR(-error);
+ if (unlikely(error != -ENOENT))
+ return ERR_PTR(error);
d_add(dentry, NULL);
return NULL;
}
@@ -275,8 +275,8 @@ xfs_vn_ci_lookup(
xfs_dentry_to_name(&xname, dentry, 0);
error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
if (unlikely(error)) {
- if (unlikely(error != ENOENT))
- return ERR_PTR(-error);
+ if (unlikely(error != -ENOENT))
+ return ERR_PTR(error);
/*
* call d_add(dentry, NULL) here when d_drop_negative_children
* is called in xfs_vn_mknod (ie. allow negative dentries
@@ -311,7 +311,7 @@ xfs_vn_link(
error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
if (unlikely(error))
- return -error;
+ return error;
ihold(inode);
d_instantiate(dentry, inode);
@@ -328,7 +328,7 @@ xfs_vn_unlink(
xfs_dentry_to_name(&name, dentry, 0);
- error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+ error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
if (error)
return error;
@@ -375,7 +375,7 @@ xfs_vn_symlink(
xfs_cleanup_inode(dir, inode, dentry);
iput(inode);
out:
- return -error;
+ return error;
}
STATIC int
@@ -392,8 +392,8 @@ xfs_vn_rename(
xfs_dentry_to_name(&oname, odentry, 0);
xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
- return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
- XFS_I(ndir), &nname, new_inode ?
+ return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+ XFS_I(ndir), &nname, new_inode ?
XFS_I(new_inode) : NULL);
}
@@ -414,7 +414,7 @@ xfs_vn_follow_link(
if (!link)
goto out_err;
- error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+ error = xfs_readlink(XFS_I(dentry->d_inode), link);
if (unlikely(error))
goto out_kfree;
@@ -441,7 +441,7 @@ xfs_vn_getattr(
trace_xfs_getattr(ip);
if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+ return -EIO;
stat->size = XFS_ISIZE(ip);
stat->dev = inode->i_sb->s_dev;
@@ -546,14 +546,14 @@ xfs_setattr_nonsize(
/* If acls are being inherited, we already have this checked */
if (!(flags & XFS_ATTR_NOACL)) {
if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
+ return -EROFS;
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
- error = -inode_change_ok(inode, iattr);
+ error = inode_change_ok(inode, iattr);
if (error)
- return XFS_ERROR(error);
+ return error;
}
ASSERT((mask & ATTR_SIZE) == 0);
@@ -703,7 +703,7 @@ xfs_setattr_nonsize(
xfs_qm_dqrele(gdqp);
if (error)
- return XFS_ERROR(error);
+ return error;
/*
* XXX(hch): Updating the ACL entries is not atomic vs the i_mode
@@ -713,9 +713,9 @@ xfs_setattr_nonsize(
* Posix ACL code seems to care about this issue either.
*/
if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
- error = -posix_acl_chmod(inode, inode->i_mode);
+ error = posix_acl_chmod(inode, inode->i_mode);
if (error)
- return XFS_ERROR(error);
+ return error;
}
return 0;
@@ -748,14 +748,14 @@ xfs_setattr_size(
trace_xfs_setattr(ip);
if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
+ return -EROFS;
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
- error = -inode_change_ok(inode, iattr);
+ error = inode_change_ok(inode, iattr);
if (error)
- return XFS_ERROR(error);
+ return error;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(S_ISREG(ip->i_d.di_mode));
@@ -818,7 +818,7 @@ xfs_setattr_size(
* care about here.
*/
if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
- error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
return error;
@@ -844,11 +844,41 @@ xfs_setattr_size(
* much we can do about this, except to hope that the caller sees ENOMEM
* and retries the truncate operation.
*/
- error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+ error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (error)
return error;
truncate_setsize(inode, newsize);
+ /*
+ * The "we can't serialise against page faults" pain gets worse.
+ *
+ * If the file is mapped then we have to clean the page at the old EOF
+ * when extending the file. Extending the file can expose changes the
+ * underlying page mapping (e.g. from beyond EOF to a hole or
+ * unwritten), and so on the next attempt to write to that page we need
+ * to remap it for write. i.e. we need .page_mkwrite() to be called.
+ * Hence we need to clean the page to clean the pte and so a new write
+ * fault will be triggered appropriately.
+ *
+ * If we do it before we change the inode size, then we can race with a
+ * page fault that maps the page with exactly the same problem. If we do
+ * it after we change the file size, then a new page fault can come in
+ * and allocate space before we've run the rest of the truncate
+ * transaction. That's kinda grotesque, but it's better than have data
+ * over a hole, and so that's the lesser evil that has been chosen here.
+ *
+ * The real solution, however, is to have some mechanism for locking out
+ * page faults while a truncate is in progress.
+ */
+ if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
+ error = filemap_write_and_wait_range(
+ VFS_I(ip)->i_mapping,
+ round_down(oldsize, PAGE_CACHE_SIZE),
+ round_up(oldsize, PAGE_CACHE_SIZE) - 1);
+ if (error)
+ return error;
+ }
+
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
@@ -950,7 +980,7 @@ xfs_vn_setattr(
error = xfs_setattr_nonsize(ip, iattr, 0);
}
- return -error;
+ return error;
}
STATIC int
@@ -970,7 +1000,7 @@ xfs_vn_update_time(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
- return -error;
+ return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -991,7 +1021,7 @@ xfs_vn_update_time(
}
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
- return -xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp, 0);
}
#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1036,7 +1066,7 @@ xfs_fiemap_format(
*full = 1; /* user array now full */
}
- return -error;
+ return error;
}
STATIC int
@@ -1055,12 +1085,12 @@ xfs_vn_fiemap(
return error;
/* Set up bmap header for xfs internal routine */
- bm.bmv_offset = BTOBB(start);
+ bm.bmv_offset = BTOBBT(start);
/* Special case for whole file */
if (length == FIEMAP_MAX_OFFSET)
bm.bmv_length = -1LL;
else
- bm.bmv_length = BTOBB(length);
+ bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
/* We add one because in getbmap world count includes the header */
bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
@@ -1075,7 +1105,7 @@ xfs_vn_fiemap(
error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
if (error)
- return -error;
+ return error;
return 0;
}
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cb64f222d607..894924a5129b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -67,19 +67,17 @@ xfs_bulkstat_one_int(
*stat = BULKSTAT_RV_NOTHING;
if (!buffer || xfs_internal_inum(mp, ino))
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
if (!buf)
- return XFS_ERROR(ENOMEM);
+ return -ENOMEM;
error = xfs_iget(mp, NULL, ino,
(XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
XFS_ILOCK_SHARED, &ip);
- if (error) {
- *stat = BULKSTAT_RV_NOTHING;
+ if (error)
goto out_free;
- }
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
@@ -136,7 +134,6 @@ xfs_bulkstat_one_int(
IRELE(ip);
error = formatter(buffer, ubsize, ubused, buf);
-
if (!error)
*stat = BULKSTAT_RV_DIDONE;
@@ -154,9 +151,9 @@ xfs_bulkstat_one_fmt(
const xfs_bstat_t *buffer)
{
if (ubsize < sizeof(*buffer))
- return XFS_ERROR(ENOMEM);
+ return -ENOMEM;
if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
- return XFS_ERROR(EFAULT);
+ return -EFAULT;
if (ubused)
*ubused = sizeof(*buffer);
return 0;
@@ -175,8 +172,172 @@ xfs_bulkstat_one(
xfs_bulkstat_one_fmt, ubused, stat);
}
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record. Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_bulkstat_ichunk_ra(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *irec)
+{
+ xfs_agblock_t agbno;
+ struct blk_plug plug;
+ int blks_per_cluster;
+ int inodes_per_cluster;
+ int i; /* inode chunk index */
+
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+
+ blk_start_plug(&plug);
+ for (i = 0; i < XFS_INODES_PER_CHUNK;
+ i += inodes_per_cluster, agbno += blks_per_cluster) {
+ if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
+ xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
+ &xfs_inode_buf_ops);
+ }
+ }
+ blk_finish_plug(&plug);
+}
+
+/*
+ * Lookup the inode chunk that the given inode lives in and then get the record
+ * if we found the chunk. If the inode was not the last in the chunk and there
+ * are some left allocated, update the data for the pointed-to record as well as
+ * return the count of grabbed inodes.
+ */
+STATIC int
+xfs_bulkstat_grab_ichunk(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t agino, /* starting inode of chunk */
+ int *icount,/* return # of inodes grabbed */
+ struct xfs_inobt_rec_incore *irec) /* btree record */
+{
+ int idx; /* index into inode chunk */
+ int stat;
+ int error = 0;
+
+ /* Lookup the inode chunk that this inode lives in */
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
+ if (error)
+ return error;
+ if (!stat) {
+ *icount = 0;
+ return error;
+ }
+
+ /* Get the record, should always work */
+ error = xfs_inobt_get_rec(cur, irec, &stat);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(stat == 1);
+
+ /* Check if the record contains the inode in request */
+ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
+ *icount = 0;
+ return 0;
+ }
+
+ idx = agino - irec->ir_startino + 1;
+ if (idx < XFS_INODES_PER_CHUNK &&
+ (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
+ int i;
+
+ /* We got a right chunk with some left inodes allocated at it.
+ * Grab the chunk record. Mark all the uninteresting inodes
+ * free -- because they're before our start point.
+ */
+ for (i = 0; i < idx; i++) {
+ if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+ irec->ir_freecount++;
+ }
+
+ irec->ir_free |= xfs_inobt_maskn(0, idx);
+ *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+ }
+
+ return 0;
+}
+
#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
+struct xfs_bulkstat_agichunk {
+ char __user **ac_ubuffer;/* pointer into user's buffer */
+ int ac_ubleft; /* bytes left in user's buffer */
+ int ac_ubelem; /* spaces used in user's buffer */
+};
+
+/*
+ * Process inodes in chunk with a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ */
+static int
+xfs_bulkstat_ag_ichunk(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *irbp,
+ bulkstat_one_pf formatter,
+ size_t statstruct_size,
+ struct xfs_bulkstat_agichunk *acp,
+ xfs_agino_t *last_agino)
+{
+ char __user **ubufp = acp->ac_ubuffer;
+ int chunkidx;
+ int error = 0;
+ xfs_agino_t agino = irbp->ir_startino;
+
+ for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
+ chunkidx++, agino++) {
+ int fmterror;
+ int ubused;
+
+ /* inode won't fit in buffer, we are done */
+ if (acp->ac_ubleft < statstruct_size)
+ break;
+
+ /* Skip if this inode is free */
+ if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
+ continue;
+
+ /* Get the inode and fill in a single buffer */
+ ubused = statstruct_size;
+ error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
+ *ubufp, acp->ac_ubleft, &ubused, &fmterror);
+
+ if (fmterror == BULKSTAT_RV_GIVEUP ||
+ (error && error != -ENOENT && error != -EINVAL)) {
+ acp->ac_ubleft = 0;
+ ASSERT(error);
+ break;
+ }
+
+ /* be careful not to leak error if at end of chunk */
+ if (fmterror == BULKSTAT_RV_NOTHING || error) {
+ error = 0;
+ continue;
+ }
+
+ *ubufp += ubused;
+ acp->ac_ubleft -= ubused;
+ acp->ac_ubelem++;
+ }
+
+ /*
+ * Post-update *last_agino. At this point, agino will always point one
+ * inode past the last inode we processed successfully. Hence we
+ * substract that inode when setting the *last_agino cursor so that we
+ * return the correct cookie to userspace. On the next bulkstat call,
+ * the inode under the lastino cookie will be skipped as we have already
+ * processed it here.
+ */
+ *last_agino = agino - 1;
+
+ return error;
+}
+
/*
* Return stat information in bulk (by-inode) for the filesystem.
*/
@@ -190,63 +351,41 @@ xfs_bulkstat(
char __user *ubuffer, /* buffer with inode stats */
int *done) /* 1 if there are more stats to get */
{
- xfs_agblock_t agbno=0;/* allocation group block number */
xfs_buf_t *agbp; /* agi header buffer */
xfs_agi_t *agi; /* agi header data */
xfs_agino_t agino; /* inode # in allocation group */
xfs_agnumber_t agno; /* allocation group number */
- int chunkidx; /* current index into inode chunk */
- int clustidx; /* current index into inode cluster */
xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
- int end_of_ag; /* set if we've seen the ag end */
- int error; /* error code */
- int fmterror;/* bulkstat formatter result */
- int i; /* loop index */
- int icount; /* count of inodes good in irbuf */
size_t irbsize; /* size of irec buffer in bytes */
- xfs_ino_t ino; /* inode number (filesystem) */
- xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */
xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
- xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
- xfs_ino_t lastino; /* last inode number returned */
- int blks_per_cluster; /* # of blocks per cluster */
- int inodes_per_cluster;/* # of inodes per cluster */
int nirbuf; /* size of irbuf */
- int rval; /* return value error code */
- int tmp; /* result value from btree calls */
int ubcount; /* size of user's buffer */
- int ubleft; /* bytes left in user's buffer */
- char __user *ubufp; /* pointer into user's buffer */
- int ubelem; /* spaces used in user's buffer */
- int ubused; /* bytes used by formatter */
+ struct xfs_bulkstat_agichunk ac;
+ int error = 0;
/*
* Get the last inode value, see if there's nothing to do.
*/
- ino = (xfs_ino_t)*lastinop;
- lastino = ino;
- agno = XFS_INO_TO_AGNO(mp, ino);
- agino = XFS_INO_TO_AGINO(mp, ino);
+ agno = XFS_INO_TO_AGNO(mp, *lastinop);
+ agino = XFS_INO_TO_AGINO(mp, *lastinop);
if (agno >= mp->m_sb.sb_agcount ||
- ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
*done = 1;
*ubcountp = 0;
return 0;
}
- if (!ubcountp || *ubcountp <= 0) {
- return EINVAL;
- }
+
ubcount = *ubcountp; /* statstruct's */
- ubleft = ubcount * statstruct_size; /* bytes */
- *ubcountp = ubelem = 0;
+ ac.ac_ubuffer = &ubuffer;
+ ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
+ ac.ac_ubelem = 0;
+
+ *ubcountp = 0;
*done = 0;
- fmterror = 0;
- ubufp = ubuffer;
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+
irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
if (!irbuf)
- return ENOMEM;
+ return -ENOMEM;
nirbuf = irbsize / sizeof(*irbuf);
@@ -254,122 +393,60 @@ xfs_bulkstat(
* Loop over the allocation groups, starting from the last
* inode returned; 0 means start of the allocation group.
*/
- rval = 0;
- while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
- cond_resched();
+ while (agno < mp->m_sb.sb_agcount) {
+ struct xfs_inobt_rec_incore *irbp = irbuf;
+ struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf;
+ bool end_of_ag = false;
+ int icount = 0;
+ int stat;
+
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
- if (error) {
- /*
- * Skip this allocation group and go to the next one.
- */
- agno++;
- agino = 0;
- continue;
- }
+ if (error)
+ break;
agi = XFS_BUF_TO_AGI(agbp);
/*
* Allocate and initialize a btree cursor for ialloc btree.
*/
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
XFS_BTNUM_INO);
- irbp = irbuf;
- irbufend = irbuf + nirbuf;
- end_of_ag = 0;
- /*
- * If we're returning in the middle of an allocation group,
- * we need to get the remainder of the chunk we're in.
- */
if (agino > 0) {
- xfs_inobt_rec_incore_t r;
-
/*
- * Lookup the inode chunk that this inode lives in.
+ * In the middle of an allocation group, we need to get
+ * the remainder of the chunk we're in.
*/
- error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
- &tmp);
- if (!error && /* no I/O error */
- tmp && /* lookup succeeded */
- /* got the record, should always work */
- !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
- i == 1 &&
- /* this is the right chunk */
- agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
- /* lastino was not last in chunk */
- (chunkidx = agino - r.ir_startino + 1) <
- XFS_INODES_PER_CHUNK &&
- /* there are some left allocated */
- xfs_inobt_maskn(chunkidx,
- XFS_INODES_PER_CHUNK - chunkidx) &
- ~r.ir_free) {
- /*
- * Grab the chunk record. Mark all the
- * uninteresting inodes (because they're
- * before our start point) free.
- */
- for (i = 0; i < chunkidx; i++) {
- if (XFS_INOBT_MASK(i) & ~r.ir_free)
- r.ir_freecount++;
- }
- r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+ struct xfs_inobt_rec_incore r;
+
+ error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
+ if (error)
+ goto del_cursor;
+ if (icount) {
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
- agino = r.ir_startino + XFS_INODES_PER_CHUNK;
- icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
- } else {
- /*
- * If any of those tests failed, bump the
- * inode number (just in case).
- */
- agino++;
- icount = 0;
}
- /*
- * In any case, increment to the next record.
- */
- if (!error)
- error = xfs_btree_increment(cur, 0, &tmp);
+ /* Increment to the next record */
+ error = xfs_btree_increment(cur, 0, &stat);
} else {
- /*
- * Start of ag. Lookup the first inode chunk.
- */
- error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
- icount = 0;
+ /* Start of ag. Lookup the first inode chunk */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
}
+ if (error || stat == 0) {
+ end_of_ag = true;
+ goto del_cursor;
+ }
+
/*
* Loop through inode btree records in this ag,
* until we run out of inodes or space in the buffer.
*/
while (irbp < irbufend && icount < ubcount) {
- xfs_inobt_rec_incore_t r;
-
- /*
- * Loop as long as we're unable to read the
- * inode btree.
- */
- while (error) {
- agino += XFS_INODES_PER_CHUNK;
- if (XFS_AGINO_TO_AGBNO(mp, agino) >=
- be32_to_cpu(agi->agi_length))
- break;
- error = xfs_inobt_lookup(cur, agino,
- XFS_LOOKUP_GE, &tmp);
- cond_resched();
- }
- /*
- * If ran off the end of the ag either with an error,
- * or the normal way, set end and stop collecting.
- */
- if (error) {
- end_of_ag = 1;
- break;
- }
+ struct xfs_inobt_rec_incore r;
- error = xfs_inobt_get_rec(cur, &r, &i);
- if (error || i == 0) {
- end_of_ag = 1;
- break;
+ error = xfs_inobt_get_rec(cur, &r, &stat);
+ if (error || stat == 0) {
+ end_of_ag = true;
+ goto del_cursor;
}
/*
@@ -377,193 +454,92 @@ xfs_bulkstat(
* Also start read-ahead now for this chunk.
*/
if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
- struct blk_plug plug;
- /*
- * Loop over all clusters in the next chunk.
- * Do a readahead if there are any allocated
- * inodes in that cluster.
- */
- blk_start_plug(&plug);
- agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
- for (chunkidx = 0;
- chunkidx < XFS_INODES_PER_CHUNK;
- chunkidx += inodes_per_cluster,
- agbno += blks_per_cluster) {
- if (xfs_inobt_maskn(chunkidx,
- inodes_per_cluster) & ~r.ir_free)
- xfs_btree_reada_bufs(mp, agno,
- agbno, blks_per_cluster,
- &xfs_inode_buf_ops);
- }
- blk_finish_plug(&plug);
+ xfs_bulkstat_ichunk_ra(mp, agno, &r);
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
}
- /*
- * Set agino to after this chunk and bump the cursor.
- */
- agino = r.ir_startino + XFS_INODES_PER_CHUNK;
- error = xfs_btree_increment(cur, 0, &tmp);
+ error = xfs_btree_increment(cur, 0, &stat);
+ if (error || stat == 0) {
+ end_of_ag = true;
+ goto del_cursor;
+ }
cond_resched();
}
+
/*
- * Drop the btree buffers and the agi buffer.
- * We can't hold any of the locks these represent
- * when calling iget.
+ * Drop the btree buffers and the agi buffer as we can't hold any
+ * of the locks these represent when calling iget. If there is a
+ * pending error, then we are done.
*/
+del_cursor:
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_buf_relse(agbp);
+ if (error)
+ break;
/*
- * Now format all the good inodes into the user's buffer.
+ * Now format all the good inodes into the user's buffer. The
+ * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
+ * for the next loop iteration.
*/
irbufend = irbp;
for (irbp = irbuf;
- irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
- /*
- * Now process this chunk of inodes.
- */
- for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
- XFS_BULKSTAT_UBLEFT(ubleft) &&
- irbp->ir_freecount < XFS_INODES_PER_CHUNK;
- chunkidx++, clustidx++, agino++) {
- ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
-
- ino = XFS_AGINO_TO_INO(mp, agno, agino);
- /*
- * Skip if this inode is free.
- */
- if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
- lastino = ino;
- continue;
- }
- /*
- * Count used inodes as free so we can tell
- * when the chunk is used up.
- */
- irbp->ir_freecount++;
-
- /*
- * Get the inode and fill in a single buffer.
- */
- ubused = statstruct_size;
- error = formatter(mp, ino, ubufp, ubleft,
- &ubused, &fmterror);
- if (fmterror == BULKSTAT_RV_NOTHING) {
- if (error && error != ENOENT &&
- error != EINVAL) {
- ubleft = 0;
- rval = error;
- break;
- }
- lastino = ino;
- continue;
- }
- if (fmterror == BULKSTAT_RV_GIVEUP) {
- ubleft = 0;
- ASSERT(error);
- rval = error;
- break;
- }
- if (ubufp)
- ubufp += ubused;
- ubleft -= ubused;
- ubelem++;
- lastino = ino;
- }
+ irbp < irbufend && ac.ac_ubleft >= statstruct_size;
+ irbp++) {
+ error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
+ formatter, statstruct_size, &ac,
+ &agino);
+ if (error)
+ break;
cond_resched();
}
+
/*
- * Set up for the next loop iteration.
+ * If we've run out of space or had a formatting error, we
+ * are now done
*/
- if (XFS_BULKSTAT_UBLEFT(ubleft)) {
- if (end_of_ag) {
- agno++;
- agino = 0;
- } else
- agino = XFS_INO_TO_AGINO(mp, lastino);
- } else
+ if (ac.ac_ubleft < statstruct_size || error)
break;
+
+ if (end_of_ag) {
+ agno++;
+ agino = 0;
+ }
}
/*
* Done, we're either out of filesystem or space to put the data.
*/
kmem_free(irbuf);
- *ubcountp = ubelem;
+ *ubcountp = ac.ac_ubelem;
+
/*
- * Found some inodes, return them now and return the error next time.
+ * We found some inodes, so clear the error status and return them.
+ * The lastino pointer will point directly at the inode that triggered
+ * any error that occurred, so on the next call the error will be
+ * triggered again and propagated to userspace as there will be no
+ * formatted inodes in the buffer.
*/
- if (ubelem)
- rval = 0;
- if (agno >= mp->m_sb.sb_agcount) {
- /*
- * If we ran out of filesystem, mark lastino as off
- * the end of the filesystem, so the next call
- * will return immediately.
- */
- *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
- *done = 1;
- } else
- *lastinop = (xfs_ino_t)lastino;
-
- return rval;
-}
-
-/*
- * Return stat information in bulk (by-inode) for the filesystem.
- * Special case for non-sequential one inode bulkstat.
- */
-int /* error status */
-xfs_bulkstat_single(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t *lastinop, /* inode to return */
- char __user *buffer, /* buffer with inode stats */
- int *done) /* 1 if there are more stats to get */
-{
- int count; /* count value for bulkstat call */
- int error; /* return value */
- xfs_ino_t ino; /* filesystem inode number */
- int res; /* result from bs1 */
+ if (ac.ac_ubelem)
+ error = 0;
/*
- * note that requesting valid inode numbers which are not allocated
- * to inodes will most likely cause xfs_imap_to_bp to generate warning
- * messages about bad magic numbers. This is ok. The fact that
- * the inode isn't actually an inode is handled by the
- * error check below. Done this way to make the usual case faster
- * at the expense of the error case.
+ * If we ran out of filesystem, lastino will point off the end of
+ * the filesystem so the next call will return immediately.
*/
+ *lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
+ if (agno >= mp->m_sb.sb_agcount)
+ *done = 1;
- ino = *lastinop;
- error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
- NULL, &res);
- if (error) {
- /*
- * Special case way failed, do it the "long" way
- * to see if that works.
- */
- (*lastinop)--;
- count = 1;
- if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
- sizeof(xfs_bstat_t), buffer, done))
- return error;
- if (count == 0 || (xfs_ino_t)*lastinop != ino)
- return error == EFSCORRUPTED ?
- XFS_ERROR(EINVAL) : error;
- else
- return 0;
- }
- *done = 0;
- return 0;
+ return error;
}
int
xfs_inumbers_fmt(
void __user *ubuffer, /* buffer to write to */
- const xfs_inogrp_t *buffer, /* buffer to read from */
+ const struct xfs_inogrp *buffer, /* buffer to read from */
long count, /* # of elements to read */
long *written) /* # of bytes written */
{
@@ -578,127 +554,105 @@ xfs_inumbers_fmt(
*/
int /* error status */
xfs_inumbers(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t *lastino, /* last inode returned */
- int *count, /* size of buffer/count returned */
- void __user *ubuffer,/* buffer with inode descriptions */
- inumbers_fmt_pf formatter)
+ struct xfs_mount *mp,/* mount point for filesystem */
+ xfs_ino_t *lastino,/* last inode returned */
+ int *count,/* size of buffer/count returned */
+ void __user *ubuffer,/* buffer with inode descriptions */
+ inumbers_fmt_pf formatter)
{
- xfs_buf_t *agbp;
- xfs_agino_t agino;
- xfs_agnumber_t agno;
- int bcount;
- xfs_inogrp_t *buffer;
- int bufidx;
- xfs_btree_cur_t *cur;
- int error;
- xfs_inobt_rec_incore_t r;
- int i;
- xfs_ino_t ino;
- int left;
- int tmp;
-
- ino = (xfs_ino_t)*lastino;
- agno = XFS_INO_TO_AGNO(mp, ino);
- agino = XFS_INO_TO_AGINO(mp, ino);
- left = *count;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, *lastino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, *lastino);
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_buf *agbp = NULL;
+ struct xfs_inogrp *buffer;
+ int bcount;
+ int left = *count;
+ int bufidx = 0;
+ int error = 0;
+
*count = 0;
+ if (agno >= mp->m_sb.sb_agcount ||
+ *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
+ return error;
+
bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
- error = bufidx = 0;
- cur = NULL;
- agbp = NULL;
- while (left > 0 && agno < mp->m_sb.sb_agcount) {
- if (agbp == NULL) {
+ do {
+ struct xfs_inobt_rec_incore r;
+ int stat;
+
+ if (!agbp) {
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
- if (error) {
- /*
- * If we can't read the AGI of this ag,
- * then just skip to the next one.
- */
- ASSERT(cur == NULL);
- agbp = NULL;
- agno++;
- agino = 0;
- continue;
- }
+ if (error)
+ break;
+
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
- &tmp);
- if (error) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- cur = NULL;
- xfs_buf_relse(agbp);
- agbp = NULL;
- /*
- * Move up the last inode in the current
- * chunk. The lookup_ge will always get
- * us the first inode in the next chunk.
- */
- agino += XFS_INODES_PER_CHUNK - 1;
- continue;
- }
- }
- error = xfs_inobt_get_rec(cur, &r, &i);
- if (error || i == 0) {
- xfs_buf_relse(agbp);
- agbp = NULL;
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
- cur = NULL;
- agno++;
- agino = 0;
- continue;
+ &stat);
+ if (error)
+ break;
+ if (!stat)
+ goto next_ag;
}
+
+ error = xfs_inobt_get_rec(cur, &r, &stat);
+ if (error)
+ break;
+ if (!stat)
+ goto next_ag;
+
agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
buffer[bufidx].xi_startino =
XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
buffer[bufidx].xi_alloccount =
XFS_INODES_PER_CHUNK - r.ir_freecount;
buffer[bufidx].xi_allocmask = ~r.ir_free;
- bufidx++;
- left--;
- if (bufidx == bcount) {
- long written;
- if (formatter(ubuffer, buffer, bufidx, &written)) {
- error = XFS_ERROR(EFAULT);
+ if (++bufidx == bcount) {
+ long written;
+
+ error = formatter(ubuffer, buffer, bufidx, &written);
+ if (error)
break;
- }
ubuffer += written;
*count += bufidx;
bufidx = 0;
}
- if (left) {
- error = xfs_btree_increment(cur, 0, &tmp);
- if (error) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- cur = NULL;
- xfs_buf_relse(agbp);
- agbp = NULL;
- /*
- * The agino value has already been bumped.
- * Just try to skip up to it.
- */
- agino += XFS_INODES_PER_CHUNK;
- continue;
- }
- }
- }
+ if (!--left)
+ break;
+
+ error = xfs_btree_increment(cur, 0, &stat);
+ if (error)
+ break;
+ if (stat)
+ continue;
+
+next_ag:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ cur = NULL;
+ xfs_buf_relse(agbp);
+ agbp = NULL;
+ agino = 0;
+ agno++;
+ } while (agno < mp->m_sb.sb_agcount);
+
if (!error) {
if (bufidx) {
- long written;
- if (formatter(ubuffer, buffer, bufidx, &written))
- error = XFS_ERROR(EFAULT);
- else
+ long written;
+
+ error = formatter(ubuffer, buffer, bufidx, &written);
+ if (!error)
*count += bufidx;
}
*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
}
+
kmem_free(buffer);
if (cur)
xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
XFS_BTREE_NOERROR));
if (agbp)
xfs_buf_relse(agbp);
+
return error;
}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 97295d91d170..6ea8b3912fa4 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -50,13 +50,6 @@ xfs_bulkstat(
char __user *ubuffer,/* buffer with inode stats */
int *done); /* 1 if there are more stats to get */
-int
-xfs_bulkstat_single(
- xfs_mount_t *mp,
- xfs_ino_t *lastinop,
- char __user *buffer,
- int *done);
-
typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */
void __user *ubuffer, /* buffer to write to */
int ubsize, /* remaining user buffer sz */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 825249d2dfc1..6a51619d8690 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -21,18 +21,6 @@
#include <linux/types.h>
/*
- * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
- */
-#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
-# define XFS_BIG_BLKNOS 1
-# define XFS_BIG_INUMS 1
-#else
-# define XFS_BIG_BLKNOS 0
-# define XFS_BIG_INUMS 0
-#endif
-
-/*
* Kernel specific type declarations for XFS
*/
typedef signed char __int8_t;
@@ -68,7 +56,6 @@ typedef __uint64_t __psunsigned_t;
#include "kmem.h"
#include "mrlock.h"
-#include "time.h"
#include "uuid.h"
#include <linux/semaphore.h>
@@ -113,7 +100,7 @@ typedef __uint64_t __psunsigned_t;
#include <asm/byteorder.h>
#include <asm/unaligned.h>
-#include "xfs_vnode.h"
+#include "xfs_fs.h"
#include "xfs_stats.h"
#include "xfs_sysctl.h"
#include "xfs_iops.h"
@@ -191,6 +178,22 @@ typedef __uint64_t __psunsigned_t;
#define MAX(a,b) (max(a,b))
#define howmany(x, y) (((x)+((y)-1))/(y))
+static inline void delay(long ticks)
+{
+ schedule_timeout_uninterruptible(ticks);
+}
+
+/*
+ * XFS wrapper structure for sysfs support. It depends on external data
+ * structures and is embedded in various internal data structures to implement
+ * the XFS sysfs object heirarchy. Define it here for broad access throughout
+ * the codebase.
+ */
+struct xfs_kobj {
+ struct kobject kobject;
+ struct completion complete;
+};
+
/* Kernel uid/gid conversion. These are used to convert to/from the on disk
* uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
* The conversion here is type only, the value will remain the same since we
@@ -331,7 +334,7 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
{
x += y - 1;
do_div(x, y);
- return(x * y);
+ return x * y;
}
static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 292308dede6d..fe88ef67f93a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,7 @@
#include "xfs_trace.h"
#include "xfs_fsops.h"
#include "xfs_cksum.h"
+#include "xfs_sysfs.h"
kmem_zone_t *xfs_log_ticket_zone;
@@ -283,7 +284,7 @@ xlog_grant_head_wait(
return 0;
shutdown:
list_del_init(&tic->t_queue);
- return XFS_ERROR(EIO);
+ return -EIO;
}
/*
@@ -377,7 +378,7 @@ xfs_log_regrant(
int error = 0;
if (XLOG_FORCED_SHUTDOWN(log))
- return XFS_ERROR(EIO);
+ return -EIO;
XFS_STATS_INC(xs_try_logspace);
@@ -446,7 +447,7 @@ xfs_log_reserve(
ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
if (XLOG_FORCED_SHUTDOWN(log))
- return XFS_ERROR(EIO);
+ return -EIO;
XFS_STATS_INC(xs_try_logspace);
@@ -454,7 +455,7 @@ xfs_log_reserve(
tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
KM_SLEEP | KM_MAYFAIL);
if (!tic)
- return XFS_ERROR(ENOMEM);
+ return -ENOMEM;
tic->t_trans_type = t_type;
*ticp = tic;
@@ -590,7 +591,7 @@ xfs_log_release_iclog(
{
if (xlog_state_release_iclog(mp->m_log, iclog)) {
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- return EIO;
+ return -EIO;
}
return 0;
@@ -628,7 +629,7 @@ xfs_log_mount(
mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
if (IS_ERR(mp->m_log)) {
- error = -PTR_ERR(mp->m_log);
+ error = PTR_ERR(mp->m_log);
goto out;
}
@@ -652,18 +653,18 @@ xfs_log_mount(
xfs_warn(mp,
"Log size %d blocks too small, minimum size is %d blocks",
mp->m_sb.sb_logblocks, min_logfsbs);
- error = EINVAL;
+ error = -EINVAL;
} else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
xfs_warn(mp,
"Log size %d blocks too large, maximum size is %lld blocks",
mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
- error = EINVAL;
+ error = -EINVAL;
} else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
xfs_warn(mp,
"log size %lld bytes too large, maximum size is %lld bytes",
XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
XFS_MAX_LOG_BYTES);
- error = EINVAL;
+ error = -EINVAL;
}
if (error) {
if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -707,6 +708,11 @@ xfs_log_mount(
}
}
+ error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
+ "log");
+ if (error)
+ goto out_destroy_ail;
+
/* Normal transactions can now occur */
mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
@@ -947,6 +953,9 @@ xfs_log_unmount(
xfs_log_quiesce(mp);
xfs_trans_ail_destroy(mp);
+
+ xfs_sysfs_del(&mp->m_log->l_kobj);
+
xlog_dealloc_log(mp->m_log);
}
@@ -1313,7 +1322,7 @@ xlog_alloc_log(
xlog_in_core_t *iclog, *prev_iclog=NULL;
xfs_buf_t *bp;
int i;
- int error = ENOMEM;
+ int error = -ENOMEM;
uint log2_size = 0;
log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
@@ -1340,7 +1349,7 @@ xlog_alloc_log(
xlog_grant_head_init(&log->l_reserve_head);
xlog_grant_head_init(&log->l_write_head);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
if (xfs_sb_version_hassector(&mp->m_sb)) {
log2_size = mp->m_sb.sb_logsectlog;
if (log2_size < BBSHIFT) {
@@ -1369,8 +1378,14 @@ xlog_alloc_log(
xlog_get_iclog_buffer_size(mp, log);
- error = ENOMEM;
- bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
+ /*
+ * Use a NULL block for the extra log buffer used during splits so that
+ * it will trigger errors if we ever try to do IO on it without first
+ * having set it up properly.
+ */
+ error = -ENOMEM;
+ bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
+ BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_log;
@@ -1463,7 +1478,7 @@ out_free_iclog:
out_free_log:
kmem_free(log);
out:
- return ERR_PTR(-error);
+ return ERR_PTR(error);
} /* xlog_alloc_log */
@@ -1661,9 +1676,9 @@ xlog_bdstrat(
xfs_buf_lock(bp);
if (iclog->ic_state & XLOG_STATE_IOERROR) {
- xfs_buf_ioerror(bp, EIO);
+ xfs_buf_ioerror(bp, -EIO);
xfs_buf_stale(bp);
- xfs_buf_ioend(bp, 0);
+ xfs_buf_ioend(bp);
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
@@ -1673,7 +1688,7 @@ xlog_bdstrat(
return 0;
}
- xfs_buf_iorequest(bp);
+ xfs_buf_submit(bp);
return 0;
}
@@ -2360,7 +2375,7 @@ xlog_write(
ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
if (!ophdr)
- return XFS_ERROR(EIO);
+ return -EIO;
xlog_write_adv_cnt(&ptr, &len, &log_offset,
sizeof(struct xlog_op_header));
@@ -2859,7 +2874,7 @@ restart:
spin_lock(&log->l_icloglock);
if (XLOG_FORCED_SHUTDOWN(log)) {
spin_unlock(&log->l_icloglock);
- return XFS_ERROR(EIO);
+ return -EIO;
}
iclog = log->l_iclog;
@@ -3047,7 +3062,7 @@ xlog_state_release_iclog(
int sync = 0; /* do we sync? */
if (iclog->ic_state & XLOG_STATE_IOERROR)
- return XFS_ERROR(EIO);
+ return -EIO;
ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
@@ -3055,7 +3070,7 @@ xlog_state_release_iclog(
if (iclog->ic_state & XLOG_STATE_IOERROR) {
spin_unlock(&log->l_icloglock);
- return XFS_ERROR(EIO);
+ return -EIO;
}
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
iclog->ic_state == XLOG_STATE_WANT_SYNC);
@@ -3172,7 +3187,7 @@ _xfs_log_force(
iclog = log->l_iclog;
if (iclog->ic_state & XLOG_STATE_IOERROR) {
spin_unlock(&log->l_icloglock);
- return XFS_ERROR(EIO);
+ return -EIO;
}
/* If the head iclog is not active nor dirty, we just attach
@@ -3210,7 +3225,7 @@ _xfs_log_force(
spin_unlock(&log->l_icloglock);
if (xlog_state_release_iclog(log, iclog))
- return XFS_ERROR(EIO);
+ return -EIO;
if (log_flushed)
*log_flushed = 1;
@@ -3246,7 +3261,7 @@ maybe_sleep:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR) {
spin_unlock(&log->l_icloglock);
- return XFS_ERROR(EIO);
+ return -EIO;
}
XFS_STATS_INC(xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3256,7 +3271,7 @@ maybe_sleep:
* and the memory read should be atomic.
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
- return XFS_ERROR(EIO);
+ return -EIO;
if (log_flushed)
*log_flushed = 1;
} else {
@@ -3324,7 +3339,7 @@ try_again:
iclog = log->l_iclog;
if (iclog->ic_state & XLOG_STATE_IOERROR) {
spin_unlock(&log->l_icloglock);
- return XFS_ERROR(EIO);
+ return -EIO;
}
do {
@@ -3375,7 +3390,7 @@ try_again:
xlog_state_switch_iclogs(log, iclog, 0);
spin_unlock(&log->l_icloglock);
if (xlog_state_release_iclog(log, iclog))
- return XFS_ERROR(EIO);
+ return -EIO;
if (log_flushed)
*log_flushed = 1;
spin_lock(&log->l_icloglock);
@@ -3390,7 +3405,7 @@ try_again:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR) {
spin_unlock(&log->l_icloglock);
- return XFS_ERROR(EIO);
+ return -EIO;
}
XFS_STATS_INC(xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3400,7 +3415,7 @@ try_again:
* and the memory read should be atomic.
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
- return XFS_ERROR(EIO);
+ return -EIO;
if (log_flushed)
*log_flushed = 1;
@@ -3852,18 +3867,17 @@ xlog_state_ioerror(
* This is called from xfs_force_shutdown, when we're forcibly
* shutting down the filesystem, typically because of an IO error.
* Our main objectives here are to make sure that:
- * a. the filesystem gets marked 'SHUTDOWN' for all interested
+ * a. if !logerror, flush the logs to disk. Anything modified
+ * after this is ignored.
+ * b. the filesystem gets marked 'SHUTDOWN' for all interested
* parties to find out, 'atomically'.
- * b. those who're sleeping on log reservations, pinned objects and
+ * c. those who're sleeping on log reservations, pinned objects and
* other resources get woken up, and be told the bad news.
- * c. nothing new gets queued up after (a) and (b) are done.
- * d. if !logerror, flush the iclogs to disk, then seal them off
- * for business.
+ * d. nothing new gets queued up after (b) and (c) are done.
*
- * Note: for delayed logging the !logerror case needs to flush the regions
- * held in memory out to the iclogs before flushing them to disk. This needs
- * to be done before the log is marked as shutdown, otherwise the flush to the
- * iclogs will fail.
+ * Note: for the !logerror case we need to flush the regions held in memory out
+ * to disk first. This needs to be done before the log is marked as shutdown,
+ * otherwise the iclog writes will fail.
*/
int
xfs_log_force_umount(
@@ -3895,16 +3909,16 @@ xfs_log_force_umount(
ASSERT(XLOG_FORCED_SHUTDOWN(log));
return 1;
}
- retval = 0;
/*
- * Flush the in memory commit item list before marking the log as
- * being shut down. We need to do it in this order to ensure all the
- * completed transactions are flushed to disk with the xfs_log_force()
- * call below.
+ * Flush all the completed transactions to disk before marking the log
+ * being shut down. We need to do it in this order to ensure that
+ * completed operations are safely on disk before we shut down, and that
+ * we don't have to issue any buffer IO after the shutdown flags are set
+ * to guarantee this.
*/
if (!logerror)
- xlog_cil_force(log);
+ _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
/*
* mark the filesystem and the as in a shutdown state and wake
@@ -3916,18 +3930,11 @@ xfs_log_force_umount(
XFS_BUF_DONE(mp->m_sb_bp);
/*
- * This flag is sort of redundant because of the mount flag, but
- * it's good to maintain the separation between the log and the rest
- * of XFS.
+ * Mark the log and the iclogs with IO error flags to prevent any
+ * further log IO from being issued or completed.
*/
log->l_flags |= XLOG_IO_ERROR;
-
- /*
- * If we hit a log error, we want to mark all the iclogs IOERROR
- * while we're still holding the loglock.
- */
- if (logerror)
- retval = xlog_state_ioerror(log);
+ retval = xlog_state_ioerror(log);
spin_unlock(&log->l_icloglock);
/*
@@ -3940,19 +3947,6 @@ xfs_log_force_umount(
xlog_grant_head_wake_all(&log->l_reserve_head);
xlog_grant_head_wake_all(&log->l_write_head);
- if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
- ASSERT(!logerror);
- /*
- * Force the incore logs to disk before shutting the
- * log down completely.
- */
- _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
-
- spin_lock(&log->l_icloglock);
- retval = xlog_state_ioerror(log);
- spin_unlock(&log->l_icloglock);
- }
-
/*
* Wake up everybody waiting on xfs_log_force. Wake the CIL push first
* as if the log writes were completed. The abort handling in the log
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b3425b34e3d5..f506c457011e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,8 +78,6 @@ xlog_cil_init_post_recovery(
{
log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
log->l_cilp->xc_ctx->sequence = 1;
- log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
- log->l_curr_block);
}
/*
@@ -465,12 +463,40 @@ xlog_cil_push(
spin_unlock(&cil->xc_push_lock);
goto out_skip;
}
- spin_unlock(&cil->xc_push_lock);
/* check for a previously pushed seqeunce */
- if (push_seq < cil->xc_ctx->sequence)
+ if (push_seq < cil->xc_ctx->sequence) {
+ spin_unlock(&cil->xc_push_lock);
goto out_skip;
+ }
+
+ /*
+ * We are now going to push this context, so add it to the committing
+ * list before we do anything else. This ensures that anyone waiting on
+ * this push can easily detect the difference between a "push in
+ * progress" and "CIL is empty, nothing to do".
+ *
+ * IOWs, a wait loop can now check for:
+ * the current sequence not being found on the committing list;
+ * an empty CIL; and
+ * an unchanged sequence number
+ * to detect a push that had nothing to do and therefore does not need
+ * waiting on. If the CIL is not empty, we get put on the committing
+ * list before emptying the CIL and bumping the sequence number. Hence
+ * an empty CIL and an unchanged sequence number means we jumped out
+ * above after doing nothing.
+ *
+ * Hence the waiter will either find the commit sequence on the
+ * committing list or the sequence number will be unchanged and the CIL
+ * still dirty. In that latter case, the push has not yet started, and
+ * so the waiter will have to continue trying to check the CIL
+ * committing list until it is found. In extreme cases of delay, the
+ * sequence may fully commit between the attempts the wait makes to wait
+ * on the commit sequence.
+ */
+ list_add(&ctx->committing, &cil->xc_committing);
+ spin_unlock(&cil->xc_push_lock);
/*
* pull all the log vectors off the items in the CIL, and
@@ -534,7 +560,6 @@ xlog_cil_push(
*/
spin_lock(&cil->xc_push_lock);
cil->xc_current_sequence = new_ctx->sequence;
- list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock);
up_write(&cil->xc_ctx_lock);
@@ -634,7 +659,7 @@ out_abort_free_ticket:
xfs_log_ticket_put(tic);
out_abort:
xlog_cil_committed(ctx, XFS_LI_ABORTED);
- return XFS_ERROR(EIO);
+ return -EIO;
}
static void
@@ -857,13 +882,15 @@ restart:
* Hence by the time we have got here it our sequence may not have been
* pushed yet. This is true if the current sequence still matches the
* push sequence after the above wait loop and the CIL still contains
- * dirty objects.
+ * dirty objects. This is guaranteed by the push code first adding the
+ * context to the committing list before emptying the CIL.
*
- * When the push occurs, it will empty the CIL and atomically increment
- * the currect sequence past the push sequence and move it into the
- * committing list. Of course, if the CIL is clean at the time of the
- * push, it won't have pushed the CIL at all, so in that case we should
- * try the push for this sequence again from the start just in case.
+ * Hence if we don't find the context in the committing list and the
+ * current sequence number is unchanged then the CIL contents are
+ * significant. If the CIL is empty, if means there was nothing to push
+ * and that means there is nothing to wait for. If the CIL is not empty,
+ * it means we haven't yet started the push, because if it had started
+ * we would have found the context on the committing list.
*/
if (sequence == cil->xc_current_sequence &&
!list_empty(&cil->xc_cil)) {
@@ -928,12 +955,12 @@ xlog_cil_init(
cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
if (!cil)
- return ENOMEM;
+ return -ENOMEM;
ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
if (!ctx) {
kmem_free(cil);
- return ENOMEM;
+ return -ENOMEM;
}
INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9bc403a9e54f..db7cbdeb2b42 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -405,6 +405,8 @@ struct xlog {
struct xlog_grant_head l_reserve_head;
struct xlog_grant_head l_write_head;
+ struct xfs_kobj l_kobj;
+
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
char *l_iclog_bak[XLOG_MAX_ICLOGS];
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 981af0f6504b..00cd7f3a8f59 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -179,7 +179,7 @@ xlog_bread_noalign(
xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -193,12 +193,8 @@ xlog_bread_noalign(
bp->b_io_length = nbblks;
bp->b_error = 0;
- if (XFS_FORCED_SHUTDOWN(log->l_mp))
- return XFS_ERROR(EIO);
-
- xfs_buf_iorequest(bp);
- error = xfs_buf_iowait(bp);
- if (error)
+ error = xfs_buf_submit_wait(bp);
+ if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
xfs_buf_ioerror_alert(bp, __func__);
return error;
}
@@ -268,7 +264,7 @@ xlog_bwrite(
xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -330,14 +326,14 @@ xlog_header_check_recover(
xlog_header_check_dump(mp, head);
XFS_ERROR_REPORT("xlog_header_check_recover(1)",
XFS_ERRLEVEL_HIGH, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
xfs_warn(mp,
"dirty log entry has mismatched uuid - can't recover");
xlog_header_check_dump(mp, head);
XFS_ERROR_REPORT("xlog_header_check_recover(2)",
XFS_ERRLEVEL_HIGH, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -364,7 +360,7 @@ xlog_header_check_mount(
xlog_header_check_dump(mp, head);
XFS_ERROR_REPORT("xlog_header_check_mount",
XFS_ERRLEVEL_HIGH, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -378,12 +374,14 @@ xlog_recover_iodone(
* We're not going to bother about retrying
* this during recovery. One strike!
*/
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_force_shutdown(bp->b_target->bt_mount,
- SHUTDOWN_META_IO_ERROR);
+ if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ xfs_buf_ioerror_alert(bp, __func__);
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_META_IO_ERROR);
+ }
}
bp->b_iodone = NULL;
- xfs_buf_ioend(bp, 0);
+ xfs_buf_ioend(bp);
}
/*
@@ -462,7 +460,7 @@ xlog_find_verify_cycle(
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < log->l_sectBBsize)
- return ENOMEM;
+ return -ENOMEM;
}
for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
@@ -524,7 +522,7 @@ xlog_find_verify_log_record(
if (!(bp = xlog_get_bp(log, num_blks))) {
if (!(bp = xlog_get_bp(log, 1)))
- return ENOMEM;
+ return -ENOMEM;
smallmem = 1;
} else {
error = xlog_bread(log, start_blk, num_blks, bp, &offset);
@@ -539,7 +537,7 @@ xlog_find_verify_log_record(
xfs_warn(log->l_mp,
"Log inconsistent (didn't find previous header)");
ASSERT(0);
- error = XFS_ERROR(EIO);
+ error = -EIO;
goto out;
}
@@ -564,7 +562,7 @@ xlog_find_verify_log_record(
* will be called again for the end of the physical log.
*/
if (i == -1) {
- error = -1;
+ error = 1;
goto out;
}
@@ -628,7 +626,12 @@ xlog_find_head(
int error, log_bbnum = log->l_logBBsize;
/* Is the end of the log device zeroed? */
- if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+ error = xlog_find_zeroed(log, &first_blk);
+ if (error < 0) {
+ xfs_warn(log->l_mp, "empty log check failed");
+ return error;
+ }
+ if (error == 1) {
*return_head_blk = first_blk;
/* Is the whole lot zeroed? */
@@ -641,15 +644,12 @@ xlog_find_head(
}
return 0;
- } else if (error) {
- xfs_warn(log->l_mp, "empty log check failed");
- return error;
}
first_blk = 0; /* get cycle # of 1st block */
bp = xlog_get_bp(log, 1);
if (!bp)
- return ENOMEM;
+ return -ENOMEM;
error = xlog_bread(log, 0, 1, bp, &offset);
if (error)
@@ -818,29 +818,29 @@ validate_head:
start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
/* start ptr at last block ptr before head_blk */
- if ((error = xlog_find_verify_log_record(log, start_blk,
- &head_blk, 0)) == -1) {
- error = XFS_ERROR(EIO);
- goto bp_err;
- } else if (error)
+ error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+ if (error == 1)
+ error = -EIO;
+ if (error)
goto bp_err;
} else {
start_blk = 0;
ASSERT(head_blk <= INT_MAX);
- if ((error = xlog_find_verify_log_record(log, start_blk,
- &head_blk, 0)) == -1) {
+ error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+ if (error < 0)
+ goto bp_err;
+ if (error == 1) {
/* We hit the beginning of the log during our search */
start_blk = log_bbnum - (num_scan_bblks - head_blk);
new_blk = log_bbnum;
ASSERT(start_blk <= INT_MAX &&
(xfs_daddr_t) log_bbnum-start_blk >= 0);
ASSERT(head_blk <= INT_MAX);
- if ((error = xlog_find_verify_log_record(log,
- start_blk, &new_blk,
- (int)head_blk)) == -1) {
- error = XFS_ERROR(EIO);
- goto bp_err;
- } else if (error)
+ error = xlog_find_verify_log_record(log, start_blk,
+ &new_blk, (int)head_blk);
+ if (error == 1)
+ error = -EIO;
+ if (error)
goto bp_err;
if (new_blk != log_bbnum)
head_blk = new_blk;
@@ -911,7 +911,7 @@ xlog_find_tail(
bp = xlog_get_bp(log, 1);
if (!bp)
- return ENOMEM;
+ return -ENOMEM;
if (*head_blk == 0) { /* special case */
error = xlog_bread(log, 0, 1, bp, &offset);
if (error)
@@ -961,7 +961,7 @@ xlog_find_tail(
xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
xlog_put_bp(bp);
ASSERT(0);
- return XFS_ERROR(EIO);
+ return -EIO;
}
/* find blk_no of tail of log */
@@ -1092,8 +1092,8 @@ done:
*
* Return:
* 0 => the log is completely written to
- * -1 => use *blk_no as the first block of the log
- * >0 => error has occurred
+ * 1 => use *blk_no as the first block of the log
+ * <0 => error has occurred
*/
STATIC int
xlog_find_zeroed(
@@ -1112,7 +1112,7 @@ xlog_find_zeroed(
/* check totally zeroed log */
bp = xlog_get_bp(log, 1);
if (!bp)
- return ENOMEM;
+ return -ENOMEM;
error = xlog_bread(log, 0, 1, bp, &offset);
if (error)
goto bp_err;
@@ -1121,7 +1121,7 @@ xlog_find_zeroed(
if (first_cycle == 0) { /* completely zeroed log */
*blk_no = 0;
xlog_put_bp(bp);
- return -1;
+ return 1;
}
/* check partially zeroed log */
@@ -1141,7 +1141,7 @@ xlog_find_zeroed(
*/
xfs_warn(log->l_mp,
"Log inconsistent or not a log (last==0, first!=1)");
- error = XFS_ERROR(EINVAL);
+ error = -EINVAL;
goto bp_err;
}
@@ -1179,19 +1179,18 @@ xlog_find_zeroed(
* Potentially backup over partial log record write. We don't need
* to search the end of the log because we know it is zero.
*/
- if ((error = xlog_find_verify_log_record(log, start_blk,
- &last_blk, 0)) == -1) {
- error = XFS_ERROR(EIO);
- goto bp_err;
- } else if (error)
- goto bp_err;
+ error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
+ if (error == 1)
+ error = -EIO;
+ if (error)
+ goto bp_err;
*blk_no = last_blk;
bp_err:
xlog_put_bp(bp);
if (error)
return error;
- return -1;
+ return 1;
}
/*
@@ -1251,7 +1250,7 @@ xlog_write_log_records(
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < sectbb)
- return ENOMEM;
+ return -ENOMEM;
}
/* We may need to do a read at the start to fill in part of
@@ -1354,7 +1353,7 @@ xlog_clear_stale_blocks(
if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
XFS_ERRLEVEL_LOW, log->l_mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
tail_distance = tail_block + (log->l_logBBsize - head_block);
} else {
@@ -1366,7 +1365,7 @@ xlog_clear_stale_blocks(
if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
XFS_ERRLEVEL_LOW, log->l_mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
tail_distance = tail_block - head_block;
}
@@ -1444,160 +1443,6 @@ xlog_clear_stale_blocks(
******************************************************************************
*/
-STATIC xlog_recover_t *
-xlog_recover_find_tid(
- struct hlist_head *head,
- xlog_tid_t tid)
-{
- xlog_recover_t *trans;
-
- hlist_for_each_entry(trans, head, r_list) {
- if (trans->r_log_tid == tid)
- return trans;
- }
- return NULL;
-}
-
-STATIC void
-xlog_recover_new_tid(
- struct hlist_head *head,
- xlog_tid_t tid,
- xfs_lsn_t lsn)
-{
- xlog_recover_t *trans;
-
- trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
- trans->r_log_tid = tid;
- trans->r_lsn = lsn;
- INIT_LIST_HEAD(&trans->r_itemq);
-
- INIT_HLIST_NODE(&trans->r_list);
- hlist_add_head(&trans->r_list, head);
-}
-
-STATIC void
-xlog_recover_add_item(
- struct list_head *head)
-{
- xlog_recover_item_t *item;
-
- item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
- INIT_LIST_HEAD(&item->ri_list);
- list_add_tail(&item->ri_list, head);
-}
-
-STATIC int
-xlog_recover_add_to_cont_trans(
- struct xlog *log,
- struct xlog_recover *trans,
- xfs_caddr_t dp,
- int len)
-{
- xlog_recover_item_t *item;
- xfs_caddr_t ptr, old_ptr;
- int old_len;
-
- if (list_empty(&trans->r_itemq)) {
- /* finish copying rest of trans header */
- xlog_recover_add_item(&trans->r_itemq);
- ptr = (xfs_caddr_t) &trans->r_theader +
- sizeof(xfs_trans_header_t) - len;
- memcpy(ptr, dp, len); /* d, s, l */
- return 0;
- }
- /* take the tail entry */
- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
-
- old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
- old_len = item->ri_buf[item->ri_cnt-1].i_len;
-
- ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
- memcpy(&ptr[old_len], dp, len); /* d, s, l */
- item->ri_buf[item->ri_cnt-1].i_len += len;
- item->ri_buf[item->ri_cnt-1].i_addr = ptr;
- trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
- return 0;
-}
-
-/*
- * The next region to add is the start of a new region. It could be
- * a whole region or it could be the first part of a new region. Because
- * of this, the assumption here is that the type and size fields of all
- * format structures fit into the first 32 bits of the structure.
- *
- * This works because all regions must be 32 bit aligned. Therefore, we
- * either have both fields or we have neither field. In the case we have
- * neither field, the data part of the region is zero length. We only have
- * a log_op_header and can throw away the header since a new one will appear
- * later. If we have at least 4 bytes, then we can determine how many regions
- * will appear in the current log item.
- */
-STATIC int
-xlog_recover_add_to_trans(
- struct xlog *log,
- struct xlog_recover *trans,
- xfs_caddr_t dp,
- int len)
-{
- xfs_inode_log_format_t *in_f; /* any will do */
- xlog_recover_item_t *item;
- xfs_caddr_t ptr;
-
- if (!len)
- return 0;
- if (list_empty(&trans->r_itemq)) {
- /* we need to catch log corruptions here */
- if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
- xfs_warn(log->l_mp, "%s: bad header magic number",
- __func__);
- ASSERT(0);
- return XFS_ERROR(EIO);
- }
- if (len == sizeof(xfs_trans_header_t))
- xlog_recover_add_item(&trans->r_itemq);
- memcpy(&trans->r_theader, dp, len); /* d, s, l */
- return 0;
- }
-
- ptr = kmem_alloc(len, KM_SLEEP);
- memcpy(ptr, dp, len);
- in_f = (xfs_inode_log_format_t *)ptr;
-
- /* take the tail entry */
- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
- if (item->ri_total != 0 &&
- item->ri_total == item->ri_cnt) {
- /* tail item is in use, get a new one */
- xlog_recover_add_item(&trans->r_itemq);
- item = list_entry(trans->r_itemq.prev,
- xlog_recover_item_t, ri_list);
- }
-
- if (item->ri_total == 0) { /* first region to be added */
- if (in_f->ilf_size == 0 ||
- in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
- xfs_warn(log->l_mp,
- "bad number of regions (%d) in inode log format",
- in_f->ilf_size);
- ASSERT(0);
- kmem_free(ptr);
- return XFS_ERROR(EIO);
- }
-
- item->ri_total = in_f->ilf_size;
- item->ri_buf =
- kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
- KM_SLEEP);
- }
- ASSERT(item->ri_total > item->ri_cnt);
- /* Description region is ri_buf[0] */
- item->ri_buf[item->ri_cnt].i_addr = ptr;
- item->ri_buf[item->ri_cnt].i_len = len;
- item->ri_cnt++;
- trace_xfs_log_recover_item_add(log, trans, item, 0);
- return 0;
-}
-
/*
* Sort the log items in the transaction.
*
@@ -1702,7 +1547,7 @@ xlog_recover_reorder_trans(
*/
if (!list_empty(&sort_list))
list_splice_init(&sort_list, &trans->r_itemq);
- error = XFS_ERROR(EIO);
+ error = -EIO;
goto out;
}
}
@@ -1943,7 +1788,7 @@ xlog_recover_do_inode_buffer(
item, bp);
XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
@@ -2125,6 +1970,17 @@ xlog_recover_validate_buf_type(
__uint16_t magic16;
__uint16_t magicda;
+ /*
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
magicda = be16_to_cpu(info->magic);
@@ -2162,8 +2018,6 @@ xlog_recover_validate_buf_type(
bp->b_ops = &xfs_agf_buf_ops;
break;
case XFS_BLFT_AGFL_BUF:
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- break;
if (magic32 != XFS_AGFL_MAGIC) {
xfs_warn(mp, "Bad AGFL block magic!");
ASSERT(0);
@@ -2196,10 +2050,6 @@ xlog_recover_validate_buf_type(
#endif
break;
case XFS_BLFT_DINO_BUF:
- /*
- * we get here with inode allocation buffers, not buffers that
- * track unlinked list changes.
- */
if (magic16 != XFS_DINODE_MAGIC) {
xfs_warn(mp, "Bad INODE block magic!");
ASSERT(0);
@@ -2279,8 +2129,6 @@ xlog_recover_validate_buf_type(
bp->b_ops = &xfs_attr3_leaf_buf_ops;
break;
case XFS_BLFT_ATTR_RMT_BUF:
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- break;
if (magic32 != XFS_ATTR3_RMT_MAGIC) {
xfs_warn(mp, "Bad attr remote magic!");
ASSERT(0);
@@ -2387,16 +2235,7 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
- /*
- * We can only do post recovery validation on items on CRC enabled
- * fielsystems as we need to know when the buffer was written to be able
- * to determine if we should have replayed the item. If we replay old
- * metadata over a newer buffer, then it will enter a temporarily
- * inconsistent state resulting in verification failures. Hence for now
- * just avoid the verification stage for non-crc filesystems
- */
- if (xfs_sb_version_hascrc(&mp->m_sb))
- xlog_recover_validate_buf_type(mp, bp, buf_f);
+ xlog_recover_validate_buf_type(mp, bp, buf_f);
}
/*
@@ -2404,8 +2243,11 @@ xlog_recover_do_reg_buffer(
* Simple algorithm: if we have found a QUOTAOFF log item of the same type
* (ie. USR or GRP), then just toss this buffer away; don't recover it.
* Else, treat it as a regular buffer and do recovery.
+ *
+ * Return false if the buffer was tossed and true if we recovered the buffer to
+ * indicate to the caller if the buffer needs writing.
*/
-STATIC void
+STATIC bool
xlog_recover_do_dquot_buffer(
struct xfs_mount *mp,
struct xlog *log,
@@ -2420,9 +2262,8 @@ xlog_recover_do_dquot_buffer(
/*
* Filesystems are required to send in quota flags at mount time.
*/
- if (mp->m_qflags == 0) {
- return;
- }
+ if (!mp->m_qflags)
+ return false;
type = 0;
if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
@@ -2435,9 +2276,10 @@ xlog_recover_do_dquot_buffer(
* This type of quotas was turned off, so ignore this buffer
*/
if (log->l_quotaoffs_flag & type)
- return;
+ return false;
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+ return true;
}
/*
@@ -2496,7 +2338,7 @@ xlog_recover_buffer_pass2(
bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
buf_flags, NULL);
if (!bp)
- return XFS_ERROR(ENOMEM);
+ return -ENOMEM;
error = bp->b_error;
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
@@ -2504,23 +2346,44 @@ xlog_recover_buffer_pass2(
}
/*
- * recover the buffer only if we get an LSN from it and it's less than
+ * Recover the buffer only if we get an LSN from it and it's less than
* the lsn of the transaction we are replaying.
+ *
+ * Note that we have to be extremely careful of readahead here.
+ * Readahead does not attach verfiers to the buffers so if we don't
+ * actually do any replay after readahead because of the LSN we found
+ * in the buffer if more recent than that current transaction then we
+ * need to attach the verifier directly. Failure to do so can lead to
+ * future recovery actions (e.g. EFI and unlinked list recovery) can
+ * operate on the buffers and they won't get the verifier attached. This
+ * can lead to blocks on disk having the correct content but a stale
+ * CRC.
+ *
+ * It is safe to assume these clean buffers are currently up to date.
+ * If the buffer is dirtied by a later transaction being replayed, then
+ * the verifier will be reset to match whatever recover turns that
+ * buffer into.
*/
lsn = xlog_recover_get_buf_lsn(mp, bp);
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ xlog_recover_validate_buf_type(mp, bp, buf_f);
goto out_release;
+ }
if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+ if (error)
+ goto out_release;
} else if (buf_f->blf_flags &
(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
- xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+ bool dirty;
+
+ dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+ if (!dirty)
+ goto out_release;
} else {
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
}
- if (error)
- goto out_release;
/*
* Perform delayed write on the buffer. Asynchronous writes will be
@@ -2598,7 +2461,7 @@ xfs_recover_inode_owner_change(
ip = xfs_inode_alloc(mp, in_f->ilf_ino);
if (!ip)
- return ENOMEM;
+ return -ENOMEM;
/* instantiate the inode */
xfs_dinode_from_disk(&ip->i_d, dip);
@@ -2676,7 +2539,7 @@ xlog_recover_inode_pass2(
bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
&xfs_inode_buf_ops);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error;
}
error = bp->b_error;
@@ -2697,7 +2560,7 @@ xlog_recover_inode_pass2(
__func__, dip, bp, in_f->ilf_ino);
XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
XFS_ERRLEVEL_LOW, mp);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto out_release;
}
dicp = item->ri_buf[1].i_addr;
@@ -2707,7 +2570,7 @@ xlog_recover_inode_pass2(
__func__, item, in_f->ilf_ino);
XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
XFS_ERRLEVEL_LOW, mp);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto out_release;
}
@@ -2764,7 +2627,7 @@ xlog_recover_inode_pass2(
"%s: Bad regular inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto out_release;
}
} else if (unlikely(S_ISDIR(dicp->di_mode))) {
@@ -2777,7 +2640,7 @@ xlog_recover_inode_pass2(
"%s: Bad dir inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto out_release;
}
}
@@ -2790,7 +2653,7 @@ xlog_recover_inode_pass2(
__func__, item, dip, bp, in_f->ilf_ino,
dicp->di_nextents + dicp->di_anextents,
dicp->di_nblocks);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto out_release;
}
if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
@@ -2800,7 +2663,7 @@ xlog_recover_inode_pass2(
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto out_release;
}
isize = xfs_icdinode_size(dicp->di_version);
@@ -2810,7 +2673,7 @@ xlog_recover_inode_pass2(
xfs_alert(mp,
"%s: Bad inode log record length %d, rec ptr 0x%p",
__func__, item->ri_buf[1].i_len, item);
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
goto out_release;
}
@@ -2898,7 +2761,7 @@ xlog_recover_inode_pass2(
default:
xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
ASSERT(0);
- error = EIO;
+ error = -EIO;
goto out_release;
}
}
@@ -2919,7 +2782,7 @@ out_release:
error:
if (need_free)
kmem_free(in_f);
- return XFS_ERROR(error);
+ return error;
}
/*
@@ -2946,7 +2809,7 @@ xlog_recover_quotaoff_pass1(
if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
log->l_quotaoffs_flag |= XFS_DQ_GROUP;
- return (0);
+ return 0;
}
/*
@@ -2971,17 +2834,17 @@ xlog_recover_dquot_pass2(
* Filesystems are required to send in quota flags at mount time.
*/
if (mp->m_qflags == 0)
- return (0);
+ return 0;
recddq = item->ri_buf[1].i_addr;
if (recddq == NULL) {
xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
- return XFS_ERROR(EIO);
+ return -EIO;
}
if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
item->ri_buf[1].i_len, __func__);
- return XFS_ERROR(EIO);
+ return -EIO;
}
/*
@@ -2990,7 +2853,7 @@ xlog_recover_dquot_pass2(
type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
ASSERT(type);
if (log->l_quotaoffs_flag & type)
- return (0);
+ return 0;
/*
* At this point we know that quota was _not_ turned off.
@@ -3007,12 +2870,19 @@ xlog_recover_dquot_pass2(
error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
"xlog_recover_dquot_pass2 (log copy)");
if (error)
- return XFS_ERROR(EIO);
+ return -EIO;
ASSERT(dq_f->qlf_len == 1);
+ /*
+ * At this point we are assuming that the dquots have been allocated
+ * and hence the buffer has valid dquots stamped in it. It should,
+ * therefore, pass verifier validation. If the dquot is bad, then the
+ * we'll return an error here, so we don't need to specifically check
+ * the dquot in the buffer after the verifier has run.
+ */
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
- NULL);
+ &xfs_dquot_buf_ops);
if (error)
return error;
@@ -3020,18 +2890,6 @@ xlog_recover_dquot_pass2(
ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
/*
- * At least the magic num portion should be on disk because this
- * was among a chunk of dquots created earlier, and we did some
- * minimal initialization then.
- */
- error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
- "xlog_recover_dquot_pass2");
- if (error) {
- xfs_buf_relse(bp);
- return XFS_ERROR(EIO);
- }
-
- /*
* If the dquot has an LSN in it, recover the dquot only if it's less
* than the lsn of the transaction we are replaying.
*/
@@ -3178,38 +3036,38 @@ xlog_recover_do_icreate_pass2(
icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
if (icl->icl_type != XFS_LI_ICREATE) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
- return EINVAL;
+ return -EINVAL;
}
if (icl->icl_size != 1) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
- return EINVAL;
+ return -EINVAL;
}
agno = be32_to_cpu(icl->icl_ag);
if (agno >= mp->m_sb.sb_agcount) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
- return EINVAL;
+ return -EINVAL;
}
agbno = be32_to_cpu(icl->icl_agbno);
if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
- return EINVAL;
+ return -EINVAL;
}
isize = be32_to_cpu(icl->icl_isize);
if (isize != mp->m_sb.sb_inodesize) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
- return EINVAL;
+ return -EINVAL;
}
count = be32_to_cpu(icl->icl_count);
if (!count) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
- return EINVAL;
+ return -EINVAL;
}
length = be32_to_cpu(icl->icl_length);
if (!length || length >= mp->m_sb.sb_agblocks) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
- return EINVAL;
+ return -EINVAL;
}
/* existing allocation is fixed value */
@@ -3218,7 +3076,7 @@ xlog_recover_do_icreate_pass2(
if (count != mp->m_ialloc_inos ||
length != mp->m_ialloc_blks) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
- return EINVAL;
+ return -EINVAL;
}
/*
@@ -3240,31 +3098,6 @@ xlog_recover_do_icreate_pass2(
return 0;
}
-/*
- * Free up any resources allocated by the transaction
- *
- * Remember that EFIs, EFDs, and IUNLINKs are handled later.
- */
-STATIC void
-xlog_recover_free_trans(
- struct xlog_recover *trans)
-{
- xlog_recover_item_t *item, *n;
- int i;
-
- list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
- /* Free the regions in the item. */
- list_del(&item->ri_list);
- for (i = 0; i < item->ri_cnt; i++)
- kmem_free(item->ri_buf[i].i_addr);
- /* Free the item itself */
- kmem_free(item->ri_buf);
- kmem_free(item);
- }
- /* Free the transaction recover structure */
- kmem_free(trans);
-}
-
STATIC void
xlog_recover_buffer_ra_pass2(
struct xlog *log,
@@ -3389,7 +3222,7 @@ xlog_recover_commit_pass1(
xfs_warn(log->l_mp, "%s: invalid item type (%d)",
__func__, ITEM_TYPE(item));
ASSERT(0);
- return XFS_ERROR(EIO);
+ return -EIO;
}
}
@@ -3425,7 +3258,7 @@ xlog_recover_commit_pass2(
xfs_warn(log->l_mp, "%s: invalid item type (%d)",
__func__, ITEM_TYPE(item));
ASSERT(0);
- return XFS_ERROR(EIO);
+ return -EIO;
}
}
@@ -3514,22 +3347,309 @@ out:
if (!list_empty(&done_list))
list_splice_init(&done_list, &trans->r_itemq);
- xlog_recover_free_trans(trans);
-
error2 = xfs_buf_delwri_submit(&buffer_list);
return error ? error : error2;
}
+STATIC void
+xlog_recover_add_item(
+ struct list_head *head)
+{
+ xlog_recover_item_t *item;
+
+ item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+ INIT_LIST_HEAD(&item->ri_list);
+ list_add_tail(&item->ri_list, head);
+}
+
STATIC int
-xlog_recover_unmount_trans(
- struct xlog *log)
+xlog_recover_add_to_cont_trans(
+ struct xlog *log,
+ struct xlog_recover *trans,
+ xfs_caddr_t dp,
+ int len)
{
- /* Do nothing now */
- xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+ xlog_recover_item_t *item;
+ xfs_caddr_t ptr, old_ptr;
+ int old_len;
+
+ if (list_empty(&trans->r_itemq)) {
+ /* finish copying rest of trans header */
+ xlog_recover_add_item(&trans->r_itemq);
+ ptr = (xfs_caddr_t) &trans->r_theader +
+ sizeof(xfs_trans_header_t) - len;
+ memcpy(ptr, dp, len);
+ return 0;
+ }
+ /* take the tail entry */
+ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+
+ old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+ old_len = item->ri_buf[item->ri_cnt-1].i_len;
+
+ ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+ memcpy(&ptr[old_len], dp, len);
+ item->ri_buf[item->ri_cnt-1].i_len += len;
+ item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+ trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
+ return 0;
+}
+
+/*
+ * The next region to add is the start of a new region. It could be
+ * a whole region or it could be the first part of a new region. Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned. Therefore, we
+ * either have both fields or we have neither field. In the case we have
+ * neither field, the data part of the region is zero length. We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later. If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(
+ struct xlog *log,
+ struct xlog_recover *trans,
+ xfs_caddr_t dp,
+ int len)
+{
+ xfs_inode_log_format_t *in_f; /* any will do */
+ xlog_recover_item_t *item;
+ xfs_caddr_t ptr;
+
+ if (!len)
+ return 0;
+ if (list_empty(&trans->r_itemq)) {
+ /* we need to catch log corruptions here */
+ if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+ xfs_warn(log->l_mp, "%s: bad header magic number",
+ __func__);
+ ASSERT(0);
+ return -EIO;
+ }
+ if (len == sizeof(xfs_trans_header_t))
+ xlog_recover_add_item(&trans->r_itemq);
+ memcpy(&trans->r_theader, dp, len);
+ return 0;
+ }
+
+ ptr = kmem_alloc(len, KM_SLEEP);
+ memcpy(ptr, dp, len);
+ in_f = (xfs_inode_log_format_t *)ptr;
+
+ /* take the tail entry */
+ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+ if (item->ri_total != 0 &&
+ item->ri_total == item->ri_cnt) {
+ /* tail item is in use, get a new one */
+ xlog_recover_add_item(&trans->r_itemq);
+ item = list_entry(trans->r_itemq.prev,
+ xlog_recover_item_t, ri_list);
+ }
+
+ if (item->ri_total == 0) { /* first region to be added */
+ if (in_f->ilf_size == 0 ||
+ in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+ xfs_warn(log->l_mp,
+ "bad number of regions (%d) in inode log format",
+ in_f->ilf_size);
+ ASSERT(0);
+ kmem_free(ptr);
+ return -EIO;
+ }
+
+ item->ri_total = in_f->ilf_size;
+ item->ri_buf =
+ kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+ KM_SLEEP);
+ }
+ ASSERT(item->ri_total > item->ri_cnt);
+ /* Description region is ri_buf[0] */
+ item->ri_buf[item->ri_cnt].i_addr = ptr;
+ item->ri_buf[item->ri_cnt].i_len = len;
+ item->ri_cnt++;
+ trace_xfs_log_recover_item_add(log, trans, item, 0);
return 0;
}
/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(
+ struct xlog_recover *trans)
+{
+ xlog_recover_item_t *item, *n;
+ int i;
+
+ list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+ /* Free the regions in the item. */
+ list_del(&item->ri_list);
+ for (i = 0; i < item->ri_cnt; i++)
+ kmem_free(item->ri_buf[i].i_addr);
+ /* Free the item itself */
+ kmem_free(item->ri_buf);
+ kmem_free(item);
+ }
+ /* Free the transaction recover structure */
+ kmem_free(trans);
+}
+
+/*
+ * On error or completion, trans is freed.
+ */
+STATIC int
+xlog_recovery_process_trans(
+ struct xlog *log,
+ struct xlog_recover *trans,
+ xfs_caddr_t dp,
+ unsigned int len,
+ unsigned int flags,
+ int pass)
+{
+ int error = 0;
+ bool freeit = false;
+
+ /* mask off ophdr transaction container flags */
+ flags &= ~XLOG_END_TRANS;
+ if (flags & XLOG_WAS_CONT_TRANS)
+ flags &= ~XLOG_CONTINUE_TRANS;
+
+ /*
+ * Callees must not free the trans structure. We'll decide if we need to
+ * free it or not based on the operation being done and it's result.
+ */
+ switch (flags) {
+ /* expected flag values */
+ case 0:
+ case XLOG_CONTINUE_TRANS:
+ error = xlog_recover_add_to_trans(log, trans, dp, len);
+ break;
+ case XLOG_WAS_CONT_TRANS:
+ error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
+ break;
+ case XLOG_COMMIT_TRANS:
+ error = xlog_recover_commit_trans(log, trans, pass);
+ /* success or fail, we are now done with this transaction. */
+ freeit = true;
+ break;
+
+ /* unexpected flag values */
+ case XLOG_UNMOUNT_TRANS:
+ /* just skip trans */
+ xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+ freeit = true;
+ break;
+ case XLOG_START_TRANS:
+ default:
+ xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
+ ASSERT(0);
+ error = -EIO;
+ break;
+ }
+ if (error || freeit)
+ xlog_recover_free_trans(trans);
+ return error;
+}
+
+/*
+ * Lookup the transaction recovery structure associated with the ID in the
+ * current ophdr. If the transaction doesn't exist and the start flag is set in
+ * the ophdr, then allocate a new transaction for future ID matches to find.
+ * Either way, return what we found during the lookup - an existing transaction
+ * or nothing.
+ */
+STATIC struct xlog_recover *
+xlog_recover_ophdr_to_trans(
+ struct hlist_head rhash[],
+ struct xlog_rec_header *rhead,
+ struct xlog_op_header *ohead)
+{
+ struct xlog_recover *trans;
+ xlog_tid_t tid;
+ struct hlist_head *rhp;
+
+ tid = be32_to_cpu(ohead->oh_tid);
+ rhp = &rhash[XLOG_RHASH(tid)];
+ hlist_for_each_entry(trans, rhp, r_list) {
+ if (trans->r_log_tid == tid)
+ return trans;
+ }
+
+ /*
+ * skip over non-start transaction headers - we could be
+ * processing slack space before the next transaction starts
+ */
+ if (!(ohead->oh_flags & XLOG_START_TRANS))
+ return NULL;
+
+ ASSERT(be32_to_cpu(ohead->oh_len) == 0);
+
+ /*
+ * This is a new transaction so allocate a new recovery container to
+ * hold the recovery ops that will follow.
+ */
+ trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+ trans->r_log_tid = tid;
+ trans->r_lsn = be64_to_cpu(rhead->h_lsn);
+ INIT_LIST_HEAD(&trans->r_itemq);
+ INIT_HLIST_NODE(&trans->r_list);
+ hlist_add_head(&trans->r_list, rhp);
+
+ /*
+ * Nothing more to do for this ophdr. Items to be added to this new
+ * transaction will be in subsequent ophdr containers.
+ */
+ return NULL;
+}
+
+STATIC int
+xlog_recover_process_ophdr(
+ struct xlog *log,
+ struct hlist_head rhash[],
+ struct xlog_rec_header *rhead,
+ struct xlog_op_header *ohead,
+ xfs_caddr_t dp,
+ xfs_caddr_t end,
+ int pass)
+{
+ struct xlog_recover *trans;
+ unsigned int len;
+
+ /* Do we understand who wrote this op? */
+ if (ohead->oh_clientid != XFS_TRANSACTION &&
+ ohead->oh_clientid != XFS_LOG) {
+ xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+ __func__, ohead->oh_clientid);
+ ASSERT(0);
+ return -EIO;
+ }
+
+ /*
+ * Check the ophdr contains all the data it is supposed to contain.
+ */
+ len = be32_to_cpu(ohead->oh_len);
+ if (dp + len > end) {
+ xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
+ WARN_ON(1);
+ return -EIO;
+ }
+
+ trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
+ if (!trans) {
+ /* nothing to do, so skip over this ophdr */
+ return 0;
+ }
+
+ return xlog_recovery_process_trans(log, trans, dp, len,
+ ohead->oh_flags, pass);
+}
+
+/*
* There are two valid states of the r_state field. 0 indicates that the
* transaction structure is in a normal state. We have either seen the
* start of the transaction or the last operation we added was not a partial
@@ -3546,86 +3666,30 @@ xlog_recover_process_data(
xfs_caddr_t dp,
int pass)
{
- xfs_caddr_t lp;
+ struct xlog_op_header *ohead;
+ xfs_caddr_t end;
int num_logops;
- xlog_op_header_t *ohead;
- xlog_recover_t *trans;
- xlog_tid_t tid;
int error;
- unsigned long hash;
- uint flags;
- lp = dp + be32_to_cpu(rhead->h_len);
+ end = dp + be32_to_cpu(rhead->h_len);
num_logops = be32_to_cpu(rhead->h_num_logops);
/* check the log format matches our own - else we can't recover */
if (xlog_header_check_recover(log->l_mp, rhead))
- return (XFS_ERROR(EIO));
-
- while ((dp < lp) && num_logops) {
- ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
- ohead = (xlog_op_header_t *)dp;
- dp += sizeof(xlog_op_header_t);
- if (ohead->oh_clientid != XFS_TRANSACTION &&
- ohead->oh_clientid != XFS_LOG) {
- xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
- __func__, ohead->oh_clientid);
- ASSERT(0);
- return (XFS_ERROR(EIO));
- }
- tid = be32_to_cpu(ohead->oh_tid);
- hash = XLOG_RHASH(tid);
- trans = xlog_recover_find_tid(&rhash[hash], tid);
- if (trans == NULL) { /* not found; add new tid */
- if (ohead->oh_flags & XLOG_START_TRANS)
- xlog_recover_new_tid(&rhash[hash], tid,
- be64_to_cpu(rhead->h_lsn));
- } else {
- if (dp + be32_to_cpu(ohead->oh_len) > lp) {
- xfs_warn(log->l_mp, "%s: bad length 0x%x",
- __func__, be32_to_cpu(ohead->oh_len));
- WARN_ON(1);
- return (XFS_ERROR(EIO));
- }
- flags = ohead->oh_flags & ~XLOG_END_TRANS;
- if (flags & XLOG_WAS_CONT_TRANS)
- flags &= ~XLOG_CONTINUE_TRANS;
- switch (flags) {
- case XLOG_COMMIT_TRANS:
- error = xlog_recover_commit_trans(log,
- trans, pass);
- break;
- case XLOG_UNMOUNT_TRANS:
- error = xlog_recover_unmount_trans(log);
- break;
- case XLOG_WAS_CONT_TRANS:
- error = xlog_recover_add_to_cont_trans(log,
- trans, dp,
- be32_to_cpu(ohead->oh_len));
- break;
- case XLOG_START_TRANS:
- xfs_warn(log->l_mp, "%s: bad transaction",
- __func__);
- ASSERT(0);
- error = XFS_ERROR(EIO);
- break;
- case 0:
- case XLOG_CONTINUE_TRANS:
- error = xlog_recover_add_to_trans(log, trans,
- dp, be32_to_cpu(ohead->oh_len));
- break;
- default:
- xfs_warn(log->l_mp, "%s: bad flag 0x%x",
- __func__, flags);
- ASSERT(0);
- error = XFS_ERROR(EIO);
- break;
- }
- if (error) {
- xlog_recover_free_trans(trans);
- return error;
- }
- }
+ return -EIO;
+
+ while ((dp < end) && num_logops) {
+
+ ohead = (struct xlog_op_header *)dp;
+ dp += sizeof(*ohead);
+ ASSERT(dp <= end);
+
+ /* errors will abort recovery */
+ error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
+ dp, end, pass);
+ if (error)
+ return error;
+
dp += be32_to_cpu(ohead->oh_len);
num_logops--;
}
@@ -3669,7 +3733,7 @@ xlog_recover_process_efi(
*/
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
xfs_efi_release(efip, efip->efi_format.efi_nextents);
- return XFS_ERROR(EIO);
+ return -EIO;
}
}
@@ -3969,7 +4033,7 @@ xlog_unpack_data_crc(
* CRC protection by punting an error back up the stack.
*/
if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
- return EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
return 0;
@@ -4018,14 +4082,14 @@ xlog_valid_rec_header(
if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
XFS_ERRLEVEL_LOW, log->l_mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (unlikely(
(!rhead->h_version ||
(be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
__func__, be32_to_cpu(rhead->h_version));
- return XFS_ERROR(EIO);
+ return -EIO;
}
/* LR body must have data or it wouldn't have been written */
@@ -4033,12 +4097,12 @@ xlog_valid_rec_header(
if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
XFS_ERRLEVEL_LOW, log->l_mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
XFS_ERRLEVEL_LOW, log->l_mp);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -4081,7 +4145,7 @@ xlog_do_recovery_pass(
*/
hbp = xlog_get_bp(log, 1);
if (!hbp)
- return ENOMEM;
+ return -ENOMEM;
error = xlog_bread(log, tail_blk, 1, hbp, &offset);
if (error)
@@ -4110,49 +4174,21 @@ xlog_do_recovery_pass(
}
if (!hbp)
- return ENOMEM;
+ return -ENOMEM;
dbp = xlog_get_bp(log, BTOBB(h_size));
if (!dbp) {
xlog_put_bp(hbp);
- return ENOMEM;
+ return -ENOMEM;
}
memset(rhash, 0, sizeof(rhash));
- if (tail_blk <= head_blk) {
- for (blk_no = tail_blk; blk_no < head_blk; ) {
- error = xlog_bread(log, blk_no, hblks, hbp, &offset);
- if (error)
- goto bread_err2;
-
- rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, blk_no);
- if (error)
- goto bread_err2;
-
- /* blocks in data section */
- bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
- error = xlog_bread(log, blk_no + hblks, bblks, dbp,
- &offset);
- if (error)
- goto bread_err2;
-
- error = xlog_unpack_data(rhead, offset, log);
- if (error)
- goto bread_err2;
-
- error = xlog_recover_process_data(log,
- rhash, rhead, offset, pass);
- if (error)
- goto bread_err2;
- blk_no += bblks + hblks;
- }
- } else {
+ blk_no = tail_blk;
+ if (tail_blk > head_blk) {
/*
* Perform recovery around the end of the physical log.
* When the head is not on the same cycle number as the tail,
- * we can't do a sequential recovery as above.
+ * we can't do a sequential recovery.
*/
- blk_no = tail_blk;
while (blk_no < log->l_logBBsize) {
/*
* Check for header wrapping around physical end-of-log
@@ -4266,34 +4302,35 @@ xlog_do_recovery_pass(
ASSERT(blk_no >= log->l_logBBsize);
blk_no -= log->l_logBBsize;
+ }
- /* read first part of physical log */
- while (blk_no < head_blk) {
- error = xlog_bread(log, blk_no, hblks, hbp, &offset);
- if (error)
- goto bread_err2;
+ /* read first part of physical log */
+ while (blk_no < head_blk) {
+ error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+ if (error)
+ goto bread_err2;
- rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, blk_no);
- if (error)
- goto bread_err2;
+ rhead = (xlog_rec_header_t *)offset;
+ error = xlog_valid_rec_header(log, rhead, blk_no);
+ if (error)
+ goto bread_err2;
- bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
- error = xlog_bread(log, blk_no+hblks, bblks, dbp,
- &offset);
- if (error)
- goto bread_err2;
+ /* blocks in data section */
+ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+ error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+ &offset);
+ if (error)
+ goto bread_err2;
- error = xlog_unpack_data(rhead, offset, log);
- if (error)
- goto bread_err2;
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
- error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass);
- if (error)
- goto bread_err2;
- blk_no += bblks + hblks;
- }
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
+ goto bread_err2;
+ blk_no += bblks + hblks;
}
bread_err2:
@@ -4388,7 +4425,7 @@ xlog_do_recover(
* If IO errors happened during recovery, bail out.
*/
if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
- return (EIO);
+ return -EIO;
}
/*
@@ -4413,16 +4450,12 @@ xlog_do_recover(
XFS_BUF_UNASYNC(bp);
bp->b_ops = &xfs_sb_buf_ops;
- if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
- xfs_buf_relse(bp);
- return XFS_ERROR(EIO);
- }
-
- xfs_buf_iorequest(bp);
- error = xfs_buf_iowait(bp);
+ error = xfs_buf_submit_wait(bp);
if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
- ASSERT(0);
+ if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ xfs_buf_ioerror_alert(bp, __func__);
+ ASSERT(0);
+ }
xfs_buf_relse(bp);
return error;
}
@@ -4492,7 +4525,19 @@ xlog_recover(
"Please recover the log on a kernel that supports the unknown features.",
(log->l_mp->m_sb.sb_features_log_incompat &
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
- return EINVAL;
+ return -EINVAL;
+ }
+
+ /*
+ * Delay log recovery if the debug hook is set. This is debug
+ * instrumention to coordinate simulation of I/O failures with
+ * log recovery.
+ */
+ if (xfs_globals.log_recovery_delay) {
+ xfs_notice(log->l_mp,
+ "Delaying log recovery for %d seconds.",
+ xfs_globals.log_recovery_delay);
+ msleep(xfs_globals.log_recovery_delay * 1000);
}
xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3507cd0ec400..51435dbce9c4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_dinode.h"
+#include "xfs_sysfs.h"
#ifdef HAVE_PERCPU_SB
@@ -76,7 +77,7 @@ xfs_uuid_mount(
if (uuid_is_nil(uuid)) {
xfs_warn(mp, "Filesystem has nil UUID - can't mount");
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mutex_lock(&xfs_uuid_table_mutex);
@@ -104,7 +105,7 @@ xfs_uuid_mount(
out_duplicate:
mutex_unlock(&xfs_uuid_table_mutex);
xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
STATIC void
@@ -173,13 +174,9 @@ xfs_sb_validate_fsb_count(
ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
ASSERT(sbp->sb_blocklog >= BBSHIFT);
-#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
+ /* Limited by ULONG_MAX of page cache index */
if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
- return EFBIG;
-#else /* Limited by UINT_MAX of sectors */
- if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
- return EFBIG;
-#endif
+ return -EFBIG;
return 0;
}
@@ -250,9 +247,9 @@ xfs_initialize_perag(
mp->m_flags &= ~XFS_MOUNT_32BITINODES;
if (mp->m_flags & XFS_MOUNT_32BITINODES)
- index = xfs_set_inode32(mp);
+ index = xfs_set_inode32(mp, agcount);
else
- index = xfs_set_inode64(mp);
+ index = xfs_set_inode64(mp, agcount);
if (maxagi)
*maxagi = index;
@@ -303,28 +300,21 @@ xfs_readsb(
* access to the superblock.
*/
reread:
- bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), 0, buf_ops);
- if (!bp) {
- if (loud)
- xfs_warn(mp, "SB buffer read failed");
- return EIO;
- }
- if (bp->b_error) {
- error = bp->b_error;
+ error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
+ BTOBB(sector_size), 0, &bp, buf_ops);
+ if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
/* bad CRC means corrupted metadata */
- if (error == EFSBADCRC)
- error = EFSCORRUPTED;
- goto release_buf;
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
+ return error;
}
/*
* Initialize the mount structure from the superblock.
*/
xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
- xfs_sb_quota_from_disk(sbp);
/*
* If we haven't validated the superblock, do so now before we try
@@ -333,7 +323,7 @@ reread:
if (sbp->sb_magicnum != XFS_SB_MAGIC) {
if (loud)
xfs_warn(mp, "Invalid superblock magic number");
- error = EINVAL;
+ error = -EINVAL;
goto release_buf;
}
@@ -344,7 +334,7 @@ reread:
if (loud)
xfs_warn(mp, "device supports %u byte sectors (not %u)",
sector_size, sbp->sb_sectsize);
- error = ENOSYS;
+ error = -ENOSYS;
goto release_buf;
}
@@ -392,7 +382,7 @@ xfs_update_alignment(xfs_mount_t *mp)
xfs_warn(mp,
"alignment check failed: sunit/swidth vs. blocksize(%d)",
sbp->sb_blocksize);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
} else {
/*
* Convert the stripe unit and width to FSBs.
@@ -402,14 +392,14 @@ xfs_update_alignment(xfs_mount_t *mp)
xfs_warn(mp,
"alignment check failed: sunit/swidth vs. agsize(%d)",
sbp->sb_agblocks);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
} else if (mp->m_dalign) {
mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
} else {
xfs_warn(mp,
"alignment check failed: sunit(%d) less than bsize(%d)",
mp->m_dalign, sbp->sb_blocksize);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
}
@@ -429,7 +419,7 @@ xfs_update_alignment(xfs_mount_t *mp)
} else {
xfs_warn(mp,
"cannot change alignment: superblock does not support data alignment");
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
xfs_sb_version_hasdalign(&mp->m_sb)) {
@@ -548,40 +538,43 @@ xfs_set_inoalignment(xfs_mount_t *mp)
* Check that the data (and log if separate) is an ok size.
*/
STATIC int
-xfs_check_sizes(xfs_mount_t *mp)
+xfs_check_sizes(
+ struct xfs_mount *mp)
{
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
xfs_daddr_t d;
+ int error;
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
xfs_warn(mp, "filesystem size mismatch detected");
- return XFS_ERROR(EFBIG);
+ return -EFBIG;
}
- bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+ error = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, NULL);
- if (!bp) {
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+ if (error) {
xfs_warn(mp, "last sector read failed");
- return EIO;
+ return error;
}
xfs_buf_relse(bp);
- if (mp->m_logdev_targp != mp->m_ddev_targp) {
- d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
- if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
- xfs_warn(mp, "log size mismatch detected");
- return XFS_ERROR(EFBIG);
- }
- bp = xfs_buf_read_uncached(mp->m_logdev_targp,
+ if (mp->m_logdev_targp == mp->m_ddev_targp)
+ return 0;
+
+ d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+ if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+ xfs_warn(mp, "log size mismatch detected");
+ return -EFBIG;
+ }
+ error = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, NULL);
- if (!bp) {
- xfs_warn(mp, "log device read failed");
- return EIO;
- }
- xfs_buf_relse(bp);
+ XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+ if (error) {
+ xfs_warn(mp, "log device read failed");
+ return error;
}
+ xfs_buf_relse(bp);
return 0;
}
@@ -731,10 +724,14 @@ xfs_mountfs(
xfs_set_maxicount(mp);
- error = xfs_uuid_mount(mp);
+ error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
if (error)
goto out;
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out_remove_sysfs;
+
/*
* Set the minimum read and write sizes
*/
@@ -816,7 +813,7 @@ xfs_mountfs(
if (!sbp->sb_logblocks) {
xfs_warn(mp, "no log defined");
XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto out_free_perag;
}
@@ -855,7 +852,7 @@ xfs_mountfs(
!mp->m_sb.sb_inprogress) {
error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
if (error)
- goto out_fail_wait;
+ goto out_log_dealloc;
}
/*
@@ -876,7 +873,7 @@ xfs_mountfs(
xfs_iunlock(rip, XFS_ILOCK_EXCL);
XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
mp);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto out_rele_rip;
}
mp->m_rootip = rip; /* save it */
@@ -927,7 +924,7 @@ xfs_mountfs(
xfs_notice(mp, "resetting quota flags");
error = xfs_mount_reset_sbqflags(mp);
if (error)
- return error;
+ goto out_rtunmount;
}
}
@@ -989,6 +986,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_remove_sysfs:
+ xfs_sysfs_del(&mp->m_kobj);
out:
return error;
}
@@ -1071,6 +1070,8 @@ xfs_unmountfs(
xfs_errortag_clearall(mp, 0);
#endif
xfs_free_perag(mp);
+
+ xfs_sysfs_del(&mp->m_kobj);
}
int
@@ -1152,7 +1153,7 @@ xfs_mod_incore_sb_unlocked(
lcounter += delta;
if (lcounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_icount = lcounter;
return 0;
@@ -1161,7 +1162,7 @@ xfs_mod_incore_sb_unlocked(
lcounter += delta;
if (lcounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_ifree = lcounter;
return 0;
@@ -1191,7 +1192,7 @@ xfs_mod_incore_sb_unlocked(
* blocks if were allowed to.
*/
if (!rsvd)
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
lcounter = (long long)mp->m_resblks_avail + delta;
if (lcounter >= 0) {
@@ -1202,7 +1203,7 @@ xfs_mod_incore_sb_unlocked(
"Filesystem \"%s\": reserve blocks depleted! "
"Consider increasing reserve pool size.",
mp->m_fsname);
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
}
mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1211,7 +1212,7 @@ xfs_mod_incore_sb_unlocked(
lcounter = (long long)mp->m_sb.sb_frextents;
lcounter += delta;
if (lcounter < 0) {
- return XFS_ERROR(ENOSPC);
+ return -ENOSPC;
}
mp->m_sb.sb_frextents = lcounter;
return 0;
@@ -1220,7 +1221,7 @@ xfs_mod_incore_sb_unlocked(
lcounter += delta;
if (lcounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_dblocks = lcounter;
return 0;
@@ -1229,7 +1230,7 @@ xfs_mod_incore_sb_unlocked(
scounter += delta;
if (scounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_agcount = scounter;
return 0;
@@ -1238,7 +1239,7 @@ xfs_mod_incore_sb_unlocked(
scounter += delta;
if (scounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_imax_pct = scounter;
return 0;
@@ -1247,7 +1248,7 @@ xfs_mod_incore_sb_unlocked(
scounter += delta;
if (scounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_rextsize = scounter;
return 0;
@@ -1256,7 +1257,7 @@ xfs_mod_incore_sb_unlocked(
scounter += delta;
if (scounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_rbmblocks = scounter;
return 0;
@@ -1265,7 +1266,7 @@ xfs_mod_incore_sb_unlocked(
lcounter += delta;
if (lcounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_rblocks = lcounter;
return 0;
@@ -1274,7 +1275,7 @@ xfs_mod_incore_sb_unlocked(
lcounter += delta;
if (lcounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_rextents = lcounter;
return 0;
@@ -1283,13 +1284,13 @@ xfs_mod_incore_sb_unlocked(
scounter += delta;
if (scounter < 0) {
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_sb.sb_rextslog = scounter;
return 0;
default:
ASSERT(0);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
}
@@ -1452,7 +1453,7 @@ xfs_dev_is_read_only(
(mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
xfs_notice(mp, "%s required on read-only device.", message);
xfs_notice(mp, "write access unavailable, cannot proceed.");
- return EROFS;
+ return -EROFS;
}
return 0;
}
@@ -1995,7 +1996,7 @@ slow_path:
* (e.g. lots of space just got freed). After that
* we are done.
*/
- if (ret != ENOSPC)
+ if (ret != -ENOSPC)
xfs_icsb_balance_counter(mp, field, 0);
xfs_icsb_unlock(mp);
return ret;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7295a0b7c343..b0447c86e7e2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -166,6 +166,7 @@ typedef struct xfs_mount {
on the next remount,rw */
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
+ struct xfs_kobj m_kobj;
struct workqueue_struct *m_data_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f99b4933dc22..30ecca3037e3 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -304,7 +304,8 @@ _xfs_mru_cache_reap(
int
xfs_mru_cache_init(void)
{
- xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
+ xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 1);
if (!xfs_mru_reap_wq)
return -ENOMEM;
return 0;
@@ -337,20 +338,20 @@ xfs_mru_cache_create(
*mrup = NULL;
if (!mrup || !grp_count || !lifetime_ms || !free_func)
- return EINVAL;
+ return -EINVAL;
if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
- return EINVAL;
+ return -EINVAL;
if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
- return ENOMEM;
+ return -ENOMEM;
/* An extra list is needed to avoid reaping up to a grp_time early. */
mru->grp_count = grp_count + 1;
mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
if (!mru->lists) {
- err = ENOMEM;
+ err = -ENOMEM;
goto exit;
}
@@ -434,16 +435,16 @@ xfs_mru_cache_insert(
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
- return EINVAL;
+ return -EINVAL;
if (radix_tree_preload(GFP_KERNEL))
- return ENOMEM;
+ return -ENOMEM;
INIT_LIST_HEAD(&elem->list_node);
elem->key = key;
spin_lock(&mru->lock);
- error = -radix_tree_insert(&mru->store, key, elem);
+ error = radix_tree_insert(&mru->store, key, elem);
radix_tree_preload_end();
if (!error)
_xfs_mru_cache_list_insert(mru, elem);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6d26759c779a..d68f23021af3 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -98,18 +98,18 @@ restart:
next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
error = execute(batch[i], data);
- if (error == EAGAIN) {
+ if (error == -EAGAIN) {
skipped++;
continue;
}
- if (error && last_error != EFSCORRUPTED)
+ if (error && last_error != -EFSCORRUPTED)
last_error = error;
}
mutex_unlock(&qi->qi_tree_lock);
/* bail out if the filesystem is corrupted. */
- if (last_error == EFSCORRUPTED) {
+ if (last_error == -EFSCORRUPTED) {
skipped = 0;
break;
}
@@ -138,7 +138,7 @@ xfs_qm_dqpurge(
xfs_dqlock(dqp);
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
xfs_dqunlock(dqp);
- return EAGAIN;
+ return -EAGAIN;
}
dqp->dq_flags |= XFS_DQ_FREEING;
@@ -221,100 +221,6 @@ xfs_qm_unmount(
}
}
-
-/*
- * This is called from xfs_mountfs to start quotas and initialize all
- * necessary data structures like quotainfo. This is also responsible for
- * running a quotacheck as necessary. We are guaranteed that the superblock
- * is consistently read in at this point.
- *
- * If we fail here, the mount will continue with quota turned off. We don't
- * need to inidicate success or failure at all.
- */
-void
-xfs_qm_mount_quotas(
- xfs_mount_t *mp)
-{
- int error = 0;
- uint sbf;
-
- /*
- * If quotas on realtime volumes is not supported, we disable
- * quotas immediately.
- */
- if (mp->m_sb.sb_rextents) {
- xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
- mp->m_qflags = 0;
- goto write_changes;
- }
-
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- /*
- * Allocate the quotainfo structure inside the mount struct, and
- * create quotainode(s), and change/rev superblock if necessary.
- */
- error = xfs_qm_init_quotainfo(mp);
- if (error) {
- /*
- * We must turn off quotas.
- */
- ASSERT(mp->m_quotainfo == NULL);
- mp->m_qflags = 0;
- goto write_changes;
- }
- /*
- * If any of the quotas are not consistent, do a quotacheck.
- */
- if (XFS_QM_NEED_QUOTACHECK(mp)) {
- error = xfs_qm_quotacheck(mp);
- if (error) {
- /* Quotacheck failed and disabled quotas. */
- return;
- }
- }
- /*
- * If one type of quotas is off, then it will lose its
- * quotachecked status, since we won't be doing accounting for
- * that type anymore.
- */
- if (!XFS_IS_UQUOTA_ON(mp))
- mp->m_qflags &= ~XFS_UQUOTA_CHKD;
- if (!XFS_IS_GQUOTA_ON(mp))
- mp->m_qflags &= ~XFS_GQUOTA_CHKD;
- if (!XFS_IS_PQUOTA_ON(mp))
- mp->m_qflags &= ~XFS_PQUOTA_CHKD;
-
- write_changes:
- /*
- * We actually don't have to acquire the m_sb_lock at all.
- * This can only be called from mount, and that's single threaded. XXX
- */
- spin_lock(&mp->m_sb_lock);
- sbf = mp->m_sb.sb_qflags;
- mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
- spin_unlock(&mp->m_sb_lock);
-
- if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
- if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
- /*
- * We could only have been turning quotas off.
- * We aren't in very good shape actually because
- * the incore structures are convinced that quotas are
- * off, but the on disk superblock doesn't know that !
- */
- ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
- xfs_alert(mp, "%s: Superblock update failed!",
- __func__);
- }
- }
-
- if (error) {
- xfs_warn(mp, "Failed to initialize disk quotas.");
- return;
- }
-}
-
/*
* Called from the vfsops layer.
*/
@@ -528,6 +434,7 @@ xfs_qm_dquot_isolate(
struct list_head *item,
spinlock_t *lru_lock,
void *arg)
+ __releases(lru_lock) __acquires(lru_lock)
{
struct xfs_dquot *dqp = container_of(item,
struct xfs_dquot, q_lru);
@@ -671,7 +578,7 @@ xfs_qm_init_quotainfo(
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
- error = -list_lru_init(&qinf->qi_lru);
+ error = list_lru_init(&qinf->qi_lru);
if (error)
goto out_free_qinf;
@@ -995,7 +902,7 @@ xfs_qm_dqiter_bufs(
* will leave a trace in the log indicating corruption has
* been detected.
*/
- if (error == EFSCORRUPTED) {
+ if (error == -EFSCORRUPTED) {
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, bno),
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
@@ -1005,6 +912,12 @@ xfs_qm_dqiter_bufs(
if (error)
break;
+ /*
+ * A corrupt buffer might not have a verifier attached, so
+ * make sure we have the correct one attached before writeback
+ * occurs.
+ */
+ bp->b_ops = &xfs_dquot_buf_ops;
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
@@ -1090,7 +1003,7 @@ xfs_qm_dqiterate(
xfs_buf_readahead(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, rablkno),
mp->m_quotainfo->qi_dqchunklen,
- NULL);
+ &xfs_dquot_buf_ops);
rablkno++;
}
}
@@ -1138,8 +1051,8 @@ xfs_qm_quotacheck_dqadjust(
/*
* Shouldn't be able to turn off quotas here.
*/
- ASSERT(error != ESRCH);
- ASSERT(error != ENOENT);
+ ASSERT(error != -ESRCH);
+ ASSERT(error != -ENOENT);
return error;
}
@@ -1226,7 +1139,7 @@ xfs_qm_dqusage_adjust(
*/
if (xfs_is_quota_inode(&mp->m_sb, ino)) {
*res = BULKSTAT_RV_NOTHING;
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
/*
@@ -1330,7 +1243,7 @@ out_unlock:
* Walk thru all the filesystem inodes and construct a consistent view
* of the disk quota world. If the quotacheck fails, disable quotas.
*/
-int
+STATIC int
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
@@ -1463,7 +1376,100 @@ xfs_qm_quotacheck(
}
} else
xfs_notice(mp, "Quotacheck: Done.");
- return (error);
+ return error;
+}
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo. This is also responsible for
+ * running a quotacheck as necessary. We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+ uint sbf;
+
+ /*
+ * If quotas on realtime volumes is not supported, we disable
+ * quotas immediately.
+ */
+ if (mp->m_sb.sb_rextents) {
+ xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+ mp->m_qflags = 0;
+ goto write_changes;
+ }
+
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+ /*
+ * Allocate the quotainfo structure inside the mount struct, and
+ * create quotainode(s), and change/rev superblock if necessary.
+ */
+ error = xfs_qm_init_quotainfo(mp);
+ if (error) {
+ /*
+ * We must turn off quotas.
+ */
+ ASSERT(mp->m_quotainfo == NULL);
+ mp->m_qflags = 0;
+ goto write_changes;
+ }
+ /*
+ * If any of the quotas are not consistent, do a quotacheck.
+ */
+ if (XFS_QM_NEED_QUOTACHECK(mp)) {
+ error = xfs_qm_quotacheck(mp);
+ if (error) {
+ /* Quotacheck failed and disabled quotas. */
+ return;
+ }
+ }
+ /*
+ * If one type of quotas is off, then it will lose its
+ * quotachecked status, since we won't be doing accounting for
+ * that type anymore.
+ */
+ if (!XFS_IS_UQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+ if (!XFS_IS_GQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+ if (!XFS_IS_PQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_PQUOTA_CHKD;
+
+ write_changes:
+ /*
+ * We actually don't have to acquire the m_sb_lock at all.
+ * This can only be called from mount, and that's single threaded. XXX
+ */
+ spin_lock(&mp->m_sb_lock);
+ sbf = mp->m_sb.sb_qflags;
+ mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+ if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+ /*
+ * We could only have been turning quotas off.
+ * We aren't in very good shape actually because
+ * the incore structures are convinced that quotas are
+ * off, but the on disk superblock doesn't know that !
+ */
+ ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+ xfs_alert(mp, "%s: Superblock update failed!",
+ __func__);
+ }
+ }
+
+ if (error) {
+ xfs_warn(mp, "Failed to initialize disk quotas.");
+ return;
+ }
}
/*
@@ -1493,7 +1499,7 @@ xfs_qm_init_quotainos(
error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
0, 0, &uip);
if (error)
- return XFS_ERROR(error);
+ return error;
}
if (XFS_IS_GQUOTA_ON(mp) &&
mp->m_sb.sb_gquotino != NULLFSINO) {
@@ -1563,7 +1569,7 @@ error_rele:
IRELE(gip);
if (pip)
IRELE(pip);
- return XFS_ERROR(error);
+ return error;
}
STATIC void
@@ -1679,7 +1685,7 @@ xfs_qm_vop_dqalloc(
XFS_QMOPT_DOWARN,
&uq);
if (error) {
- ASSERT(error != ENOENT);
+ ASSERT(error != -ENOENT);
return error;
}
/*
@@ -1706,7 +1712,7 @@ xfs_qm_vop_dqalloc(
XFS_QMOPT_DOWARN,
&gq);
if (error) {
- ASSERT(error != ENOENT);
+ ASSERT(error != -ENOENT);
goto error_rele;
}
xfs_dqunlock(gq);
@@ -1726,7 +1732,7 @@ xfs_qm_vop_dqalloc(
XFS_QMOPT_DOWARN,
&pq);
if (error) {
- ASSERT(error != ENOENT);
+ ASSERT(error != -ENOENT);
goto error_rele;
}
xfs_dqunlock(pq);
@@ -1895,7 +1901,7 @@ xfs_qm_vop_chown_reserve(
-((xfs_qcnt_t)delblks), 0, blkflags);
}
- return (0);
+ return 0;
}
int
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 797fd4636273..3a07a937e232 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
#define XFS_QM_RTBWARNLIMIT 5
extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
-extern int xfs_qm_quotacheck(struct xfs_mount *);
extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
/* dquot stuff */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index e9be63abd8d2..2c61e61b0205 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -117,7 +117,7 @@ xfs_qm_newmount(
(uquotaondisk ? " usrquota" : ""),
(gquotaondisk ? " grpquota" : ""),
(pquotaondisk ? " prjquota" : ""));
- return XFS_ERROR(EPERM);
+ return -EPERM;
}
if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index bbc813caba4c..80f2d77d929a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -64,10 +64,10 @@ xfs_qm_scall_quotaoff(
/*
* No file system can have quotas enabled on disk but not in core.
* Note that quota utilities (like quotaoff) _expect_
- * errno == EEXIST here.
+ * errno == -EEXIST here.
*/
if ((mp->m_qflags & flags) == 0)
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
error = 0;
flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
@@ -94,7 +94,7 @@ xfs_qm_scall_quotaoff(
/* XXX what to do if error ? Revert back to old vals incore ? */
error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
- return (error);
+ return error;
}
dqtype = 0;
@@ -198,7 +198,7 @@ xfs_qm_scall_quotaoff(
if (mp->m_qflags == 0) {
mutex_unlock(&q->qi_quotaofflock);
xfs_qm_destroy_quotainfo(mp);
- return (0);
+ return 0;
}
/*
@@ -278,13 +278,13 @@ xfs_qm_scall_trunc_qfiles(
xfs_mount_t *mp,
uint flags)
{
- int error = EINVAL;
+ int error = -EINVAL;
if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
(flags & ~XFS_DQ_ALLTYPES)) {
xfs_debug(mp, "%s: flags=%x m_qflags=%x",
__func__, flags, mp->m_qflags);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
if (flags & XFS_DQ_USER) {
@@ -328,7 +328,7 @@ xfs_qm_scall_quotaon(
if (flags == 0) {
xfs_debug(mp, "%s: zero flags, m_qflags=%x",
__func__, mp->m_qflags);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
/* No fs can turn on quotas with a delayed effect */
@@ -351,13 +351,13 @@ xfs_qm_scall_quotaon(
xfs_debug(mp,
"%s: Can't enforce without acct, flags=%x sbflags=%x",
__func__, flags, mp->m_sb.sb_qflags);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
/*
* If everything's up to-date incore, then don't waste time.
*/
if ((mp->m_qflags & flags) == flags)
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
/*
* Change sb_qflags on disk but not incore mp->qflags
@@ -372,11 +372,11 @@ xfs_qm_scall_quotaon(
* There's nothing to change if it's the same.
*/
if ((qf & flags) == flags && sbflags == 0)
- return XFS_ERROR(EEXIST);
+ return -EEXIST;
sbflags |= XFS_SB_QFLAGS;
if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
- return (error);
+ return error;
/*
* If we aren't trying to switch on quota enforcement, we are done.
*/
@@ -387,10 +387,10 @@ xfs_qm_scall_quotaon(
((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
(mp->m_qflags & XFS_GQUOTA_ACCT)) ||
(flags & XFS_ALL_QUOTA_ENFD) == 0)
- return (0);
+ return 0;
if (! XFS_IS_QUOTA_RUNNING(mp))
- return XFS_ERROR(ESRCH);
+ return -ESRCH;
/*
* Switch on quota enforcement in core.
@@ -399,7 +399,7 @@ xfs_qm_scall_quotaon(
mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
- return (0);
+ return 0;
}
@@ -426,7 +426,7 @@ xfs_qm_scall_getqstat(
if (!xfs_sb_version_hasquota(&mp->m_sb)) {
out->qs_uquota.qfs_ino = NULLFSINO;
out->qs_gquota.qfs_ino = NULLFSINO;
- return (0);
+ return 0;
}
out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -514,7 +514,7 @@ xfs_qm_scall_getqstatv(
out->qs_uquota.qfs_ino = NULLFSINO;
out->qs_gquota.qfs_ino = NULLFSINO;
out->qs_pquota.qfs_ino = NULLFSINO;
- return (0);
+ return 0;
}
out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -595,7 +595,7 @@ xfs_qm_scall_setqlim(
xfs_qcnt_t hard, soft;
if (newlim->d_fieldmask & ~XFS_DQ_MASK)
- return EINVAL;
+ return -EINVAL;
if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
return 0;
@@ -615,7 +615,7 @@ xfs_qm_scall_setqlim(
*/
error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
if (error) {
- ASSERT(error != ENOENT);
+ ASSERT(error != -ENOENT);
goto out_unlock;
}
xfs_dqunlock(dqp);
@@ -758,7 +758,7 @@ xfs_qm_log_quotaoff_end(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
- return (error);
+ return error;
}
qoffi = xfs_trans_get_qoff_item(tp, startqoff,
@@ -772,7 +772,7 @@ xfs_qm_log_quotaoff_end(
*/
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp, 0);
- return (error);
+ return error;
}
@@ -822,7 +822,7 @@ error0:
spin_unlock(&mp->m_sb_lock);
}
*qoffstartp = qoffi;
- return (error);
+ return error;
}
@@ -850,7 +850,7 @@ xfs_qm_scall_getquota(
* our utility programs are concerned.
*/
if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- error = XFS_ERROR(ENOENT);
+ error = -ENOENT;
goto out_put;
}
@@ -953,7 +953,7 @@ xfs_qm_export_flags(
uflags |= FS_QUOTA_GDQ_ENFD;
if (flags & XFS_PQUOTA_ENFD)
uflags |= FS_QUOTA_PDQ_ENFD;
- return (uflags);
+ return uflags;
}
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 2ad1b9822e92..b238027df987 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -51,7 +51,7 @@ xfs_fs_get_xstate(
if (!XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
- return -xfs_qm_scall_getqstat(mp, fqs);
+ return xfs_qm_scall_getqstat(mp, fqs);
}
STATIC int
@@ -63,7 +63,7 @@ xfs_fs_get_xstatev(
if (!XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
- return -xfs_qm_scall_getqstatv(mp, fqs);
+ return xfs_qm_scall_getqstatv(mp, fqs);
}
STATIC int
@@ -95,11 +95,11 @@ xfs_fs_set_xstate(
switch (op) {
case Q_XQUOTAON:
- return -xfs_qm_scall_quotaon(mp, flags);
+ return xfs_qm_scall_quotaon(mp, flags);
case Q_XQUOTAOFF:
if (!XFS_IS_QUOTA_ON(mp))
return -EINVAL;
- return -xfs_qm_scall_quotaoff(mp, flags);
+ return xfs_qm_scall_quotaoff(mp, flags);
}
return -EINVAL;
@@ -112,7 +112,7 @@ xfs_fs_rm_xquota(
{
struct xfs_mount *mp = XFS_M(sb);
unsigned int flags = 0;
-
+
if (sb->s_flags & MS_RDONLY)
return -EROFS;
@@ -123,11 +123,11 @@ xfs_fs_rm_xquota(
flags |= XFS_DQ_USER;
if (uflags & FS_GROUP_QUOTA)
flags |= XFS_DQ_GROUP;
- if (uflags & FS_USER_QUOTA)
+ if (uflags & FS_PROJ_QUOTA)
flags |= XFS_DQ_PROJ;
- return -xfs_qm_scall_trunc_qfiles(mp, flags);
-}
+ return xfs_qm_scall_trunc_qfiles(mp, flags);
+}
STATIC int
xfs_fs_get_dqblk(
@@ -142,7 +142,7 @@ xfs_fs_get_dqblk(
if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
- return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
+ return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
xfs_quota_type(qid.type), fdq);
}
@@ -161,7 +161,7 @@ xfs_fs_set_dqblk(
if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
- return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
+ return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
xfs_quota_type(qid.type), fdq);
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ec5ca65c6211..e1175ea9b551 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -46,7 +46,7 @@
* Keeps track of a current summary block, so we don't keep reading
* it from the buffer cache.
*/
-STATIC int /* error */
+static int
xfs_rtget_summary(
xfs_mount_t *mp, /* file system mount structure */
xfs_trans_t *tp, /* transaction pointer */
@@ -56,60 +56,9 @@ xfs_rtget_summary(
xfs_fsblock_t *rsb, /* in/out: summary block number */
xfs_suminfo_t *sum) /* out: summary info for this block */
{
- xfs_buf_t *bp; /* buffer for summary block */
- int error; /* error value */
- xfs_fsblock_t sb; /* summary fsblock */
- int so; /* index into the summary file */
- xfs_suminfo_t *sp; /* pointer to returned data */
-
- /*
- * Compute entry number in the summary file.
- */
- so = XFS_SUMOFFS(mp, log, bbno);
- /*
- * Compute the block number in the summary file.
- */
- sb = XFS_SUMOFFSTOBLOCK(mp, so);
- /*
- * If we have an old buffer, and the block number matches, use that.
- */
- if (rbpp && *rbpp && *rsb == sb)
- bp = *rbpp;
- /*
- * Otherwise we have to get the buffer.
- */
- else {
- /*
- * If there was an old one, get rid of it first.
- */
- if (rbpp && *rbpp)
- xfs_trans_brelse(tp, *rbpp);
- error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
- if (error) {
- return error;
- }
- /*
- * Remember this buffer and block for the next call.
- */
- if (rbpp) {
- *rbpp = bp;
- *rsb = sb;
- }
- }
- /*
- * Point to the summary information & copy it out.
- */
- sp = XFS_SUMPTR(mp, bp, so);
- *sum = *sp;
- /*
- * Drop the buffer if we're not asked to remember it.
- */
- if (!rbpp)
- xfs_trans_brelse(tp, bp);
- return 0;
+ return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
}
-
/*
* Return whether there are any free extents in the size range given
* by low and high, for the bitmap block bbno.
@@ -863,7 +812,7 @@ xfs_growfs_rt_alloc(
XFS_BMAPI_METADATA, &firstblock,
resblks, &map, &nmap, &flist);
if (!error && nmap < 1)
- error = XFS_ERROR(ENOSPC);
+ error = -ENOSPC;
if (error)
goto error_cancel;
/*
@@ -903,7 +852,7 @@ xfs_growfs_rt_alloc(
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
mp->m_bsize, 0);
if (bp == NULL) {
- error = XFS_ERROR(EIO);
+ error = -EIO;
error_cancel:
xfs_trans_cancel(tp, cancelflags);
goto error;
@@ -944,9 +893,9 @@ xfs_growfs_rt(
xfs_buf_t *bp; /* temporary buffer */
int error; /* error return value */
xfs_mount_t *nmp; /* new (fake) mount structure */
- xfs_drfsbno_t nrblocks; /* new number of realtime blocks */
+ xfs_rfsblock_t nrblocks; /* new number of realtime blocks */
xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
- xfs_drtbno_t nrextents; /* new number of realtime extents */
+ xfs_rtblock_t nrextents; /* new number of realtime extents */
uint8_t nrextslog; /* new log2 of sb_rextents */
xfs_extlen_t nrsumblocks; /* new number of summary blocks */
uint nrsumlevels; /* new rt summary levels */
@@ -962,26 +911,21 @@ xfs_growfs_rt(
* Initial error checking.
*/
if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
+ return -EPERM;
if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
(nrblocks = in->newblocks) <= sbp->sb_rblocks ||
(sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
return error;
/*
* Read in the last block of the device, make sure it exists.
*/
- bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
+ error = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, nrblocks - 1),
- XFS_FSB_TO_BB(mp, 1), 0, NULL);
- if (!bp)
- return EIO;
- if (bp->b_error) {
- error = bp->b_error;
- xfs_buf_relse(bp);
+ XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+ if (error)
return error;
- }
xfs_buf_relse(bp);
/*
@@ -1001,7 +945,7 @@ xfs_growfs_rt(
* since we'll log basically the whole summary file at once.
*/
if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
/*
* Get the old block counts for bitmap and summary inodes.
* These can't change since other growfs callers are locked out.
@@ -1208,7 +1152,7 @@ xfs_rtallocate_extent(
len, &sumbp, &sb, prod, &r);
break;
default:
- error = EIO;
+ error = -EIO;
ASSERT(0);
}
if (error)
@@ -1235,11 +1179,12 @@ xfs_rtallocate_extent(
*/
int /* error */
xfs_rtmount_init(
- xfs_mount_t *mp) /* file system mount structure */
+ struct xfs_mount *mp) /* file system mount structure */
{
- xfs_buf_t *bp; /* buffer for last block of subvolume */
- xfs_daddr_t d; /* address of last block of subvolume */
- xfs_sb_t *sbp; /* filesystem superblock copy in mount */
+ struct xfs_buf *bp; /* buffer for last block of subvolume */
+ struct xfs_sb *sbp; /* filesystem superblock copy in mount */
+ xfs_daddr_t d; /* address of last block of subvolume */
+ int error;
sbp = &mp->m_sb;
if (sbp->sb_rblocks == 0)
@@ -1247,7 +1192,7 @@ xfs_rtmount_init(
if (mp->m_rtdev_targp == NULL) {
xfs_warn(mp,
"Filesystem has a realtime volume, use rtdev=device option");
- return XFS_ERROR(ENODEV);
+ return -ENODEV;
}
mp->m_rsumlevels = sbp->sb_rextslog + 1;
mp->m_rsumsize =
@@ -1263,16 +1208,14 @@ xfs_rtmount_init(
xfs_warn(mp, "realtime mount -- %llu != %llu",
(unsigned long long) XFS_BB_TO_FSB(mp, d),
(unsigned long long) mp->m_sb.sb_rblocks);
- return XFS_ERROR(EFBIG);
+ return -EFBIG;
}
- bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
+ error = xfs_buf_read_uncached(mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, NULL);
- if (!bp || bp->b_error) {
+ XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+ if (error) {
xfs_warn(mp, "realtime device size check failed");
- if (bp)
- xfs_buf_relse(bp);
- return EIO;
+ return error;
}
xfs_buf_relse(bp);
return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 752b63d10300..76c0a4a9bb17 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -111,6 +111,10 @@ int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t *rtblock);
int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
+ int log, xfs_rtblock_t bbno, int delta,
+ xfs_buf_t **rbpp, xfs_fsblock_t *rsb,
+ xfs_suminfo_t *sum);
int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
xfs_fsblock_t *rsb);
@@ -132,7 +136,7 @@ xfs_rtmount_init(
return 0;
xfs_warn(mp, "Not built with CONFIG_XFS_RT");
- return ENOSYS;
+ return -ENOSYS;
}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
# define xfs_rtunmount_inodes(m)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8f0333b3f7a0..9f622feda6a4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,6 +47,7 @@
#include "xfs_dinode.h"
#include "xfs_filestream.h"
#include "xfs_quota.h"
+#include "xfs_sysfs.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -62,6 +63,11 @@ static const struct super_operations xfs_super_operations;
static kmem_zone_t *xfs_ioend_zone;
mempool_t *xfs_ioend_pool;
+static struct kset *xfs_kset; /* top-level xfs sysfs dir */
+#ifdef DEBUG
+static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
+#endif
+
#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
#define MNTOPT_LOGDEV "logdev" /* log device */
@@ -185,7 +191,7 @@ xfs_parseargs(
*/
mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
if (!mp->m_fsname)
- return ENOMEM;
+ return -ENOMEM;
mp->m_fsname_len = strlen(mp->m_fsname) + 1;
/*
@@ -204,9 +210,6 @@ xfs_parseargs(
*/
mp->m_flags |= XFS_MOUNT_BARRIER;
mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-#if !XFS_BIG_INUMS
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-#endif
/*
* These can be overridden by the mount option parsing.
@@ -227,57 +230,57 @@ xfs_parseargs(
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
- return EINVAL;
+ return -EINVAL;
}
if (kstrtoint(value, 10, &mp->m_logbufs))
- return EINVAL;
+ return -EINVAL;
} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
- return EINVAL;
+ return -EINVAL;
}
if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
- return EINVAL;
+ return -EINVAL;
} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
- return EINVAL;
+ return -EINVAL;
}
mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
if (!mp->m_logname)
- return ENOMEM;
+ return -ENOMEM;
} else if (!strcmp(this_char, MNTOPT_MTPT)) {
xfs_warn(mp, "%s option not allowed on this system",
this_char);
- return EINVAL;
+ return -EINVAL;
} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
- return EINVAL;
+ return -EINVAL;
}
mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
if (!mp->m_rtname)
- return ENOMEM;
+ return -ENOMEM;
} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
- return EINVAL;
+ return -EINVAL;
}
if (kstrtoint(value, 10, &iosize))
- return EINVAL;
+ return -EINVAL;
iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
- return EINVAL;
+ return -EINVAL;
}
if (suffix_kstrtoint(value, 10, &iosize))
- return EINVAL;
+ return -EINVAL;
iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_GRPID) ||
!strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -297,27 +300,22 @@ xfs_parseargs(
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
- return EINVAL;
+ return -EINVAL;
}
if (kstrtoint(value, 10, &dsunit))
- return EINVAL;
+ return -EINVAL;
} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
- return EINVAL;
+ return -EINVAL;
}
if (kstrtoint(value, 10, &dswidth))
- return EINVAL;
+ return -EINVAL;
} else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
-#if !XFS_BIG_INUMS
- xfs_warn(mp, "%s option not allowed on this system",
- this_char);
- return EINVAL;
-#endif
} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
mp->m_flags |= XFS_MOUNT_NOUUID;
} else if (!strcmp(this_char, MNTOPT_BARRIER)) {
@@ -390,7 +388,7 @@ xfs_parseargs(
"irixsgid is now a sysctl(2) variable, option is deprecated.");
} else {
xfs_warn(mp, "unknown mount option [%s].", this_char);
- return EINVAL;
+ return -EINVAL;
}
}
@@ -400,32 +398,32 @@ xfs_parseargs(
if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
!(mp->m_flags & XFS_MOUNT_RDONLY)) {
xfs_warn(mp, "no-recovery mounts must be read-only.");
- return EINVAL;
+ return -EINVAL;
}
if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
xfs_warn(mp,
"sunit and swidth options incompatible with the noalign option");
- return EINVAL;
+ return -EINVAL;
}
#ifndef CONFIG_XFS_QUOTA
if (XFS_IS_QUOTA_RUNNING(mp)) {
xfs_warn(mp, "quota support not available in this kernel.");
- return EINVAL;
+ return -EINVAL;
}
#endif
if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
xfs_warn(mp, "sunit and swidth must be specified together");
- return EINVAL;
+ return -EINVAL;
}
if (dsunit && (dswidth % dsunit != 0)) {
xfs_warn(mp,
"stripe width (%d) must be a multiple of the stripe unit (%d)",
dswidth, dsunit);
- return EINVAL;
+ return -EINVAL;
}
done:
@@ -446,7 +444,7 @@ done:
mp->m_logbufs > XLOG_MAX_ICLOGS)) {
xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
if (mp->m_logbsize != -1 &&
mp->m_logbsize != 0 &&
@@ -456,7 +454,7 @@ done:
xfs_warn(mp,
"invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
mp->m_logbsize);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
if (iosizelog) {
@@ -465,7 +463,7 @@ done:
xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
iosizelog, XFS_MIN_IO_LOG,
XFS_MAX_IO_LOG);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
@@ -597,15 +595,20 @@ xfs_max_file_offset(
return (((__uint64_t)pagefactor) << bitshift) - 1;
}
+/*
+ * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
+ * because in the growfs case, mp->m_sb.sb_agcount is not updated
+ * yet to the potentially higher ag count.
+ */
xfs_agnumber_t
-xfs_set_inode32(struct xfs_mount *mp)
+xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
{
xfs_agnumber_t index = 0;
xfs_agnumber_t maxagi = 0;
xfs_sb_t *sbp = &mp->m_sb;
xfs_agnumber_t max_metadata;
- xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0);
- xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino);
+ xfs_agino_t agino;
+ xfs_ino_t ino;
xfs_perag_t *pag;
/* Calculate how much should be reserved for inodes to meet
@@ -620,10 +623,12 @@ xfs_set_inode32(struct xfs_mount *mp)
do_div(icount, sbp->sb_agblocks);
max_metadata = icount;
} else {
- max_metadata = sbp->sb_agcount;
+ max_metadata = agcount;
}
- for (index = 0; index < sbp->sb_agcount; index++) {
+ agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+
+ for (index = 0; index < agcount; index++) {
ino = XFS_AGINO_TO_INO(mp, index, agino);
if (ino > XFS_MAXINUMBER_32) {
@@ -648,11 +653,11 @@ xfs_set_inode32(struct xfs_mount *mp)
}
xfs_agnumber_t
-xfs_set_inode64(struct xfs_mount *mp)
+xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
{
xfs_agnumber_t index = 0;
- for (index = 0; index < mp->m_sb.sb_agcount; index++) {
+ for (index = 0; index < agcount; index++) {
struct xfs_perag *pag;
pag = xfs_perag_get(mp, index);
@@ -686,7 +691,7 @@ xfs_blkdev_get(
xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
}
- return -error;
+ return error;
}
STATIC void
@@ -756,7 +761,7 @@ xfs_open_devices(
if (rtdev == ddev || rtdev == logdev) {
xfs_warn(mp,
"Cannot mount filesystem with identical rtdev and ddev/logdev.");
- error = EINVAL;
+ error = -EINVAL;
goto out_close_rtdev;
}
}
@@ -764,7 +769,7 @@ xfs_open_devices(
/*
* Setup xfs_mount buffer target pointers
*/
- error = ENOMEM;
+ error = -ENOMEM;
mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
@@ -838,32 +843,32 @@ xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
- WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_data_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_unwritten_workqueue)
goto out_destroy_data_iodone_queue;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- 0, 0, mp->m_fsname);
+ WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- 0, 0, mp->m_fsname);
+ WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- 0, 0, mp->m_fsname);
+ WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
@@ -1188,6 +1193,7 @@ xfs_fs_remount(
char *options)
{
struct xfs_mount *mp = XFS_M(sb);
+ xfs_sb_t *sbp = &mp->m_sb;
substring_t args[MAX_OPT_ARGS];
char *p;
int error;
@@ -1208,10 +1214,10 @@ xfs_fs_remount(
mp->m_flags &= ~XFS_MOUNT_BARRIER;
break;
case Opt_inode64:
- mp->m_maxagi = xfs_set_inode64(mp);
+ mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
break;
case Opt_inode32:
- mp->m_maxagi = xfs_set_inode32(mp);
+ mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
break;
default:
/*
@@ -1295,7 +1301,7 @@ xfs_fs_freeze(
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
- return -xfs_fs_log_dummy(mp);
+ return xfs_fs_log_dummy(mp);
}
STATIC int
@@ -1314,7 +1320,7 @@ xfs_fs_show_options(
struct seq_file *m,
struct dentry *root)
{
- return -xfs_showargs(XFS_M(root->d_sb), m);
+ return xfs_showargs(XFS_M(root->d_sb), m);
}
/*
@@ -1336,14 +1342,14 @@ xfs_finish_flags(
mp->m_logbsize < mp->m_sb.sb_logsunit) {
xfs_warn(mp,
"logbuf size must be greater than or equal to log stripe size");
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
} else {
/* Fail a mount if the logbuf is larger than 32K */
if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
xfs_warn(mp,
"logbuf size for version 1 logs must be 16K or 32K");
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
}
@@ -1355,7 +1361,7 @@ xfs_finish_flags(
xfs_warn(mp,
"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
MNTOPT_NOATTR2, MNTOPT_ATTR2);
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
/*
@@ -1372,7 +1378,7 @@ xfs_finish_flags(
if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
xfs_warn(mp,
"cannot mount a read-only filesystem as read-write");
- return XFS_ERROR(EROFS);
+ return -EROFS;
}
if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
@@ -1380,7 +1386,7 @@ xfs_finish_flags(
!xfs_sb_version_has_pquotino(&mp->m_sb)) {
xfs_warn(mp,
"Super block does not support project and group quota together");
- return XFS_ERROR(EINVAL);
+ return -EINVAL;
}
return 0;
@@ -1394,7 +1400,7 @@ xfs_fs_fill_super(
{
struct inode *root;
struct xfs_mount *mp = NULL;
- int flags = 0, error = ENOMEM;
+ int flags = 0, error = -ENOMEM;
mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
@@ -1405,6 +1411,7 @@ xfs_fs_fill_super(
atomic_set(&mp->m_active_trans, 0);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+ mp->m_kobj.kobject.kset = xfs_kset;
mp->m_super = sb;
sb->s_fs_info = mp;
@@ -1428,11 +1435,11 @@ xfs_fs_fill_super(
if (error)
goto out_free_fsname;
- error = -xfs_init_mount_workqueues(mp);
+ error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
- error = -xfs_icsb_init_counters(mp);
+ error = xfs_icsb_init_counters(mp);
if (error)
goto out_destroy_workqueues;
@@ -1474,12 +1481,12 @@ xfs_fs_fill_super(
root = igrab(VFS_I(mp->m_rootip));
if (!root) {
- error = ENOENT;
+ error = -ENOENT;
goto out_unmount;
}
sb->s_root = d_make_root(root);
if (!sb->s_root) {
- error = ENOMEM;
+ error = -ENOMEM;
goto out_unmount;
}
@@ -1499,7 +1506,7 @@ out_destroy_workqueues:
xfs_free_fsname(mp);
kfree(mp);
out:
- return -error;
+ return error;
out_unmount:
xfs_filestream_unmount(mp);
@@ -1714,7 +1721,8 @@ xfs_init_workqueues(void)
* AGs in all the filesystems mounted. Hence use the default large
* max_active value for this workqueue.
*/
- xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
+ xfs_alloc_wq = alloc_workqueue("xfsalloc",
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
if (!xfs_alloc_wq)
return -ENOMEM;
@@ -1761,9 +1769,22 @@ init_xfs_fs(void)
if (error)
goto out_cleanup_procfs;
+ xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
+ if (!xfs_kset) {
+ error = -ENOMEM;
+ goto out_sysctl_unregister;;
+ }
+
+#ifdef DEBUG
+ xfs_dbg_kobj.kobject.kset = xfs_kset;
+ error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
+ if (error)
+ goto out_kset_unregister;
+#endif
+
error = xfs_qm_init();
if (error)
- goto out_sysctl_unregister;
+ goto out_remove_kobj;
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1772,6 +1793,12 @@ init_xfs_fs(void)
out_qm_exit:
xfs_qm_exit();
+ out_remove_kobj:
+#ifdef DEBUG
+ xfs_sysfs_del(&xfs_dbg_kobj);
+ out_kset_unregister:
+#endif
+ kset_unregister(xfs_kset);
out_sysctl_unregister:
xfs_sysctl_unregister();
out_cleanup_procfs:
@@ -1793,6 +1820,10 @@ exit_xfs_fs(void)
{
xfs_qm_exit();
unregister_filesystem(&xfs_fs_type);
+#ifdef DEBUG
+ xfs_sysfs_del(&xfs_dbg_kobj);
+#endif
+ kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
xfs_buf_terminate();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index bbe3d15a7904..2b830c2f322e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -44,16 +44,6 @@ extern void xfs_qm_exit(void);
# define XFS_REALTIME_STRING
#endif
-#if XFS_BIG_BLKNOS
-# if XFS_BIG_INUMS
-# define XFS_BIGFS_STRING "large block/inode numbers, "
-# else
-# define XFS_BIGFS_STRING "large block numbers, "
-# endif
-#else
-# define XFS_BIGFS_STRING
-#endif
-
#ifdef DEBUG
# define XFS_DBG_STRING "debug"
#else
@@ -64,7 +54,6 @@ extern void xfs_qm_exit(void);
#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
XFS_SECURITY_STRING \
XFS_REALTIME_STRING \
- XFS_BIGFS_STRING \
XFS_DBG_STRING /* DBG must be last */
struct xfs_inode;
@@ -76,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
-extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
+extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
+extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
extern const struct export_operations xfs_export_operations;
extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index d69363c833e1..02ae62a998e0 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -76,15 +76,15 @@ xfs_readlink_bmap(
bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
&xfs_symlink_buf_ops);
if (!bp)
- return XFS_ERROR(ENOMEM);
+ return -ENOMEM;
error = bp->b_error;
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_relse(bp);
/* bad CRC means corrupted metadata */
- if (error == EFSBADCRC)
- error = EFSCORRUPTED;
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
goto out;
}
byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -95,7 +95,7 @@ xfs_readlink_bmap(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
byte_cnt, bp)) {
- error = EFSCORRUPTED;
+ error = -EFSCORRUPTED;
xfs_alert(mp,
"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
offset, byte_cnt, ip->i_ino);
@@ -135,7 +135,7 @@ xfs_readlink(
trace_xfs_readlink(ip);
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -148,7 +148,7 @@ xfs_readlink(
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- error = XFS_ERROR(EFSCORRUPTED);
+ error = -EFSCORRUPTED;
goto out;
}
@@ -203,14 +203,14 @@ xfs_symlink(
trace_xfs_symlink(dp, link_name);
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
/*
* Check component lengths of the target path name.
*/
pathlen = strlen(target_path);
if (pathlen >= MAXPATHLEN) /* total string too long */
- return XFS_ERROR(ENAMETOOLONG);
+ return -ENAMETOOLONG;
udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
@@ -238,7 +238,7 @@ xfs_symlink(
fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
- if (error == ENOSPC && fs_blocks == 0) {
+ if (error == -ENOSPC && fs_blocks == 0) {
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
}
@@ -254,7 +254,7 @@ xfs_symlink(
* Check whether the directory allows new symlinks or not.
*/
if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
- error = XFS_ERROR(EPERM);
+ error = -EPERM;
goto error_return;
}
@@ -269,9 +269,11 @@ xfs_symlink(
/*
* Check for ability to enter directory entry, if no space reserved.
*/
- error = xfs_dir_canenter(tp, dp, link_name, resblks);
- if (error)
- goto error_return;
+ if (!resblks) {
+ error = xfs_dir_canenter(tp, dp, link_name);
+ if (error)
+ goto error_return;
+ }
/*
* Initialize the bmap freelist prior to calling either
* bmapi or the directory create code.
@@ -284,7 +286,7 @@ xfs_symlink(
error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
prid, resblks > 0, &ip, NULL);
if (error) {
- if (error == ENOSPC)
+ if (error == -ENOSPC)
goto error_return;
goto error1;
}
@@ -348,7 +350,7 @@ xfs_symlink(
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
BTOBB(byte_cnt), 0);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error2;
}
bp->b_ops = &xfs_symlink_buf_ops;
@@ -489,7 +491,7 @@ xfs_inactive_symlink_rmt(
XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
if (!bp) {
- error = ENOMEM;
+ error = -ENOMEM;
goto error_bmap_cancel;
}
xfs_trans_binval(tp, bp);
@@ -562,7 +564,7 @@ xfs_inactive_symlink(
trace_xfs_inactive_symlink(ip);
if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ return -EIO;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -580,7 +582,7 @@ xfs_inactive_symlink(
__func__, (unsigned long long)ip->i_ino, pathlen);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
ASSERT(0);
- return XFS_ERROR(EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
if (ip->i_df.if_flags & XFS_IFINLINE) {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index bd8e157c20ef..ffef45375754 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -92,6 +92,11 @@ enum {
extern xfs_param_t xfs_params;
+struct xfs_globals {
+ int log_recovery_delay; /* log recovery delay (secs) */
+};
+extern struct xfs_globals xfs_globals;
+
#ifdef CONFIG_SYSCTL
extern int xfs_sysctl_register(void);
extern void xfs_sysctl_unregister(void);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
new file mode 100644
index 000000000000..aa03670851d8
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_sysfs.h"
+#include "xfs_log_format.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+
+struct xfs_sysfs_attr {
+ struct attribute attr;
+ ssize_t (*show)(char *buf, void *data);
+ ssize_t (*store)(const char *buf, size_t count, void *data);
+};
+
+static inline struct xfs_sysfs_attr *
+to_attr(struct attribute *attr)
+{
+ return container_of(attr, struct xfs_sysfs_attr, attr);
+}
+
+#define XFS_SYSFS_ATTR_RW(name) \
+ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
+#define XFS_SYSFS_ATTR_RO(name) \
+ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+
+#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
+
+/*
+ * xfs_mount kobject. This currently has no attributes and thus no need for show
+ * and store helpers. The mp kobject serves as the per-mount parent object that
+ * is identified by the fsname under sysfs.
+ */
+
+struct kobj_type xfs_mp_ktype = {
+ .release = xfs_sysfs_release,
+};
+
+#ifdef DEBUG
+/* debug */
+
+STATIC ssize_t
+log_recovery_delay_store(
+ const char *buf,
+ size_t count,
+ void *data)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < 0 || val > 60)
+ return -EINVAL;
+
+ xfs_globals.log_recovery_delay = val;
+
+ return count;
+}
+
+STATIC ssize_t
+log_recovery_delay_show(
+ char *buf,
+ void *data)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
+}
+XFS_SYSFS_ATTR_RW(log_recovery_delay);
+
+static struct attribute *xfs_dbg_attrs[] = {
+ ATTR_LIST(log_recovery_delay),
+ NULL,
+};
+
+STATIC ssize_t
+xfs_dbg_show(
+ struct kobject *kobject,
+ struct attribute *attr,
+ char *buf)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+}
+
+STATIC ssize_t
+xfs_dbg_store(
+ struct kobject *kobject,
+ struct attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+}
+
+static struct sysfs_ops xfs_dbg_ops = {
+ .show = xfs_dbg_show,
+ .store = xfs_dbg_store,
+};
+
+struct kobj_type xfs_dbg_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_dbg_ops,
+ .default_attrs = xfs_dbg_attrs,
+};
+
+#endif /* DEBUG */
+
+/* xlog */
+
+STATIC ssize_t
+log_head_lsn_show(
+ char *buf,
+ void *data)
+{
+ struct xlog *log = data;
+ int cycle;
+ int block;
+
+ spin_lock(&log->l_icloglock);
+ cycle = log->l_curr_cycle;
+ block = log->l_curr_block;
+ spin_unlock(&log->l_icloglock);
+
+ return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_head_lsn);
+
+STATIC ssize_t
+log_tail_lsn_show(
+ char *buf,
+ void *data)
+{
+ struct xlog *log = data;
+ int cycle;
+ int block;
+
+ xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
+ return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_tail_lsn);
+
+STATIC ssize_t
+reserve_grant_head_show(
+ char *buf,
+ void *data)
+{
+ struct xlog *log = data;
+ int cycle;
+ int bytes;
+
+ xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
+ return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(reserve_grant_head);
+
+STATIC ssize_t
+write_grant_head_show(
+ char *buf,
+ void *data)
+{
+ struct xlog *log = data;
+ int cycle;
+ int bytes;
+
+ xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
+ return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(write_grant_head);
+
+static struct attribute *xfs_log_attrs[] = {
+ ATTR_LIST(log_head_lsn),
+ ATTR_LIST(log_tail_lsn),
+ ATTR_LIST(reserve_grant_head),
+ ATTR_LIST(write_grant_head),
+ NULL,
+};
+
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+ return container_of(kobj, struct xlog, l_kobj);
+}
+
+STATIC ssize_t
+xfs_log_show(
+ struct kobject *kobject,
+ struct attribute *attr,
+ char *buf)
+{
+ struct xlog *log = to_xlog(kobject);
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
+}
+
+STATIC ssize_t
+xfs_log_store(
+ struct kobject *kobject,
+ struct attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct xlog *log = to_xlog(kobject);
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
+}
+
+static struct sysfs_ops xfs_log_ops = {
+ .show = xfs_log_show,
+ .store = xfs_log_store,
+};
+
+struct kobj_type xfs_log_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_log_ops,
+ .default_attrs = xfs_log_attrs,
+};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
new file mode 100644
index 000000000000..240eee35f342
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef __XFS_SYSFS_H__
+#define __XFS_SYSFS_H__
+
+extern struct kobj_type xfs_mp_ktype; /* xfs_mount */
+extern struct kobj_type xfs_dbg_ktype; /* debug */
+extern struct kobj_type xfs_log_ktype; /* xlog */
+
+static inline struct xfs_kobj *
+to_kobj(struct kobject *kobject)
+{
+ return container_of(kobject, struct xfs_kobj, kobject);
+}
+
+static inline void
+xfs_sysfs_release(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+ complete(&kobj->complete);
+}
+
+static inline int
+xfs_sysfs_init(
+ struct xfs_kobj *kobj,
+ struct kobj_type *ktype,
+ struct xfs_kobj *parent_kobj,
+ const char *name)
+{
+ init_completion(&kobj->complete);
+ return kobject_init_and_add(&kobj->kobject, ktype,
+ &parent_kobj->kobject, "%s", name);
+}
+
+static inline void
+xfs_sysfs_del(
+ struct xfs_kobj *kobj)
+{
+ kobject_del(&kobj->kobject);
+ kobject_put(&kobj->kobject);
+ wait_for_completion(&kobj->complete);
+}
+
+#endif /* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 152f82782630..51372e34d988 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -349,7 +349,8 @@ DEFINE_BUF_EVENT(xfs_buf_free);
DEFINE_BUF_EVENT(xfs_buf_hold);
DEFINE_BUF_EVENT(xfs_buf_rele);
DEFINE_BUF_EVENT(xfs_buf_iodone);
-DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_submit);
+DEFINE_BUF_EVENT(xfs_buf_submit_wait);
DEFINE_BUF_EVENT(xfs_buf_bawrite);
DEFINE_BUF_EVENT(xfs_buf_lock);
DEFINE_BUF_EVENT(xfs_buf_lock_done);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d03932564ccb..30e8e3410955 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -190,7 +190,7 @@ xfs_trans_reserve(
-((int64_t)blocks), rsvd);
if (error != 0) {
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- return (XFS_ERROR(ENOSPC));
+ return -ENOSPC;
}
tp->t_blk_res += blocks;
}
@@ -241,7 +241,7 @@ xfs_trans_reserve(
error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
-((int64_t)rtextents), rsvd);
if (error) {
- error = XFS_ERROR(ENOSPC);
+ error = -ENOSPC;
goto undo_log;
}
tp->t_rtx_res += rtextents;
@@ -874,7 +874,7 @@ xfs_trans_commit(
goto out_unreserve;
if (XFS_FORCED_SHUTDOWN(mp)) {
- error = XFS_ERROR(EIO);
+ error = -EIO;
goto out_unreserve;
}
@@ -917,7 +917,7 @@ out_unreserve:
if (tp->t_ticket) {
commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
if (commit_lsn == -1 && !error)
- error = XFS_ERROR(EIO);
+ error = -EIO;
}
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
@@ -1024,7 +1024,7 @@ xfs_trans_roll(
*/
error = xfs_trans_commit(trans, 0);
if (error)
- return (error);
+ return error;
trans = *tpp;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index cb0f3a84cc68..859482f53b5a 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -762,7 +762,7 @@ xfs_trans_ail_init(
ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
if (!ailp)
- return ENOMEM;
+ return -ENOMEM;
ailp->xa_mount = mp;
INIT_LIST_HEAD(&ailp->xa_ail);
@@ -781,7 +781,7 @@ xfs_trans_ail_init(
out_free_ailp:
kmem_free(ailp);
- return ENOMEM;
+ return -ENOMEM;
}
void
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index b8eef0549f3f..e2b2216b1635 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -166,7 +166,7 @@ xfs_trans_get_buf_map(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_recur++;
trace_xfs_trans_get_buf_recur(bip);
- return (bp);
+ return bp;
}
bp = xfs_buf_get_map(target, map, nmaps, flags);
@@ -178,7 +178,7 @@ xfs_trans_get_buf_map(
_xfs_trans_bjoin(tp, bp, 1);
trace_xfs_trans_get_buf(bp->b_fspriv);
- return (bp);
+ return bp;
}
/*
@@ -201,9 +201,8 @@ xfs_trans_getsb(xfs_trans_t *tp,
* Default to just trying to lock the superblock buffer
* if tp is NULL.
*/
- if (tp == NULL) {
- return (xfs_getsb(mp, flags));
- }
+ if (tp == NULL)
+ return xfs_getsb(mp, flags);
/*
* If the superblock buffer already has this transaction
@@ -218,7 +217,7 @@ xfs_trans_getsb(xfs_trans_t *tp,
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_recur++;
trace_xfs_trans_getsb_recur(bip);
- return (bp);
+ return bp;
}
bp = xfs_getsb(mp, flags);
@@ -227,7 +226,7 @@ xfs_trans_getsb(xfs_trans_t *tp,
_xfs_trans_bjoin(tp, bp, 1);
trace_xfs_trans_getsb(bp->b_fspriv);
- return (bp);
+ return bp;
}
#ifdef DEBUG
@@ -267,7 +266,7 @@ xfs_trans_read_buf_map(
bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
if (!bp)
return (flags & XBF_TRYLOCK) ?
- EAGAIN : XFS_ERROR(ENOMEM);
+ -EAGAIN : -ENOMEM;
if (bp->b_error) {
error = bp->b_error;
@@ -277,8 +276,8 @@ xfs_trans_read_buf_map(
xfs_buf_relse(bp);
/* bad CRC means corrupted metadata */
- if (error == EFSBADCRC)
- error = EFSCORRUPTED;
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
return error;
}
#ifdef DEBUG
@@ -287,7 +286,7 @@ xfs_trans_read_buf_map(
if (((xfs_req_num++) % xfs_error_mod) == 0) {
xfs_buf_relse(bp);
xfs_debug(mp, "Returning error!");
- return XFS_ERROR(EIO);
+ return -EIO;
}
}
}
@@ -319,20 +318,10 @@ xfs_trans_read_buf_map(
XFS_BUF_READ(bp);
bp->b_ops = ops;
- /*
- * XXX(hch): clean up the error handling here to be less
- * of a mess..
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
- trace_xfs_bdstrat_shut(bp, _RET_IP_);
- xfs_bioerror_relse(bp);
- } else {
- xfs_buf_iorequest(bp);
- }
-
- error = xfs_buf_iowait(bp);
+ error = xfs_buf_submit_wait(bp);
if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_relse(bp);
/*
* We can gracefully recover from most read
@@ -343,8 +332,8 @@ xfs_trans_read_buf_map(
xfs_force_shutdown(tp->t_mountp,
SHUTDOWN_META_IO_ERROR);
/* bad CRC means corrupted metadata */
- if (error == EFSBADCRC)
- error = EFSCORRUPTED;
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
return error;
}
}
@@ -355,7 +344,7 @@ xfs_trans_read_buf_map(
if (XFS_FORCED_SHUTDOWN(mp)) {
trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
*bpp = NULL;
- return XFS_ERROR(EIO);
+ return -EIO;
}
@@ -372,7 +361,7 @@ xfs_trans_read_buf_map(
if (bp == NULL) {
*bpp = NULL;
return (flags & XBF_TRYLOCK) ?
- 0 : XFS_ERROR(ENOMEM);
+ 0 : -ENOMEM;
}
if (bp->b_error) {
error = bp->b_error;
@@ -384,8 +373,8 @@ xfs_trans_read_buf_map(
xfs_buf_relse(bp);
/* bad CRC means corrupted metadata */
- if (error == EFSBADCRC)
- error = EFSCORRUPTED;
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
return error;
}
#ifdef DEBUG
@@ -396,7 +385,7 @@ xfs_trans_read_buf_map(
SHUTDOWN_META_IO_ERROR);
xfs_buf_relse(bp);
xfs_debug(mp, "Returning trans error!");
- return XFS_ERROR(EIO);
+ return -EIO;
}
}
}
@@ -414,7 +403,7 @@ shutdown_abort:
trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
xfs_buf_relse(bp);
*bpp = NULL;
- return XFS_ERROR(EIO);
+ return -EIO;
}
/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 41172861e857..846e061c2e98 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -722,8 +722,8 @@ xfs_trans_dqresv(
error_return:
xfs_dqunlock(dqp);
if (flags & XFS_QMOPT_ENOSPC)
- return ENOSPC;
- return EDQUOT;
+ return -ENOSPC;
+ return -EDQUOT;
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 50c3f5614288..cdb4d86520e1 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -70,7 +70,7 @@ xfs_trans_ichgtime(
int flags)
{
struct inode *inode = VFS_I(ip);
- timespec_t tv;
+ struct timespec tv;
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 65c6e6650b1a..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -38,43 +38,18 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
-/*
- * These types are 64 bits on disk but are either 32 or 64 bits in memory.
- * Disk based types:
- */
-typedef __uint64_t xfs_dfsbno_t; /* blockno in filesystem (agno|agbno) */
-typedef __uint64_t xfs_drfsbno_t; /* blockno in filesystem (raw) */
-typedef __uint64_t xfs_drtbno_t; /* extent (block) in realtime area */
-typedef __uint64_t xfs_dfiloff_t; /* block number in a file */
-typedef __uint64_t xfs_dfilblks_t; /* number of blocks in a file */
-
-/*
- * Memory based types are conditional.
- */
-#if XFS_BIG_BLKNOS
typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
-typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-#else
-typedef __uint32_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
-typedef __uint32_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
-typedef __uint32_t xfs_rtblock_t; /* extent (block) in realtime area */
-typedef __int32_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-#endif
typedef __uint64_t xfs_fileoff_t; /* block number in a file */
-typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
+typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
/*
* Null values for the types.
*/
-#define NULLDFSBNO ((xfs_dfsbno_t)-1)
-#define NULLDRFSBNO ((xfs_drfsbno_t)-1)
-#define NULLDRTBNO ((xfs_drtbno_t)-1)
-#define NULLDFILOFF ((xfs_dfiloff_t)-1)
-
#define NULLFSBLOCK ((xfs_fsblock_t)-1)
#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
#define NULLRTBLOCK ((xfs_rtblock_t)-1)
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
deleted file mode 100644
index e8a77383c0d5..000000000000
--- a/fs/xfs/xfs_vnode.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_VNODE_H__
-#define __XFS_VNODE_H__
-
-#include "xfs_fs.h"
-
-struct file;
-struct xfs_inode;
-struct attrlist_cursor_kern;
-
-/*
- * Flags for read/write calls - same values as IRIX
- */
-#define IO_ISDIRECT 0x00004 /* bypass page cache */
-#define IO_INVIS 0x00020 /* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
- { IO_ISDIRECT, "DIRECT" }, \
- { IO_INVIS, "INVIS"}
-
-/*
- * Some useful predicates.
- */
-#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
-#define VN_CACHED(vp) (vp->i_mapping->nrpages)
-#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \
- PAGECACHE_TAG_DIRTY)
-
-
-#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 78ed92a46fdd..93455b998041 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -49,7 +49,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
value = NULL;
}
- error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+ error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
if (error)
return error;
return asize;
@@ -71,8 +71,8 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
xflags |= ATTR_REPLACE;
if (!value)
- return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
- return -xfs_attr_set(ip, (unsigned char *)name,
+ return xfs_attr_remove(ip, (unsigned char *)name, xflags);
+ return xfs_attr_set(ip, (unsigned char *)name,
(void *)value, size, xflags);
}