aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c5
-rw-r--r--fs/9p/acl.h2
-rw-r--r--fs/9p/vfs_file.c20
-rw-r--r--fs/Kconfig18
-rw-r--r--fs/Makefile2
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/aio.c2
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/befs/debug.c2
-rw-r--r--fs/block_dev.c263
-rw-r--r--fs/btrfs/Makefile1
-rw-r--r--fs/btrfs/acl.c16
-rw-r--r--fs/btrfs/backref.c6
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/block-group.c114
-rw-r--r--fs/btrfs/btrfs_inode.h27
-rw-r--r--fs/btrfs/check-integrity.c60
-rw-r--r--fs/btrfs/compression.c169
-rw-r--r--fs/btrfs/compression.h5
-rw-r--r--fs/btrfs/ctree.c62
-rw-r--r--fs/btrfs/ctree.h96
-rw-r--r--fs/btrfs/delayed-inode.c227
-rw-r--r--fs/btrfs/dir-item.c76
-rw-r--r--fs/btrfs/disk-io.c13
-rw-r--r--fs/btrfs/extent-tree.c12
-rw-r--r--fs/btrfs/extent_io.c318
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c23
-rw-r--r--fs/btrfs/free-space-cache.c26
-rw-r--r--fs/btrfs/inode.c302
-rw-r--r--fs/btrfs/ioctl.c188
-rw-r--r--fs/btrfs/lzo.c236
-rw-r--r--fs/btrfs/ordered-data.c5
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/qgroup.c8
-rw-r--r--fs/btrfs/raid56.c18
-rw-r--r--fs/btrfs/ref-verify.c10
-rw-r--r--fs/btrfs/relocation.c306
-rw-r--r--fs/btrfs/send.c35
-rw-r--r--fs/btrfs/space-info.c98
-rw-r--r--fs/btrfs/struct-funcs.c8
-rw-r--r--fs/btrfs/subpage.c24
-rw-r--r--fs/btrfs/subpage.h3
-rw-r--r--fs/btrfs/super.c56
-rw-r--r--fs/btrfs/sysfs.c108
-rw-r--r--fs/btrfs/tests/qgroup-tests.c30
-rw-r--r--fs/btrfs/tree-checker.c38
-rw-r--r--fs/btrfs/tree-log.c102
-rw-r--r--fs/btrfs/verity.c811
-rw-r--r--fs/btrfs/volumes.c234
-rw-r--r--fs/btrfs/volumes.h29
-rw-r--r--fs/btrfs/zlib.c54
-rw-r--r--fs/btrfs/zoned.c22
-rw-r--r--fs/btrfs/zoned.h1
-rw-r--r--fs/btrfs/zstd.c39
-rw-r--r--fs/buffer.c31
-rw-r--r--fs/cachefiles/Kconfig19
-rw-r--r--fs/cachefiles/Makefile2
-rw-r--r--fs/cachefiles/bind.c2
-rw-r--r--fs/cachefiles/interface.c6
-rw-r--r--fs/cachefiles/internal.h25
-rw-r--r--fs/cachefiles/io.c6
-rw-r--r--fs/cachefiles/key.c2
-rw-r--r--fs/cachefiles/main.c7
-rw-r--r--fs/cachefiles/namei.c61
-rw-r--r--fs/cachefiles/proc.c114
-rw-r--r--fs/cachefiles/xattr.c4
-rw-r--r--fs/ceph/acl.c5
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/caps.c21
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/locks.c3
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/mdsmap.c8
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.h5
-rw-r--r--fs/cifs/Kconfig30
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_swn.c2
-rw-r--r--fs/cifs/cifs_unicode.c9
-rw-r--r--fs/cifs/cifsencrypt.c89
-rw-r--r--fs/cifs/cifsfs.c8
-rw-r--r--fs/cifs/cifsglob.h32
-rw-r--r--fs/cifs/cifspdu.h28
-rw-r--r--fs/cifs/cifsproto.h10
-rw-r--r--fs/cifs/cifssmb.c107
-rw-r--r--fs/cifs/connect.c32
-rw-r--r--fs/cifs/file.c15
-rw-r--r--fs/cifs/fs_context.c25
-rw-r--r--fs/cifs/fs_context.h3
-rw-r--r--fs/cifs/fscache.c41
-rw-r--r--fs/cifs/fscache.h23
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/readdir.c23
-rw-r--r--fs/cifs/sess.c257
-rw-r--r--fs/cifs/smb2maperror.c1
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/cifs/smbencrypt.c139
-rw-r--r--fs/cifs_common/Makefile7
-rw-r--r--fs/cifs_common/arc4.h23
-rw-r--r--fs/cifs_common/cifs_arc4.c87
-rw-r--r--fs/cifs_common/cifs_md4.c197
-rw-r--r--fs/cifs_common/md4.h27
-rw-r--r--fs/configfs/dir.c87
-rw-r--r--fs/crypto/fname.c106
-rw-r--r--fs/crypto/hooks.c44
-rw-r--r--fs/d_path.c93
-rw-r--r--fs/dax.c578
-rw-r--r--fs/debugfs/file.c8
-rw-r--r--fs/dlm/dir.c4
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lockspace.c3
-rw-r--r--fs/dlm/lowcomms.c770
-rw-r--r--fs/dlm/lowcomms.h1
-rw-r--r--fs/dlm/member.c4
-rw-r--r--fs/dlm/midcomms.c56
-rw-r--r--fs/dlm/rcom.c29
-rw-r--r--fs/dlm/recoverd.c4
-rw-r--r--fs/drop_caches.c3
-rw-r--r--fs/erofs/Kconfig1
-rw-r--r--fs/erofs/data.c415
-rw-r--r--fs/erofs/erofs_fs.h47
-rw-r--r--fs/erofs/inode.c29
-rw-r--r--fs/erofs/internal.h22
-rw-r--r--fs/erofs/namei.c1
-rw-r--r--fs/erofs/super.c61
-rw-r--r--fs/erofs/xattr.c5
-rw-r--r--fs/erofs/xattr.h2
-rw-r--r--fs/erofs/zdata.c6
-rw-r--r--fs/erofs/zmap.c133
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/exec.c16
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/acl.c5
-rw-r--r--fs/ext2/acl.h2
-rw-r--r--fs/ext2/ext2.h11
-rw-r--r--fs/ext2/file.c7
-rw-r--r--fs/ext2/inode.c27
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/acl.c5
-rw-r--r--fs/ext4/acl.h2
-rw-r--r--fs/ext4/balloc.c8
-rw-r--r--fs/ext4/ext4.h118
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_jbd2.c43
-rw-r--r--fs/ext4/ext4_jbd2.h18
-rw-r--r--fs/ext4/extents.c42
-rw-r--r--fs/ext4/fast_commit.c27
-rw-r--r--fs/ext4/file.c16
-rw-r--r--fs/ext4/ialloc.c19
-rw-r--r--fs/ext4/indirect.c15
-rw-r--r--fs/ext4/inline.c32
-rw-r--r--fs/ext4/inode.c348
-rw-r--r--fs/ext4/ioctl.c8
-rw-r--r--fs/ext4/mballoc.c259
-rw-r--r--fs/ext4/namei.c214
-rw-r--r--fs/ext4/orphan.c652
-rw-r--r--fs/ext4/resize.c38
-rw-r--r--fs/ext4/super.c251
-rw-r--r--fs/ext4/symlink.c12
-rw-r--r--fs/ext4/truncate.h8
-rw-r--r--fs/ext4/xattr.c26
-rw-r--r--fs/f2fs/Kconfig19
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/acl.c5
-rw-r--r--fs/f2fs/acl.h2
-rw-r--r--fs/f2fs/checkpoint.c57
-rw-r--r--fs/f2fs/compress.c97
-rw-r--r--fs/f2fs/data.c206
-rw-r--r--fs/f2fs/debug.c39
-rw-r--r--fs/f2fs/dir.c18
-rw-r--r--fs/f2fs/extent_cache.c46
-rw-r--r--fs/f2fs/f2fs.h178
-rw-r--r--fs/f2fs/file.c124
-rw-r--r--fs/f2fs/gc.c14
-rw-r--r--fs/f2fs/iostat.c287
-rw-r--r--fs/f2fs/iostat.h84
-rw-r--r--fs/f2fs/namei.c12
-rw-r--r--fs/f2fs/node.c165
-rw-r--r--fs/f2fs/recovery.c3
-rw-r--r--fs/f2fs/segment.c137
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/super.c243
-rw-r--r--fs/f2fs/sysfs.c158
-rw-r--r--fs/f2fs/xattr.c3
-rw-r--r--fs/fat/.kunitconfig5
-rw-r--r--fs/fat/Kconfig14
-rw-r--r--fs/fat/Makefile2
-rw-r--r--fs/fat/fat_test.c196
-rw-r--r--fs/fat/fatent.c1
-rw-r--r--fs/fat/misc.c3
-rw-r--r--fs/fcntl.c21
-rw-r--r--fs/file.c64
-rw-r--r--fs/fs-writeback.c47
-rw-r--r--fs/fs_context.c4
-rw-r--r--fs/fscache/Kconfig25
-rw-r--r--fs/fscache/Makefile2
-rw-r--r--fs/fscache/cache.c11
-rw-r--r--fs/fscache/cookie.c201
-rw-r--r--fs/fscache/fsdef.c3
-rw-r--r--fs/fscache/histogram.c87
-rw-r--r--fs/fscache/internal.h57
-rw-r--r--fs/fscache/main.c39
-rw-r--r--fs/fscache/netfs.c2
-rw-r--r--fs/fscache/object-list.c414
-rw-r--r--fs/fscache/object.c8
-rw-r--r--fs/fscache/operation.c3
-rw-r--r--fs/fscache/page.c6
-rw-r--r--fs/fscache/proc.c20
-rw-r--r--fs/fuse/acl.c5
-rw-r--r--fs/fuse/dax.c50
-rw-r--r--fs/fuse/dir.c11
-rw-r--r--fs/fuse/file.c10
-rw-r--r--fs/fuse/fuse_i.h9
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/gfs2/acl.c5
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c9
-rw-r--r--fs/gfs2/bmap.c5
-rw-r--r--fs/gfs2/file.c3
-rw-r--r--fs/gfs2/glock.c13
-rw-r--r--fs/gfs2/glops.c27
-rw-r--r--fs/gfs2/incore.h2
-rw-r--r--fs/gfs2/lock_dlm.c5
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/lops.c44
-rw-r--r--fs/gfs2/meta_io.c7
-rw-r--r--fs/gfs2/ops_fstype.c53
-rw-r--r--fs/gfs2/super.c107
-rw-r--r--fs/gfs2/super.h3
-rw-r--r--fs/gfs2/util.c3
-rw-r--r--fs/gfs2/util.h5
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/hpfs/file.c51
-rw-r--r--fs/inode.c4
-rw-r--r--fs/internal.h12
-rw-r--r--fs/io-wq.c208
-rw-r--r--fs/io-wq.h3
-rw-r--r--fs/io_uring.c1945
-rw-r--r--fs/ioctl.c211
-rw-r--r--fs/iomap/Makefile2
-rw-r--r--fs/iomap/apply.c99
-rw-r--r--fs/iomap/buffered-io.c512
-rw-r--r--fs/iomap/direct-io.c172
-rw-r--r--fs/iomap/fiemap.c101
-rw-r--r--fs/iomap/iter.c80
-rw-r--r--fs/iomap/seek.c98
-rw-r--r--fs/iomap/swapfile.c44
-rw-r--r--fs/iomap/trace.h61
-rw-r--r--fs/isofs/inode.c27
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/joliet.c4
-rw-r--r--fs/jbd2/recovery.c29
-rw-r--r--fs/jbd2/transaction.c21
-rw-r--r--fs/jffs2/acl.c5
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jfs/acl.c5
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/kernfs/dir.c153
-rw-r--r--fs/kernfs/file.c4
-rw-r--r--fs/kernfs/inode.c26
-rw-r--r--fs/kernfs/kernfs-internal.h24
-rw-r--r--fs/kernfs/mount.c12
-rw-r--r--fs/kernfs/symlink.c4
-rw-r--r--fs/ksmbd/Kconfig68
-rw-r--r--fs/ksmbd/Makefile20
-rw-r--r--fs/ksmbd/asn1.c343
-rw-r--r--fs/ksmbd/asn1.h21
-rw-r--r--fs/ksmbd/auth.c1364
-rw-r--r--fs/ksmbd/auth.h67
-rw-r--r--fs/ksmbd/connection.c413
-rw-r--r--fs/ksmbd/connection.h213
-rw-r--r--fs/ksmbd/crypto_ctx.c282
-rw-r--r--fs/ksmbd/crypto_ctx.h74
-rw-r--r--fs/ksmbd/glob.h49
-rw-r--r--fs/ksmbd/ksmbd_netlink.h395
-rw-r--r--fs/ksmbd/ksmbd_spnego_negtokeninit.asn131
-rw-r--r--fs/ksmbd/ksmbd_spnego_negtokentarg.asn119
-rw-r--r--fs/ksmbd/ksmbd_work.c80
-rw-r--r--fs/ksmbd/ksmbd_work.h117
-rw-r--r--fs/ksmbd/mgmt/ksmbd_ida.c46
-rw-r--r--fs/ksmbd/mgmt/ksmbd_ida.h34
-rw-r--r--fs/ksmbd/mgmt/share_config.c238
-rw-r--r--fs/ksmbd/mgmt/share_config.h81
-rw-r--r--fs/ksmbd/mgmt/tree_connect.c121
-rw-r--r--fs/ksmbd/mgmt/tree_connect.h56
-rw-r--r--fs/ksmbd/mgmt/user_config.c69
-rw-r--r--fs/ksmbd/mgmt/user_config.h66
-rw-r--r--fs/ksmbd/mgmt/user_session.c369
-rw-r--r--fs/ksmbd/mgmt/user_session.h106
-rw-r--r--fs/ksmbd/misc.c338
-rw-r--r--fs/ksmbd/misc.h35
-rw-r--r--fs/ksmbd/ndr.c345
-rw-r--r--fs/ksmbd/ndr.h22
-rw-r--r--fs/ksmbd/nterr.h543
-rw-r--r--fs/ksmbd/ntlmssp.h169
-rw-r--r--fs/ksmbd/oplock.c1709
-rw-r--r--fs/ksmbd/oplock.h131
-rw-r--r--fs/ksmbd/server.c633
-rw-r--r--fs/ksmbd/server.h70
-rw-r--r--fs/ksmbd/smb2misc.c438
-rw-r--r--fs/ksmbd/smb2ops.c312
-rw-r--r--fs/ksmbd/smb2pdu.c8373
-rw-r--r--fs/ksmbd/smb2pdu.h1698
-rw-r--r--fs/ksmbd/smb_common.c674
-rw-r--r--fs/ksmbd/smb_common.h542
-rw-r--r--fs/ksmbd/smbacl.c1366
-rw-r--r--fs/ksmbd/smbacl.h212
-rw-r--r--fs/ksmbd/smbfsctl.h91
-rw-r--r--fs/ksmbd/smbstatus.h1822
-rw-r--r--fs/ksmbd/transport_ipc.c874
-rw-r--r--fs/ksmbd/transport_ipc.h47
-rw-r--r--fs/ksmbd/transport_rdma.c2058
-rw-r--r--fs/ksmbd/transport_rdma.h63
-rw-r--r--fs/ksmbd/transport_tcp.c618
-rw-r--r--fs/ksmbd/transport_tcp.h13
-rw-r--r--fs/ksmbd/unicode.c384
-rw-r--r--fs/ksmbd/unicode.h357
-rw-r--r--fs/ksmbd/uniupr.h268
-rw-r--r--fs/ksmbd/vfs.c1895
-rw-r--r--fs/ksmbd/vfs.h197
-rw-r--r--fs/ksmbd/vfs_cache.c725
-rw-r--r--fs/ksmbd/vfs_cache.h178
-rw-r--r--fs/ksmbd/xattr.h122
-rw-r--r--fs/lockd/svc.c4
-rw-r--r--fs/lockd/svc4proc.c7
-rw-r--r--fs/lockd/svclock.c82
-rw-r--r--fs/lockd/svcproc.c6
-rw-r--r--fs/lockd/svcsubs.c114
-rw-r--r--fs/locks.c123
-rw-r--r--fs/namei.c294
-rw-r--r--fs/namespace.c113
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_xdr.c61
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/export.c2
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/fs_context.c7
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs3_fs.h2
-rw-r--r--fs/nfs/nfs3acl.c5
-rw-r--r--fs/nfs/nfs3proc.c3
-rw-r--r--fs/nfs/nfs4client.c41
-rw-r--r--fs/nfs/nfs4file.c10
-rw-r--r--fs/nfs/pnfs.c20
-rw-r--r--fs/nfs/read.c8
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/Kconfig2
-rw-r--r--fs/nfsd/lockd.c8
-rw-r--r--fs/nfsd/nfs4state.c34
-rw-r--r--fs/nfsd/nfsproc.c1
-rw-r--r--fs/nfsd/trace.h17
-rw-r--r--fs/nfsd/vfs.c44
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c251
-rw-r--r--fs/notify/fsnotify.c6
-rw-r--r--fs/notify/fsnotify.h15
-rw-r--r--fs/notify/mark.c52
-rw-r--r--fs/ntfs3/ntfs_fs.h2
-rw-r--r--fs/ntfs3/xattr.c7
-rw-r--r--fs/ocfs2/acl.c5
-rw-r--r--fs/ocfs2/acl.h2
-rw-r--r--fs/ocfs2/dlmglue.c14
-rw-r--r--fs/ocfs2/locks.c4
-rw-r--r--fs/ocfs2/quota_global.c1
-rw-r--r--fs/ocfs2/quota_local.c2
-rw-r--r--fs/open.c8
-rw-r--r--fs/orangefs/acl.c5
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/orangefs/orangefs-kernel.h2
-rw-r--r--fs/overlayfs/copy_up.c83
-rw-r--r--fs/overlayfs/dir.c16
-rw-r--r--fs/overlayfs/inode.c105
-rw-r--r--fs/overlayfs/namei.c4
-rw-r--r--fs/overlayfs/overlayfs.h44
-rw-r--r--fs/overlayfs/super.c4
-rw-r--r--fs/overlayfs/util.c92
-rw-r--r--fs/pipe.c22
-rw-r--r--fs/posix_acl.c15
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/reiserfs/acl.h2
-rw-r--r--fs/reiserfs/prints.c2
-rw-r--r--fs/reiserfs/xattr_acl.c5
-rw-r--r--fs/remap_range.c12
-rw-r--r--fs/select.c4
-rw-r--r--fs/signalfd.c4
-rw-r--r--fs/squashfs/block.c7
-rw-r--r--fs/squashfs/lz4_wrapper.c2
-rw-r--r--fs/squashfs/lzo_wrapper.c2
-rw-r--r--fs/squashfs/xz_wrapper.c2
-rw-r--r--fs/squashfs/zlib_wrapper.c2
-rw-r--r--fs/squashfs/zstd_wrapper.c2
-rw-r--r--fs/stat.c18
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/timerfd.c16
-rw-r--r--fs/ubifs/file.c13
-rw-r--r--fs/udf/dir.c5
-rw-r--r--fs/udf/ecma_167.h44
-rw-r--r--fs/udf/inode.c3
-rw-r--r--fs/udf/misc.c13
-rw-r--r--fs/udf/namei.c13
-rw-r--r--fs/udf/osta_udf.h22
-rw-r--r--fs/udf/super.c75
-rw-r--r--fs/udf/udf_sb.h2
-rw-r--r--fs/udf/udfdecl.h4
-rw-r--r--fs/udf/unicode.c4
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/userfaultfd.c116
-rw-r--r--fs/xfs/kmem.c64
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--fs/xfs/libxfs/xfs_ag.c25
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c56
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c100
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c56
-rw-r--r--fs/xfs/libxfs/xfs_attr.h1
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c57
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c21
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c38
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c56
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h9
-rw-r--r--fs/xfs/libxfs/xfs_btree.c141
-rw-r--r--fs/xfs/libxfs/xfs_btree.h56
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c14
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c20
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c20
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c12
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c8
-rw-r--r--fs/xfs/libxfs/xfs_format.h226
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c69
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c88
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c22
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h11
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h6
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c2
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h30
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c12
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c54
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c34
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c72
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c14
-rw-r--r--fs/xfs/libxfs/xfs_sb.c263
-rw-r--r--fs/xfs/libxfs/xfs_sb.h4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c14
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c48
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h2
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h6
-rw-r--r--fs/xfs/libxfs/xfs_types.c2
-rw-r--r--fs/xfs/libxfs/xfs_types.h5
-rw-r--r--fs/xfs/scrub/agheader.c47
-rw-r--r--fs/xfs/scrub/agheader_repair.c66
-rw-r--r--fs/xfs/scrub/alloc.c2
-rw-r--r--fs/xfs/scrub/attr.c16
-rw-r--r--fs/xfs/scrub/attr.h3
-rw-r--r--fs/xfs/scrub/bitmap.c4
-rw-r--r--fs/xfs/scrub/bmap.c41
-rw-r--r--fs/xfs/scrub/btree.c9
-rw-r--r--fs/xfs/scrub/btree.h4
-rw-r--r--fs/xfs/scrub/common.c77
-rw-r--r--fs/xfs/scrub/common.h18
-rw-r--r--fs/xfs/scrub/dabtree.c4
-rw-r--r--fs/xfs/scrub/dir.c10
-rw-r--r--fs/xfs/scrub/fscounters.c6
-rw-r--r--fs/xfs/scrub/ialloc.c4
-rw-r--r--fs/xfs/scrub/inode.c14
-rw-r--r--fs/xfs/scrub/quota.c4
-rw-r--r--fs/xfs/scrub/refcount.c4
-rw-r--r--fs/xfs/scrub/repair.c32
-rw-r--r--fs/xfs/scrub/rmap.c2
-rw-r--r--fs/xfs/scrub/rtbitmap.c2
-rw-r--r--fs/xfs/scrub/scrub.c23
-rw-r--r--fs/xfs/scrub/scrub.h3
-rw-r--r--fs/xfs/scrub/trace.c8
-rw-r--r--fs/xfs/scrub/trace.h78
-rw-r--r--fs/xfs/xfs_acl.c7
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_aops.c25
-rw-r--r--fs/xfs/xfs_attr_inactive.c6
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c14
-rw-r--r--fs/xfs/xfs_bmap_util.c35
-rw-r--r--fs/xfs/xfs_buf.c42
-rw-r--r--fs/xfs/xfs_buf.h25
-rw-r--r--fs/xfs/xfs_buf_item.c6
-rw-r--r--fs/xfs/xfs_buf_item_recover.c10
-rw-r--r--fs/xfs/xfs_dir2_readdir.c4
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c13
-rw-r--r--fs/xfs/xfs_dquot.h10
-rw-r--r--fs/xfs/xfs_dquot_item.c134
-rw-r--r--fs/xfs/xfs_dquot_item.h17
-rw-r--r--fs/xfs/xfs_dquot_item_recover.c4
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_error.h12
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_extfree_item.c3
-rw-r--r--fs/xfs/xfs_file.c31
-rw-r--r--fs/xfs/xfs_filestream.c2
-rw-r--r--fs/xfs/xfs_filestream.h2
-rw-r--r--fs/xfs/xfs_fsmap.c68
-rw-r--r--fs/xfs/xfs_fsops.c67
-rw-r--r--fs/xfs/xfs_health.c2
-rw-r--r--fs/xfs/xfs_icache.c754
-rw-r--r--fs/xfs/xfs_icache.h14
-rw-r--r--fs/xfs/xfs_icreate_item.c4
-rw-r--r--fs/xfs/xfs_inode.c223
-rw-r--r--fs/xfs/xfs_inode.h28
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_inode_item_recover.c2
-rw-r--r--fs/xfs/xfs_ioctl.c33
-rw-r--r--fs/xfs/xfs_ioctl32.c4
-rw-r--r--fs/xfs/xfs_iomap.c24
-rw-r--r--fs/xfs/xfs_iops.c32
-rw-r--r--fs/xfs/xfs_itable.c44
-rw-r--r--fs/xfs/xfs_iwalk.c33
-rw-r--r--fs/xfs/xfs_log.c721
-rw-r--r--fs/xfs/xfs_log.h7
-rw-r--r--fs/xfs/xfs_log_cil.c450
-rw-r--r--fs/xfs/xfs_log_priv.h66
-rw-r--r--fs/xfs/xfs_log_recover.c161
-rw-r--r--fs/xfs/xfs_mount.c233
-rw-r--r--fs/xfs/xfs_mount.h248
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_qm.c96
-rw-r--r--fs/xfs/xfs_qm.h3
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c253
-rw-r--r--fs/xfs/xfs_quota.h2
-rw-r--r--fs/xfs/xfs_quotaops.c30
-rw-r--r--fs/xfs/xfs_refcount_item.c5
-rw-r--r--fs/xfs/xfs_reflink.c4
-rw-r--r--fs/xfs/xfs_reflink.h3
-rw-r--r--fs/xfs/xfs_rmap_item.c5
-rw-r--r--fs/xfs/xfs_rtalloc.c6
-rw-r--r--fs/xfs/xfs_rtalloc.h13
-rw-r--r--fs/xfs/xfs_super.c538
-rw-r--r--fs/xfs/xfs_symlink.c13
-rw-r--r--fs/xfs/xfs_sysfs.c1
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h386
-rw-r--r--fs/xfs/xfs_trans.c33
-rw-r--r--fs/xfs/xfs_trans_ail.c19
-rw-r--r--fs/xfs/xfs_trans_buf.c8
-rw-r--r--fs/xfs/xfs_trans_dquot.c51
-rw-r--r--fs/zonefs/super.c23
-rw-r--r--fs/zonefs/zonefs.h7
565 files changed, 47994 insertions, 11874 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index bb1b286c49ae..c381499f5416 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -97,10 +97,13 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
return acl;
}
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type)
+struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
{
struct v9fs_session_info *v9ses;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
v9ses = v9fs_inode2v9ses(inode);
if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index e4f7e882272b..d43c8949e807 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@
#ifdef CONFIG_9P_FS_POSIX_ACL
extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type);
+extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu);
extern int v9fs_acl_chmod(struct inode *, struct p9_fid *);
extern int v9fs_set_create_acl(struct inode *, struct p9_fid *,
struct posix_acl *, struct posix_acl *);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 59c32c9b799f..aab5e6538660 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -121,10 +121,6 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
-
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
@@ -312,10 +308,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- goto out_err;
-
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
@@ -327,7 +319,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
ret = v9fs_file_getlock(filp, fl);
else
ret = -EINVAL;
-out_err:
return ret;
}
@@ -348,10 +339,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- goto out_err;
-
if (!(fl->fl_flags & FL_FLOCK))
goto out_err;
@@ -625,12 +612,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
inode = file_inode(vma->vm_file);
-
- if (!mapping_can_writeback(inode->i_mapping))
- wbc.nr_to_write = 0;
-
- might_sleep();
- sync_inode(inode, &wbc);
+ filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
}
diff --git a/fs/Kconfig b/fs/Kconfig
index 6fd766308521..d8207a1b8c44 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -101,16 +101,6 @@ config FILE_LOCKING
for filesystems like NFS and for the flock() system
call. Disabling this option saves about 11k.
-config MANDATORY_FILE_LOCKING
- bool "Enable Mandatory file locking"
- depends on FILE_LOCKING
- default y
- help
- This option enables files appropriately marked files on appropriely
- mounted filesystems to support mandatory locking.
-
- To the best of my knowledge this is dead code that no one cares about.
-
source "fs/crypto/Kconfig"
source "fs/verity/Kconfig"
@@ -359,7 +349,15 @@ config NFS_V4_2_SSC_HELPER
source "net/sunrpc/Kconfig"
source "fs/ceph/Kconfig"
+
source "fs/cifs/Kconfig"
+source "fs/ksmbd/Kconfig"
+
+config CIFS_COMMON
+ tristate
+ default y if CIFS=y
+ default m if CIFS=m
+
source "fs/coda/Kconfig"
source "fs/afs/Kconfig"
source "fs/9p/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 25daacebb5bc..2f21300851ae 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -96,7 +96,9 @@ obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
obj-$(CONFIG_UNICODE) += unicode/
obj-$(CONFIG_SYSV_FS) += sysv/
+obj-$(CONFIG_CIFS_COMMON) += cifs_common/
obj-$(CONFIG_CIFS) += cifs/
+obj-$(CONFIG_SMB_SERVER) += ksmbd/
obj-$(CONFIG_HPFS_FS) += hpfs/
obj-$(CONFIG_NTFS_FS) += ntfs/
obj-$(CONFIG_NTFS3_FS) += ntfs3/
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index cb3054c7843e..c4210a3964d8 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -772,10 +772,6 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
fl->fl_type, fl->fl_flags,
(long long) fl->fl_start, (long long) fl->fl_end);
- /* AFS doesn't support mandatory locks */
- if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
-
if (IS_GETLK(cmd))
return afs_do_getlk(file, fl);
diff --git a/fs/aio.c b/fs/aio.c
index 76ce0cc3ee4e..51b08ab01dff 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_count()) {
+ if (iocb->ki_eventfd && eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 48e16144c1f7..12b8fdcc445b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -121,7 +121,7 @@ static const char *bad_inode_get_link(struct dentry *dentry,
return ERR_PTR(-EIO);
}
-static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type)
+static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu)
{
return ERR_PTR(-EIO);
}
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index eb7bd6c692c7..02fa66fb82c2 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -14,7 +14,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#ifdef __KERNEL__
-#include <stdarg.h>
+#include <linux/stdarg.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9ef4f1fc2cb0..45df6cbccf12 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -35,6 +35,7 @@
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include "internal.h"
+#include "../block/blk.h"
struct bdev_inode {
struct block_device bdev;
@@ -385,7 +386,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
+ bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
@@ -513,7 +514,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static __init int blkdev_init(void)
{
- return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
+ return bioset_init(&blkdev_dio_pool, 4,
+ offsetof(struct blkdev_dio, bio),
+ BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
}
module_init(blkdev_init);
@@ -686,7 +689,8 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
return retval;
}
-int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync)
{
struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode);
@@ -707,7 +711,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
return error;
}
-EXPORT_SYMBOL(blkdev_fsync);
/**
* bdev_read_page() - Start reading a page from a block device
@@ -801,7 +804,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
memset(&ei->bdev, 0, sizeof(ei->bdev));
- ei->bdev.bd_bdi = &noop_backing_dev_info;
return &ei->vfs_inode;
}
@@ -812,8 +814,15 @@ static void bdev_free_inode(struct inode *inode)
free_percpu(bdev->bd_stats);
kfree(bdev->bd_meta_info);
- if (!bdev_is_partition(bdev))
+ if (!bdev_is_partition(bdev)) {
+ if (bdev->bd_disk && bdev->bd_disk->bdi)
+ bdi_put(bdev->bd_disk->bdi);
kfree(bdev->bd_disk);
+ }
+
+ if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
+ blk_free_ext_minor(MINOR(bdev->bd_dev));
+
kmem_cache_free(bdev_cachep, BDEV_I(inode));
}
@@ -826,16 +835,9 @@ static void init_once(void *data)
static void bdev_evict_inode(struct inode *inode)
{
- struct block_device *bdev = &BDEV_I(inode)->bdev;
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
- /* Detach inode from wb early as bdi_put() may free bdi->wb */
- inode_detach_wb(inode);
- if (bdev->bd_bdi != &noop_backing_dev_info) {
- bdi_put(bdev->bd_bdi);
- bdev->bd_bdi = &noop_backing_dev_info;
- }
}
static const struct super_operations bdev_sops = {
@@ -902,9 +904,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
bdev->bd_disk = disk;
bdev->bd_partno = partno;
bdev->bd_inode = inode;
-#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_disks);
-#endif
bdev->bd_stats = alloc_percpu(struct disk_stats);
if (!bdev->bd_stats) {
iput(inode);
@@ -921,31 +920,6 @@ void bdev_add(struct block_device *bdev, dev_t dev)
insert_inode_hash(bdev->bd_inode);
}
-static struct block_device *bdget(dev_t dev)
-{
- struct inode *inode;
-
- inode = ilookup(blockdev_superblock, dev);
- if (!inode)
- return NULL;
- return &BDEV_I(inode)->bdev;
-}
-
-/**
- * bdgrab -- Grab a reference to an already referenced block device
- * @bdev: Block device to grab a reference to.
- *
- * Returns the block_device with an additional reference when successful,
- * or NULL if the inode is already beeing freed.
- */
-struct block_device *bdgrab(struct block_device *bdev)
-{
- if (!igrab(bdev->bd_inode))
- return NULL;
- return bdev;
-}
-EXPORT_SYMBOL(bdgrab);
-
long nr_blockdev_pages(void)
{
struct inode *inode;
@@ -959,12 +933,6 @@ long nr_blockdev_pages(void)
return ret;
}
-void bdput(struct block_device *bdev)
-{
- iput(bdev->bd_inode);
-}
-EXPORT_SYMBOL(bdput);
-
/**
* bd_may_claim - test whether a block device can be claimed
* @bdev: block device of interest
@@ -1094,148 +1062,6 @@ void bd_abort_claiming(struct block_device *bdev, void *holder)
}
EXPORT_SYMBOL(bd_abort_claiming);
-#ifdef CONFIG_SYSFS
-struct bd_holder_disk {
- struct list_head list;
- struct gendisk *disk;
- int refcnt;
-};
-
-static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
- struct gendisk *disk)
-{
- struct bd_holder_disk *holder;
-
- list_for_each_entry(holder, &bdev->bd_holder_disks, list)
- if (holder->disk == disk)
- return holder;
- return NULL;
-}
-
-static int add_symlink(struct kobject *from, struct kobject *to)
-{
- return sysfs_create_link(from, to, kobject_name(to));
-}
-
-static void del_symlink(struct kobject *from, struct kobject *to)
-{
- sysfs_remove_link(from, kobject_name(to));
-}
-
-/**
- * bd_link_disk_holder - create symlinks between holding disk and slave bdev
- * @bdev: the claimed slave bdev
- * @disk: the holding disk
- *
- * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- *
- * This functions creates the following sysfs symlinks.
- *
- * - from "slaves" directory of the holder @disk to the claimed @bdev
- * - from "holders" directory of the @bdev to the holder @disk
- *
- * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
- * passed to bd_link_disk_holder(), then:
- *
- * /sys/block/dm-0/slaves/sda --> /sys/block/sda
- * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- *
- * The caller must have claimed @bdev before calling this function and
- * ensure that both @bdev and @disk are valid during the creation and
- * lifetime of these symlinks.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
-{
- struct bd_holder_disk *holder;
- int ret = 0;
-
- mutex_lock(&bdev->bd_disk->open_mutex);
-
- WARN_ON_ONCE(!bdev->bd_holder);
-
- /* FIXME: remove the following once add_disk() handles errors */
- if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir))
- goto out_unlock;
-
- holder = bd_find_holder_disk(bdev, disk);
- if (holder) {
- holder->refcnt++;
- goto out_unlock;
- }
-
- holder = kzalloc(sizeof(*holder), GFP_KERNEL);
- if (!holder) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- INIT_LIST_HEAD(&holder->list);
- holder->disk = disk;
- holder->refcnt = 1;
-
- ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
- if (ret)
- goto out_free;
-
- ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
- if (ret)
- goto out_del;
- /*
- * bdev could be deleted beneath us which would implicitly destroy
- * the holder directory. Hold on to it.
- */
- kobject_get(bdev->bd_holder_dir);
-
- list_add(&holder->list, &bdev->bd_holder_disks);
- goto out_unlock;
-
-out_del:
- del_symlink(disk->slave_dir, bdev_kobj(bdev));
-out_free:
- kfree(holder);
-out_unlock:
- mutex_unlock(&bdev->bd_disk->open_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(bd_link_disk_holder);
-
-/**
- * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
- * @bdev: the calimed slave bdev
- * @disk: the holding disk
- *
- * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- *
- * CONTEXT:
- * Might sleep.
- */
-void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
-{
- struct bd_holder_disk *holder;
-
- mutex_lock(&bdev->bd_disk->open_mutex);
-
- holder = bd_find_holder_disk(bdev, disk);
-
- if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- del_symlink(disk->slave_dir, bdev_kobj(bdev));
- del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
- kobject_put(bdev->bd_holder_dir);
- list_del_init(&holder->list);
- kfree(holder);
- }
-
- mutex_unlock(&bdev->bd_disk->open_mutex);
-}
-EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
-#endif
-
static void blkdev_flush_mapping(struct block_device *bdev)
{
WARN_ON_ONCE(bdev->bd_holders);
@@ -1260,11 +1086,8 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
}
}
- if (!bdev->bd_openers) {
+ if (!bdev->bd_openers)
set_init_blocksize(bdev);
- if (bdev->bd_bdi == &noop_backing_dev_info)
- bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
- }
if (test_bit(GD_NEED_PART_SCAN, &disk->state))
bdev_disk_changed(disk, false);
bdev->bd_openers++;
@@ -1282,16 +1105,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
static int blkdev_get_part(struct block_device *part, fmode_t mode)
{
struct gendisk *disk = part->bd_disk;
- struct block_device *whole;
int ret;
if (part->bd_openers)
goto done;
- whole = bdgrab(disk->part0);
- ret = blkdev_get_whole(whole, mode);
+ ret = blkdev_get_whole(bdev_whole(part), mode);
if (ret)
- goto out_put_whole;
+ return ret;
ret = -ENXIO;
if (!bdev_nr_sectors(part))
@@ -1299,16 +1120,12 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode)
disk->open_partitions++;
set_init_blocksize(part);
- if (part->bd_bdi == &noop_backing_dev_info)
- part->bd_bdi = bdi_get(disk->queue->backing_dev_info);
done:
part->bd_openers++;
return 0;
out_blkdev_put:
- blkdev_put_whole(whole, mode);
-out_put_whole:
- bdput(whole);
+ blkdev_put_whole(bdev_whole(part), mode);
return ret;
}
@@ -1321,42 +1138,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode)
blkdev_flush_mapping(part);
whole->bd_disk->open_partitions--;
blkdev_put_whole(whole, mode);
- bdput(whole);
}
struct block_device *blkdev_get_no_open(dev_t dev)
{
struct block_device *bdev;
- struct gendisk *disk;
+ struct inode *inode;
- bdev = bdget(dev);
- if (!bdev) {
+ inode = ilookup(blockdev_superblock, dev);
+ if (!inode) {
blk_request_module(dev);
- bdev = bdget(dev);
- if (!bdev)
+ inode = ilookup(blockdev_superblock, dev);
+ if (!inode)
return NULL;
}
- disk = bdev->bd_disk;
- if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj))
- goto bdput;
- if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
- goto put_disk;
- if (!try_module_get(bdev->bd_disk->fops->owner))
- goto put_disk;
+ /* switch from the inode reference to a device mode one: */
+ bdev = &BDEV_I(inode)->bdev;
+ if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
+ bdev = NULL;
+ iput(inode);
+
+ if (!bdev)
+ return NULL;
+ if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
+ !try_module_get(bdev->bd_disk->fops->owner)) {
+ put_device(&bdev->bd_device);
+ return NULL;
+ }
+
return bdev;
-put_disk:
- put_disk(disk);
-bdput:
- bdput(bdev);
- return NULL;
}
void blkdev_put_no_open(struct block_device *bdev)
{
module_put(bdev->bd_disk->fops->owner);
- put_disk(bdev->bd_disk);
- bdput(bdev);
+ put_device(&bdev->bd_device);
}
/**
@@ -1409,7 +1226,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
mutex_lock(&disk->open_mutex);
ret = -ENXIO;
- if (!(disk->flags & GENHD_FL_UP))
+ if (!disk_live(disk))
goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index cec88a66bd6c..3dcf9bcc2326 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
+btrfs-$(CONFIG_FS_VERITY) += verity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index d95eb5c8cb37..0a0d0eccee4e 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -16,13 +16,16 @@
#include "btrfs_inode.h"
#include "xattr.h"
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
{
int size;
const char *name;
char *value = NULL;
struct posix_acl *acl;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -53,7 +56,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
}
static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct posix_acl *acl, int type)
+ struct user_namespace *mnt_userns,
+ struct inode *inode, struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@@ -114,12 +118,12 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
umode_t old_mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
- ret = posix_acl_update_mode(&init_user_ns, inode,
+ ret = posix_acl_update_mode(mnt_userns, inode,
&inode->i_mode, &acl);
if (ret)
return ret;
}
- ret = __btrfs_set_acl(NULL, inode, acl, type);
+ ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
if (ret)
inode->i_mode = old_mode;
return ret;
@@ -140,14 +144,14 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
return ret;
if (default_acl) {
- ret = __btrfs_set_acl(trans, inode, default_acl,
+ ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
}
if (acl) {
if (!ret)
- ret = __btrfs_set_acl(trans, inode, acl,
+ ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
ACL_TYPE_ACCESS);
posix_acl_release(acl);
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 78b202d198b8..f735b8798ba1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1211,7 +1211,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
again:
head = NULL;
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
BUG_ON(ret == 0);
@@ -1488,14 +1488,14 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots,
- bool ignore_offset, bool skip_commit_root_sem)
+ bool skip_commit_root_sem)
{
int ret;
if (!trans && !skip_commit_root_sem)
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
- time_seq, roots, ignore_offset);
+ time_seq, roots, false);
if (!trans && !skip_commit_root_sem)
up_read(&fs_info->commit_root_sem);
return ret;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ff5f07f9940b..ba454032dbe2 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -47,7 +47,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots, bool ignore_offset,
+ u64 time_seq, struct ulist **roots,
bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9e7d9d0c763d..a3b830b8410a 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1561,7 +1561,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
- if (ret)
+ if (ret && ret != -EAGAIN)
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
@@ -2105,11 +2105,22 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
bg->used = em->len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
+ /*
+ * We may have some valid block group cache added already, in
+ * that case we skip to the next one.
+ */
+ if (ret == -EEXIST) {
+ ret = 0;
+ btrfs_put_block_group(bg);
+ continue;
+ }
+
if (ret) {
btrfs_remove_free_space_cache(bg);
btrfs_put_block_group(bg);
break;
}
+
btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
0, 0, &space_info);
bg->space_info = space_info;
@@ -2212,6 +2223,14 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
ret = check_chunk_block_group_mappings(info);
error:
btrfs_free_path(path);
+ /*
+ * We've hit some error while reading the extent tree, and have
+ * rescue=ibadroots mount option.
+ * Try to fill the tree using dummy block groups so that the user can
+ * continue to mount and grab their data.
+ */
+ if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
+ ret = fill_dummy_bgs(info);
return ret;
}
@@ -2244,6 +2263,95 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
}
+static int insert_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 chunk_offset,
+ u64 start, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_dev_extent *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+ btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * This function belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_dev_extents(struct btrfs_trans_handle *trans,
+ u64 chunk_offset, u64 chunk_size)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_device *device;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i;
+ int ret = 0;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ stripe_size = em->orig_block_len;
+
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * resulting in persisting a device extent item with such ID.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+ ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ free_extent_map(em);
+ return ret;
+}
+
/*
* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
* chunk allocation.
@@ -2278,8 +2386,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
if (ret)
btrfs_abort_transaction(trans, ret);
}
- ret = btrfs_finish_chunk_alloc(trans, block_group->start,
- block_group->length);
+ ret = insert_dev_extents(trans, block_group->start,
+ block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c652e19ad74e..76ee1452c57b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,13 @@ enum {
* the file range, inode's io_tree).
*/
BTRFS_INODE_NO_DELALLOC_FLUSH,
+ /*
+ * Set when we are working on enabling verity for a file. Computing and
+ * writing the whole Merkle tree can take a while so we want to prevent
+ * races where two separate tasks attempt to simultaneously start verity
+ * on the same file.
+ */
+ BTRFS_INODE_VERITY_IN_PROGRESS,
};
/* in memory btrfs inode */
@@ -189,8 +196,10 @@ struct btrfs_inode {
*/
u64 csum_bytes;
- /* flags field from the on disk inode */
+ /* Backwards incompatible flags, lower half of inode_item::flags */
u32 flags;
+ /* Read-only compatibility flags, upper half of inode_item::flags */
+ u32 ro_flags;
/*
* Counters to keep track of the number of extent item's we may use due
@@ -348,6 +357,22 @@ struct btrfs_dio_private {
u8 csums[];
};
+/*
+ * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
+ * separate u32s. These two functions convert between the two representations.
+ */
+static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
+{
+ return (flags | ((u64)ro_flags << 32));
+}
+
+static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+ u32 *flags, u32 *ro_flags)
+{
+ *flags = (u32)inode_item_flags;
+ *ro_flags = (u32)(inode_item_flags >> 32);
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 169508609324..86816088927f 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -243,47 +243,6 @@ struct btrfsic_state {
u32 datablock_size;
};
-static void btrfsic_block_init(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_alloc(void);
-static void btrfsic_block_free(struct btrfsic_block *b);
-static void btrfsic_block_link_init(struct btrfsic_block_link *n);
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
-static void btrfsic_block_link_free(struct btrfsic_block_link *n);
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
- struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
- struct block_device *bdev,
- u64 dev_bytenr,
- struct btrfsic_block_hashtable *h);
-static void btrfsic_block_link_hashtable_init(
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_add(
- struct btrfsic_block_link *l,
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
- struct block_device *bdev_ref_to,
- u64 dev_bytenr_ref_to,
- struct block_device *bdev_ref_from,
- u64 dev_bytenr_ref_from,
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_dev_state_hashtable_init(
- struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_add(
- struct btrfsic_dev_state *ds,
- struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
- struct btrfsic_dev_state_hashtable *h);
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
-static int btrfsic_process_superblock(struct btrfsic_state *state,
- struct btrfs_fs_devices *fs_devices);
static int btrfsic_process_metablock(struct btrfsic_state *state,
struct btrfsic_block *block,
struct btrfsic_block_data_ctx *block_ctx,
@@ -313,14 +272,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
-static void btrfsic_dump_database(struct btrfsic_state *state);
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
- char **datav, unsigned int num_pages);
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char **mapped_datav,
- unsigned int num_pages,
- struct bio *bio, int *bio_is_patched,
- int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
@@ -1558,10 +1509,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
/* Pages must be unmapped in reverse order */
while (num_pages > 0) {
num_pages--;
- if (block_ctx->datav[num_pages]) {
- kunmap_local(block_ctx->datav[num_pages]);
+ if (block_ctx->datav[num_pages])
block_ctx->datav[num_pages] = NULL;
- }
if (block_ctx->pagev[num_pages]) {
__free_page(block_ctx->pagev[num_pages]);
block_ctx->pagev[num_pages] = NULL;
@@ -1638,7 +1587,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
i = j;
}
for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
+ block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
return block_ctx->len;
}
@@ -2703,7 +2652,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap_local_page(bvec.bv_page);
+ mapped_datav[i] = page_address(bvec.bv_page);
i++;
if (dev_state->state->print_mask &
@@ -2716,9 +2665,6 @@ static void __btrfsic_submit_bio(struct bio *bio)
mapped_datav, segs,
bio, &bio_is_patched,
bio->bi_opf);
- /* Unmap in reverse order */
- for (--i; i >= 0; i--)
- kunmap_local(mapped_datav[i]);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 30d82cdf128c..7869ad12bc6e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -172,10 +172,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
/* Hash through the page sector by sector */
for (pg_offset = 0; pg_offset < bytes_left;
pg_offset += sectorsize) {
- kaddr = kmap_atomic(page);
+ kaddr = page_address(page);
crypto_shash_digest(shash, kaddr + pg_offset,
sectorsize, csum);
- kunmap_atomic(kaddr);
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
@@ -565,6 +564,16 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (isize == 0)
return 0;
+ /*
+ * For current subpage support, we only support 64K page size,
+ * which means maximum compressed extent size (128K) is just 2x page
+ * size.
+ * This makes readahead less effective, so here disable readahead for
+ * subpage for now, until full compressed write is supported.
+ */
+ if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+ return 0;
+
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (last_offset < compressed_end) {
@@ -673,6 +682,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct page *page;
struct bio *comp_bio;
u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+ u64 file_offset;
u64 em_len;
u64 em_start;
struct extent_map *em;
@@ -682,15 +692,17 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_tree = &BTRFS_I(inode)->extent_tree;
+ file_offset = bio_first_bvec_all(bio)->bv_offset +
+ page_offset(bio_first_page_all(bio));
+
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree,
- page_offset(bio_first_page_all(bio)),
- fs_info->sectorsize);
+ em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
if (!em)
return BLK_STS_IOERR;
+ ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
@@ -721,8 +733,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
goto fail1;
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
- __GFP_HIGHMEM);
+ cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
if (!cb->compressed_pages[pg_index]) {
faili = pg_index - 1;
ret = BLK_STS_RESOURCE;
@@ -1261,96 +1272,82 @@ void __cold btrfs_exit_compress(void)
}
/*
- * Copy uncompressed data from working buffer to pages.
+ * Copy decompressed data from working buffer to pages.
+ *
+ * @buf: The decompressed data buffer
+ * @buf_len: The decompressed data length
+ * @decompressed: Number of bytes that are already decompressed inside the
+ * compressed extent
+ * @cb: The compressed extent descriptor
+ * @orig_bio: The original bio that the caller wants to read for
+ *
+ * An easier to understand graph is like below:
+ *
+ * |<- orig_bio ->| |<- orig_bio->|
+ * |<------- full decompressed extent ----->|
+ * |<----------- @cb range ---->|
+ * | |<-- @buf_len -->|
+ * |<--- @decompressed --->|
+ *
+ * Note that, @cb can be a subpage of the full decompressed extent, but
+ * @cb->start always has the same as the orig_file_offset value of the full
+ * decompressed extent.
*
- * buf_start is the byte offset we're of the start of our workspace buffer.
+ * When reading compressed extent, we have to read the full compressed extent,
+ * while @orig_bio may only want part of the range.
+ * Thus this function will ensure only data covered by @orig_bio will be copied
+ * to.
*
- * total_out is the last byte of the buffer
+ * Return 0 if we have copied all needed contents for @orig_bio.
+ * Return >0 if we need continue decompress.
*/
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
- unsigned long total_out, u64 disk_start,
- struct bio *bio)
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ struct compressed_bio *cb, u32 decompressed)
{
- unsigned long buf_offset;
- unsigned long current_buf_start;
- unsigned long start_byte;
- unsigned long prev_start_byte;
- unsigned long working_bytes = total_out - buf_start;
- unsigned long bytes;
- struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter);
-
- /*
- * start byte is the first byte of the page we're currently
- * copying into relative to the start of the compressed data.
- */
- start_byte = page_offset(bvec.bv_page) - disk_start;
-
- /* we haven't yet hit data corresponding to this page */
- if (total_out <= start_byte)
- return 1;
-
- /*
- * the start of the data we care about is offset into
- * the middle of our working buffer
- */
- if (total_out > start_byte && buf_start < start_byte) {
- buf_offset = start_byte - buf_start;
- working_bytes -= buf_offset;
- } else {
- buf_offset = 0;
- }
- current_buf_start = buf_start;
-
- /* copy bytes from the working buffer into the pages */
- while (working_bytes > 0) {
- bytes = min_t(unsigned long, bvec.bv_len,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, working_bytes);
-
- memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset,
- bytes);
- flush_dcache_page(bvec.bv_page);
+ struct bio *orig_bio = cb->orig_bio;
+ /* Offset inside the full decompressed extent */
+ u32 cur_offset;
+
+ cur_offset = decompressed;
+ /* The main loop to do the copy */
+ while (cur_offset < decompressed + buf_len) {
+ struct bio_vec bvec;
+ size_t copy_len;
+ u32 copy_start;
+ /* Offset inside the full decompressed extent */
+ u32 bvec_offset;
+
+ bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
+ /*
+ * cb->start may underflow, but subtracting that value can still
+ * give us correct offset inside the full decompressed extent.
+ */
+ bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
- buf_offset += bytes;
- working_bytes -= bytes;
- current_buf_start += bytes;
+ /* Haven't reached the bvec range, exit */
+ if (decompressed + buf_len <= bvec_offset)
+ return 1;
- /* check if we need to pick another page */
- bio_advance(bio, bytes);
- if (!bio->bi_iter.bi_size)
- return 0;
- bvec = bio_iter_iovec(bio, bio->bi_iter);
- prev_start_byte = start_byte;
- start_byte = page_offset(bvec.bv_page) - disk_start;
+ copy_start = max(cur_offset, bvec_offset);
+ copy_len = min(bvec_offset + bvec.bv_len,
+ decompressed + buf_len) - copy_start;
+ ASSERT(copy_len);
/*
- * We need to make sure we're only adjusting
- * our offset into compression working buffer when
- * we're switching pages. Otherwise we can incorrectly
- * keep copying when we were actually done.
+ * Extra range check to ensure we didn't go beyond
+ * @buf + @buf_len.
*/
- if (start_byte != prev_start_byte) {
- /*
- * make sure our new page is covered by this
- * working buffer
- */
- if (total_out <= start_byte)
- return 1;
+ ASSERT(copy_start - decompressed < buf_len);
+ memcpy_to_page(bvec.bv_page, bvec.bv_offset,
+ buf + copy_start - decompressed, copy_len);
+ flush_dcache_page(bvec.bv_page);
+ cur_offset += copy_len;
- /*
- * the next page in the biovec might not be adjacent
- * to the last page, but it might still be found
- * inside this working buffer. bump our offset pointer
- */
- if (total_out > start_byte &&
- current_buf_start < start_byte) {
- buf_offset = start_byte - buf_start;
- working_bytes = total_out - start_byte;
- current_buf_start = buf_start + buf_offset;
- }
- }
+ bio_advance(orig_bio, copy_len);
+ /* Finished the bio */
+ if (!orig_bio->bi_iter.bi_size)
+ return 0;
}
-
return 1;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index c359f20920d0..399be0b435bf 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -86,9 +86,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *total_out);
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
- unsigned long total_out, u64 disk_start,
- struct bio *bio);
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ struct compressed_bio *cb, u32 decompressed);
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int len, u64 disk_start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c5c08c87e130..84627cbd5b5b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -726,21 +726,21 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
/*
* search for key in the extent_buffer. The items start at offset p,
- * and they are item_size apart. There are 'max' items in p.
+ * and they are item_size apart.
*
* the slot in the array is returned via slot, and it points to
* the place where you would insert key if it is not found in
* the array.
*
- * slot may point to max if the key is bigger than all of the keys
+ * Slot may point to total number of items if the key is bigger than
+ * all of the keys
*/
static noinline int generic_bin_search(struct extent_buffer *eb,
unsigned long p, int item_size,
- const struct btrfs_key *key,
- int max, int *slot)
+ const struct btrfs_key *key, int *slot)
{
int low = 0;
- int high = max;
+ int high = btrfs_header_nritems(eb);
int ret;
const int key_size = sizeof(struct btrfs_disk_key);
@@ -799,15 +799,11 @@ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
if (btrfs_header_level(eb) == 0)
return generic_bin_search(eb,
offsetof(struct btrfs_leaf, items),
- sizeof(struct btrfs_item),
- key, btrfs_header_nritems(eb),
- slot);
+ sizeof(struct btrfs_item), key, slot);
else
return generic_bin_search(eb,
offsetof(struct btrfs_node, ptrs),
- sizeof(struct btrfs_key_ptr),
- key, btrfs_header_nritems(eb),
- slot);
+ sizeof(struct btrfs_key_ptr), key, slot);
}
static void root_add_used(struct btrfs_root *root, u32 size)
@@ -1237,7 +1233,6 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
u64 target;
u64 nread = 0;
u64 nread_max;
- struct extent_buffer *eb;
u32 nr;
u32 blocksize;
u32 nscan = 0;
@@ -1266,10 +1261,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
search = btrfs_node_blockptr(node, slot);
blocksize = fs_info->nodesize;
- eb = find_extent_buffer(fs_info, search);
- if (eb) {
- free_extent_buffer(eb);
- return;
+ if (path->reada != READA_FORWARD_ALWAYS) {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer(fs_info, search);
+ if (eb) {
+ free_extent_buffer(eb);
+ return;
+ }
}
target = search;
@@ -2103,6 +2102,27 @@ again:
}
/*
+ * Execute search and call btrfs_previous_item to traverse backwards if the item
+ * was not found.
+ *
+ * Return 0 if found, 1 if not found and < 0 if error.
+ */
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ int ret;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret > 0)
+ ret = btrfs_previous_item(root, path, key->objectid, key->type);
+
+ if (ret == 0)
+ btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
+
+ return ret;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
@@ -4358,16 +4378,6 @@ next:
return 1;
}
-/*
- * search the tree again to find a leaf with greater keys
- * returns 0 if it found something or 1 if there are no greater leaves.
- * returns < 0 on io errors.
- */
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
-{
- return btrfs_next_old_leaf(root, path, 0);
-}
-
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e5e53e592d4f..dff2c8a3e059 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -281,7 +281,8 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_COMPAT_RO_SUPP \
(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
- BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+ BTRFS_FEATURE_COMPAT_RO_VERITY)
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -1012,8 +1013,6 @@ struct btrfs_fs_info {
u64 zoned;
};
- /* Max size to emit ZONE_APPEND write command */
- u64 max_zone_append_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
u64 treelog_bg;
@@ -1484,20 +1483,20 @@ do { \
/*
* Inode flags
*/
-#define BTRFS_INODE_NODATASUM (1 << 0)
-#define BTRFS_INODE_NODATACOW (1 << 1)
-#define BTRFS_INODE_READONLY (1 << 2)
-#define BTRFS_INODE_NOCOMPRESS (1 << 3)
-#define BTRFS_INODE_PREALLOC (1 << 4)
-#define BTRFS_INODE_SYNC (1 << 5)
-#define BTRFS_INODE_IMMUTABLE (1 << 6)
-#define BTRFS_INODE_APPEND (1 << 7)
-#define BTRFS_INODE_NODUMP (1 << 8)
-#define BTRFS_INODE_NOATIME (1 << 9)
-#define BTRFS_INODE_DIRSYNC (1 << 10)
-#define BTRFS_INODE_COMPRESS (1 << 11)
-
-#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
+#define BTRFS_INODE_NODATASUM (1U << 0)
+#define BTRFS_INODE_NODATACOW (1U << 1)
+#define BTRFS_INODE_READONLY (1U << 2)
+#define BTRFS_INODE_NOCOMPRESS (1U << 3)
+#define BTRFS_INODE_PREALLOC (1U << 4)
+#define BTRFS_INODE_SYNC (1U << 5)
+#define BTRFS_INODE_IMMUTABLE (1U << 6)
+#define BTRFS_INODE_APPEND (1U << 7)
+#define BTRFS_INODE_NODUMP (1U << 8)
+#define BTRFS_INODE_NOATIME (1U << 9)
+#define BTRFS_INODE_DIRSYNC (1U << 10)
+#define BTRFS_INODE_COMPRESS (1U << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31)
#define BTRFS_INODE_FLAG_MASK \
(BTRFS_INODE_NODATASUM | \
@@ -1514,6 +1513,10 @@ do { \
BTRFS_INODE_COMPRESS | \
BTRFS_INODE_ROOT_ITEM_INIT)
+#define BTRFS_INODE_RO_VERITY (1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY)
+
struct btrfs_map_token {
struct extent_buffer *eb;
char *kaddr;
@@ -2781,10 +2784,11 @@ enum btrfs_flush_state {
FLUSH_DELAYED_REFS = 4,
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
- ALLOC_CHUNK = 7,
- ALLOC_CHUNK_FORCE = 8,
- RUN_DELAYED_IPUTS = 9,
- COMMIT_TRANS = 10,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
@@ -2901,10 +2905,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
}
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
+
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
static inline int btrfs_next_old_item(struct btrfs_root *root,
struct btrfs_path *p, u64 time_seq)
{
@@ -2913,6 +2920,18 @@ static inline int btrfs_next_old_item(struct btrfs_root *root,
return btrfs_next_old_leaf(root, p, time_seq);
return 0;
}
+
+/*
+ * Search the tree again to find a leaf with greater keys.
+ *
+ * Returns 0 if it found something or 1 if there are no greater leaves.
+ * Returns < 0 on error.
+ */
+static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ return btrfs_next_old_leaf(root, path, 0);
+}
+
static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
{
return btrfs_next_old_item(root, p, 0);
@@ -3145,7 +3164,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root);
+ struct btrfs_root *parent_root,
+ struct user_namespace *mnt_userns);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
@@ -3194,10 +3214,10 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
+int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
- u64 end, int uptodate);
+ u64 end, bool uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
extern const struct iomap_dio_ops btrfs_dio_ops;
@@ -3686,7 +3706,7 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
int btrfs_init_acl(struct btrfs_trans_handle *trans,
@@ -3779,6 +3799,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
return signal_pending(current);
}
+/* verity.c */
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+ size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+ struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+ struct btrfs_verity_descriptor_item, size, 64);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ return 0;
+}
+
+#endif
+
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 257c1e18abd4..1e08eb2b27f0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,7 +6,6 @@
#include <linux/slab.h>
#include <linux/iversion.h>
-#include <linux/sched/mm.h>
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
@@ -672,176 +671,119 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
}
/*
- * This helper will insert some continuous items into the same leaf according
- * to the free space of the leaf.
+ * Insert a single delayed item or a batch of delayed items that have consecutive
+ * keys if they exist.
*/
-static int btrfs_batch_insert_items(struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_item *item)
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *first_item)
{
- struct btrfs_delayed_item *curr, *next;
- int free_space;
- int total_size = 0;
- struct extent_buffer *leaf;
- char *data_ptr;
- struct btrfs_key *keys;
- u32 *data_size;
- struct list_head head;
- int slot;
+ LIST_HEAD(batch);
+ struct btrfs_delayed_item *curr;
+ struct btrfs_delayed_item *next;
+ const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ int total_size;
int nitems;
- int i;
- int ret = 0;
-
- BUG_ON(!path->nodes[0]);
+ char *ins_data = NULL;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ int ret;
- leaf = path->nodes[0];
- free_space = btrfs_leaf_free_space(leaf);
- INIT_LIST_HEAD(&head);
+ list_add_tail(&first_item->tree_list, &batch);
+ nitems = 1;
+ total_size = first_item->data_len + sizeof(struct btrfs_item);
+ curr = first_item;
- next = item;
- nitems = 0;
+ while (true) {
+ int next_size;
- /*
- * count the number of the continuous items that we can insert in batch
- */
- while (total_size + next->data_len + sizeof(struct btrfs_item) <=
- free_space) {
- total_size += next->data_len + sizeof(struct btrfs_item);
- list_add_tail(&next->tree_list, &head);
- nitems++;
-
- curr = next;
next = __btrfs_next_delayed_item(curr);
- if (!next)
+ if (!next || !btrfs_is_continuous_delayed_item(curr, next))
break;
- if (!btrfs_is_continuous_delayed_item(curr, next))
+ next_size = next->data_len + sizeof(struct btrfs_item);
+ if (total_size + next_size > max_size)
break;
- }
- if (!nitems) {
- ret = 0;
- goto out;
+ list_add_tail(&next->tree_list, &batch);
+ nitems++;
+ total_size += next_size;
+ curr = next;
}
- keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
- if (!keys) {
- ret = -ENOMEM;
- goto out;
- }
+ if (nitems == 1) {
+ ins_keys = &first_item->key;
+ ins_sizes = &first_item->data_len;
+ } else {
+ int i = 0;
- data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
- if (!data_size) {
- ret = -ENOMEM;
- goto error;
+ ins_data = kmalloc(nitems * sizeof(u32) +
+ nitems * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
+ list_for_each_entry(curr, &batch, tree_list) {
+ ins_keys[i] = curr->key;
+ ins_sizes[i] = curr->data_len;
+ i++;
+ }
}
- /* get keys of all the delayed items */
- i = 0;
- list_for_each_entry(next, &head, tree_list) {
- keys[i] = next->key;
- data_size[i] = next->data_len;
- i++;
- }
+ ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
+ nitems);
+ if (ret)
+ goto out;
- /* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size, nitems);
+ list_for_each_entry(curr, &batch, tree_list) {
+ char *data_ptr;
- /* insert the dir index items */
- slot = path->slots[0];
- list_for_each_entry_safe(curr, next, &head, tree_list) {
- data_ptr = btrfs_item_ptr(leaf, slot, char);
- write_extent_buffer(leaf, &curr->data,
- (unsigned long)data_ptr,
- curr->data_len);
- slot++;
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ path->slots[0]++;
+ }
- btrfs_delayed_item_release_metadata(root, curr);
+ /*
+ * Now release our path before releasing the delayed items and their
+ * metadata reservations, so that we don't block other tasks for more
+ * time than needed.
+ */
+ btrfs_release_path(path);
+ list_for_each_entry_safe(curr, next, &batch, tree_list) {
list_del(&curr->tree_list);
+ btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
}
-
-error:
- kfree(data_size);
- kfree(keys);
out:
+ kfree(ins_data);
return ret;
}
-/*
- * This helper can just do simple insertion that needn't extend item for new
- * data, such as directory name index insertion, inode insertion.
- */
-static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_item *delayed_item)
-{
- struct extent_buffer *leaf;
- unsigned int nofs_flag;
- char *ptr;
- int ret;
-
- nofs_flag = memalloc_nofs_save();
- ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
- delayed_item->data_len);
- memalloc_nofs_restore(nofs_flag);
- if (ret < 0 && ret != -EEXIST)
- return ret;
-
- leaf = path->nodes[0];
-
- ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-
- write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
- delayed_item->data_len);
- btrfs_mark_buffer_dirty(leaf);
-
- btrfs_delayed_item_release_metadata(root, delayed_item);
- return 0;
-}
-
-/*
- * we insert an item first, then if there are some continuous items, we try
- * to insert those items into the same leaf.
- */
static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
- struct btrfs_delayed_item *curr, *prev;
int ret = 0;
-do_again:
- mutex_lock(&node->mutex);
- curr = __btrfs_first_delayed_insertion_item(node);
- if (!curr)
- goto insert_end;
-
- ret = btrfs_insert_delayed_item(trans, root, path, curr);
- if (ret < 0) {
- btrfs_release_path(path);
- goto insert_end;
- }
+ while (ret == 0) {
+ struct btrfs_delayed_item *curr;
- prev = curr;
- curr = __btrfs_next_delayed_item(prev);
- if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
- /* insert the continuous items into the same leaf */
- path->slots[0]++;
- btrfs_batch_insert_items(root, path, curr);
+ mutex_lock(&node->mutex);
+ curr = __btrfs_first_delayed_insertion_item(node);
+ if (!curr) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+ ret = btrfs_insert_delayed_item(trans, root, path, curr);
+ mutex_unlock(&node->mutex);
}
- btrfs_release_delayed_item(prev);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
- goto do_again;
-
-insert_end:
- mutex_unlock(&node->mutex);
return ret;
}
@@ -914,7 +856,6 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_delayed_node *node)
{
struct btrfs_delayed_item *curr, *prev;
- unsigned int nofs_flag;
int ret = 0;
do_again:
@@ -923,9 +864,7 @@ do_again:
if (!curr)
goto delete_fail;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
- memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto delete_fail;
else if (ret > 0) {
@@ -994,7 +933,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- unsigned int nofs_flag;
int mod;
int ret;
@@ -1007,9 +945,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
else
mod = 1;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
- memalloc_nofs_restore(nofs_flag);
if (ret > 0)
ret = -ENOENT;
if (ret < 0)
@@ -1066,9 +1002,7 @@ search:
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto err_out;
ASSERT(ret);
@@ -1711,6 +1645,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
struct inode *inode)
{
+ u64 flags;
+
btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
@@ -1723,7 +1659,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
inode_peek_iversion(inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
- btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_stack_inode_flags(inode_item, flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
@@ -1781,7 +1719,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
btrfs_stack_inode_sequence(inode_item));
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
- BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+ btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 98b63ebed539..f1274d5c3805 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -170,6 +170,25 @@ out_free:
return 0;
}
+static struct btrfs_dir_item *btrfs_lookup_match_dir(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, const char *name,
+ int name_len, int mod)
+{
+ const int ins_len = (mod < 0 ? -1 : 0);
+ const int cow = (mod != 0);
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+}
+
/*
* lookup a directory item based on name. 'dir' is the objectid
* we're searching in, and 'mod' tells us if you plan on deleting the
@@ -181,23 +200,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
const char *name, int name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
-
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -211,7 +225,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int slot;
struct btrfs_path *path;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -220,20 +233,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
- /* return back any errors */
- if (ret < 0)
- goto out;
+ di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ /* Nothing found, we're safe */
+ if (ret == -ENOENT) {
+ ret = 0;
+ goto out;
+ }
- /* nothing found, we're safe */
- if (ret > 0) {
- ret = 0;
- goto out;
+ if (ret < 0)
+ goto out;
}
/* we found an item, look for our name in the item */
- di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
if (di) {
/* our exact name was found */
ret = -EEXIST;
@@ -274,21 +287,13 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
u64 objectid, const char *name, int name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = objectid;
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
}
struct btrfs_dir_item *
@@ -345,21 +350,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a59ab7b9aea0..2f9515dccce0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3392,11 +3392,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- /* For 4K sector size support, it's only read-only */
- if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
- if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
+ if (sectorsize != PAGE_SIZE) {
+ btrfs_warn(fs_info,
+ "read-write for sector size %u with page size %lu is experimental",
+ sectorsize, PAGE_SIZE);
+ }
+ if (sectorsize != PAGE_SIZE) {
+ if (btrfs_super_incompat_flags(fs_info->super_copy) &
+ BTRFS_FEATURE_INCOMPAT_RAID56) {
btrfs_err(fs_info,
- "subpage sectorsize %u only supported read-only for page size %lu",
+ "RAID56 is not yet supported for sector size %u with page size %lu",
sectorsize, PAGE_SIZE);
err = -EINVAL;
goto fail_alloc;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 268ce58d4569..fc3da7585fb7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -153,7 +153,7 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out_free;
@@ -5950,9 +5950,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_block_group *cache = NULL;
struct btrfs_device *device;
- struct list_head *devices;
u64 group_trimmed;
u64 range_end = U64_MAX;
u64 start;
@@ -6016,9 +6016,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
btrfs_warn(fs_info,
"failed to trim %llu block group(s), last error %d",
bg_failed, bg_ret);
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
continue;
@@ -6031,7 +6031,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
trimmed += group_trimmed;
}
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
if (dev_failed)
btrfs_warn(fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9e81d25dea70..aaddd7225348 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,6 +13,7 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/cleancache.h>
+#include <linux/fsverity.h>
#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
@@ -172,6 +173,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num,
bio->bi_private = NULL;
+ /* Caller should ensure the bio has at least some range added */
+ ASSERT(bio->bi_iter.bi_size);
if (is_data_inode(tree->private_data))
ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
bio_flags);
@@ -2245,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
return bitset;
}
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
-{
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
- SetPageUptodate(page);
-}
-
int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec)
@@ -2688,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
start + len <= page_offset(page) + PAGE_SIZE);
if (uptodate) {
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ if (fsverity_active(page->mapping->host) &&
+ !PageError(page) &&
+ !PageUptodate(page) &&
+ start < i_size_read(page->mapping->host) &&
+ !fsverity_verify_page(page)) {
+ btrfs_page_set_error(fs_info, page, start, len);
+ } else {
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ }
} else {
btrfs_page_clear_uptodate(fs_info, page, start, len);
btrfs_page_set_error(fs_info, page, start, len);
@@ -2779,7 +2778,7 @@ next:
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
struct btrfs_inode *inode;
- int uptodate = (err == 0);
+ const bool uptodate = (err == 0);
int ret = 0;
ASSERT(page && page->mapping);
@@ -2787,8 +2786,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
if (!uptodate) {
- ClearPageUptodate(page);
- SetPageError(page);
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 len;
+
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
+
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
ret = err < 0 ? err : -EIO;
mapping_set_error(page->mapping, ret);
}
@@ -3097,7 +3102,7 @@ readpage_ok:
/* Update page status and unlock */
end_page_read(page, uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, uptodate);
+ start, end, PageUptodate(page));
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3153,11 +3158,13 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
return bio;
}
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
struct btrfs_io_bio *btrfs_bio;
+ ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
/* this will never fail when it's backed by a bioset */
bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
@@ -3181,20 +3188,22 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
* @size: portion of page that we want to write
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
- * @return: true if page was added, false otherwise
*
* Attempt to add a page to bio considering stripe alignment etc.
*
- * Return true if successfully page added. Otherwise, return false.
+ * Return >= 0 for the number of bytes added to the bio.
+ * Can return 0 if the current bio is already at stripe/zone boundary.
+ * Return <0 for error.
*/
-static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
- struct page *page,
- u64 disk_bytenr, unsigned int size,
- unsigned int pg_offset,
- unsigned long bio_flags)
+static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ struct page *page,
+ u64 disk_bytenr, unsigned int size,
+ unsigned int pg_offset,
+ unsigned long bio_flags)
{
struct bio *bio = bio_ctrl->bio;
u32 bio_size = bio->bi_iter.bi_size;
+ u32 real_size;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
bool contig;
int ret;
@@ -3203,29 +3212,36 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
/* The limit should be calculated when bio_ctrl->bio is allocated */
ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
if (bio_ctrl->bio_flags != bio_flags)
- return false;
+ return 0;
if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
if (!contig)
- return false;
+ return 0;
- if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
- bio_size + size > bio_ctrl->len_to_stripe_boundary)
- return false;
+ real_size = min(bio_ctrl->len_to_oe_boundary,
+ bio_ctrl->len_to_stripe_boundary) - bio_size;
+ real_size = min(real_size, size);
+
+ /*
+ * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+ * bio will still execute its endio function on the page!
+ */
+ if (real_size == 0)
+ return 0;
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
- ret = bio_add_zone_append_page(bio, page, size, pg_offset);
+ ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
else
- ret = bio_add_page(bio, page, size, pg_offset);
+ ret = bio_add_page(bio, page, real_size, pg_offset);
- return ret == size;
+ return ret;
}
static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_io_geometry geom;
@@ -3266,9 +3282,8 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
return 0;
}
- ASSERT(fs_info->max_zone_append_size > 0);
/* Ordered extent not yet created, so we're good */
- ordered = btrfs_lookup_ordered_extent(inode, logical);
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
if (!ordered) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
return 0;
@@ -3280,6 +3295,62 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
return 0;
}
+static int alloc_new_bio(struct btrfs_inode *inode,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ struct writeback_control *wbc,
+ unsigned int opf,
+ bio_end_io_t end_io_func,
+ u64 disk_bytenr, u32 offset, u64 file_offset,
+ unsigned long bio_flags)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct bio *bio;
+ int ret;
+
+ /*
+ * For compressed page range, its disk_bytenr is always @disk_bytenr
+ * passed in, no matter if we have added any range into previous bio.
+ */
+ if (bio_flags & EXTENT_BIO_COMPRESSED)
+ bio = btrfs_bio_alloc(disk_bytenr);
+ else
+ bio = btrfs_bio_alloc(disk_bytenr + offset);
+ bio_ctrl->bio = bio;
+ bio_ctrl->bio_flags = bio_flags;
+ bio->bi_end_io = end_io_func;
+ bio->bi_private = &inode->io_tree;
+ bio->bi_write_hint = inode->vfs_inode.i_write_hint;
+ bio->bi_opf = opf;
+ ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
+ if (ret < 0)
+ goto error;
+ if (wbc) {
+ struct block_device *bdev;
+
+ bdev = fs_info->fs_devices->latest_bdev;
+ bio_set_dev(bio, bdev);
+ wbc_init_bio(wbc, bio);
+ }
+ if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *device;
+
+ device = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ btrfs_io_bio(bio)->device = device;
+ }
+ return 0;
+error:
+ bio_ctrl->bio = NULL;
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+ return ret;
+}
+
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
@@ -3305,61 +3376,67 @@ static int submit_extent_page(unsigned int opf,
bool force_bio_submit)
{
int ret = 0;
- struct bio *bio;
- size_t io_size = min_t(size_t, size, PAGE_SIZE);
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct extent_io_tree *tree = &inode->io_tree;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ unsigned int cur = pg_offset;
ASSERT(bio_ctrl);
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
- if (bio_ctrl->bio) {
- bio = bio_ctrl->bio;
- if (force_bio_submit ||
- !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
- pg_offset, bio_flags)) {
- ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
+ if (force_bio_submit && bio_ctrl->bio) {
+ ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
+ bio_ctrl->bio = NULL;
+ if (ret < 0)
+ return ret;
+ }
+
+ while (cur < pg_offset + size) {
+ u32 offset = cur - pg_offset;
+ int added;
+
+ /* Allocate new bio if needed */
+ if (!bio_ctrl->bio) {
+ ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
+ end_io_func, disk_bytenr, offset,
+ page_offset(page) + cur,
+ bio_flags);
+ if (ret < 0)
+ return ret;
+ }
+ /*
+ * We must go through btrfs_bio_add_page() to ensure each
+ * page range won't cross various boundaries.
+ */
+ if (bio_flags & EXTENT_BIO_COMPRESSED)
+ added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
+ size - offset, pg_offset + offset,
+ bio_flags);
+ else
+ added = btrfs_bio_add_page(bio_ctrl, page,
+ disk_bytenr + offset, size - offset,
+ pg_offset + offset, bio_flags);
+
+ /* Metadata page range should never be split */
+ if (!is_data_inode(&inode->vfs_inode))
+ ASSERT(added == 0 || added == size - offset);
+
+ /* At least we added some page, update the account */
+ if (wbc && added)
+ wbc_account_cgroup_owner(wbc, page, added);
+
+ /* We have reached boundary, submit right now */
+ if (added < size - offset) {
+ /* The bio should contain some page(s) */
+ ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+ ret = submit_one_bio(bio_ctrl->bio, mirror_num,
+ bio_ctrl->bio_flags);
bio_ctrl->bio = NULL;
if (ret < 0)
return ret;
- } else {
- if (wbc)
- wbc_account_cgroup_owner(wbc, page, io_size);
- return 0;
}
+ cur += added;
}
-
- bio = btrfs_bio_alloc(disk_bytenr);
- bio_add_page(bio, page, io_size, pg_offset);
- bio->bi_end_io = end_io_func;
- bio->bi_private = tree;
- bio->bi_write_hint = page->mapping->host->i_write_hint;
- bio->bi_opf = opf;
- if (wbc) {
- struct block_device *bdev;
-
- bdev = fs_info->fs_devices->latest_bdev;
- bio_set_dev(bio, bdev);
- wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, page, io_size);
- }
- if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct btrfs_device *device;
-
- device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
- if (IS_ERR(device))
- return PTR_ERR(device);
-
- btrfs_io_bio(bio)->device = device;
- }
-
- bio_ctrl->bio = bio;
- bio_ctrl->bio_flags = bio_flags;
- ret = calc_bio_boundaries(bio_ctrl, inode);
-
- return ret;
+ return 0;
}
static int attach_extent_buffer_page(struct extent_buffer *eb,
@@ -3488,7 +3565,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
size_t pg_offset = 0;
size_t iosize;
size_t blocksize = inode->i_sb->s_blocksize;
- unsigned long this_bio_flag = 0;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ret = set_page_extent_mapped(page);
@@ -3519,6 +3595,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
begin_page_read(fs_info, page);
while (cur <= end) {
+ unsigned long this_bio_flag = 0;
bool force_bio_submit = false;
u64 disk_bytenr;
@@ -3627,7 +3704,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
/* the get_extent function already copied into the page */
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
- check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, true, cur, iosize);
cur = cur + iosize;
@@ -3722,14 +3798,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
delalloc_end, &page_started, nr_written, wbc);
if (ret) {
- SetPageError(page);
- /*
- * btrfs_run_delalloc_range should return < 0 for error
- * but just in case, we use > 0 here meaning the IO is
- * started, so we don't want to return > 0 unless
- * things are going well.
- */
- return ret < 0 ? ret : -EIO;
+ btrfs_page_set_error(inode->root->fs_info, page,
+ page_offset(page), PAGE_SIZE);
+ return ret;
}
/*
* delalloc_end is already one less than the total length, so
@@ -3829,9 +3900,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- u64 cur = start;
+ u64 cur = page_offset(page);
+ u64 end = cur + PAGE_SIZE - 1;
u64 extent_offset;
u64 block_start;
struct extent_map *em;
@@ -3841,7 +3911,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
- ret = btrfs_writepage_cow_fixup(page, start, end);
+ ret = btrfs_writepage_cow_fixup(page);
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
@@ -3865,7 +3935,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (cur >= i_size) {
btrfs_writepage_endio_finish_ordered(inode, page, cur,
- end, 1);
+ end, true);
+ /*
+ * This range is beyond i_size, thus we don't need to
+ * bother writing back.
+ * But we still need to clear the dirty subpage bit, or
+ * the next time the page gets dirtied, we will try to
+ * writeback the sectors with subpage dirty bits,
+ * causing writeback without ordered extent.
+ */
+ btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
break;
}
@@ -3915,7 +3994,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
nr++;
else
btrfs_writepage_endio_finish_ordered(inode,
- page, cur, cur + iosize - 1, 1);
+ page, cur, cur + iosize - 1, true);
+ btrfs_page_clear_dirty(fs_info, page, cur, iosize);
cur += iosize;
continue;
}
@@ -3951,6 +4031,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
cur += iosize;
nr++;
}
+ /*
+ * If we finish without problem, we should not only clear page dirty,
+ * but also empty subpage dirty bits
+ */
+ if (!ret)
+ btrfs_page_assert_not_dirty(fs_info, page);
*nr_ret = nr;
return ret;
}
@@ -3981,7 +4067,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
WARN_ON(!PageLocked(page));
- ClearPageError(page);
+ btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
+ page_offset(page), PAGE_SIZE);
pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
@@ -4022,10 +4109,39 @@ done:
set_page_writeback(page);
end_page_writeback(page);
}
- if (PageError(page)) {
- ret = ret < 0 ? ret : -EIO;
+ /*
+ * Here we used to have a check for PageError() and then set @ret and
+ * call end_extent_writepage().
+ *
+ * But in fact setting @ret here will cause different error paths
+ * between subpage and regular sectorsize.
+ *
+ * For regular page size, we never submit current page, but only add
+ * current page to current bio.
+ * The bio submission can only happen in next page.
+ * Thus if we hit the PageError() branch, @ret is already set to
+ * non-zero value and will not get updated for regular sectorsize.
+ *
+ * But for subpage case, it's possible we submit part of current page,
+ * thus can get PageError() set by submitted bio of the same page,
+ * while our @ret is still 0.
+ *
+ * So here we unify the behavior and don't set @ret.
+ * Error can still be properly passed to higher layer as page will
+ * be set error, here we just don't handle the IO failure.
+ *
+ * NOTE: This is just a hotfix for subpage.
+ * The root fix will be properly ending ordered extent when we hit
+ * an error during writeback.
+ *
+ * But that needs a bigger refactoring, as we not only need to grab the
+ * submitted OE, but also need to know exactly at which bytenr we hit
+ * the error.
+ * Currently the full page based __extent_writepage_io() is not
+ * capable of that.
+ */
+ if (PageError(page))
end_extent_writepage(page, ret, start, page_end);
- }
unlock_page(page);
ASSERT(ret <= 0);
return ret;
@@ -4984,7 +5100,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
- page, start, start + PAGE_SIZE - 1, 1);
+ page, start, start + PAGE_SIZE - 1, true);
unlock_page(page);
}
put_page(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 62027f551b44..53abdc280451 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -280,7 +280,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct bio *btrfs_bio_alloc(u64 first_byte);
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index df6631eefc65..2673c6ba7a4e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -233,7 +233,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 objectid,
u64 offset, int mod)
{
- int ret;
struct btrfs_key file_key;
int ins_len = mod < 0 ? -1 : 0;
int cow = mod != 0;
@@ -241,8 +240,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
file_key.objectid = objectid;
file_key.offset = offset;
file_key.type = BTRFS_EXTENT_DATA_KEY;
- ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
- return ret;
+
+ return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
}
/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ee34497500e1..7ff577005d0f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,6 +16,7 @@
#include <linux/btrfs.h>
#include <linux/uio.h>
#include <linux/iversion.h>
+#include <linux/fsverity.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -1340,7 +1341,18 @@ static int prepare_uptodate_page(struct inode *inode,
unlock_page(page);
return -EIO;
}
- if (page->mapping != inode->i_mapping) {
+
+ /*
+ * Since btrfs_readpage() will unlock the page before it
+ * returns, there is a window where btrfs_releasepage() can be
+ * called to release the page. Here we check both inode
+ * mapping and PagePrivate() to make sure the page was not
+ * released.
+ *
+ * The private flag check is essential for subpage as we need
+ * to store extra bitmap using page->private.
+ */
+ if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
unlock_page(page);
return -EAGAIN;
}
@@ -3604,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
+ int ret;
+
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+
+ ret = fsverity_file_open(inode, filp);
+ if (ret)
+ return ret;
return generic_file_open(inode, filp);
}
@@ -3633,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ if (fsverity_active(inode))
+ return 0;
+
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
return 0;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2131ae5b9ed7..da0eee7c9e5f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -344,19 +344,13 @@ fail:
static void readahead_cache(struct inode *inode)
{
- struct file_ra_state *ra;
+ struct file_ra_state ra;
unsigned long last_index;
- ra = kzalloc(sizeof(*ra), GFP_NOFS);
- if (!ra)
- return;
-
- file_ra_state_init(ra, inode->i_mapping);
+ file_ra_state_init(&ra, inode->i_mapping);
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
-
- kfree(ra);
+ page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
}
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
@@ -2544,6 +2538,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
+ const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
spin_lock(&ctl->tree_lock);
if (!used)
@@ -2573,9 +2568,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
/* All the region is now unusable. Mark it as unused and reclaim */
if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
- } else if (block_group->zone_unusable >=
- div_factor_fine(block_group->length,
- fs_info->bg_reclaim_threshold)) {
+ } else if (bg_reclaim_threshold &&
+ block_group->zone_unusable >=
+ div_factor_fine(block_group->length, bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@@ -2652,8 +2647,11 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group,
* btrfs_pin_extent_for_log_replay() when replaying the log.
* Advance the pointer not to overwrite the tree-log nodes.
*/
- if (block_group->alloc_offset < offset + bytes)
- block_group->alloc_offset = offset + bytes;
+ if (block_group->start + block_group->alloc_offset <
+ offset + bytes) {
+ block_group->alloc_offset =
+ offset + bytes - block_group->start;
+ }
return 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06f9f167222b..487533c35ddb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <asm/unaligned.h>
+#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
@@ -286,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = page_address(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
i++;
ptr += cur_size;
@@ -490,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow,
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
+ /* Subpage doesn't support compression yet */
+ if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+ return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
@@ -629,7 +632,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -682,7 +685,11 @@ again:
}
}
cont:
- if (start == 0) {
+ /*
+ * Check cow_file_range() for why we don't even try to create inline
+ * extent for subpage case.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
if (ret || total_in < actual_end) {
/* we didn't compress the entire range, try
@@ -973,7 +980,7 @@ retry:
p->mapping = inode->vfs_inode.i_mapping;
btrfs_writepage_endio_finish_ordered(inode, p, start,
- end, 0);
+ end, false);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -1080,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- if (start == 0) {
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL);
@@ -1290,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
@@ -1303,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -1946,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
+ ASSERT(ret <= 0);
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
end - start + 1);
@@ -2285,7 +2303,6 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
struct extent_map *split_mid = NULL;
struct extent_map *split_post = NULL;
int ret = 0;
- int modified;
unsigned long flags;
/* Sanity check */
@@ -2315,11 +2332,12 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
ASSERT(em->len == len);
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!list_empty(&em->list));
flags = em->flags;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
- modified = !list_empty(&em->list);
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
@@ -2333,7 +2351,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
- replace_extent_mapping(em_tree, em, split_pre, modified);
+ replace_extent_mapping(em_tree, em, split_pre, 1);
/*
* Now we only have an extent_map at:
@@ -2353,7 +2371,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
split_mid->flags = flags;
split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, modified);
+ add_extent_mapping(em_tree, split_mid, 1);
}
if (post) {
@@ -2367,7 +2385,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
split_post->flags = flags;
split_post->compress_type = em->compress_type;
split_post->generation = em->generation;
- add_extent_mapping(em_tree, split_post, modified);
+ add_extent_mapping(em_tree, split_post, 1);
}
/* Once for us */
@@ -2770,7 +2788,7 @@ out_page:
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3171,7 +3189,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
- u64 end, int uptodate)
+ u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
@@ -3257,25 +3275,44 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
return 0;
}
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ /*
+ * For subpage case, above PageChecked is not safe as it's not subpage
+ * compatible.
+ * But for now only cow fixup and compressed read utilize PageChecked
+ * flag, while in this context we can easily use io_bio->csum to
+ * determine if we really need to do csum verification.
+ *
+ * So for now, just exit if io_bio->csum is NULL, as it means it's
+ * compressed read, and its compressed data csum has already been
+ * verified.
+ */
+ if (io_bio->csum == NULL)
return 0;
- if (!root->fs_info->csum_root)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
- clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
+ if (!root->fs_info->csum_root)
return 0;
- }
ASSERT(page_offset(page) <= start &&
end <= page_offset(page) + PAGE_SIZE - 1);
for (pg_off = offset_in_page(start);
pg_off < offset_in_page(end);
pg_off += sectorsize, bio_offset += sectorsize) {
+ u64 file_offset = pg_off + page_offset(page);
int ret;
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ test_range_bit(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, 1, NULL)) {
+ /* Skip the range without csum for data reloc inode */
+ clear_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ continue;
+ }
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
@@ -3520,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/*
* If we have an inode with links, there are a couple of
- * possibilities. Old kernels (before v3.12) used to create an
+ * possibilities:
+ *
+ * 1. We were halfway through creating fsverity metadata for the
+ * file. In that case, the orphan item represents incomplete
+ * fsverity metadata which must be cleaned up with
+ * btrfs_drop_verity_items and deleting the orphan item.
+
+ * 2. Old kernels (before v3.12) used to create an
* orphan item for truncate indicating that there were possibly
* extent items past i_size that needed to be deleted. In v3.12,
* truncate was changed to update i_size in sync with the extent
@@ -3538,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* but either way, we can delete the orphan item.
*/
if (ret == -ENOENT || inode->i_nlink) {
- if (!ret)
+ if (!ret) {
+ ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
+ if (ret)
+ goto out;
+ }
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3728,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
rdev = btrfs_inode_rdev(leaf, inode_item);
BTRFS_I(inode)->index_cnt = (u64)-1;
- BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
cache_index:
/*
@@ -3859,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
@@ -3894,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
- btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
@@ -5088,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
int ret;
/*
- * Still need to make sure the inode looks like it's been updated so
- * that any holes get logged if we fsync.
+ * If NO_HOLES is enabled, we don't need to do anything.
+ * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+ * or btrfs_update_inode() will be called, which guarantee that the next
+ * fsync will know this inode was changed and needs to be logged.
*/
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- inode->last_trans = fs_info->generation;
- inode->last_sub_trans = root->log_transid;
- inode->last_log_commit = root->last_log_commit;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
- }
/*
* 1 - for the one we're dropping
@@ -5342,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
@@ -5353,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
}
if (attr->ia_valid) {
- setattr_copy(&init_user_ns, inode, attr);
+ setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(&init_user_ns, inode,
- inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
}
return err;
@@ -5522,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode)
trace_btrfs_inode_evict(inode);
if (!root) {
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
return;
}
@@ -5604,6 +5654,7 @@ no_delete:
* to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
}
@@ -6370,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
const char *name, int name_len,
u64 ref_objectid, u64 objectid,
@@ -6479,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (ret != 0)
goto fail_unlock;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = current_time(inode);
@@ -6664,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -6728,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -6873,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_fail;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
S_IFDIR | mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -8194,9 +8247,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
+ struct inode *inode = iter->inode;
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
@@ -8206,13 +8260,13 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
u64 start_sector;
int async_submit = 0;
u64 submit_len;
- int clone_offset = 0;
- int clone_len;
+ u64 clone_offset = 0;
+ u64 clone_len;
u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
- struct btrfs_dio_data *dio_data = iomap->private;
+ struct btrfs_dio_data *dio_data = iter->iomap.private;
struct extent_map *em = NULL;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
@@ -8255,9 +8309,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
status = errno_to_blk_status(ret);
goto out_err_em;
}
- ASSERT(geom.len <= INT_MAX);
- clone_len = min_t(int, submit_len, geom.len);
+ clone_len = min(submit_len, geom.len);
+ ASSERT(clone_len <= UINT_MAX);
/*
* This will never fail as it's passing GPF_NOFS and
@@ -8401,11 +8455,47 @@ static void btrfs_readahead(struct readahead_control *rac)
extent_readahead(rac);
}
+/*
+ * For releasepage() and invalidatepage() we have a race window where
+ * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock. So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * This may look insane as we just acquire the spinlock and release it,
+ * without doing anything. But we just want to make sure no one is
+ * still holding the subpage spinlock.
+ * And since the page is not dirty nor writeback, and we have page
+ * locked, the only possible way to hold a spinlock is from the endio
+ * function to clear page writeback.
+ *
+ * Here we just acquire the spinlock so that all existing callers
+ * should exit and we're safe to release/invalidate the page.
+ */
+ spin_lock_irq(&subpage->lock);
+ spin_unlock_irq(&subpage->lock);
+}
+
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
- if (ret == 1)
+
+ if (ret == 1) {
+ wait_subpage_spinlock(page);
clear_page_extent_mapped(page);
+ }
return ret;
}
@@ -8469,6 +8559,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
* do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
+ wait_subpage_spinlock(page);
/*
* For subpage case, we have call sites like
@@ -8557,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
spin_unlock_irq(&inode->ordered_tree.lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
- cur, range_end + 1 - cur, 1)) {
+ cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered);
/*
* The ordered extent has finished, now we're again
@@ -8938,7 +9029,8 @@ out:
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root)
+ struct btrfs_root *parent_root,
+ struct user_namespace *mnt_userns)
{
struct inode *inode;
int err;
@@ -8949,7 +9041,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
if (err < 0)
return err;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+ inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
+ ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
@@ -8993,6 +9086,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
+ ei->ro_flags = 0;
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
@@ -9174,6 +9268,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
+ u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -9186,13 +9281,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (bi_flags & BTRFS_INODE_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
@@ -9280,8 +9377,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9298,8 +9393,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
@@ -9330,6 +9423,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ *
+ * We pin the logs even if at this precise moment none of the inodes was
+ * logged before. This is because right after we checked for that, some
+ * other task fsyncing some other inode not involved with this rename
+ * operation could log that one of our inodes exists.
+ *
+ * We don't need to pin the logs before the above calls to
+ * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ }
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ }
+
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9411,8 +9527,7 @@ out_fail:
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+ btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
btrfs_set_log_full_commit(trans);
if (root_log_pinned) {
@@ -9436,6 +9551,7 @@ out_notrans:
static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry)
{
@@ -9448,7 +9564,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- inode = btrfs_new_inode(trans, root, dir,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
dentry->d_name.name,
dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)),
@@ -9485,9 +9601,10 @@ out:
return ret;
}
-static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int btrfs_rename(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
@@ -9582,8 +9699,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9607,6 +9722,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
+ /*
+ * Now pin the log. We do it to ensure that no other task can
+ * sync the log while we are in progress with the rename, as
+ * that could result in an inconsistency in case any of the
+ * inodes that are part of this rename operation were logged
+ * before.
+ *
+ * We pin the log even if at this precise moment none of the
+ * inodes was logged before. This is because right after we
+ * checked for that, some other task fsyncing some other inode
+ * not involved with this rename operation could log that one of
+ * our inodes exists.
+ *
+ * We don't need to pin the logs before the above call to
+ * btrfs_insert_inode_ref(), since that does not need to change
+ * a log.
+ */
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
@@ -9660,8 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, old_dir,
- old_dentry);
+ ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+ old_dir, old_dentry);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9711,7 +9845,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
new_dentry);
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
}
struct btrfs_delalloc_work {
@@ -9808,11 +9943,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = sync_inode(inode, wbc);
- if (!ret &&
- test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = sync_inode(inode, wbc);
+ ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
@@ -9947,9 +10078,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
- objectid, S_IFLNK|S_IRWXUGO, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
+ S_IFLNK | S_IRWXUGO, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -10273,7 +10405,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
- return generic_permission(&init_user_ns, inode, mask);
+ return generic_permission(mnt_userns, inode, mask);
}
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
@@ -10298,7 +10430,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (ret)
goto out;
- inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0ba98e08a029..41524f9aeac3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,6 +27,7 @@
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/fileattr.h>
+#include <linux/fsverity.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -103,9 +104,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
-static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
{
unsigned int iflags = 0;
+ u32 flags = binode->flags;
+ u32 ro_flags = binode->ro_flags;
if (flags & BTRFS_INODE_SYNC)
iflags |= FS_SYNC_FL;
@@ -121,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
iflags |= FS_DIRSYNC_FL;
if (flags & BTRFS_INODE_NODATACOW)
iflags |= FS_NOCOW_FL;
+ if (ro_flags & BTRFS_INODE_RO_VERITY)
+ iflags |= FS_VERITY_FL;
if (flags & BTRFS_INODE_NOCOMPRESS)
iflags |= FS_NOCOMP_FL;
@@ -148,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (binode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
+ if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+ new_fl |= S_VERITY;
set_mask_bits(&inode->i_flags,
- S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
- new_fl);
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+ S_VERITY, new_fl);
}
/*
@@ -200,7 +207,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
- fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags));
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
return 0;
}
@@ -224,7 +231,7 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
return -EOPNOTSUPP;
fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
- old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(binode);
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
return ret;
@@ -492,8 +499,8 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
return 1;
}
-static noinline int create_subvol(struct inode *dir,
- struct dentry *dentry,
+static noinline int create_subvol(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
const char *name, int namelen,
struct btrfs_qgroup_inherit *inherit)
{
@@ -638,7 +645,7 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- ret = btrfs_create_subvol_root(trans, new_root, root);
+ ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
btrfs_put_root(new_root);
if (ret) {
/* We potentially lose an unused inode item here */
@@ -830,7 +837,8 @@ free_pending:
* nfs_async_unlink().
*/
-static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
+static int btrfs_may_delete(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *victim, int isdir)
{
int error;
@@ -840,12 +848,12 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
BUG_ON(d_inode(victim->d_parent) != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
+ if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
IS_SWAPFILE(d_inode(victim)))
return -EPERM;
@@ -864,13 +872,16 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
}
/* copy of may_create in fs/namei.c() */
-static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+static inline int btrfs_may_create(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+ if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
+ return -EOVERFLOW;
+ return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
}
/*
@@ -879,6 +890,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
* inside this filesystem so it's quite a bit simpler.
*/
static noinline int btrfs_mksubvol(const struct path *parent,
+ struct user_namespace *mnt_userns,
const char *name, int namelen,
struct btrfs_root *snap_src,
bool readonly,
@@ -893,12 +905,12 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (error == -EINTR)
return error;
- dentry = lookup_one_len(name, parent->dentry, namelen);
+ dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_unlock;
- error = btrfs_may_create(dir, dentry);
+ error = btrfs_may_create(mnt_userns, dir, dentry);
if (error)
goto out_dput;
@@ -920,7 +932,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (snap_src)
error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(dir, dentry, name, namelen, inherit);
+ error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -934,6 +946,7 @@ out_unlock:
}
static noinline int btrfs_mksnapshot(const struct path *parent,
+ struct user_namespace *mnt_userns,
const char *name, int namelen,
struct btrfs_root *root,
bool readonly,
@@ -963,7 +976,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
- ret = btrfs_mksubvol(parent, name, namelen,
+ ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
root, readonly, inherit);
out:
if (snapshot_force_cow)
@@ -1792,6 +1805,7 @@ out_drop:
}
static noinline int __btrfs_ioctl_snap_create(struct file *file,
+ struct user_namespace *mnt_userns,
const char *name, unsigned long fd, int subvol,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
@@ -1819,8 +1833,8 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
}
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, name, namelen,
- NULL, readonly, inherit);
+ ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
+ namelen, NULL, readonly, inherit);
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
@@ -1834,16 +1848,17 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
- } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
+ } else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
/*
* Subvolume creation is not restricted, but snapshots
* are limited to own subvolumes only
*/
ret = -EPERM;
} else {
- ret = btrfs_mksnapshot(&file->f_path, name, namelen,
- BTRFS_I(src_inode)->root,
- readonly, inherit);
+ ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
+ name, namelen,
+ BTRFS_I(src_inode)->root,
+ readonly, inherit);
}
fdput(src);
}
@@ -1867,8 +1882,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
- subvol, false, NULL);
+ ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+ vol_args->name, vol_args->fd, subvol,
+ false, NULL);
kfree(vol_args);
return ret;
@@ -1926,8 +1942,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
}
}
- ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
- subvol, readonly, inherit);
+ ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+ vol_args->name, vol_args->fd, subvol,
+ readonly, inherit);
if (ret)
goto free_inherit;
free_inherit:
@@ -1971,7 +1988,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
u64 flags;
int ret = 0;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
return -EPERM;
ret = mnt_want_write_file(file);
@@ -2382,23 +2399,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
else if (ret > 0) {
- ret = btrfs_previous_item(root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0)
- goto out;
- else if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ ret = -ENOENT;
+ goto out;
}
l = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(l, &key, slot);
iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(l, iref);
@@ -2429,7 +2439,8 @@ out:
return ret;
}
-static int btrfs_search_path_in_tree_user(struct inode *inode,
+static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
struct btrfs_ioctl_ino_lookup_user_args *args)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -2473,23 +2484,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
+ ret = btrfs_search_backwards(root, &key, path);
+ if (ret < 0)
+ goto out_put;
+ else if (ret > 0) {
+ ret = -ENOENT;
goto out_put;
- } else if (ret > 0) {
- ret = btrfs_previous_item(root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0) {
- goto out_put;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out_put;
- }
}
leaf = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(leaf, &key, slot);
iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(leaf, iref);
@@ -2527,7 +2531,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
ret = PTR_ERR(temp_inode);
goto out_put;
}
- ret = inode_permission(&init_user_ns, temp_inode,
+ ret = inode_permission(mnt_userns, temp_inode,
MAY_READ | MAY_EXEC);
iput(temp_inode);
if (ret) {
@@ -2669,7 +2673,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
return -EACCES;
}
- ret = btrfs_search_path_in_tree_user(inode, args);
+ ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
@@ -2905,6 +2909,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args = NULL;
struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
char *subvol_name, *subvol_name_ptr = NULL;
int subvol_namelen;
int err = 0;
@@ -2932,6 +2937,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out;
} else {
+ struct inode *old_dir;
+
if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out;
@@ -2968,6 +2975,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = PTR_ERR(parent);
goto out_drop_write;
}
+ old_dir = dir;
dir = d_inode(parent);
/*
@@ -2978,6 +2986,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
destroy_parent = true;
+ /*
+ * On idmapped mounts, deletion via subvolid is
+ * restricted to subvolumes that are immediate
+ * ancestors of the inode referenced by the file
+ * descriptor in the ioctl. Otherwise the idmapping
+ * could potentially be abused to delete subvolumes
+ * anywhere in the filesystem the user wouldn't be able
+ * to delete without an idmapped mount.
+ */
+ if (old_dir != dir && mnt_userns != &init_user_ns) {
+ err = -EOPNOTSUPP;
+ goto free_parent;
+ }
+
subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
fs_info, vol_args2->subvolid);
if (IS_ERR(subvol_name_ptr)) {
@@ -3016,7 +3038,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (err == -EINTR)
goto free_subvol_name;
- dentry = lookup_one_len(subvol_name, parent, subvol_namelen);
+ dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock_dir;
@@ -3058,14 +3080,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (root == dest)
goto out_dput;
- err = inode_permission(&init_user_ns, inode,
- MAY_WRITE | MAY_EXEC);
+ err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
}
/* check if subvolume may be deleted by a user */
- err = btrfs_may_delete(dir, dentry, 1);
+ err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
if (err)
goto out_dput;
@@ -3103,7 +3124,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ioctl_defrag_range_args *range;
+ struct btrfs_ioctl_defrag_range_args range = {0};
int ret;
ret = mnt_want_write_file(file);
@@ -3115,6 +3136,12 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
+ /* Subpage defrag will be supported in later commits */
+ if (root->fs_info->sectorsize < PAGE_SIZE) {
+ ret = -ENOTTY;
+ goto out;
+ }
+
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
@@ -3135,33 +3162,24 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- range = kzalloc(sizeof(*range), GFP_KERNEL);
- if (!range) {
- ret = -ENOMEM;
- goto out;
- }
-
if (argp) {
- if (copy_from_user(range, argp,
- sizeof(*range))) {
+ if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;
- kfree(range);
goto out;
}
/* compression requires us to start the IO */
- if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
- range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
- range->extent_thresh = (u32)-1;
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
+ range.extent_thresh = (u32)-1;
}
} else {
/* the rest are all set to zero by kzalloc */
- range->len = (u64)-1;
+ range.len = (u64)-1;
}
ret = btrfs_defrag_file(file_inode(file), file,
- range, BTRFS_OLDEST_GENERATION, 0);
+ &range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
- kfree(range);
break;
default:
ret = -EINVAL;
@@ -4404,25 +4422,20 @@ drop_write:
static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_quota_rescan_args *qsa;
+ struct btrfs_ioctl_quota_rescan_args qsa = {0};
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
- if (!qsa)
- return -ENOMEM;
-
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- qsa->flags = 1;
- qsa->progress = fs_info->qgroup_rescan_progress.objectid;
+ qsa.flags = 1;
+ qsa.progress = fs_info->qgroup_rescan_progress.objectid;
}
- if (copy_to_user(arg, qsa, sizeof(*qsa)))
+ if (copy_to_user(arg, &qsa, sizeof(qsa)))
ret = -EFAULT;
- kfree(qsa);
return ret;
}
@@ -4436,6 +4449,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
}
static long _btrfs_ioctl_set_received_subvol(struct file *file,
+ struct user_namespace *mnt_userns,
struct btrfs_ioctl_received_subvol_args *sa)
{
struct inode *inode = file_inode(file);
@@ -4447,7 +4461,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
int ret = 0;
int received_uuid_changed;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
ret = mnt_want_write_file(file);
@@ -4552,7 +4566,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
args64->rtime.nsec = args32->rtime.nsec;
args64->flags = args32->flags;
- ret = _btrfs_ioctl_set_received_subvol(file, args64);
+ ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
if (ret)
goto out;
@@ -4586,7 +4600,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
if (IS_ERR(sa))
return PTR_ERR(sa);
- ret = _btrfs_ioctl_set_received_subvol(file, sa);
+ ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
if (ret)
goto out;
@@ -5013,6 +5027,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_subvol_rootref(file, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
+ case FS_IOC_ENABLE_VERITY:
+ return fsverity_ioctl_enable(file, (const void __user *)argp);
+ case FS_IOC_MEASURE_VERITY:
+ return fsverity_ioctl_measure(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cd042c7567a4..c25dfd1a8a54 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -14,6 +14,7 @@
#include <linux/lzo.h>
#include <linux/refcount.h>
#include "compression.h"
+#include "ctree.h"
#define LZO_LEN 4
@@ -140,18 +141,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = 0;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = page_address(in_page);
/*
* store the size of all chunks of compressed data in
* the first 4 bytes
*/
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
out_offset = LZO_LEN;
tot_out = LZO_LEN;
pages[0] = out_page;
@@ -209,19 +210,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
if (out_len == 0 && tot_in >= len)
break;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages++] = out_page;
pg_bytes_left = PAGE_SIZE;
@@ -243,12 +243,11 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
break;
bytes_left = len - tot_in;
- kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = page_address(in_page);
in_len = min(bytes_left, PAGE_SIZE);
}
@@ -258,164 +257,130 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* store the size of all chunks of compressed data */
- sizes_ptr = kmap_local_page(pages[0]);
+ sizes_ptr = page_address(pages[0]);
write_compress_length(sizes_ptr, tot_out);
- kunmap_local(sizes_ptr);
ret = 0;
*total_out = tot_out;
*total_in = tot_in;
out:
*out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
return ret;
}
+/*
+ * Copy the compressed segment payload into @dest.
+ *
+ * For the payload there will be no padding, just need to do page switching.
+ */
+static void copy_compressed_segment(struct compressed_bio *cb,
+ char *dest, u32 len, u32 *cur_in)
+{
+ u32 orig_in = *cur_in;
+
+ while (*cur_in < orig_in + len) {
+ struct page *cur_page;
+ u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
+ orig_in + len - *cur_in);
+
+ ASSERT(copy_len);
+ cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+
+ memcpy(dest + *cur_in - orig_in,
+ page_address(cur_page) + offset_in_page(*cur_in),
+ copy_len);
+
+ *cur_in += copy_len;
+ }
+}
+
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- int ret = 0, ret2;
- char *data_in;
- unsigned long page_in_index = 0;
- size_t srclen = cb->compressed_len;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
- unsigned long buf_start;
- unsigned long buf_offset = 0;
- unsigned long bytes;
- unsigned long working_bytes;
- size_t in_len;
- size_t out_len;
- const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
- unsigned long in_offset;
- unsigned long in_page_bytes_left;
- unsigned long tot_in;
- unsigned long tot_out;
- unsigned long tot_len;
- char *buf;
- bool may_late_unmap, need_unmap;
- struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
+ const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ int ret;
+ /* Compressed data length, can be unaligned */
+ u32 len_in;
+ /* Offset inside the compressed data */
+ u32 cur_in = 0;
+ /* Bytes decompressed so far */
+ u32 cur_out = 0;
+
+ len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+ cur_in += LZO_LEN;
- data_in = kmap(pages_in[0]);
- tot_len = read_compress_length(data_in);
/*
- * Compressed data header check.
+ * LZO header length check
*
- * The real compressed size can't exceed the maximum extent length, and
- * all pages should be used (whole unused page with just the segment
- * header is not possible). If this happens it means the compressed
- * extent is corrupted.
+ * The total length should not exceed the maximum extent length,
+ * and all sectors should be used.
+ * If this happens, it means the compressed extent is corrupted.
*/
- if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
- tot_len < srclen - PAGE_SIZE) {
- ret = -EUCLEAN;
- goto done;
+ if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+ round_up(len_in, sectorsize) < cb->compressed_len) {
+ btrfs_err(fs_info,
+ "invalid lzo header, lzo len %u compressed len %u",
+ len_in, cb->compressed_len);
+ return -EUCLEAN;
}
- tot_in = LZO_LEN;
- in_offset = LZO_LEN;
- in_page_bytes_left = PAGE_SIZE - LZO_LEN;
-
- tot_out = 0;
-
- while (tot_in < tot_len) {
- in_len = read_compress_length(data_in + in_offset);
- in_page_bytes_left -= LZO_LEN;
- in_offset += LZO_LEN;
- tot_in += LZO_LEN;
+ /* Go through each lzo segment */
+ while (cur_in < len_in) {
+ struct page *cur_page;
+ /* Length of the compressed segment */
+ u32 seg_len;
+ u32 sector_bytes_left;
+ size_t out_len = lzo1x_worst_compress(sectorsize);
/*
- * Segment header check.
- *
- * The segment length must not exceed the maximum LZO
- * compression size, nor the total compressed size.
+ * We should always have enough space for one segment header
+ * inside current sector.
*/
- if (in_len > max_segment_len || tot_in + in_len > tot_len) {
- ret = -EUCLEAN;
- goto done;
- }
-
- tot_in += in_len;
- working_bytes = in_len;
- may_late_unmap = need_unmap = false;
-
- /* fast path: avoid using the working buffer */
- if (in_page_bytes_left >= in_len) {
- buf = data_in + in_offset;
- bytes = in_len;
- may_late_unmap = true;
- goto cont;
- }
-
- /* copy bytes from the pages into the working buffer */
- buf = workspace->cbuf;
- buf_offset = 0;
- while (working_bytes) {
- bytes = min(working_bytes, in_page_bytes_left);
-
- memcpy(buf + buf_offset, data_in + in_offset, bytes);
- buf_offset += bytes;
-cont:
- working_bytes -= bytes;
- in_page_bytes_left -= bytes;
- in_offset += bytes;
-
- /* check if we need to pick another page */
- if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
- || in_page_bytes_left == 0) {
- tot_in += in_page_bytes_left;
-
- if (working_bytes == 0 && tot_in >= tot_len)
- break;
-
- if (page_in_index + 1 >= total_pages_in) {
- ret = -EIO;
- goto done;
- }
-
- if (may_late_unmap)
- need_unmap = true;
- else
- kunmap(pages_in[page_in_index]);
-
- data_in = kmap(pages_in[++page_in_index]);
-
- in_page_bytes_left = PAGE_SIZE;
- in_offset = 0;
- }
- }
-
- out_len = max_segment_len;
- ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
- &out_len);
- if (need_unmap)
- kunmap(pages_in[page_in_index - 1]);
+ ASSERT(cur_in / sectorsize ==
+ (cur_in + LZO_LEN - 1) / sectorsize);
+ cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
+ ASSERT(cur_page);
+ seg_len = read_compress_length(page_address(cur_page) +
+ offset_in_page(cur_in));
+ cur_in += LZO_LEN;
+
+ /* Copy the compressed segment payload into workspace */
+ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
+
+ /* Decompress the data */
+ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
+ workspace->buf, &out_len);
if (ret != LZO_E_OK) {
- pr_warn("BTRFS: decompress failed\n");
+ btrfs_err(fs_info, "failed to decompress");
ret = -EIO;
- break;
+ goto out;
}
- buf_start = tot_out;
- tot_out += out_len;
+ /* Copy the data into inode pages */
+ ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out);
+ cur_out += out_len;
- ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
- tot_out, disk_start, orig_bio);
- if (ret2 == 0)
- break;
+ /* All data read, exit */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ /* Check if the sector has enough space for a segment header */
+ sector_bytes_left = sectorsize - (cur_in % sectorsize);
+ if (sector_bytes_left >= LZO_LEN)
+ continue;
+
+ /* Skip the padding zeros */
+ cur_in += sector_bytes_left;
}
-done:
- kunmap(pages_in[page_in_index]);
+out:
if (!ret)
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
return ret;
}
@@ -466,7 +431,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = kmap_local_page(dest_page);
+ kaddr = page_address(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
/*
@@ -476,7 +441,6 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
*/
if (bytes < destlen)
memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c0f8481e25e..edb65abf0393 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -446,7 +446,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
* Will be also used to store the finished ordered extent.
* @file_offset: File offset for the finished IO
* @io_size: Length of the finish IO range
- * @uptodate: If the IO finishes without problem
*
* Return true if the ordered extent is finished in the range, and update
* @cached.
@@ -457,7 +456,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
*/
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate)
+ u64 file_offset, u64 io_size)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
@@ -486,8 +485,6 @@ have_entry:
entry->bytes_left, io_size);
entry->bytes_left -= io_size;
- if (!uptodate)
- set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
if (entry->bytes_left == 0) {
/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index b2d88aba8420..4194e960ff61 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -177,7 +177,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate);
+ u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 0fa121171ca1..db680f5be745 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1733,7 +1733,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
ASSERT(trans != NULL);
ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
- false, true);
+ true);
if (ret < 0) {
trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
btrfs_warn(trans->fs_info,
@@ -2651,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots(NULL, fs_info,
record->bytenr, 0,
- &record->old_roots, false, false);
+ &record->old_roots, false);
if (ret < 0)
goto cleanup;
}
@@ -2667,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
* current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false);
+ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
@@ -3201,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
num_bytes = found.offset;
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
- &roots, false, false);
+ &roots, false);
if (ret < 0)
goto out;
/* For rescan, just pass old_roots as NULL */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 244d499ebc72..d8d268ca8aa7 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1035,7 +1035,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
for (i = 0; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i])
continue;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[i] = page;
@@ -1054,7 +1054,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
for (; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i])
continue;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[i] = page;
@@ -1636,10 +1636,10 @@ struct btrfs_plug_cb {
static int plug_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
- struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
- plug_list);
- struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
- plug_list);
+ const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+ plug_list);
+ const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+ plug_list);
u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
@@ -2300,7 +2300,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
if (rbio->stripe_pages[index])
continue;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[index] = page;
@@ -2350,14 +2350,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!need_check)
goto writeback;
- p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ p_page = alloc_page(GFP_NOFS);
if (!p_page)
goto cleanup;
SetPageUptodate(p_page);
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
- q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ q_page = alloc_page(GFP_NOFS);
if (!q_page) {
__free_page(p_page);
goto cleanup;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 8e026de74c44..d2062d5f71dd 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -264,8 +264,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
struct block_entry *be = NULL, *exist;
struct root_entry *re = NULL;
- re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
- be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
+ re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
+ be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
if (!be || !re) {
kfree(re);
kfree(be);
@@ -313,7 +313,7 @@ static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
struct root_entry *re;
struct ref_entry *ref = NULL, *exist;
- ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -358,7 +358,7 @@ static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
struct block_entry *be;
struct ref_entry *ref;
- ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
be = add_block_entry(fs_info, bytenr, num_bytes, 0);
@@ -393,7 +393,7 @@ static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
- ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fc831597cb22..914d403b4415 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -24,6 +24,7 @@
#include "block-group.h"
#include "backref.h"
#include "misc.h"
+#include "subpage.h"
/*
* Relocation overview
@@ -2781,10 +2782,70 @@ static noinline_for_stack int prealloc_file_extent_cluster(
u64 num_bytes;
int nr;
int ret = 0;
+ u64 i_size = i_size_read(&inode->vfs_inode);
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset = prealloc_start;
+ /*
+ * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
+ * This means the range [i_size, PAGE_END + 1) is filled with zeros by
+ * btrfs_do_readpage() call of previously relocated file cluster.
+ *
+ * If the current cluster starts in the above range, btrfs_do_readpage()
+ * will skip the read, and relocate_one_page() will later writeback
+ * the padding zeros as new data, causing data corruption.
+ *
+ * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ */
+ if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
+ struct page *page;
+
+ ASSERT(sectorsize < PAGE_SIZE);
+ ASSERT(IS_ALIGNED(i_size, sectorsize));
+
+ /*
+ * Subpage can't handle page with DIRTY but without UPTODATE
+ * bit as it can lead to the following deadlock:
+ *
+ * btrfs_readpage()
+ * | Page already *locked*
+ * |- btrfs_lock_and_flush_ordered_range()
+ * |- btrfs_start_ordered_extent()
+ * |- extent_write_cache_pages()
+ * |- lock_page()
+ * We try to lock the page we already hold.
+ *
+ * Here we just writeback the whole data reloc inode, so that
+ * we will be ensured to have no dirty range in the page, and
+ * are safe to clear the uptodate bits.
+ *
+ * This shouldn't cause too much overhead, as we need to write
+ * the data back anyway.
+ */
+ ret = filemap_write_and_wait(mapping);
+ if (ret < 0)
+ return ret;
+
+ clear_extent_bits(&inode->io_tree, i_size,
+ round_up(i_size, PAGE_SIZE) - 1,
+ EXTENT_UPTODATE);
+ page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+ /*
+ * If page is freed we don't need to do anything then, as we
+ * will re-read the whole page anyway.
+ */
+ if (page) {
+ btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ round_up(i_size, PAGE_SIZE) - i_size);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+
BUG_ON(cluster->start != cluster->boundary[0]);
ret = btrfs_alloc_data_chunk_ondemand(inode,
prealloc_end + 1 - prealloc_start);
@@ -2886,19 +2947,149 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
-static int relocate_file_extent_cluster(struct inode *inode,
- struct file_extent_cluster *cluster)
+static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+ int cluster_nr)
+{
+ /* Last extent, use cluster end directly */
+ if (cluster_nr >= cluster->nr - 1)
+ return cluster->end;
+
+ /* Use next boundary start*/
+ return cluster->boundary[cluster_nr + 1] - 1;
+}
+
+static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
+ struct file_extent_cluster *cluster,
+ int *cluster_nr, unsigned long page_index)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ struct page *page;
u64 page_start;
u64 page_end;
+ u64 cur;
+ int ret;
+
+ ASSERT(page_index <= last_index);
+ page = find_lock_page(inode->i_mapping, page_index);
+ if (!page) {
+ page_cache_sync_readahead(inode->i_mapping, ra, NULL,
+ page_index, last_index + 1 - page_index);
+ page = find_or_create_page(inode->i_mapping, page_index, mask);
+ if (!page)
+ return -ENOMEM;
+ }
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto release_page;
+
+ if (PageReadahead(page))
+ page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
+ page_index, last_index + 1 - page_index);
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto release_page;
+ }
+ }
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_SIZE - 1;
+
+ /*
+ * Start from the cluster, as for subpage case, the cluster can start
+ * inside the page.
+ */
+ cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
+ while (cur <= page_end) {
+ u64 extent_start = cluster->boundary[*cluster_nr] - offset;
+ u64 extent_end = get_cluster_boundary_end(cluster,
+ *cluster_nr) - offset;
+ u64 clamped_start = max(page_start, extent_start);
+ u64 clamped_end = min(page_end, extent_end);
+ u32 clamped_len = clamped_end + 1 - clamped_start;
+
+ /* Reserve metadata for this range */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+ clamped_len);
+ if (ret)
+ goto release_page;
+
+ /* Mark the range delalloc and dirty for later writeback */
+ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
+ clamped_end, 0, NULL);
+ if (ret) {
+ clear_extent_bits(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ clamped_len, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ clamped_len);
+ goto release_page;
+ }
+ btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+
+ /*
+ * Set the boundary if it's inside the page.
+ * Data relocation requires the destination extents to have the
+ * same size as the source.
+ * EXTENT_BOUNDARY bit prevents current extent from being merged
+ * with previous extent.
+ */
+ if (in_range(cluster->boundary[*cluster_nr] - offset,
+ page_start, PAGE_SIZE)) {
+ u64 boundary_start = cluster->boundary[*cluster_nr] -
+ offset;
+ u64 boundary_end = boundary_start +
+ fs_info->sectorsize - 1;
+
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ boundary_start, boundary_end,
+ EXTENT_BOUNDARY);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
+ cur += clamped_len;
+
+ /* Crossed extent end, go to next extent */
+ if (cur >= extent_end) {
+ (*cluster_nr)++;
+ /* Just finished the last extent of the cluster, exit. */
+ if (*cluster_nr >= cluster->nr)
+ break;
+ }
+ }
+ unlock_page(page);
+ put_page(page);
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ btrfs_throttle(fs_info);
+ if (btrfs_should_cancel_balance(fs_info))
+ ret = -ECANCELED;
+ return ret;
+
+release_page:
+ unlock_page(page);
+ put_page(page);
+ return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
- struct page *page;
struct file_ra_state *ra;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- int nr = 0;
+ int cluster_nr = 0;
int ret = 0;
if (!cluster->nr)
@@ -2919,109 +3110,14 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (ret)
goto out;
- index = (cluster->start - offset) >> PAGE_SHIFT;
last_index = (cluster->end - offset) >> PAGE_SHIFT;
- while (index <= last_index) {
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- PAGE_SIZE);
- if (ret)
- goto out;
-
- page = find_lock_page(inode->i_mapping, index);
- if (!page) {
- page_cache_sync_readahead(inode->i_mapping,
- ra, NULL, index,
- last_index + 1 - index);
- page = find_or_create_page(inode->i_mapping, index,
- mask);
- if (!page) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
- ret = -ENOMEM;
- goto out;
- }
- }
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- unlock_page(page);
- put_page(page);
- goto out;
- }
-
- if (PageReadahead(page)) {
- page_cache_async_readahead(inode->i_mapping,
- ra, NULL, page, index,
- last_index + 1 - index);
- }
-
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
- ret = -EIO;
- goto out;
- }
- }
-
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
-
- lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
-
- if (nr < cluster->nr &&
- page_start + offset == cluster->boundary[nr]) {
- set_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end,
- EXTENT_BOUNDARY);
- nr++;
- }
-
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
- page_end, 0, NULL);
- if (ret) {
- unlock_page(page);
- put_page(page);
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
-
- clear_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
- goto out;
-
- }
- set_page_dirty(page);
-
- unlock_extent(&BTRFS_I(inode)->io_tree,
- page_start, page_end);
- unlock_page(page);
- put_page(page);
-
- index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- balance_dirty_pages_ratelimited(inode->i_mapping);
- btrfs_throttle(fs_info);
- if (btrfs_should_cancel_balance(fs_info)) {
- ret = -ECANCELED;
- goto out;
- }
- }
- WARN_ON(nr != cluster->nr);
+ for (index = (cluster->start - offset) >> PAGE_SHIFT;
+ index <= last_index && !ret; index++)
+ ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
if (btrfs_is_zoned(fs_info) && !ret)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (ret == 0)
+ WARN_ON(cluster_nr != cluster->nr);
out:
kfree(ra);
return ret;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6ac37ae6c811..72f9b865e847 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1198,7 +1198,7 @@ struct backref_ctx {
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
{
u64 root = (u64)(uintptr_t)key;
- struct clone_root *cr = (struct clone_root *)elt;
+ const struct clone_root *cr = elt;
if (root < cr->root->root_key.objectid)
return -1;
@@ -1209,8 +1209,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
static int __clone_root_cmp_sort(const void *e1, const void *e2)
{
- struct clone_root *cr1 = (struct clone_root *)e1;
- struct clone_root *cr2 = (struct clone_root *)e2;
+ const struct clone_root *cr1 = e1;
+ const struct clone_root *cr2 = e2;
if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
return -1;
@@ -1307,7 +1307,7 @@ static int find_extent_clone(struct send_ctx *sctx,
u64 flags = 0;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0];
- struct backref_ctx *backref_ctx = NULL;
+ struct backref_ctx backref_ctx = {0};
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
@@ -1322,12 +1322,6 @@ static int find_extent_clone(struct send_ctx *sctx,
/* We only use this path under the commit sem */
tmp_path->need_commit_sem = 0;
- backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
- if (!backref_ctx) {
- ret = -ENOMEM;
- goto out;
- }
-
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
@@ -1392,12 +1386,12 @@ static int find_extent_clone(struct send_ctx *sctx,
cur_clone_root->found_refs = 0;
}
- backref_ctx->sctx = sctx;
- backref_ctx->found = 0;
- backref_ctx->cur_objectid = ino;
- backref_ctx->cur_offset = data_offset;
- backref_ctx->found_itself = 0;
- backref_ctx->extent_len = num_bytes;
+ backref_ctx.sctx = sctx;
+ backref_ctx.found = 0;
+ backref_ctx.cur_objectid = ino;
+ backref_ctx.cur_offset = data_offset;
+ backref_ctx.found_itself = 0;
+ backref_ctx.extent_len = num_bytes;
/*
* The last extent of a file may be too large due to page alignment.
@@ -1405,7 +1399,7 @@ static int find_extent_clone(struct send_ctx *sctx,
* __iterate_backrefs work.
*/
if (data_offset + num_bytes >= ino_size)
- backref_ctx->extent_len = ino_size - data_offset;
+ backref_ctx.extent_len = ino_size - data_offset;
/*
* Now collect all backrefs.
@@ -1416,12 +1410,12 @@ static int find_extent_clone(struct send_ctx *sctx,
extent_item_pos = 0;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1, __iterate_backrefs,
- backref_ctx, false);
+ &backref_ctx, false);
if (ret < 0)
goto out;
- if (!backref_ctx->found_itself) {
+ if (!backref_ctx.found_itself) {
/* found a bug in backref code? */
ret = -EIO;
btrfs_err(fs_info,
@@ -1434,7 +1428,7 @@ static int find_extent_clone(struct send_ctx *sctx,
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
data_offset, ino, num_bytes, logical);
- if (!backref_ctx->found)
+ if (!backref_ctx.found)
btrfs_debug(fs_info, "no clones found");
cur_clone_root = NULL;
@@ -1458,7 +1452,6 @@ static int find_extent_clone(struct send_ctx *sctx,
out:
btrfs_free_path(tmp_path);
- kfree(backref_ctx);
return ret;
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f79bf85f2439..5ada02e0e629 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
long time_left;
int loops;
+ delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ if (delalloc_bytes == 0 && ordered_bytes == 0)
+ return;
+
/* Calc the number of the pages we need flush for space reservation */
if (to_reclaim == U64_MAX) {
items = U64_MAX;
@@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
/*
* to_reclaim is set to however much metadata we need to
* reclaim, but reclaiming that much data doesn't really track
- * exactly, so increase the amount to reclaim by 2x in order to
- * make sure we're flushing enough delalloc to hopefully reclaim
- * some metadata reservations.
+ * exactly. What we really want to do is reclaim full inode's
+ * worth of reservations, however that's not available to us
+ * here. We will take a fraction of the delalloc bytes for our
+ * flushing loops and hope for the best. Delalloc will expand
+ * the amount we write to cover an entire dirty extent, which
+ * will reclaim the metadata reservation for that range. If
+ * it's not enough subsequent flush stages will be more
+ * aggressive.
*/
+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
}
trans = (struct btrfs_trans_handle *)current->journal_info;
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
- if (delalloc_bytes == 0 && ordered_bytes == 0)
- return;
-
/*
* If we are doing more ordered than delalloc we need to just wait on
* ordered extents, otherwise we'll waste time trying to flush delalloc
@@ -528,9 +532,49 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
while ((delalloc_bytes || ordered_bytes) && loops < 3) {
u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
long nr_pages = min_t(u64, temp, LONG_MAX);
+ int async_pages;
btrfs_start_delalloc_roots(fs_info, nr_pages, true);
+ /*
+ * We need to make sure any outstanding async pages are now
+ * processed before we continue. This is because things like
+ * sync_inode() try to be smart and skip writing if the inode is
+ * marked clean. We don't use filemap_fwrite for flushing
+ * because we want to control how many pages we write out at a
+ * time, thus this is the only safe way to make sure we've
+ * waited for outstanding compressed workers to have started
+ * their jobs and thus have ordered extents set up properly.
+ *
+ * This exists because we do not want to wait for each
+ * individual inode to finish its async work, we simply want to
+ * start the IO on everybody, and then come back here and wait
+ * for all of the async work to catch up. Once we're done with
+ * that we know we'll have ordered extents for everything and we
+ * can decide if we wait for that or not.
+ *
+ * If we choose to replace this in the future, make absolutely
+ * sure that the proper waiting is being done in the async case,
+ * as there have been bugs in that area before.
+ */
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
+ goto skip_async;
+
+ /*
+ * We don't want to wait forever, if we wrote less pages in this
+ * loop than we have outstanding, only wait for that number of
+ * pages, otherwise we can wait for all async pages to finish
+ * before continuing.
+ */
+ if (async_pages > nr_pages)
+ async_pages -= nr_pages;
+ else
+ async_pages = 0;
+ wait_event(fs_info->async_submit_wait,
+ atomic_read(&fs_info->async_delalloc_pages) <=
+ async_pages);
+skip_async:
loops++;
if (wait_ordered && !trans) {
btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
@@ -595,8 +639,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
+ case FLUSH_DELALLOC_FULL:
+ if (state == FLUSH_DELALLOC_FULL)
+ num_bytes = U64_MAX;
shrink_delalloc(fs_info, space_info, num_bytes,
- state == FLUSH_DELALLOC_WAIT, for_preempt);
+ state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
@@ -686,7 +733,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
{
u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
- u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+ u64 thresh = div_factor_fine(space_info->total_bytes, 90);
u64 used;
/* If we're just plain full then async reclaim just slows us down. */
@@ -694,6 +741,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
global_rsv_size) >= thresh)
return false;
+ used = space_info->bytes_may_use + space_info->bytes_pinned;
+
+ /* The total flushable belongs to the global rsv, don't flush. */
+ if (global_rsv_size >= used)
+ return false;
+
+ /*
+ * 128MiB is 1/4 of the maximum global rsv size. If we have less than
+ * that devoted to other reservations then there's no sense in flushing,
+ * we don't have a lot of things that need flushing.
+ */
+ if (used - global_rsv_size <= SZ_128M)
+ return false;
+
/*
* We have tickets queued, bail so we don't compete with the async
* flushers.
@@ -824,6 +885,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
+ trace_btrfs_fail_all_tickets(fs_info, space_info);
+
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
__btrfs_dump_space_info(fs_info, space_info);
@@ -905,6 +968,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
/*
+ * We do not want to empty the system of delalloc unless we're
+ * under heavy pressure, so allow one trip through the flushing
+ * logic before we start doing a FLUSH_DELALLOC_FULL.
+ */
+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+ flush_state++;
+
+ /*
* We don't want to force a chunk allocation until we've tried
* pretty hard to reclaim space. Think of the case where we
* freed up a bunch of space and so have a lot of pinned space
@@ -1067,7 +1138,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* so if we now have space to allocate do the force chunk allocation.
*/
static const enum btrfs_flush_state data_flush_states[] = {
- FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
ALLOC_CHUNK_FORCE,
@@ -1156,6 +1227,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELAYED_REFS,
FLUSH_DELALLOC,
FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
};
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 8260f8bb3ff0..f429256f56db 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -73,7 +73,7 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
} \
token->kaddr = page_address(token->eb->pages[idx]); \
token->offset = idx << PAGE_SHIFT; \
- if (oip + size <= PAGE_SIZE) \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
return get_unaligned_le##bits(token->kaddr + oip); \
\
memcpy(lebytes, token->kaddr + oip, part); \
@@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (oip + size <= PAGE_SIZE) \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
return get_unaligned_le##bits(kaddr + oip); \
\
memcpy(lebytes, kaddr + oip, part); \
@@ -124,7 +124,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
} \
token->kaddr = page_address(token->eb->pages[idx]); \
token->offset = idx << PAGE_SHIFT; \
- if (oip + size <= PAGE_SIZE) { \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
put_unaligned_le##bits(val, token->kaddr + oip); \
return; \
} \
@@ -146,7 +146,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (oip + size <= PAGE_SIZE) { \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
put_unaligned_le##bits(val, kaddr + oip); \
return; \
} \
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 640bcd21bf28..cb10e56ee31e 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -435,8 +435,10 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
subpage->writeback_bitmap &= ~tmp;
- if (subpage->writeback_bitmap == 0)
+ if (subpage->writeback_bitmap == 0) {
+ ASSERT(PageWriteback(page));
end_page_writeback(page);
+ }
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -559,3 +561,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);
IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
PageOrdered);
+
+/*
+ * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
+ * is cleared.
+ */
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ASSERT(!PageDirty(page));
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(subpage->dirty_bitmap == 0);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 4d7aca85d915..0120948f37a1 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -126,4 +126,7 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d07b18b2b250..537d90bf5d84 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1201,21 +1201,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0) {
goto err;
} else if (ret > 0) {
- ret = btrfs_previous_item(root, path, subvol_objectid,
- BTRFS_ROOT_BACKREF_KEY);
- if (ret < 0) {
- goto err;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto err;
- }
+ ret = -ENOENT;
+ goto err;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
subvol_objectid = key.offset;
root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1248,21 +1241,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(fs_root, &key, path);
if (ret < 0) {
goto err;
} else if (ret > 0) {
- ret = btrfs_previous_item(fs_root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0) {
- goto err;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto err;
- }
+ ret = -ENOENT;
+ goto err;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
dirid = key.offset;
inode_ref = btrfs_item_ptr(path->nodes[0],
@@ -1353,6 +1339,9 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_op = &btrfs_super_ops;
sb->s_d_op = &btrfs_dentry_operations;
sb->s_export_op = &btrfs_export_ops;
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &btrfs_verityops;
+#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
@@ -2041,13 +2030,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
ret = -EINVAL;
goto restore;
}
- if (fs_info->sectorsize < PAGE_SIZE) {
- btrfs_warn(fs_info,
- "read-write mount is not yet allowed for sectorsize %u page size %lu",
- fs_info->sectorsize, PAGE_SIZE);
- ret = -EINVAL;
- goto restore;
- }
/*
* NOTE: when remounting with a change that does writes, don't
@@ -2096,16 +2078,15 @@ restore:
}
/* Used to sort the devices by max_avail(descending sort) */
-static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
- const void *dev_info2)
+static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
{
- if (((struct btrfs_device_info *)dev_info1)->max_avail >
- ((struct btrfs_device_info *)dev_info2)->max_avail)
+ const struct btrfs_device_info *dev_info1 = a;
+ const struct btrfs_device_info *dev_info2 = b;
+
+ if (dev_info1->max_avail > dev_info2->max_avail)
return -1;
- else if (((struct btrfs_device_info *)dev_info1)->max_avail <
- ((struct btrfs_device_info *)dev_info2)->max_avail)
+ else if (dev_info1->max_avail < dev_info2->max_avail)
return 1;
- else
return 0;
}
@@ -2381,7 +2362,7 @@ static struct file_system_type btrfs_root_fs_type = {
.name = "btrfs",
.mount = btrfs_mount_root,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("btrfs");
@@ -2572,6 +2553,11 @@ static void __init btrfs_print_mod_info(void)
#else
", zoned=no"
#endif
+#ifdef CONFIG_FS_VERITY
+ ", fsverity=yes"
+#else
+ ", fsverity=no"
+#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9d1d140118ff..25a6f587852b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,6 +22,26 @@
#include "block-group.h"
#include "qgroup.h"
+/*
+ * Structure name Path
+ * --------------------------------------------------------------------------
+ * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features
+ * btrfs_supported_feature_attrs /sys/fs/btrfs/features and
+ * /sys/fs/btrfs/<uuid>/features
+ * btrfs_attrs /sys/fs/btrfs/<uuid>
+ * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid>
+ * allocation_attrs /sys/fs/btrfs/<uuid>/allocation
+ * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
+ * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>
+ * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ *
+ * When built with BTRFS_CONFIG_DEBUG:
+ *
+ * btrfs_debug_feature_attrs /sys/fs/btrfs/debug
+ * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
+ * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard
+ */
+
struct btrfs_feature_attr {
struct kobj_attribute kobj_attr;
enum btrfs_feature_set feature_set;
@@ -267,7 +287,17 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#endif
+#ifdef CONFIG_FS_VERITY
+BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
+#endif
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features - all available features implemeted by this version
+ * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
+ * can be changed on a mounted filesystem.
+ */
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
BTRFS_FEAT_ATTR_PTR(default_subvol),
@@ -285,16 +315,12 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
+#ifdef CONFIG_FS_VERITY
+ BTRFS_FEAT_ATTR_PTR(verity),
+#endif
NULL
};
-/*
- * Features which depend on feature bits and may differ between each fs.
- *
- * /sys/fs/btrfs/features lists all available features of this kernel while
- * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
- * can be changed online.
- */
static const struct attribute_group btrfs_feature_attr_group = {
.name = "features",
.is_visible = btrfs_feature_visible,
@@ -366,6 +392,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
+ /* 4K sector size is also supported with 64K page size */
+ if (PAGE_SIZE == SZ_64K)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+
/* Only sectorsize == PAGE_SIZE is now supported */
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
@@ -374,6 +404,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
BTRFS_ATTR(static_feature, supported_sectorsizes,
supported_sectorsizes_show);
+/*
+ * Features which only depend on kernel version.
+ *
+ * These are listed in /sys/fs/btrfs/features along with
+ * btrfs_supported_feature_attrs.
+ */
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
@@ -383,12 +419,6 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
NULL
};
-/*
- * Features which only depend on kernel version.
- *
- * These are listed in /sys/fs/btrfs/features along with
- * btrfs_feature_attr_group
- */
static const struct attribute_group btrfs_static_feature_attr_group = {
.name = "features",
.attrs = btrfs_supported_static_feature_attrs,
@@ -547,6 +577,11 @@ static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
btrfs_discard_max_discard_size_store);
+/*
+ * Per-filesystem debugging of discard (when mounted with discard=async).
+ *
+ * Path: /sys/fs/btrfs/<uuid>/debug/discard/
+ */
static const struct attribute *discard_debug_attrs[] = {
BTRFS_ATTR_PTR(discard, discardable_bytes),
BTRFS_ATTR_PTR(discard, discardable_extents),
@@ -560,15 +595,19 @@ static const struct attribute *discard_debug_attrs[] = {
};
/*
- * Runtime debugging exported via sysfs
+ * Per-filesystem runtime debugging exported via sysfs.
*
- * /sys/fs/btrfs/debug - applies to module or all filesystems
- * /sys/fs/btrfs/UUID - applies only to the given filesystem
+ * Path: /sys/fs/btrfs/UUID/debug/
*/
static const struct attribute *btrfs_debug_mount_attrs[] = {
NULL,
};
+/*
+ * Runtime debugging exported via sysfs, applies to all mounted filesystems.
+ *
+ * Path: /sys/fs/btrfs/debug
+ */
static struct attribute *btrfs_debug_feature_attrs[] = {
NULL
};
@@ -637,6 +676,11 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
+/*
+ * Allocation information about block group profiles.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/
+ */
static struct attribute *raid_attrs[] = {
BTRFS_ATTR_PTR(raid, total_bytes),
BTRFS_ATTR_PTR(raid, used_bytes),
@@ -676,6 +720,11 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+/*
+ * Allocation information about block group types.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/
+ */
static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, flags),
BTRFS_ATTR_PTR(space_info, total_bytes),
@@ -703,6 +752,11 @@ static struct kobj_type space_info_ktype = {
.default_groups = space_info_groups,
};
+/*
+ * Allocation information about block groups.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/
+ */
static const struct attribute *allocation_attrs[] = {
BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
BTRFS_ATTR_PTR(allocation, global_rsv_size),
@@ -974,7 +1028,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
ssize_t ret;
- ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n",
+ READ_ONCE(fs_info->bg_reclaim_threshold));
return ret;
}
@@ -991,16 +1046,21 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
if (ret)
return ret;
- if (thresh <= 50 || thresh > 100)
+ if (thresh != 0 && (thresh <= 50 || thresh > 100))
return -EINVAL;
- fs_info->bg_reclaim_threshold = thresh;
+ WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
return len;
}
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
+/*
+ * Per-filesystem information and stats.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/
+ */
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -1510,6 +1570,11 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
}
BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
+/*
+ * Information about one device.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/
+ */
static struct attribute *devid_attrs[] = {
BTRFS_ATTR_PTR(devid, error_stats),
BTRFS_ATTR_PTR(devid, in_fs_metadata),
@@ -1799,6 +1864,11 @@ QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
+/*
+ * Qgroup information.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/
+ */
static struct attribute *qgroup_attrs[] = {
BTRFS_ATTR_PTR(qgroup, referenced),
BTRFS_ATTR_PTR(qgroup, exclusive),
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 98b5aaba46f1..19ba7d5b7d8f 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -223,8 +223,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -236,8 +235,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -260,8 +258,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
old_roots = NULL;
new_roots = NULL;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -272,8 +269,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret)
return -EINVAL;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -324,8 +320,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -337,8 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -359,8 +353,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -372,8 +365,7 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -400,8 +392,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -413,8 +404,7 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false, false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a8b2e0d2c025..7733e8ac0a69 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -24,6 +24,7 @@
#include "compression.h"
#include "volumes.h"
#include "misc.h"
+#include "btrfs_inode.h"
/*
* Error message should follow the following format:
@@ -873,13 +874,22 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
}
}
- if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+ if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
+ sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 &&
+ num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 &&
+ num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_DUP &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
- num_stripes != 1))) {
+ num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
chunk_err(leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
@@ -999,6 +1009,8 @@ static int check_inode_item(struct extent_buffer *leaf,
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
u32 mode;
int ret;
+ u32 flags;
+ u32 ro_flags;
ret = check_inode_key(leaf, key, slot);
if (unlikely(ret < 0))
@@ -1054,11 +1066,17 @@ static int check_inode_item(struct extent_buffer *leaf,
btrfs_inode_nlink(leaf, iitem));
return -EUCLEAN;
}
- if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags);
+ if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) {
inode_item_err(leaf, slot,
- "unknown flags detected: 0x%llx",
- btrfs_inode_flags(leaf, iitem) &
- ~BTRFS_INODE_FLAG_MASK);
+ "unknown incompat flags detected: 0x%x", flags);
+ return -EUCLEAN;
+ }
+ if (unlikely(!sb_rdonly(fs_info->sb) &&
+ (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) {
+ inode_item_err(leaf, slot,
+ "unknown ro-compat flags detected on writeable mount: 0x%x",
+ ro_flags);
return -EUCLEAN;
}
return 0;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e6430ac9bbe8..f7efc26aa82a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
- if (ret == 0) {
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
ins.objectid, ins.offset, 0);
@@ -3039,8 +3041,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
list_del_init(&ctx->list);
ctx->log_ret = error;
}
-
- INIT_LIST_HEAD(&root->log_ctxs[index]);
}
/*
@@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
- mutex_lock(&root->log_mutex);
- if (root->last_log_commit < log_transid)
- root->last_log_commit = log_transid;
- mutex_unlock(&root->log_mutex);
+ /*
+ * We know there can only be one task here, since we have not yet set
+ * root->log_commit[index1] to 0 and any task attempting to sync the
+ * log must wait for the previous log transaction to commit if it's
+ * still in progress or wait for the current log transaction commit if
+ * someone else already started it. We use <= and not < because the
+ * first log transaction has an ID of 0.
+ */
+ ASSERT(root->last_log_commit <= log_transid);
+ root->last_log_commit = log_transid;
out_wake_log_root:
mutex_lock(&log_root_tree->log_mutex);
@@ -3417,14 +3423,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
}
/*
- * Check if an inode was logged in the current transaction. We can't always rely
- * on an inode's logged_trans value, because it's an in-memory only field and
- * therefore not persisted. This means that its value is lost if the inode gets
- * evicted and loaded again from disk (in which case it has a value of 0, and
- * certainly it is smaller then any possible transaction ID), when that happens
- * the full_sync flag is set in the inode's runtime flags, so on that case we
- * assume eviction happened and ignore the logged_trans value, assuming the
- * worst case, that the inode was logged before in the current transaction.
+ * Check if an inode was logged in the current transaction. This may often
+ * return some false positives, because logged_trans is an in memory only field,
+ * not persisted anywhere. This is meant to be used in contexts where a false
+ * positive has no functional consequences.
*/
static bool inode_logged(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
@@ -3432,8 +3434,17 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
if (inode->logged_trans == trans->transid)
return true;
- if (inode->last_trans == trans->transid &&
- test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ /*
+ * The inode's logged_trans is always 0 when we load it (because it is
+ * not persisted in the inode item or elsewhere). So if it is 0, the
+ * inode was last modified in the current transaction then the inode may
+ * have been logged before in the current transaction, then evicted and
+ * loaded again in the current transaction - or may have never been logged
+ * in the current transaction, but since we can not be sure, we have to
+ * assume it was, otherwise our callers can leave an inconsistent log.
+ */
+ if (inode->logged_trans == 0 &&
+ inode->last_trans == trans->transid &&
!test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
return true;
@@ -3913,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
u64 logged_isize)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
@@ -3962,20 +3974,49 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
- btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct btrfs_path *path,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
int ret;
- ret = btrfs_insert_empty_item(trans, log, path,
- &inode->location, sizeof(*inode_item));
- if (ret && ret != -EEXIST)
+ /*
+ * If we are doing a fast fsync and the inode was logged before in the
+ * current transaction, then we know the inode was previously logged and
+ * it exists in the log tree. For performance reasons, in this case use
+ * btrfs_search_slot() directly with ins_len set to 0 so that we never
+ * attempt a write lock on the leaf's parent, which adds unnecessary lock
+ * contention in case there are concurrent fsyncs for other inodes of the
+ * same subvolume. Using btrfs_insert_empty_item() when the inode item
+ * already exists can also result in unnecessarily splitting a leaf.
+ */
+ if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+ ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+ ASSERT(ret <= 0);
+ if (ret > 0)
+ ret = -ENOENT;
+ } else {
+ /*
+ * This means it is the first fsync in the current transaction,
+ * so the inode item is not in the log and we need to insert it.
+ * We can never get -EEXIST because we are only called for a fast
+ * fsync and in case an inode eviction happens after the inode was
+ * logged before in the current transaction, when we load again
+ * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+ * flags and set ->logged_trans to 0.
+ */
+ ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+ sizeof(*inode_item));
+ ASSERT(ret != -EEXIST);
+ }
+ if (ret)
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
@@ -4160,7 +4201,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
static int extent_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
- struct extent_map *em1, *em2;
+ const struct extent_map *em1, *em2;
em1 = list_entry(a, struct extent_map, list);
em2 = list_entry(b, struct extent_map, list);
@@ -5053,8 +5094,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
/*
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
- * the inode is not updated when we only log that it exists and
- * it has the full sync bit set (see btrfs_log_inode()).
+ * the inode is not updated when we only log that it exists (see
+ * btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5299,6 +5340,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool need_log_inode_item = true;
bool xattrs_logged = false;
bool recursive_logging = false;
+ bool inode_item_dropped = true;
path = btrfs_alloc_path();
if (!path)
@@ -5433,6 +5475,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
+ inode_item_dropped = false;
goto log_extents;
}
@@ -5466,7 +5509,7 @@ log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (need_log_inode_item) {
- err = log_inode_item(trans, log, dst_path, inode);
+ err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
if (err)
goto out_unlock;
/*
@@ -5573,6 +5616,13 @@ static bool need_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
/*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
* If this inode does not have new/updated/deleted xattrs since the last
* time it was logged and is flagged as logged in the current transaction,
* we can skip logging it. As for new/deleted names, those are updated in
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
new file mode 100644
index 000000000000..28d443d3ef93
--- /dev/null
+++ b/fs/btrfs/verity.c
@@ -0,0 +1,811 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+
+/*
+ * Implementation of the interface defined in struct fsverity_operations.
+ *
+ * The main question is how and where to store the verity descriptor and the
+ * Merkle tree. We store both in dedicated btree items in the filesystem tree,
+ * together with the rest of the inode metadata. This means we'll need to do
+ * extra work to encrypt them once encryption is supported in btrfs, but btrfs
+ * has a lot of careful code around i_size and it seems better to make a new key
+ * type than try and adjust all of our expectations for i_size.
+ *
+ * Note that this differs from the implementation in ext4 and f2fs, where
+ * this data is stored as if it were in the file, but past EOF. However, btrfs
+ * does not have a widespread mechanism for caching opaque metadata pages, so we
+ * do pretend that the Merkle tree pages themselves are past EOF for the
+ * purposes of caching them (as opposed to creating a virtual inode).
+ *
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
+ * size of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.
+ * The latter are opaque to btrfs, we just read and write them as a blob for
+ * the higher level verity code. The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.
+ * So when fsverity asks for page 0 of the merkle tree, we pull up one page
+ * starting at offset 0 for this key type. These are also opaque to btrfs,
+ * we're blindly storing whatever fsverity sends down.
+ *
+ * Another important consideration is the fact that the Merkle tree data scales
+ * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
+ * ~1/127th the size) so for large files, writing the tree can be a lengthy
+ * operation. For that reason, we guard the whole enable verity operation
+ * (between begin_enable_verity and end_enable_verity) with an orphan item.
+ * Again, because the data can be pretty large, it's quite possible that we
+ * could run out of space writing it, so we try our best to handle errors by
+ * stopping and rolling back rather than aborting the victim transaction.
+ */
+
+#define MERKLE_START_ALIGN 65536
+
+/*
+ * Compute the logical file offset where we cache the Merkle tree.
+ *
+ * @inode: inode of the verity file
+ *
+ * For the purposes of caching the Merkle tree pages, as required by
+ * fs-verity, it is convenient to do size computations in terms of a file
+ * offset, rather than in terms of page indices.
+ *
+ * Use 64K to be sure it's past the last page in the file, even with 64K pages.
+ * That rounding operation itself can overflow loff_t, so we do it in u64 and
+ * check.
+ *
+ * Returns the file offset on success, negative error code on failure.
+ */
+static loff_t merkle_file_pos(const struct inode *inode)
+{
+ u64 sz = inode->i_size;
+ u64 rounded = round_up(sz, MERKLE_START_ALIGN);
+
+ if (rounded > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ return rounded;
+}
+
+/*
+ * Drop all the items for this inode with this key_type.
+ *
+ * @inode: inode to drop items for
+ * @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or
+ * BTRFS_VERITY_MERKLE_ITEM)
+ *
+ * Before doing a verity enable we cleanup any existing verity items.
+ * This is also used to clean up if a verity enable failed half way through.
+ *
+ * Returns number of dropped items on success, negative error code on failure.
+ */
+static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int count = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ /* 1 for the item being dropped */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /*
+ * Walk backwards through all the items until we find one that
+ * isn't from our key type or objectid
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = 0;
+ /* No more keys of this type, we're done */
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ } else if (ret < 0) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ /* No more keys of this type, we're done */
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+ /*
+ * This shouldn't be a performance sensitive function because
+ * it's not used as part of truncate. If it ever becomes
+ * perf sensitive, change this to walk forward and bulk delete
+ * items
+ */
+ ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ count++;
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ }
+ ret = count;
+ btrfs_end_transaction(trans);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Drop all verity items
+ *
+ * @inode: inode to drop verity items for
+ *
+ * In most contexts where we are dropping verity items, we want to do it for all
+ * the types of verity items, not a particular one.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ int ret;
+
+ ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
+ if (ret < 0)
+ return ret;
+ ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Insert and write inode items with a given key type and offset.
+ *
+ * @inode: inode to insert for
+ * @key_type: key type to insert
+ * @offset: item offset to insert at
+ * @src: source data to write
+ * @len: length of source data to write
+ *
+ * Write len bytes from src into items of up to 2K length.
+ * The inserted items will have key (ino, key_type, offset + off) where off is
+ * consecutively increasing from 0 up to the last item ending at offset + len.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ const char *src, u64 len)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long copy_bytes;
+ unsigned long src_offset = 0;
+ void *data;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (len > 0) {
+ /* 1 for the new item being inserted */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = offset;
+
+ /*
+ * Insert 2K at a time mostly to be friendly for smaller leaf
+ * size filesystems
+ */
+ copy_bytes = min_t(u64, len, 2048);
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ break;
+ }
+
+ leaf = path->nodes[0];
+
+ data = btrfs_item_ptr(leaf, path->slots[0], void);
+ write_extent_buffer(leaf, src + src_offset,
+ (unsigned long)data, copy_bytes);
+ offset += copy_bytes;
+ src_offset += copy_bytes;
+ len -= copy_bytes;
+
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Read inode items of the given key type and offset from the btree.
+ *
+ * @inode: inode to read items of
+ * @key_type: key type to read
+ * @offset: item offset to read from
+ * @dest: Buffer to read into. This parameter has slightly tricky
+ * semantics. If it is NULL, the function will not do any copying
+ * and will just return the size of all the items up to len bytes.
+ * If dest_page is passed, then the function will kmap_local the
+ * page and ignore dest, but it must still be non-NULL to avoid the
+ * counting-only behavior.
+ * @len: length in bytes to read
+ * @dest_page: copy into this page instead of the dest buffer
+ *
+ * Helper function to read items from the btree. This returns the number of
+ * bytes read or < 0 for errors. We can return short reads if the items don't
+ * exist on disk or aren't big enough to fill the desired length. Supports
+ * reading into a provided buffer (dest) or into the page cache
+ *
+ * Returns number of bytes read or a negative error code on failure.
+ */
+static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ char *dest, u64 len, struct page *dest_page)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 item_end;
+ u64 copy_end;
+ int copied = 0;
+ u32 copy_offset;
+ unsigned long copy_bytes;
+ unsigned long dest_offset = 0;
+ void *data;
+ char *kaddr = dest;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (dest_page)
+ path->reada = READA_FORWARD;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (len > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+ item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+
+ if (copied > 0) {
+ /*
+ * Once we've copied something, we want all of the items
+ * to be sequential
+ */
+ if (key.offset != offset)
+ break;
+ } else {
+ /*
+ * Our initial offset might be in the middle of an
+ * item. Make sure it all makes sense.
+ */
+ if (key.offset > offset)
+ break;
+ if (item_end <= offset)
+ break;
+ }
+
+ /* desc = NULL to just sum all the item lengths */
+ if (!dest)
+ copy_end = item_end;
+ else
+ copy_end = min(offset + len, item_end);
+
+ /* Number of bytes in this item we want to copy */
+ copy_bytes = copy_end - offset;
+
+ /* Offset from the start of item for copying */
+ copy_offset = offset - key.offset;
+
+ if (dest) {
+ if (dest_page)
+ kaddr = kmap_local_page(dest_page);
+
+ data = btrfs_item_ptr(leaf, path->slots[0], void);
+ read_extent_buffer(leaf, kaddr + dest_offset,
+ (unsigned long)data + copy_offset,
+ copy_bytes);
+
+ if (dest_page)
+ kunmap_local(kaddr);
+ }
+
+ offset += copy_bytes;
+ dest_offset += copy_bytes;
+ len -= copy_bytes;
+ copied += copy_bytes;
+
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ /*
+ * We've reached the last slot in this leaf and we need
+ * to go to the next leaf.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ }
+out:
+ btrfs_free_path(path);
+ if (!ret)
+ ret = copied;
+ return ret;
+}
+
+/*
+ * Delete an fsverity orphan
+ *
+ * @trans: transaction to do the delete in
+ * @inode: inode to orphan
+ *
+ * Capture verity orphan specific logic that is repeated in the couple places
+ * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
+ * with 0 links.
+ *
+ * Returns zero on success or a negative error code on failure.
+ */
+static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+ int ret;
+
+ /*
+ * If the inode has no links, it is either already unlinked, or was
+ * created with O_TMPFILE. In either case, it should have an orphan from
+ * that other operation. Rather than reference count the orphans, we
+ * simply ignore them here, because we only invoke the verity path in
+ * the orphan logic when i_nlink is 1.
+ */
+ if (!inode->vfs_inode.i_nlink)
+ return 0;
+
+ ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Rollback in-progress verity if we encounter an error.
+ *
+ * @inode: inode verity had an error for
+ *
+ * We try to handle recoverable errors while enabling verity by rolling it back
+ * and just failing the operation, rather than having an fs level error no
+ * matter what. However, any error in rollback is unrecoverable.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int rollback_verity(struct btrfs_inode *inode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = inode->root;
+ int ret;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+ truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
+ clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ ret = btrfs_drop_verity_items(inode);
+ if (ret) {
+ btrfs_handle_fs_error(root->fs_info, ret,
+ "failed to drop verity items in rollback %llu",
+ (u64)inode->vfs_inode.i_ino);
+ goto out;
+ }
+
+ /*
+ * 1 for updating the inode flag
+ * 1 for deleting the orphan
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_handle_fs_error(root->fs_info, ret,
+ "failed to start transaction in verity rollback %llu",
+ (u64)inode->vfs_inode.i_ino);
+ goto out;
+ }
+ inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ ret = del_orphan(trans, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ btrfs_end_transaction(trans);
+out:
+ return ret;
+}
+
+/*
+ * Finalize making the file a valid verity file
+ *
+ * @inode: inode to be marked as verity
+ * @desc: contents of the verity descriptor to write (not NULL)
+ * @desc_size: size of the verity descriptor
+ *
+ * Do the actual work of finalizing verity after successfully writing the Merkle
+ * tree:
+ *
+ * - write out the descriptor items
+ * - mark the inode with the verity flag
+ * - delete the orphan item
+ * - mark the ro compat bit
+ * - clear the in progress bit
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int finish_verity(struct btrfs_inode *inode, const void *desc,
+ size_t desc_size)
+{
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_verity_descriptor_item item;
+ int ret;
+
+ /* Write out the descriptor item */
+ memset(&item, 0, sizeof(item));
+ btrfs_set_stack_verity_descriptor_size(&item, desc_size);
+ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
+ (const char *)&item, sizeof(item));
+ if (ret)
+ goto out;
+
+ /* Write out the descriptor itself */
+ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
+ desc, desc_size);
+ if (ret)
+ goto out;
+
+ /*
+ * 1 for updating the inode flag
+ * 1 for deleting the orphan
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ inode->ro_flags |= BTRFS_INODE_RO_VERITY;
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto end_trans;
+ ret = del_orphan(trans, inode);
+ if (ret)
+ goto end_trans;
+ clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ btrfs_set_fs_compat_ro(root->fs_info, VERITY);
+end_trans:
+ btrfs_end_transaction(trans);
+out:
+ return ret;
+
+}
+
+/*
+ * fsverity op that begins enabling verity.
+ *
+ * @filp: file to enable verity on
+ *
+ * Begin enabling fsverity for the file. We drop any existing verity items, add
+ * an orphan and set the in progress bit.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int btrfs_begin_enable_verity(struct file *filp)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ASSERT(inode_is_locked(file_inode(filp)));
+
+ if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
+ return -EBUSY;
+
+ /*
+ * This should almost never do anything, but theoretically, it's
+ * possible that we failed to enable verity on a file, then were
+ * interrupted or failed while rolling back, failed to cleanup the
+ * orphan, and finally attempt to enable verity again.
+ */
+ ret = btrfs_drop_verity_items(inode);
+ if (ret)
+ return ret;
+
+ /* 1 for the orphan item */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (!ret)
+ set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ btrfs_end_transaction(trans);
+
+ return 0;
+}
+
+/*
+ * fsverity op that ends enabling verity.
+ *
+ * @filp: file we are finishing enabling verity on
+ * @desc: verity descriptor to write out (NULL in error conditions)
+ * @desc_size: size of the verity descriptor (variable with signatures)
+ * @merkle_tree_size: size of the merkle tree in bytes
+ *
+ * If desc is null, then VFS is signaling an error occurred during verity
+ * enable, and we should try to rollback. Otherwise, attempt to finish verity.
+ *
+ * Returns 0 on success, negative error code on error.
+ */
+static int btrfs_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+ int ret = 0;
+ int rollback_ret;
+
+ ASSERT(inode_is_locked(file_inode(filp)));
+
+ if (desc == NULL)
+ goto rollback;
+
+ ret = finish_verity(inode, desc, desc_size);
+ if (ret)
+ goto rollback;
+ return ret;
+
+rollback:
+ rollback_ret = rollback_verity(inode);
+ if (rollback_ret)
+ btrfs_err(inode->root->fs_info,
+ "failed to rollback verity items: %d", rollback_ret);
+ return ret;
+}
+
+/*
+ * fsverity op that gets the struct fsverity_descriptor.
+ *
+ * @inode: inode to get the descriptor of
+ * @buf: output buffer for the descriptor contents
+ * @buf_size: size of the output buffer. 0 to query the size
+ *
+ * fsverity does a two pass setup for reading the descriptor, in the first pass
+ * it calls with buf_size = 0 to query the size of the descriptor, and then in
+ * the second pass it actually reads the descriptor off disk.
+ *
+ * Returns the size on success or a negative error code on failure.
+ */
+static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ u64 true_size;
+ int ret = 0;
+ struct btrfs_verity_descriptor_item item;
+
+ memset(&item, 0, sizeof(item));
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
+ (char *)&item, sizeof(item), NULL);
+ if (ret < 0)
+ return ret;
+
+ if (item.reserved[0] != 0 || item.reserved[1] != 0)
+ return -EUCLEAN;
+
+ true_size = btrfs_stack_verity_descriptor_size(&item);
+ if (true_size > INT_MAX)
+ return -EUCLEAN;
+
+ if (buf_size == 0)
+ return true_size;
+ if (buf_size < true_size)
+ return -ERANGE;
+
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
+ buf, buf_size, NULL);
+ if (ret < 0)
+ return ret;
+ if (ret != true_size)
+ return -EIO;
+
+ return true_size;
+}
+
+/*
+ * fsverity op that reads and caches a merkle tree page.
+ *
+ * @inode: inode to read a merkle tree page for
+ * @index: page index relative to the start of the merkle tree
+ * @num_ra_pages: number of pages to readahead. Optional, we ignore it
+ *
+ * The Merkle tree is stored in the filesystem btree, but its pages are cached
+ * with a logical position past EOF in the inode's mapping.
+ *
+ * Returns the page we read, or an ERR_PTR on error.
+ */
+static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index,
+ unsigned long num_ra_pages)
+{
+ struct page *page;
+ u64 off = (u64)index << PAGE_SHIFT;
+ loff_t merkle_pos = merkle_file_pos(inode);
+ int ret;
+
+ if (merkle_pos < 0)
+ return ERR_PTR(merkle_pos);
+ if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
+ return ERR_PTR(-EFBIG);
+ index += merkle_pos >> PAGE_SHIFT;
+again:
+ page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+ if (page) {
+ if (PageUptodate(page))
+ return page;
+
+ lock_page(page);
+ /*
+ * We only insert uptodate pages, so !Uptodate has to be
+ * an error
+ */
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ unlock_page(page);
+ return page;
+ }
+
+ page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Merkle item keys are indexed from byte 0 in the merkle tree.
+ * They have the form:
+ *
+ * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
+ */
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
+ page_address(page), PAGE_SIZE, page);
+ if (ret < 0) {
+ put_page(page);
+ return ERR_PTR(ret);
+ }
+ if (ret < PAGE_SIZE)
+ memzero_page(page, ret, PAGE_SIZE - ret);
+
+ SetPageUptodate(page);
+ ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+
+ if (!ret) {
+ /* Inserted and ready for fsverity */
+ unlock_page(page);
+ } else {
+ put_page(page);
+ /* Did someone race us into inserting this page? */
+ if (ret == -EEXIST)
+ goto again;
+ page = ERR_PTR(ret);
+ }
+ return page;
+}
+
+/*
+ * fsverity op that writes a Merkle tree block into the btree.
+ *
+ * @inode: inode to write a Merkle tree block for
+ * @buf: Merkle tree data block to write
+ * @index: index of the block in the Merkle tree
+ * @log_blocksize: log base 2 of the Merkle tree block size
+ *
+ * Note that the block size could be different from the page size, so it is not
+ * safe to assume that index is a page index.
+ *
+ * Returns 0 on success or negative error code on failure
+ */
+static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ u64 off = index << log_blocksize;
+ u64 len = 1ULL << log_blocksize;
+ loff_t merkle_pos = merkle_file_pos(inode);
+
+ if (merkle_pos < 0)
+ return merkle_pos;
+ if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+ return -EFBIG;
+
+ return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
+ off, buf, len);
+}
+
+const struct fsverity_operations btrfs_verityops = {
+ .begin_enable_verity = btrfs_begin_enable_verity,
+ .end_enable_verity = btrfs_end_enable_verity,
+ .get_verity_descriptor = btrfs_get_verity_descriptor,
+ .read_merkle_tree_page = btrfs_read_merkle_tree_page,
+ .write_merkle_tree_block = btrfs_write_merkle_tree_block,
+};
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 70f94b75f25a..ec3a874165de 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -38,7 +38,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.sub_stripes = 2,
.dev_stripes = 1,
.devs_max = 0, /* 0 == as many as possible */
- .devs_min = 4,
+ .devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
@@ -103,7 +103,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
- .devs_min = 2,
+ .devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
@@ -153,6 +153,32 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
+{
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
+ return BTRFS_RAID_RAID10;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+ return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
+ return BTRFS_RAID_RAID1C3;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
+ return BTRFS_RAID_RAID1C4;
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return BTRFS_RAID_DUP;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
+
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
const char *btrfs_bg_type_to_raid_name(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
@@ -404,44 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void)
}
}
-/*
- * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
- * Returned struct is not linked onto any lists and must be destroyed using
- * btrfs_free_device.
- */
-static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_device *dev;
-
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
- if (!dev)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Preallocate a bio that's always going to be used for flushing device
- * barriers and matches the device lifespan
- */
- dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
- if (!dev->flush_bio) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
- INIT_LIST_HEAD(&dev->dev_list);
- INIT_LIST_HEAD(&dev->dev_alloc_list);
- INIT_LIST_HEAD(&dev->post_commit_list);
-
- atomic_set(&dev->reada_in_flight, 0);
- atomic_set(&dev->dev_stats_ccnt, 0);
- btrfs_device_data_ordered_init(dev);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(fs_info, &dev->alloc_state,
- IO_TREE_DEVICE_ALLOC_STATE, NULL);
-
- return dev;
-}
-
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@@ -1130,6 +1118,9 @@ static void btrfs_close_one_device(struct btrfs_device *device)
fs_devices->rw_devices--;
}
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
fs_devices->missing_devices--;
@@ -1228,7 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
static int devid_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
- struct btrfs_device *dev1, *dev2;
+ const struct btrfs_device *dev1, *dev2;
dev1 = list_entry(a, struct btrfs_device, dev_list);
dev2 = list_entry(b, struct btrfs_device, dev_list);
@@ -1598,14 +1589,9 @@ again:
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
- if (ret > 0) {
- ret = btrfs_previous_item(root, path, key.objectid, key.type);
- if (ret < 0)
- goto out;
- }
while (1) {
l = path->nodes[0];
@@ -1759,48 +1745,6 @@ out:
return ret;
}
-static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 chunk_offset, u64 start, u64 num_bytes)
-{
- int ret;
- struct btrfs_path *path;
- struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_root *root = fs_info->dev_root;
- struct btrfs_dev_extent *extent;
- struct extent_buffer *leaf;
- struct btrfs_key key;
-
- WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
- WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = device->devid;
- key.offset = start;
- key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- sizeof(*extent));
- if (ret)
- goto out;
-
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_dev_extent);
- btrfs_set_dev_extent_chunk_tree(leaf, extent,
- BTRFS_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_objectid(leaf, extent,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
- btrfs_set_dev_extent_length(leaf, extent, num_bytes);
- btrfs_mark_buffer_dirty(leaf);
-out:
- btrfs_free_path(path);
- return ret;
-}
-
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
@@ -2003,12 +1947,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
if (!(all_avail & btrfs_raid_array[i].bg_flag))
continue;
- if (num_devices < btrfs_raid_array[i].devs_min) {
- int ret = btrfs_raid_array[i].mindev_error;
-
- if (ret)
- return ret;
- }
+ if (num_devices < btrfs_raid_array[i].devs_min)
+ return btrfs_raid_array[i].mindev_error;
}
return 0;
@@ -2137,7 +2077,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (IS_ERR(device)) {
if (PTR_ERR(device) == -ENOENT &&
- strcmp(device_path, "missing") == 0)
+ device_path && strcmp(device_path, "missing") == 0)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = PTR_ERR(device);
@@ -3622,10 +3562,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
const int ncopies = btrfs_raid_array[index].ncopies;
const int nparity = btrfs_raid_array[index].nparity;
- if (nparity)
- return num_stripes - nparity;
- else
- return num_stripes / ncopies;
+ return (num_stripes - nparity) / ncopies;
}
/* [pstart, pend) */
@@ -4025,6 +3962,13 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ btrfs_err(fs_info,
+ "RAID56 is not yet supported for sectorsize %u with page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ return false;
+ }
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
@@ -5464,56 +5408,6 @@ out:
}
/*
- * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
- *
- * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
- * phases.
- */
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- u64 chunk_offset, u64 chunk_size)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_device *device;
- struct extent_map *em;
- struct map_lookup *map;
- u64 dev_offset;
- u64 stripe_size;
- int i;
- int ret = 0;
-
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- map = em->map_lookup;
- stripe_size = em->orig_block_len;
-
- /*
- * Take the device list mutex to prevent races with the final phase of
- * a device replace operation that replaces the device object associated
- * with the map's stripes, because the device object's id can change
- * at any time during that final phase of the device replace operation
- * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
- * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
- * resulting in persisting a device extent item with such ID.
- */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
-
- ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
- dev_offset, stripe_size);
- if (ret)
- break;
- }
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
- free_extent_map(em);
- return ret;
-}
-
-/*
* This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
* phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
* chunks.
@@ -6923,9 +6817,31 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device(fs_info);
- if (IS_ERR(dev))
- return dev;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Preallocate a bio that's always going to be used for flushing device
+ * barriers and matches the device lifespan
+ */
+ dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
+ if (!dev->flush_bio) {
+ kfree(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&dev->dev_list);
+ INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->post_commit_list);
+
+ atomic_set(&dev->reada_in_flight, 0);
+ atomic_set(&dev->dev_stats_ccnt, 0);
+ btrfs_device_data_ordered_init(dev);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
if (devid)
tmp = *devid;
@@ -6961,15 +6877,7 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
- int index = btrfs_bg_flags_to_raid_index(type);
- int ncopies = btrfs_raid_array[index].ncopies;
- const int nparity = btrfs_raid_array[index].nparity;
- int data_stripes;
-
- if (nparity)
- data_stripes = num_stripes - nparity;
- else
- data_stripes = num_stripes / ncopies;
+ const int data_stripes = calc_data_stripes(type, num_stripes);
return div_u64(chunk_len, data_stripes);
}
@@ -8144,7 +8052,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
goto out;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_item(root, path);
+ ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
/* No dev extents at all? Not good */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 55a8ba244716..b082250b42e0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -508,8 +508,6 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- u64 chunk_offset, u64 chunk_size);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
@@ -568,32 +566,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
atomic_inc(&dev->dev_stats_ccnt);
}
-/*
- * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
- * can be used as index to access btrfs_raid_array[].
- */
-static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
-{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
- return BTRFS_RAID_RAID1C3;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
- return BTRFS_RAID_RAID1C4;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
@@ -603,6 +575,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
struct block_device *bdev,
const char *device_path);
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c3fa7d3fa770..8afa90074891 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,12 +121,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,26 +148,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = page_address(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = page_address(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -196,18 +192,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -234,18 +229,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -264,13 +258,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
-
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
return ret;
}
@@ -286,10 +275,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
- data_in = kmap(pages_in[page_in_index]);
+ data_in = page_address(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -311,7 +298,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
- kunmap(pages_in[page_in_index]);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -326,9 +312,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (buf_start == total_out)
break;
- ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
- total_out, disk_start,
- orig_bio);
+ ret2 = btrfs_decompress_buf2page(workspace->buf,
+ total_out - buf_start, cb, buf_start);
if (ret2 == 0) {
ret = 0;
goto done;
@@ -339,17 +324,16 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
- kunmap(pages_in[page_in_index]);
+
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = kmap(pages_in[page_in_index]);
+ data_in = page_address(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
- workspace->strm.avail_in = min(tmp,
- PAGE_SIZE);
+ workspace->strm.avail_in = min(tmp, PAGE_SIZE);
}
}
if (ret != Z_STREAM_END)
@@ -358,10 +342,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
done:
zlib_inflateEnd(&workspace->strm);
- if (data_in)
- kunmap(pages_in[page_in_index]);
if (!ret)
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 907c2cc45c9c..47af1ab3bf12 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -245,7 +245,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
goto out;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_item(root, path);
+ ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
/* No dev extents at all? Not good */
@@ -296,7 +296,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
- struct request_queue *queue = bdev_get_queue(bdev);
sector_t nr_sectors;
sector_t sector = 0;
struct blk_zone *zones = NULL;
@@ -348,19 +347,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
nr_sectors = bdev_nr_sectors(bdev);
zone_info->zone_size_shift = ilog2(zone_info->zone_size);
- zone_info->max_zone_append_size =
- (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
- btrfs_err(fs_info, "zoned: device %pg does not support zone append",
- bdev);
- ret = -EINVAL;
- goto out;
- }
-
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;
@@ -529,7 +519,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
u64 zoned_devices = 0;
u64 nr_devices = 0;
u64 zone_size = 0;
- u64 max_zone_append_size = 0;
const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
int ret = 0;
@@ -565,11 +554,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
ret = -EINVAL;
goto out;
}
- if (!max_zone_append_size ||
- (zone_info->max_zone_append_size &&
- zone_info->max_zone_append_size < max_zone_append_size))
- max_zone_append_size =
- zone_info->max_zone_append_size;
}
nr_devices++;
}
@@ -619,7 +603,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
}
fs_info->zone_size = zone_size;
- fs_info->max_zone_append_size = max_zone_append_size;
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
/*
@@ -1318,9 +1301,6 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!btrfs_is_zoned(fs_info))
return false;
- if (!fs_info->max_zone_append_size)
- return false;
-
if (!is_data_inode(&inode->vfs_inode))
return false;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b0ae2608cb6b..4b299705bb12 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -22,7 +22,6 @@ struct btrfs_zoned_device_info {
*/
u64 zone_size;
u8 zone_size_shift;
- u64 max_zone_append_size;
u32 nr_zones;
unsigned long *seq_zones;
unsigned long *empty_zones;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 3e26b466476a..56dce9f00988 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,19 +399,19 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = page_address(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
/* Allocate and map in the output buffer */
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -446,19 +446,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -473,13 +472,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
- kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = page_address(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -506,19 +504,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -534,12 +531,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
out:
*out_pages = nr_pages;
/* Cleanup */
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
- if (out_page)
- kunmap(out_page);
return ret;
}
@@ -547,8 +540,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
size_t srclen = cb->compressed_len;
ZSTD_DStream *stream;
int ret = 0;
@@ -565,7 +556,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = page_address(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -589,7 +580,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->out_buf.pos = 0;
ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
- buf_start, total_out, disk_start, orig_bio);
+ total_out - buf_start, cb, buf_start);
if (ret == 0)
break;
@@ -601,23 +592,21 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- kunmap(pages_in[page_in_index++]);
+ page_in_index++;
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = page_address(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
}
ret = 0;
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
done:
- if (workspace->in_buf.src)
- kunmap(pages_in[page_in_index]);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 6290c3afdba4..ab7573d72dd7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1912,7 +1912,7 @@ EXPORT_SYMBOL(page_zero_new_buffers);
static void
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
- struct iomap *iomap)
+ const struct iomap *iomap)
{
loff_t offset = block << inode->i_blkbits;
@@ -1966,7 +1966,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
}
int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block, struct iomap *iomap)
+ get_block_t *get_block, const struct iomap *iomap)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
@@ -3268,33 +3268,6 @@ out:
EXPORT_SYMBOL(try_to_free_buffers);
/*
- * There are no bdflush tunables left. But distributions are
- * still running obsolete flush daemons, so we terminate them here.
- *
- * Use of bdflush() is deprecated and will be removed in a future kernel.
- * The `flush-X' kernel threads fully replace bdflush daemons and this call.
- */
-SYSCALL_DEFINE2(bdflush, int, func, long, data)
-{
- static int msg_count;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (msg_count < 5) {
- msg_count++;
- printk(KERN_INFO
- "warning: process `%s' used the obsolete bdflush"
- " system call\n", current->comm);
- printk(KERN_INFO "Fix your initscripts?\n");
- }
-
- if (func == 1)
- do_exit(0);
- return 0;
-}
-
-/*
* Buffer-head allocation
*/
static struct kmem_cache *bh_cachep __read_mostly;
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index ff9ca55a9ae9..6827b40f7ddc 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -19,22 +19,3 @@ config CACHEFILES_DEBUG
caching on files module. If this is set, the debugging output may be
enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
by including a debugging specifier in /etc/cachefilesd.conf.
-
-config CACHEFILES_HISTOGRAM
- bool "Gather latency information on CacheFiles"
- depends on CACHEFILES && PROC_FS
- help
-
- This option causes latency information to be gathered on CacheFiles
- operation and exported through file:
-
- /proc/fs/cachefiles/histogram
-
- The generation of this histogram adds a certain amount of overhead to
- execution as there are a number of points at which data is gathered,
- and on a multi-CPU system these may be on cachelines that keep
- bouncing between CPUs. On the other hand, the histogram may be
- useful for debugging purposes. Saying 'N' here is recommended.
-
- See Documentation/filesystems/caching/cachefiles.rst for more
- information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 2227dc2d5498..02fd17731769 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -15,6 +15,4 @@ cachefiles-y := \
security.o \
xattr.o
-cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
-
obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 38bb7764b454..d463d89f5db8 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -108,8 +108,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
atomic_set(&fsdef->usage, 1);
fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
- _debug("- fsdef %p", fsdef);
-
/* look up the directory at the root of the cache */
ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
if (ret < 0)
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index da3948fdb615..da28ac1fa225 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -33,7 +33,7 @@ static struct fscache_object *cachefiles_alloc_object(
cache = container_of(_cache, struct cachefiles_cache, cache);
- _enter("{%s},%p,", cache->cache.identifier, cookie);
+ _enter("{%s},%x,", cache->cache.identifier, cookie->debug_id);
lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
if (!lookup_data)
@@ -96,7 +96,7 @@ static struct fscache_object *cachefiles_alloc_object(
lookup_data->key = key;
object->lookup_data = lookup_data;
- _leave(" = %p [%p]", &object->fscache, lookup_data);
+ _leave(" = %x [%p]", object->fscache.debug_id, lookup_data);
return &object->fscache;
nomem_key:
@@ -379,7 +379,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
const struct cred *saved_cred;
int ret;
- _enter("%p", _cache);
+ _enter("%s", _cache->tag->name);
cache = container_of(_cache, struct cachefiles_cache, cache);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 4ed83aa5253b..0a511c36dab8 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -181,31 +181,6 @@ extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
struct dentry *dir, char *filename);
/*
- * proc.c
- */
-#ifdef CONFIG_CACHEFILES_HISTOGRAM
-extern atomic_t cachefiles_lookup_histogram[HZ];
-extern atomic_t cachefiles_mkdir_histogram[HZ];
-extern atomic_t cachefiles_create_histogram[HZ];
-
-extern int __init cachefiles_proc_init(void);
-extern void cachefiles_proc_cleanup(void);
-static inline
-void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
-{
- unsigned long jif = jiffies - start_jif;
- if (jif >= HZ)
- jif = HZ - 1;
- atomic_inc(&histogram[jif]);
-}
-
-#else
-#define cachefiles_proc_init() (0)
-#define cachefiles_proc_cleanup() do {} while (0)
-#define cachefiles_hist(hist, start_jif) do {} while (0)
-#endif
-
-/*
* rdwr.c
*/
extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index b13fb45fc3f3..fac2e8e7b533 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -70,7 +70,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
_enter("%pD,%li,%llx,%zx/%llx",
file, file_inode(file)->i_ino, start_pos, len,
- i_size_read(file->f_inode));
+ i_size_read(file_inode(file)));
/* If the caller asked us to seek for data before doing the read, then
* we should do that now. If we find a gap, we fill it with zeros.
@@ -194,7 +194,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
_enter("%pD,%li,%llx,%zx/%llx",
file, file_inode(file)->i_ino, start_pos, len,
- i_size_read(file->f_inode));
+ i_size_read(file_inode(file)));
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
if (!ki)
@@ -410,7 +410,7 @@ int cachefiles_begin_read_operation(struct netfs_read_request *rreq,
rreq->cache_resources.cache_priv = op;
rreq->cache_resources.cache_priv2 = file;
rreq->cache_resources.ops = &cachefiles_netfs_cache_ops;
- rreq->cookie_debug_id = object->fscache.debug_id;
+ rreq->cache_resources.debug_id = object->fscache.debug_id;
_leave("");
return 0;
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index be96f5fc5cac..7f94efc97e23 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -150,6 +150,6 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
key[len++] = 0;
key[len] = 0;
- _leave(" = %p %d", key, len);
+ _leave(" = %s %d", key, len);
return key;
}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index ddf0cd58d60c..9c8d34c49b12 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -69,15 +69,9 @@ static int __init cachefiles_init(void)
goto error_object_jar;
}
- ret = cachefiles_proc_init();
- if (ret < 0)
- goto error_proc;
-
pr_info("Loaded\n");
return 0;
-error_proc:
- kmem_cache_destroy(cachefiles_object_jar);
error_object_jar:
misc_deregister(&cachefiles_dev);
error_dev:
@@ -94,7 +88,6 @@ static void __exit cachefiles_exit(void)
{
pr_info("Unloading\n");
- cachefiles_proc_cleanup();
kmem_cache_destroy(cachefiles_object_jar);
misc_deregister(&cachefiles_dev);
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7bf0732ae25c..a9aca5ab5970 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -39,18 +39,18 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
pr_err("%sops=%u inp=%u exc=%u\n",
prefix, object->fscache.n_ops, object->fscache.n_in_progress,
object->fscache.n_exclusive);
- pr_err("%sparent=%p\n",
- prefix, object->fscache.parent);
+ pr_err("%sparent=%x\n",
+ prefix, object->fscache.parent ? object->fscache.parent->debug_id : 0);
spin_lock(&object->fscache.lock);
cookie = object->fscache.cookie;
if (cookie) {
- pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n",
+ pr_err("%scookie=%x [pr=%x nd=%p fl=%lx]\n",
prefix,
- object->fscache.cookie,
- object->fscache.cookie->parent,
- object->fscache.cookie->netfs_data,
- object->fscache.cookie->flags);
+ cookie->debug_id,
+ cookie->parent ? cookie->parent->debug_id : 0,
+ cookie->netfs_data,
+ cookie->flags);
pr_err("%skey=[%u] '", prefix, cookie->key_len);
k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
cookie->inline_key : cookie->key;
@@ -110,7 +110,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
/* found the dentry for */
found_dentry:
- kdebug("preemptive burial: OBJ%x [%s] %p",
+ kdebug("preemptive burial: OBJ%x [%s] %pd",
object->fscache.debug_id,
object->fscache.state->name,
dentry);
@@ -140,7 +140,7 @@ static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
struct rb_node **_p, *_parent = NULL;
struct dentry *dentry;
- _enter(",%p", object);
+ _enter(",%x", object->fscache.debug_id);
try_again:
write_lock(&cache->active_lock);
@@ -298,8 +298,6 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
_enter(",'%pd','%pd'", dir, rep);
- _debug("remove %p from %p", rep, dir);
-
/* non-directories can just be unlinked */
if (!d_is_dir(rep)) {
_debug("unlink stale object");
@@ -446,7 +444,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
struct dentry *dir;
int ret;
- _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
+ _enter(",OBJ%x{%pd}", object->fscache.debug_id, object->dentry);
ASSERT(object->dentry);
ASSERT(d_backing_inode(object->dentry));
@@ -496,11 +494,10 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
struct dentry *dir, *next = NULL;
struct inode *inode;
struct path path;
- unsigned long start;
const char *name;
int ret, nlen;
- _enter("OBJ%x{%p},OBJ%x,%s,",
+ _enter("OBJ%x{%pd},OBJ%x,%s,",
parent->fscache.debug_id, parent->dentry,
object->fscache.debug_id, key);
@@ -535,9 +532,7 @@ lookup_again:
inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- start = jiffies;
next = lookup_one_len(name, dir, nlen);
- cachefiles_hist(cachefiles_lookup_histogram, start);
if (IS_ERR(next)) {
trace_cachefiles_lookup(object, next, NULL);
goto lookup_error;
@@ -545,7 +540,7 @@ lookup_again:
inode = d_backing_inode(next);
trace_cachefiles_lookup(object, next, inode);
- _debug("next -> %p %s", next, inode ? "positive" : "negative");
+ _debug("next -> %pd %s", next, inode ? "positive" : "negative");
if (!key)
object->new = !inode;
@@ -568,9 +563,7 @@ lookup_again:
ret = security_path_mkdir(&path, next, 0);
if (ret < 0)
goto create_error;
- start = jiffies;
ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0);
- cachefiles_hist(cachefiles_mkdir_histogram, start);
if (!key)
trace_cachefiles_mkdir(object, next, ret);
if (ret < 0)
@@ -583,8 +576,8 @@ lookup_again:
}
ASSERT(d_backing_inode(next));
- _debug("mkdir -> %p{%p{ino=%lu}}",
- next, d_backing_inode(next), d_backing_inode(next)->i_ino);
+ _debug("mkdir -> %pd{ino=%lu}",
+ next, d_backing_inode(next)->i_ino);
} else if (!d_can_lookup(next)) {
pr_err("inode %lu is not a directory\n",
@@ -604,18 +597,16 @@ lookup_again:
ret = security_path_mknod(&path, next, S_IFREG, 0);
if (ret < 0)
goto create_error;
- start = jiffies;
ret = vfs_create(&init_user_ns, d_inode(dir), next,
S_IFREG, true);
- cachefiles_hist(cachefiles_create_histogram, start);
trace_cachefiles_create(object, next, ret);
if (ret < 0)
goto create_error;
ASSERT(d_backing_inode(next));
- _debug("create -> %p{%p{ino=%lu}}",
- next, d_backing_inode(next), d_backing_inode(next)->i_ino);
+ _debug("create -> %pd{ino=%lu}",
+ next, d_backing_inode(next)->i_ino);
} else if (!d_can_lookup(next) &&
!d_is_reg(next)
@@ -765,7 +756,6 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
const char *dirname)
{
struct dentry *subdir;
- unsigned long start;
struct path path;
int ret;
@@ -775,16 +765,14 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
inode_lock(d_inode(dir));
retry:
- start = jiffies;
subdir = lookup_one_len(dirname, dir, strlen(dirname));
- cachefiles_hist(cachefiles_lookup_histogram, start);
if (IS_ERR(subdir)) {
if (PTR_ERR(subdir) == -ENOMEM)
goto nomem_d_alloc;
goto lookup_error;
}
- _debug("subdir -> %p %s",
+ _debug("subdir -> %pd %s",
subdir, d_backing_inode(subdir) ? "positive" : "negative");
/* we need to create the subdir if it doesn't exist yet */
@@ -810,10 +798,8 @@ retry:
}
ASSERT(d_backing_inode(subdir));
- _debug("mkdir -> %p{%p{ino=%lu}}",
- subdir,
- d_backing_inode(subdir),
- d_backing_inode(subdir)->i_ino);
+ _debug("mkdir -> %pd{ino=%lu}",
+ subdir, d_backing_inode(subdir)->i_ino);
}
inode_unlock(d_inode(dir));
@@ -876,7 +862,6 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
struct cachefiles_object *object;
struct rb_node *_n;
struct dentry *victim;
- unsigned long start;
int ret;
//_enter(",%pd/,%s",
@@ -885,13 +870,11 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
/* look up the victim */
inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- start = jiffies;
victim = lookup_one_len(filename, dir, strlen(filename));
- cachefiles_hist(cachefiles_lookup_histogram, start);
if (IS_ERR(victim))
goto lookup_error;
- //_debug("victim -> %p %s",
+ //_debug("victim -> %pd %s",
// victim, d_backing_inode(victim) ? "positive" : "negative");
/* if the object is no longer there then we probably retired the object
@@ -922,7 +905,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
read_unlock(&cache->active_lock);
- //_leave(" = %p", victim);
+ //_leave(" = %pd", victim);
return victim;
object_in_use:
@@ -968,7 +951,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
if (IS_ERR(victim))
return PTR_ERR(victim);
- _debug("victim -> %p %s",
+ _debug("victim -> %pd %s",
victim, d_backing_inode(victim) ? "positive" : "negative");
/* okay... the victim is not being used so we can cull it
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
deleted file mode 100644
index 6e67aea0f24e..000000000000
--- a/fs/cachefiles/proc.c
+++ /dev/null
@@ -1,114 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* CacheFiles statistics
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/module.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include "internal.h"
-
-atomic_t cachefiles_lookup_histogram[HZ];
-atomic_t cachefiles_mkdir_histogram[HZ];
-atomic_t cachefiles_create_histogram[HZ];
-
-/*
- * display the latency histogram
- */
-static int cachefiles_histogram_show(struct seq_file *m, void *v)
-{
- unsigned long index;
- unsigned x, y, z, t;
-
- switch ((unsigned long) v) {
- case 1:
- seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n");
- return 0;
- case 2:
- seq_puts(m, "===== ===== ========= ========= =========\n");
- return 0;
- default:
- index = (unsigned long) v - 3;
- x = atomic_read(&cachefiles_lookup_histogram[index]);
- y = atomic_read(&cachefiles_mkdir_histogram[index]);
- z = atomic_read(&cachefiles_create_histogram[index]);
- if (x == 0 && y == 0 && z == 0)
- return 0;
-
- t = (index * 1000) / HZ;
-
- seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z);
- return 0;
- }
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
-{
- if ((unsigned long long)*_pos >= HZ + 2)
- return NULL;
- if (*_pos == 0)
- *_pos = 1;
- return (void *)(unsigned long) *_pos;
-}
-
-/*
- * move to the next line
- */
-static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
-{
- (*pos)++;
- return (unsigned long long)*pos > HZ + 2 ?
- NULL : (void *)(unsigned long) *pos;
-}
-
-/*
- * clean up after reading
- */
-static void cachefiles_histogram_stop(struct seq_file *m, void *v)
-{
-}
-
-static const struct seq_operations cachefiles_histogram_ops = {
- .start = cachefiles_histogram_start,
- .stop = cachefiles_histogram_stop,
- .next = cachefiles_histogram_next,
- .show = cachefiles_histogram_show,
-};
-
-/*
- * initialise the /proc/fs/cachefiles/ directory
- */
-int __init cachefiles_proc_init(void)
-{
- _enter("");
-
- if (!proc_mkdir("fs/cachefiles", NULL))
- goto error_dir;
-
- if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
- &cachefiles_histogram_ops))
- goto error_histogram;
-
- _leave(" = 0");
- return 0;
-
-error_histogram:
- remove_proc_entry("fs/cachefiles", NULL);
-error_dir:
- _leave(" = -ENOMEM");
- return -ENOMEM;
-}
-
-/*
- * clean up the /proc/fs/cachefiles/ directory
- */
-void cachefiles_proc_cleanup(void)
-{
- remove_proc_entry("fs/cachefiles/histogram", NULL);
- remove_proc_entry("fs/cachefiles", NULL);
-}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index a591b5e09637..9e82de668595 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -36,7 +36,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
else
snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
- _enter("%p{%s}", object, type);
+ _enter("%x{%s}", object->fscache.debug_id, type);
/* attempt to install a type label directly */
ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type,
@@ -134,7 +134,7 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
if (!dentry)
return -ESTALE;
- _enter("%p,#%d", object, auxdata->len);
+ _enter("%x,#%d", object->fscache.debug_id, auxdata->len);
/* attempt to install the cache metadata directly */
_debug("SET #%u", auxdata->len);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 529af59d9fd3..f4fc8e0b847c 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -29,7 +29,7 @@ static inline void ceph_set_cached_acl(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
}
-struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu)
{
int size;
unsigned int retry_cnt = 0;
@@ -37,6 +37,9 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
char *value = NULL;
struct posix_acl *acl;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a1e2813731d1..7e7a897ae0d3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
} else {
struct address_space *mapping = inode->i_mapping;
- struct page *page = find_or_create_page(mapping, 0,
- mapping_gfp_constraint(mapping,
- ~__GFP_FS));
+ struct page *page;
+
+ filemap_invalidate_lock_shared(mapping);
+ page = find_or_create_page(mapping, 0,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
goto out_inline;
@@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
vmf->page = page;
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
out_inline:
+ filemap_invalidate_unlock_shared(mapping);
dout("filemap_fault %p %llu read inline data ret %x\n",
inode, off, ret);
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a2900903f8c..39db97f149b9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1743,7 +1743,11 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush *ceph_alloc_cap_flush(void)
{
- return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+ struct ceph_cap_flush *cf;
+
+ cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+ cf->is_capsnap = false;
+ return cf;
}
void ceph_free_cap_flush(struct ceph_cap_flush *cf)
@@ -1778,7 +1782,7 @@ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
prev->wake = true;
wake = false;
}
- list_del(&cf->g_list);
+ list_del_init(&cf->g_list);
return wake;
}
@@ -1793,7 +1797,7 @@ static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
prev->wake = true;
wake = false;
}
- list_del(&cf->i_list);
+ list_del_init(&cf->i_list);
return wake;
}
@@ -2352,7 +2356,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
- if (!cf->caps) {
+ if (cf->is_capsnap) {
last_snap_flush = cf->tid;
break;
}
@@ -2371,7 +2375,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
first_tid = cf->tid + 1;
- if (cf->caps) {
+ if (!cf->is_capsnap) {
struct cap_msg_args arg;
dout("kick_flushing_caps %p cap %p tid %llu %s\n",
@@ -3516,7 +3520,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
cleaned = cf->caps;
/* Is this a capsnap? */
- if (cf->caps == 0)
+ if (cf->is_capsnap)
continue;
if (cf->tid <= flush_tid) {
@@ -3589,8 +3593,9 @@ out:
while (!list_empty(&to_remove)) {
cf = list_first_entry(&to_remove,
struct ceph_cap_flush, i_list);
- list_del(&cf->i_list);
- ceph_free_cap_flush(cf);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
}
if (wake_ci)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d1755ac1d964..e1d605a02d4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode,
if (ret < 0)
goto unlock;
+ filemap_invalidate_lock(inode->i_mapping);
ceph_zero_pagecache_range(inode, offset, length);
ret = ceph_zero_objects(inode, offset, length);
@@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode,
if (dirty)
__mark_inode_dirty(inode, dirty);
}
+ filemap_invalidate_unlock(inode->i_mapping);
ceph_put_cap_refs(ci, got);
unlock:
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index fa8a847743d0..bdeb271f47d9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -240,9 +240,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- /* No mandatory locks */
- if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index afdc20213876..0b69aec23e5c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1616,7 +1616,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_lock(&mdsc->cap_dirty_lock);
list_for_each_entry(cf, &to_remove, i_list)
- list_del(&cf->g_list);
+ list_del_init(&cf->g_list);
if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited(
@@ -1668,8 +1668,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_cap_flush *cf;
cf = list_first_entry(&to_remove,
struct ceph_cap_flush, i_list);
- list_del(&cf->i_list);
- ceph_free_cap_flush(cf);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
}
wake_up_all(&ci->i_cap_wq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index abd9af7727ad..3c444b9cb17b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -394,9 +394,11 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->possible_max_rank; i++)
- kfree(m->m_info[i].export_targets);
- kfree(m->m_info);
+ if (m->m_info) {
+ for (i = 0; i < m->possible_max_rank; i++)
+ kfree(m->m_info[i].export_targets);
+ kfree(m->m_info);
+ }
kfree(m->m_data_pg_pools);
kfree(m);
}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4c6bd1042c94..15105f9da3fd 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -487,6 +487,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
return;
}
+ capsnap->cap_flush.is_capsnap = true;
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 9215a2f4535c..c30258f95e37 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -182,8 +182,9 @@ struct ceph_cap {
struct ceph_cap_flush {
u64 tid;
- int caps; /* 0 means capsnap */
+ int caps;
bool wake; /* wake up flush waiters when finish ? */
+ bool is_capsnap; /* true means capsnap */
struct list_head g_list; // global
struct list_head i_list; // per inode
};
@@ -1087,7 +1088,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
/* acl.c */
#ifdef CONFIG_CEPH_FS_POSIX_ACL
-struct posix_acl *ceph_get_acl(struct inode *, int);
+struct posix_acl *ceph_get_acl(struct inode *, int, bool);
int ceph_set_acl(struct user_namespace *mnt_userns,
struct inode *inode, struct posix_acl *acl, int type);
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 7364950a9ef4..3b7e3b9e4fd2 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -4,19 +4,16 @@ config CIFS
depends on INET
select NLS
select CRYPTO
- select CRYPTO_MD4
select CRYPTO_MD5
select CRYPTO_SHA256
select CRYPTO_SHA512
select CRYPTO_CMAC
select CRYPTO_HMAC
- select CRYPTO_LIB_ARC4
select CRYPTO_AEAD2
select CRYPTO_CCM
select CRYPTO_GCM
select CRYPTO_ECB
select CRYPTO_AES
- select CRYPTO_LIB_DES
select KEYS
select DNS_RESOLVER
select ASN1
@@ -85,33 +82,6 @@ config CIFS_ALLOW_INSECURE_LEGACY
If unsure, say Y.
-config CIFS_WEAK_PW_HASH
- bool "Support legacy servers which use weaker LANMAN security"
- depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY
- help
- Modern CIFS servers including Samba and most Windows versions
- (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
- security mechanisms. These hash the password more securely
- than the mechanisms used in the older LANMAN version of the
- SMB protocol but LANMAN based authentication is needed to
- establish sessions with some old SMB servers.
-
- Enabling this option allows the cifs module to mount to older
- LANMAN based servers such as OS/2 and Windows 95, but such
- mounts may be less secure than mounts using NTLM or more recent
- security mechanisms if you are on a public network. Unless you
- have a need to access old SMB servers (and are on a private
- network) you probably want to say N. Even if this support
- is enabled in the kernel build, LANMAN authentication will not be
- used automatically. At runtime LANMAN mounts are disabled but
- can be set to required (or optional) either in
- /proc/fs/cifs (see Documentation/admin-guide/cifs/usage.rst for
- more detail) or via an option on the mount command. This support
- is disabled by default in order to reduce the possibility of a
- downgrade attack.
-
- If unsure, say N.
-
config CIFS_UPCALL
bool "Kerberos/SPNEGO advanced session setup"
depends on CIFS
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 8857ac7e7a14..51a824fc926a 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -250,9 +250,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
seq_printf(m, ",ALLOW_INSECURE_LEGACY");
#endif
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- seq_printf(m, ",WEAK_PW_HASH");
-#endif
#ifdef CONFIG_CIFS_POSIX
seq_printf(m, ",CIFS_POSIX");
#endif
@@ -929,14 +926,6 @@ cifs_security_flags_handle_must_flags(unsigned int *flags)
*flags = CIFSSEC_MUST_NTLMSSP;
else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
*flags = CIFSSEC_MUST_NTLMV2;
- else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
- *flags = CIFSSEC_MUST_NTLM;
- else if (CIFSSEC_MUST_LANMAN &&
- (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
- *flags = CIFSSEC_MUST_LANMAN;
- else if (CIFSSEC_MUST_PLNTXT &&
- (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
- *flags = CIFSSEC_MUST_PLNTXT;
*flags |= signflags;
}
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index 93b47818c6c2..12bde7bfda86 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -147,8 +147,6 @@ static int cifs_swn_send_register_message(struct cifs_swn_reg *swnreg)
goto nlmsg_fail;
}
break;
- case LANMAN:
- case NTLM:
case NTLMv2:
case RawNTLMSSP:
ret = cifs_swn_auth_info_ntlm(swnreg->tcon, skb);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 9bd03a231032..171ad8b42107 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -358,14 +358,9 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
if (!dst)
return NULL;
cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
- NO_MAP_UNI_RSVD);
+ NO_MAP_UNI_RSVD);
} else {
- len = strnlen(src, maxlen);
- len++;
- dst = kmalloc(len, GFP_KERNEL);
- if (!dst)
- return NULL;
- strlcpy(dst, src, len);
+ dst = kstrndup(src, maxlen, GFP_KERNEL);
}
return dst;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index ecf15d845dbd..6679e07e533e 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -22,7 +22,7 @@
#include <linux/random.h>
#include <linux/highmem.h>
#include <linux/fips.h>
-#include <crypto/arc4.h>
+#include "../cifs_common/arc4.h"
#include <crypto/aead.h>
int __cifs_calc_signature(struct smb_rqst *rqst,
@@ -250,87 +250,6 @@ int cifs_verify_signature(struct smb_rqst *rqst,
}
-/* first calculate 24 bytes ntlm response and then 16 byte session key */
-int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
-{
- int rc = 0;
- unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
- char temp_key[CIFS_SESS_KEY_SIZE];
-
- if (!ses)
- return -EINVAL;
-
- ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
- if (!ses->auth_key.response)
- return -ENOMEM;
-
- ses->auth_key.len = temp_len;
-
- rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
- ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
- if (rc) {
- cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n",
- __func__, rc);
- return rc;
- }
-
- rc = E_md4hash(ses->password, temp_key, nls_cp);
- if (rc) {
- cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
- __func__, rc);
- return rc;
- }
-
- rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
- if (rc)
- cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n",
- __func__, rc);
-
- return rc;
-}
-
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
- char *lnm_session_key)
-{
- int i, len;
- int rc;
- char password_with_pad[CIFS_ENCPWD_SIZE] = {0};
-
- if (password) {
- for (len = 0; len < CIFS_ENCPWD_SIZE; len++)
- if (!password[len])
- break;
-
- memcpy(password_with_pad, password, len);
- }
-
- if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
- memcpy(lnm_session_key, password_with_pad,
- CIFS_ENCPWD_SIZE);
- return 0;
- }
-
- /* calculate old style session key */
- /* calling toupper is less broken than repeatedly
- calling nls_toupper would be since that will never
- work for UTF8, but neither handles multibyte code pages
- but the only alternative would be converting to UCS-16 (Unicode)
- (using a routine something like UniStrupr) then
- uppercasing and then converting back from Unicode - which
- would only worth doing it if we knew it were utf8. Basically
- utf8 and other multibyte codepages each need their own strupper
- function since a byte at a time will ont work. */
-
- for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
- password_with_pad[i] = toupper(password_with_pad[i]);
-
- rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
-
- return rc;
-}
-#endif /* CIFS_WEAK_PW_HASH */
-
/* Build a proper attribute value/target info pairs blob.
* Fill in netbios and dns domain name and workstation name
* and client time (total five av pairs and + one end of fields indicator.
@@ -780,9 +699,9 @@ calc_seckey(struct cifs_ses *ses)
return -ENOMEM;
}
- arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
- arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
- CIFS_CPHTXT_SIZE);
+ cifs_arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
+ cifs_arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
+ CIFS_CPHTXT_SIZE);
/* make secondary_key/nonce as session key */
memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 64b71c4e2a9d..8c20bfa187ac 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -399,7 +399,6 @@ cifs_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- cifs_fscache_release_inode_cookie(inode);
}
static void
@@ -438,15 +437,9 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
seq_puts(s, ",sec=");
switch (ses->sectype) {
- case LANMAN:
- seq_puts(s, "lanman");
- break;
case NTLMv2:
seq_puts(s, "ntlmv2");
break;
- case NTLM:
- seq_puts(s, "ntlm");
- break;
case Kerberos:
seq_puts(s, "krb5");
break;
@@ -1755,7 +1748,6 @@ MODULE_DESCRIPTION
MODULE_VERSION(CIFS_VERSION);
MODULE_SOFTDEP("ecb");
MODULE_SOFTDEP("hmac");
-MODULE_SOFTDEP("md4");
MODULE_SOFTDEP("md5");
MODULE_SOFTDEP("nls");
MODULE_SOFTDEP("aes");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c6a9542ca281..c068f7d8d879 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -114,8 +114,6 @@ enum statusEnum {
enum securityEnum {
Unspecified = 0, /* not specified */
- LANMAN, /* Legacy LANMAN auth */
- NTLM, /* Legacy NTLM012 auth with NTLM hash */
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
Kerberos, /* Kerberos via SPNEGO */
@@ -634,7 +632,6 @@ struct TCP_Server_Info {
struct session_key session_key;
unsigned long lstrp; /* when we got last response from this server */
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
-#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */
#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
char negflavor; /* NEGOTIATE response flavor */
@@ -1734,16 +1731,8 @@ static inline bool is_retryable_error(int error)
/* Security Flags: indicate type of session setup needed */
#define CIFSSEC_MAY_SIGN 0x00001
-#define CIFSSEC_MAY_NTLM 0x00002
#define CIFSSEC_MAY_NTLMV2 0x00004
#define CIFSSEC_MAY_KRB5 0x00008
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFSSEC_MAY_LANMAN 0x00010
-#define CIFSSEC_MAY_PLNTXT 0x00020
-#else
-#define CIFSSEC_MAY_LANMAN 0
-#define CIFSSEC_MAY_PLNTXT 0
-#endif /* weak passwords */
#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */
#define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */
@@ -1751,32 +1740,19 @@ static inline bool is_retryable_error(int error)
/* note that only one of the following can be set so the
result of setting MUST flags more than once will be to
require use of the stronger protocol */
-#define CIFSSEC_MUST_NTLM 0x02002
#define CIFSSEC_MUST_NTLMV2 0x04004
#define CIFSSEC_MUST_KRB5 0x08008
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFSSEC_MUST_LANMAN 0x10010
-#define CIFSSEC_MUST_PLNTXT 0x20020
-#ifdef CONFIG_CIFS_UPCALL
-#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */
-#else
-#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
-#endif /* UPCALL */
-#else /* do not allow weak pw hash */
-#define CIFSSEC_MUST_LANMAN 0
-#define CIFSSEC_MUST_PLNTXT 0
#ifdef CONFIG_CIFS_UPCALL
#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
#else
#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */
#endif /* UPCALL */
-#endif /* WEAK_PW_HASH */
#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
-#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
-#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
+#define CIFSSEC_MAX (CIFSSEC_MUST_NTLMV2)
+#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
/*
*****************************************************************
* All constants go here
@@ -1940,10 +1916,6 @@ static inline char *get_security_type_str(enum securityEnum sectype)
return "Kerberos";
case NTLMv2:
return "NTLMv2";
- case NTLM:
- return "NTLM";
- case LANMAN:
- return "LANMAN";
default:
return "Unknown";
}
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index f6e235001358..dc920e206336 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -14,13 +14,7 @@
#include <asm/unaligned.h>
#include "smbfsctl.h"
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define LANMAN_PROT 0
-#define LANMAN2_PROT 1
-#define CIFS_PROT 2
-#else
#define CIFS_PROT 0
-#endif
#define POSIX_PROT (CIFS_PROT+1)
#define BAD_PROT 0xFFFF
@@ -505,30 +499,8 @@ typedef struct negotiate_req {
unsigned char DialectsArray[1];
} __attribute__((packed)) NEGOTIATE_REQ;
-/* Dialect index is 13 for LANMAN */
-
#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
-typedef struct lanman_neg_rsp {
- struct smb_hdr hdr; /* wct = 13 */
- __le16 DialectIndex;
- __le16 SecurityMode;
- __le16 MaxBufSize;
- __le16 MaxMpxCount;
- __le16 MaxNumberVcs;
- __le16 RawMode;
- __le32 SessionKey;
- struct {
- __le16 Time;
- __le16 Date;
- } __attribute__((packed)) SrvTime;
- __le16 ServerTimeZone;
- __le16 EncryptionKeyLength;
- __le16 Reserved;
- __u16 ByteCount;
- unsigned char EncryptionKey[1];
-} __attribute__((packed)) LANMAN_NEG_RSP;
-
#define READ_RAW_ENABLE 1
#define WRITE_RAW_ENABLE 2
#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e0def0f0714b..f9740c21ca3d 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -498,19 +498,12 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
extern int cifs_verify_signature(struct smb_rqst *rqst,
struct TCP_Server_Info *server,
__u32 expected_sequence_number);
-extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
- const struct nls_table *);
-extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
extern void cifs_crypto_secmech_release(struct TCP_Server_Info *server);
extern int calc_seckey(struct cifs_ses *);
extern int generate_smb30signingkey(struct cifs_ses *);
extern int generate_smb311signingkey(struct cifs_ses *);
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-extern int calc_lanman_hash(const char *password, const char *cryptkey,
- bool encrypt, char *lnm_session_key);
-#endif /* CIFS_WEAK_PW_HASH */
extern int CIFSSMBCopy(unsigned int xid,
struct cifs_tcon *source_tcon,
const char *fromName,
@@ -547,11 +540,8 @@ extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
const unsigned char *path);
-extern int mdfour(unsigned char *, unsigned char *, int);
extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
const struct nls_table *codepage);
-extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
- unsigned char *p24);
extern int
cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 65d1a65bfc37..a8e41c1e80ca 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -42,10 +42,6 @@ static struct {
int index;
char *name;
} protocols[] = {
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- {LANMAN_PROT, "\2LM1.2X002"},
- {LANMAN2_PROT, "\2LANMAN2.1"},
-#endif /* weak password hashing for legacy clients */
{CIFS_PROT, "\2NT LM 0.12"},
{POSIX_PROT, "\2POSIX 2"},
{BAD_PROT, "\2"}
@@ -55,10 +51,6 @@ static struct {
int index;
char *name;
} protocols[] = {
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- {LANMAN_PROT, "\2LM1.2X002"},
- {LANMAN2_PROT, "\2LANMAN2.1"},
-#endif /* weak password hashing for legacy clients */
{CIFS_PROT, "\2NT LM 0.12"},
{BAD_PROT, "\2"}
};
@@ -66,17 +58,9 @@ static struct {
/* define the number of elements in the cifs dialect array */
#ifdef CONFIG_CIFS_POSIX
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFS_NUM_PROT 4
-#else
#define CIFS_NUM_PROT 2
-#endif /* CIFS_WEAK_PW_HASH */
#else /* not posix */
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFS_NUM_PROT 3
-#else
#define CIFS_NUM_PROT 1
-#endif /* CONFIG_CIFS_WEAK_PW_HASH */
#endif /* CIFS_POSIX */
/*
@@ -475,89 +459,6 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
return 0;
}
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-static int
-decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
-{
- __s16 tmp;
- struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
-
- if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
- return -EOPNOTSUPP;
-
- server->sec_mode = le16_to_cpu(rsp->SecurityMode);
- server->maxReq = min_t(unsigned int,
- le16_to_cpu(rsp->MaxMpxCount),
- cifs_max_pending);
- set_credits(server, server->maxReq);
- server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
- /* set up max_read for readpages check */
- server->max_read = server->maxBuf;
- /* even though we do not use raw we might as well set this
- accurately, in case we ever find a need for it */
- if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
- server->max_rw = 0xFF00;
- server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
- } else {
- server->max_rw = 0;/* do not need to use raw anyway */
- server->capabilities = CAP_MPX_MODE;
- }
- tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
- if (tmp == -1) {
- /* OS/2 often does not set timezone therefore
- * we must use server time to calc time zone.
- * Could deviate slightly from the right zone.
- * Smallest defined timezone difference is 15 minutes
- * (i.e. Nepal). Rounding up/down is done to match
- * this requirement.
- */
- int val, seconds, remain, result;
- struct timespec64 ts;
- time64_t utc = ktime_get_real_seconds();
- ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
- rsp->SrvTime.Time, 0);
- cifs_dbg(FYI, "SrvTime %lld sec since 1970 (utc: %lld) diff: %lld\n",
- ts.tv_sec, utc,
- utc - ts.tv_sec);
- val = (int)(utc - ts.tv_sec);
- seconds = abs(val);
- result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
- remain = seconds % MIN_TZ_ADJ;
- if (remain >= (MIN_TZ_ADJ / 2))
- result += MIN_TZ_ADJ;
- if (val < 0)
- result = -result;
- server->timeAdj = result;
- } else {
- server->timeAdj = (int)tmp;
- server->timeAdj *= 60; /* also in seconds */
- }
- cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
-
-
- /* BB get server time for time conversions and add
- code to use it and timezone since this is not UTC */
-
- if (rsp->EncryptionKeyLength ==
- cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
- memcpy(server->cryptkey, rsp->EncryptionKey,
- CIFS_CRYPTO_KEY_SIZE);
- } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
- return -EIO; /* need cryptkey unless plain text */
- }
-
- cifs_dbg(FYI, "LANMAN negotiated\n");
- return 0;
-}
-#else
-static inline int
-decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
-{
- cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
- return -EOPNOTSUPP;
-}
-#endif
-
static bool
should_set_ext_sec_flag(enum securityEnum sectype)
{
@@ -626,16 +527,12 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
server->dialect = le16_to_cpu(pSMBr->DialectIndex);
cifs_dbg(FYI, "Dialect: %d\n", server->dialect);
/* Check wct = 1 error case */
- if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
+ if ((pSMBr->hdr.WordCount <= 13) || (server->dialect == BAD_PROT)) {
/* core returns wct = 1, but we do not ask for core - otherwise
small wct just comes when dialect index is -1 indicating we
could not negotiate a common dialect */
rc = -EOPNOTSUPP;
goto neg_err_exit;
- } else if (pSMBr->hdr.WordCount == 13) {
- server->negflavor = CIFS_NEGFLAVOR_LANMAN;
- rc = decode_lanman_negprot_rsp(server, pSMBr);
- goto signing_check;
} else if (pSMBr->hdr.WordCount != 17) {
/* unknown wct */
rc = -EOPNOTSUPP;
@@ -677,7 +574,6 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
server->capabilities &= ~CAP_EXTENDED_SECURITY;
}
-signing_check:
if (!rc)
rc = cifs_enable_signing(server, ses->sign);
neg_err_exit:
@@ -2101,6 +1997,7 @@ cifs_writev_complete(struct work_struct *work)
else if (wdata->result < 0)
SetPageError(page);
end_page_writeback(page);
+ cifs_readpage_to_fscache(inode, page);
put_page(page);
}
if (wdata->result != -EAGAIN)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3781eee9360a..0db344807ef1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3684,38 +3684,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
*bcc_ptr = 0; /* password is null byte */
bcc_ptr++; /* skip password */
/* already aligned so no need to do it below */
- } else {
- pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- /* BB FIXME add code to fail this if NTLMv2 or Kerberos
- specified as required (when that support is added to
- the vfs in the future) as only NTLM or the much
- weaker LANMAN (which we do not send by default) is accepted
- by Samba (not sure whether other servers allow
- NTLMv2 password here) */
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
- (ses->sectype == LANMAN))
- calc_lanman_hash(tcon->password, ses->server->cryptkey,
- ses->server->sec_mode &
- SECMODE_PW_ENCRYPT ? true : false,
- bcc_ptr);
- else
-#endif /* CIFS_WEAK_PW_HASH */
- rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
- bcc_ptr, nls_codepage);
- if (rc) {
- cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n",
- __func__, rc);
- cifs_buf_release(smb_buffer);
- return rc;
- }
-
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
- if (ses->capabilities & CAP_UNICODE) {
- /* must align unicode strings */
- *bcc_ptr = 0; /* null byte password */
- bcc_ptr++;
- }
}
if (ses->server->sign)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index bb98fbdd22a9..d0216472f1c6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -377,6 +377,8 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file)
struct cifsLockInfo *li, *tmp;
struct super_block *sb = inode->i_sb;
+ cifs_fscache_release_inode_cookie(inode);
+
/*
* Delete any outstanding lock records. We'll lose them when the file
* is closed anyway.
@@ -882,8 +884,10 @@ int cifs_close(struct inode *inode, struct file *file)
if ((cinode->oplock == CIFS_CACHE_RHW_FLG) &&
cinode->lease_granted &&
dclose) {
- if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags))
+ if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
inode->i_ctime = inode->i_mtime = current_time(inode);
+ cifs_fscache_update_inode_cookie(inode);
+ }
spin_lock(&cinode->deferred_lock);
cifs_add_deferred_close(cfile, dclose);
if (cfile->deferred_close_scheduled &&
@@ -4170,6 +4174,10 @@ static vm_fault_t
cifs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
+ struct file *file = vmf->vma->vm_file;
+ struct inode *inode = file_inode(file);
+
+ cifs_fscache_wait_on_page_write(inode, page);
lock_page(page);
return VM_FAULT_LOCKED;
@@ -4235,13 +4243,16 @@ cifs_readv_complete(struct work_struct *work)
(rdata->result == -EAGAIN && got_bytes)) {
flush_dcache_page(page);
SetPageUptodate(page);
- }
+ } else
+ SetPageError(page);
unlock_page(page);
if (rdata->result == 0 ||
(rdata->result == -EAGAIN && got_bytes))
cifs_readpage_to_fscache(rdata->mapping->host, page);
+ else
+ cifs_fscache_uncache_page(rdata->mapping->host, page);
got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes);
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index eed59bc1d913..3109def8e199 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -57,12 +57,9 @@ static const match_table_t cifs_secflavor_tokens = {
{ Opt_sec_krb5p, "krb5p" },
{ Opt_sec_ntlmsspi, "ntlmsspi" },
{ Opt_sec_ntlmssp, "ntlmssp" },
- { Opt_ntlm, "ntlm" },
- { Opt_sec_ntlmi, "ntlmi" },
{ Opt_sec_ntlmv2, "nontlm" },
{ Opt_sec_ntlmv2, "ntlmv2" },
{ Opt_sec_ntlmv2i, "ntlmv2i" },
- { Opt_sec_lanman, "lanman" },
{ Opt_sec_none, "none" },
{ Opt_sec_err, NULL }
@@ -221,23 +218,12 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c
case Opt_sec_ntlmssp:
ctx->sectype = RawNTLMSSP;
break;
- case Opt_sec_ntlmi:
- ctx->sign = true;
- fallthrough;
- case Opt_ntlm:
- ctx->sectype = NTLM;
- break;
case Opt_sec_ntlmv2i:
ctx->sign = true;
fallthrough;
case Opt_sec_ntlmv2:
ctx->sectype = NTLMv2;
break;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- case Opt_sec_lanman:
- ctx->sectype = LANMAN;
- break;
-#endif
case Opt_sec_none:
ctx->nullauth = 1;
break;
@@ -1266,10 +1252,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->posix_paths = 1;
break;
case Opt_unix:
- if (result.negated)
+ if (result.negated) {
+ if (ctx->linux_ext == 1)
+ pr_warn_once("conflicting posix mount options specified\n");
ctx->linux_ext = 0;
- else
ctx->no_linux_ext = 1;
+ } else {
+ if (ctx->no_linux_ext == 1)
+ pr_warn_once("conflicting posix mount options specified\n");
+ ctx->linux_ext = 1;
+ ctx->no_linux_ext = 0;
+ }
break;
case Opt_nocase:
ctx->nocase = 1;
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index b6243972edf3..a42ba71d7a81 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -47,11 +47,8 @@ enum cifs_sec_param {
Opt_sec_krb5p,
Opt_sec_ntlmsspi,
Opt_sec_ntlmssp,
- Opt_ntlm,
- Opt_sec_ntlmi,
Opt_sec_ntlmv2,
Opt_sec_ntlmv2i,
- Opt_sec_lanman,
Opt_sec_none,
Opt_sec_err
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index dd625033cd6b..fab47fa7df74 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -176,29 +176,34 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec;
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
+ /* fscache_relinquish_cookie does not seem to update auxdata */
+ fscache_update_cookie(cifsi->fscache, &auxdata);
fscache_relinquish_cookie(cifsi->fscache, &auxdata, false);
cifsi->fscache = NULL;
}
}
-static void cifs_fscache_disable_inode_cookie(struct inode *inode)
+void cifs_fscache_update_inode_cookie(struct inode *inode)
{
+ struct cifs_fscache_inode_auxdata auxdata;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
if (cifsi->fscache) {
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.eof = cifsi->server_eof;
+ auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec;
+ auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec;
+ auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec;
+
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
- fscache_uncache_all_inode_pages(cifsi->fscache, inode);
- fscache_relinquish_cookie(cifsi->fscache, NULL, true);
- cifsi->fscache = NULL;
+ fscache_update_cookie(cifsi->fscache, &auxdata);
}
}
void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
{
- if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
- cifs_fscache_disable_inode_cookie(inode);
- else
- cifs_fscache_enable_inode_cookie(inode);
+ cifs_fscache_enable_inode_cookie(inode);
}
void cifs_fscache_reset_inode_cookie(struct inode *inode)
@@ -310,6 +315,8 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
struct cifsInodeInfo *cifsi = CIFS_I(inode);
int ret;
+ WARN_ON(!cifsi->fscache);
+
cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
__func__, cifsi->fscache, page, inode);
ret = fscache_write_page(cifsi->fscache, page,
@@ -334,3 +341,21 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
fscache_wait_on_page_write(cookie, page);
fscache_uncache_page(cookie, page);
}
+
+void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
+{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct fscache_cookie *cookie = cifsi->fscache;
+
+ cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
+ fscache_wait_on_page_write(cookie, page);
+}
+
+void __cifs_fscache_uncache_page(struct inode *inode, struct page *page)
+{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct fscache_cookie *cookie = cifsi->fscache;
+
+ cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
+ fscache_uncache_page(cookie, page);
+}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 3d55cb2ef055..82e856b9cf89 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -55,10 +55,13 @@ extern void cifs_fscache_get_super_cookie(struct cifs_tcon *);
extern void cifs_fscache_release_super_cookie(struct cifs_tcon *);
extern void cifs_fscache_release_inode_cookie(struct inode *);
+extern void cifs_fscache_update_inode_cookie(struct inode *inode);
extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
extern void cifs_fscache_reset_inode_cookie(struct inode *);
extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
+extern void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page);
+extern void __cifs_fscache_uncache_page(struct inode *inode, struct page *page);
extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
extern int __cifs_readpages_from_fscache(struct inode *,
@@ -76,6 +79,20 @@ static inline void cifs_fscache_invalidate_page(struct page *page,
__cifs_fscache_invalidate_page(page, inode);
}
+static inline void cifs_fscache_wait_on_page_write(struct inode *inode,
+ struct page *page)
+{
+ if (PageFsCache(page))
+ __cifs_fscache_wait_on_page_write(inode, page);
+}
+
+static inline void cifs_fscache_uncache_page(struct inode *inode,
+ struct page *page)
+{
+ if (PageFsCache(page))
+ __cifs_fscache_uncache_page(inode, page);
+}
+
static inline int cifs_readpage_from_fscache(struct inode *inode,
struct page *page)
{
@@ -123,6 +140,7 @@ static inline void
cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_update_inode_cookie(struct inode *inode) {}
static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
struct file *filp) {}
static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
@@ -133,6 +151,11 @@ static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
static inline void cifs_fscache_invalidate_page(struct page *page,
struct inode *inode) {}
+static inline void cifs_fscache_wait_on_page_write(struct inode *inode,
+ struct page *page) {}
+static inline void cifs_fscache_uncache_page(struct inode *inode,
+ struct page *page) {}
+
static inline int
cifs_readpage_from_fscache(struct inode *inode, struct page *page)
{
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 65f8a70cece3..50c01cff4c84 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2297,6 +2297,7 @@ cifs_revalidate_mapping(struct inode *inode)
{
int rc;
unsigned long *flags = &CIFS_I(inode)->flags;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
/* swapfiles are not supposed to be shared */
if (IS_SWAPFILE(inode))
@@ -2308,11 +2309,16 @@ cifs_revalidate_mapping(struct inode *inode)
return rc;
if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) {
+ /* for cache=singleclient, do not invalidate */
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)
+ goto skip_invalidate;
+
rc = cifs_invalidate_mapping(inode);
if (rc)
set_bit(CIFS_INO_INVALID_MAPPING, flags);
}
+skip_invalidate:
clear_bit_unlock(CIFS_INO_LOCK, flags);
smp_mb__after_atomic();
wake_up_bit(flags, CIFS_INO_LOCK);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index bfee176b901d..54d77c99e21c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -369,7 +369,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
*/
static int
-initiate_cifs_search(const unsigned int xid, struct file *file,
+_initiate_cifs_search(const unsigned int xid, struct file *file,
const char *full_path)
{
__u16 search_flags;
@@ -451,6 +451,27 @@ error_exit:
return rc;
}
+static int
+initiate_cifs_search(const unsigned int xid, struct file *file,
+ const char *full_path)
+{
+ int rc, retry_count = 0;
+
+ do {
+ rc = _initiate_cifs_search(xid, file, full_path);
+ /*
+ * If we don't have enough credits to start reading the
+ * directory just try again after short wait.
+ */
+ if (rc != -EDEADLK)
+ break;
+
+ usleep_range(512, 2048);
+ } while (retry_count++ < 5);
+
+ return rc;
+}
+
/* return length of unicode string in bytes */
static int cifs_unicode_bytelen(const char *str)
{
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c5785fd3f52e..118403fbeda2 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -799,30 +799,16 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
}
case CIFS_NEGFLAVOR_UNENCAP:
switch (requested) {
- case NTLM:
case NTLMv2:
return requested;
case Unspecified:
if (global_secflags & CIFSSEC_MAY_NTLMV2)
return NTLMv2;
- if (global_secflags & CIFSSEC_MAY_NTLM)
- return NTLM;
break;
default:
break;
}
- fallthrough; /* to attempt LANMAN authentication next */
- case CIFS_NEGFLAVOR_LANMAN:
- switch (requested) {
- case LANMAN:
- return requested;
- case Unspecified:
- if (global_secflags & CIFSSEC_MAY_LANMAN)
- return LANMAN;
- fallthrough;
- default:
- return Unspecified;
- }
+ fallthrough;
default:
return Unspecified;
}
@@ -877,7 +863,7 @@ sess_alloc_buffer(struct sess_data *sess_data, int wct)
return 0;
out_free_smb_buf:
- kfree(smb_buf);
+ cifs_small_buf_release(smb_buf);
sess_data->iov[0].iov_base = NULL;
sess_data->iov[0].iov_len = 0;
sess_data->buf0_type = CIFS_NO_BUFFER;
@@ -947,230 +933,6 @@ sess_sendreceive(struct sess_data *sess_data)
return rc;
}
-/*
- * LANMAN and plaintext are less secure and off by default.
- * So we make this explicitly be turned on in kconfig (in the
- * build) and turned on at runtime (changed from the default)
- * in proc/fs/cifs or via mount parm. Unfortunately this is
- * needed for old Win (e.g. Win95), some obscure NAS and OS/2
- */
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-static void
-sess_auth_lanman(struct sess_data *sess_data)
-{
- int rc = 0;
- struct smb_hdr *smb_buf;
- SESSION_SETUP_ANDX *pSMB;
- char *bcc_ptr;
- struct cifs_ses *ses = sess_data->ses;
- char lnm_session_key[CIFS_AUTH_RESP_SIZE];
- __u16 bytes_remaining;
-
- /* lanman 2 style sessionsetup */
- /* wct = 10 */
- rc = sess_alloc_buffer(sess_data, 10);
- if (rc)
- goto out;
-
- pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
- bcc_ptr = sess_data->iov[2].iov_base;
- (void)cifs_ssetup_hdr(ses, pSMB);
-
- pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
-
- if (ses->user_name != NULL) {
- /* no capabilities flags in old lanman negotiation */
- pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-
- /* Calculate hash with password and copy into bcc_ptr.
- * Encryption Key (stored as in cryptkey) gets used if the
- * security mode bit in Negotiate Protocol response states
- * to use challenge/response method (i.e. Password bit is 1).
- */
- rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
- ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
- true : false, lnm_session_key);
- if (rc)
- goto out;
-
- memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
- } else {
- pSMB->old_req.PasswordLength = 0;
- }
-
- /*
- * can not sign if LANMAN negotiated so no need
- * to calculate signing key? but what if server
- * changed to do higher than lanman dialect and
- * we reconnected would we ever calc signing_key?
- */
-
- cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
- /* Unicode not allowed for LANMAN dialects */
- ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
-
- sess_data->iov[2].iov_len = (long) bcc_ptr -
- (long) sess_data->iov[2].iov_base;
-
- rc = sess_sendreceive(sess_data);
- if (rc)
- goto out;
-
- pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
- smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-
- /* lanman response has a word count of 3 */
- if (smb_buf->WordCount != 3) {
- rc = -EIO;
- cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
- goto out;
- }
-
- if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
- cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
-
- ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
- cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-
- bytes_remaining = get_bcc(smb_buf);
- bcc_ptr = pByteArea(smb_buf);
-
- /* BB check if Unicode and decode strings */
- if (bytes_remaining == 0) {
- /* no string area to decode, do nothing */
- } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
- /* unicode string area must be word-aligned */
- if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
- ++bcc_ptr;
- --bytes_remaining;
- }
- decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
- sess_data->nls_cp);
- } else {
- decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
- sess_data->nls_cp);
- }
-
- rc = sess_establish_session(sess_data);
-out:
- sess_data->result = rc;
- sess_data->func = NULL;
- sess_free_buffer(sess_data);
-}
-
-#endif
-
-static void
-sess_auth_ntlm(struct sess_data *sess_data)
-{
- int rc = 0;
- struct smb_hdr *smb_buf;
- SESSION_SETUP_ANDX *pSMB;
- char *bcc_ptr;
- struct cifs_ses *ses = sess_data->ses;
- __u32 capabilities;
- __u16 bytes_remaining;
-
- /* old style NTLM sessionsetup */
- /* wct = 13 */
- rc = sess_alloc_buffer(sess_data, 13);
- if (rc)
- goto out;
-
- pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
- bcc_ptr = sess_data->iov[2].iov_base;
- capabilities = cifs_ssetup_hdr(ses, pSMB);
-
- pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
- if (ses->user_name != NULL) {
- pSMB->req_no_secext.CaseInsensitivePasswordLength =
- cpu_to_le16(CIFS_AUTH_RESP_SIZE);
- pSMB->req_no_secext.CaseSensitivePasswordLength =
- cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-
- /* calculate ntlm response and session key */
- rc = setup_ntlm_response(ses, sess_data->nls_cp);
- if (rc) {
- cifs_dbg(VFS, "Error %d during NTLM authentication\n",
- rc);
- goto out;
- }
-
- /* copy ntlm response */
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
- memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
- CIFS_AUTH_RESP_SIZE);
- bcc_ptr += CIFS_AUTH_RESP_SIZE;
- } else {
- pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
- pSMB->req_no_secext.CaseSensitivePasswordLength = 0;
- }
-
- if (ses->capabilities & CAP_UNICODE) {
- /* unicode strings must be word aligned */
- if (sess_data->iov[0].iov_len % 2) {
- *bcc_ptr = 0;
- bcc_ptr++;
- }
- unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
- } else {
- ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
- }
-
-
- sess_data->iov[2].iov_len = (long) bcc_ptr -
- (long) sess_data->iov[2].iov_base;
-
- rc = sess_sendreceive(sess_data);
- if (rc)
- goto out;
-
- pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
- smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-
- if (smb_buf->WordCount != 3) {
- rc = -EIO;
- cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
- goto out;
- }
-
- if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
- cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
-
- ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
- cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-
- bytes_remaining = get_bcc(smb_buf);
- bcc_ptr = pByteArea(smb_buf);
-
- /* BB check if Unicode and decode strings */
- if (bytes_remaining == 0) {
- /* no string area to decode, do nothing */
- } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
- /* unicode string area must be word-aligned */
- if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
- ++bcc_ptr;
- --bytes_remaining;
- }
- decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
- sess_data->nls_cp);
- } else {
- decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
- sess_data->nls_cp);
- }
-
- rc = sess_establish_session(sess_data);
-out:
- sess_data->result = rc;
- sess_data->func = NULL;
- sess_free_buffer(sess_data);
- kfree(ses->auth_key.response);
- ses->auth_key.response = NULL;
-}
-
static void
sess_auth_ntlmv2(struct sess_data *sess_data)
{
@@ -1675,21 +1437,6 @@ static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
}
switch (type) {
- case LANMAN:
- /* LANMAN and plaintext are less secure and off by default.
- * So we make this explicitly be turned on in kconfig (in the
- * build) and turned on at runtime (changed from the default)
- * in proc/fs/cifs or via mount parm. Unfortunately this is
- * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
- sess_data->func = sess_auth_lanman;
- break;
-#else
- return -EOPNOTSUPP;
-#endif
- case NTLM:
- sess_data->func = sess_auth_ntlm;
- break;
case NTLMv2:
sess_data->func = sess_auth_ntlmv2;
break;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index cea39bcecbab..181514b8770d 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: LGPL-2.1
/*
- * fs/smb2/smb2maperror.c
*
* Functions which do error mapping of SMB2 status codes to POSIX errors
*
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 2dfd0d8297eb..ddc0e8f97872 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
return rc;
}
+ filemap_invalidate_lock(inode->i_mapping);
/*
* We implement the punch hole through ioctl, so we need remove the page
* caches first, otherwise the data may be inconsistent with the server.
@@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
free_xid(xid);
+ filemap_invalidate_unlock(inode->i_mapping);
return rc;
}
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 39a938443e3e..10047cc55286 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -18,13 +18,13 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/random.h>
-#include <crypto/des.h>
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifs_debug.h"
#include "cifsproto.h"
+#include "../cifs_common/md4.h"
#ifndef false
#define false 0
@@ -38,126 +38,29 @@
#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
-static void
-str_to_key(unsigned char *str, unsigned char *key)
-{
- int i;
-
- key[0] = str[0] >> 1;
- key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
- key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
- key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
- key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
- key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
- key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
- key[7] = str[6] & 0x7F;
- for (i = 0; i < 8; i++)
- key[i] = (key[i] << 1);
-}
-
-static int
-smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
-{
- unsigned char key2[8];
- struct des_ctx ctx;
-
- str_to_key(key, key2);
-
- if (fips_enabled) {
- cifs_dbg(VFS, "FIPS compliance enabled: DES not permitted\n");
- return -ENOENT;
- }
-
- des_expand_key(&ctx, key2, DES_KEY_SIZE);
- des_encrypt(&ctx, out, in);
- memzero_explicit(&ctx, sizeof(ctx));
-
- return 0;
-}
-
-static int
-E_P16(unsigned char *p14, unsigned char *p16)
-{
- int rc;
- unsigned char sp8[8] =
- { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
-
- rc = smbhash(p16, sp8, p14);
- if (rc)
- return rc;
- rc = smbhash(p16 + 8, sp8, p14 + 7);
- return rc;
-}
-
-static int
-E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
-{
- int rc;
-
- rc = smbhash(p24, c8, p21);
- if (rc)
- return rc;
- rc = smbhash(p24 + 8, c8, p21 + 7);
- if (rc)
- return rc;
- rc = smbhash(p24 + 16, c8, p21 + 14);
- return rc;
-}
-
/* produce a md4 message digest from data of length n bytes */
-int
+static int
mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
{
int rc;
- struct crypto_shash *md4 = NULL;
- struct sdesc *sdescmd4 = NULL;
-
- rc = cifs_alloc_hash("md4", &md4, &sdescmd4);
- if (rc)
- goto mdfour_err;
+ struct md4_ctx mctx;
- rc = crypto_shash_init(&sdescmd4->shash);
+ rc = cifs_md4_init(&mctx);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init MD4\n", __func__);
goto mdfour_err;
}
- rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+ rc = cifs_md4_update(&mctx, link_str, link_len);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
+ cifs_dbg(VFS, "%s: Could not update MD4\n", __func__);
goto mdfour_err;
}
- rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
+ rc = cifs_md4_final(&mctx, md4_hash);
if (rc)
- cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__);
-
-mdfour_err:
- cifs_free_hash(&md4, &sdescmd4);
- return rc;
-}
-
-/*
- This implements the X/Open SMB password encryption
- It takes a password, a 8 byte "crypt key" and puts 24 bytes of
- encrypted password into p24 */
-/* Note that password must be uppercased and null terminated */
-int
-SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
-{
- int rc;
- unsigned char p14[14], p16[16], p21[21];
+ cifs_dbg(VFS, "%s: Could not finalize MD4\n", __func__);
- memset(p14, '\0', 14);
- memset(p16, '\0', 16);
- memset(p21, '\0', 21);
-
- memcpy(p14, passwd, 14);
- rc = E_P16(p14, p16);
- if (rc)
- return rc;
-
- memcpy(p21, p16, 16);
- rc = E_P24(p21, c8, p24);
+mdfour_err:
return rc;
}
@@ -186,25 +89,3 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
return rc;
}
-
-/* Does the NT MD4 hash then des encryption. */
-int
-SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
- const struct nls_table *codepage)
-{
- int rc;
- unsigned char p16[16], p21[21];
-
- memset(p16, '\0', 16);
- memset(p21, '\0', 21);
-
- rc = E_md4hash(passwd, p16, codepage);
- if (rc) {
- cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
- __func__, rc);
- return rc;
- }
- memcpy(p21, p16, 16);
- rc = E_P24(p21, c8, p24);
- return rc;
-}
diff --git a/fs/cifs_common/Makefile b/fs/cifs_common/Makefile
new file mode 100644
index 000000000000..6fedd2f88a25
--- /dev/null
+++ b/fs/cifs_common/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for Linux filesystem routines that are shared by client and server.
+#
+
+obj-$(CONFIG_CIFS_COMMON) += cifs_arc4.o
+obj-$(CONFIG_CIFS_COMMON) += cifs_md4.o
diff --git a/fs/cifs_common/arc4.h b/fs/cifs_common/arc4.h
new file mode 100644
index 000000000000..12e71ec033a1
--- /dev/null
+++ b/fs/cifs_common/arc4.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Common values for ARC4 Cipher Algorithm
+ */
+
+#ifndef _CRYPTO_ARC4_H
+#define _CRYPTO_ARC4_H
+
+#include <linux/types.h>
+
+#define ARC4_MIN_KEY_SIZE 1
+#define ARC4_MAX_KEY_SIZE 256
+#define ARC4_BLOCK_SIZE 1
+
+struct arc4_ctx {
+ u32 S[256];
+ u32 x, y;
+};
+
+int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len);
+void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len);
+
+#endif /* _CRYPTO_ARC4_H */
diff --git a/fs/cifs_common/cifs_arc4.c b/fs/cifs_common/cifs_arc4.c
new file mode 100644
index 000000000000..b964cc682944
--- /dev/null
+++ b/fs/cifs_common/cifs_arc4.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Cryptographic API
+ *
+ * ARC4 Cipher Algorithm
+ *
+ * Jon Oberheide <jon@oberheide.org>
+ */
+
+#include <linux/module.h>
+#include "arc4.h"
+
+MODULE_LICENSE("GPL");
+
+int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
+{
+ int i, j = 0, k = 0;
+
+ ctx->x = 1;
+ ctx->y = 0;
+
+ for (i = 0; i < 256; i++)
+ ctx->S[i] = i;
+
+ for (i = 0; i < 256; i++) {
+ u32 a = ctx->S[i];
+
+ j = (j + in_key[k] + a) & 0xff;
+ ctx->S[i] = ctx->S[j];
+ ctx->S[j] = a;
+ if (++k >= key_len)
+ k = 0;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cifs_arc4_setkey);
+
+void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
+{
+ u32 *const S = ctx->S;
+ u32 x, y, a, b;
+ u32 ty, ta, tb;
+
+ if (len == 0)
+ return;
+
+ x = ctx->x;
+ y = ctx->y;
+
+ a = S[x];
+ y = (y + a) & 0xff;
+ b = S[y];
+
+ do {
+ S[y] = a;
+ a = (a + b) & 0xff;
+ S[x] = b;
+ x = (x + 1) & 0xff;
+ ta = S[x];
+ ty = (y + ta) & 0xff;
+ tb = S[ty];
+ *out++ = *in++ ^ S[a];
+ if (--len == 0)
+ break;
+ y = ty;
+ a = ta;
+ b = tb;
+ } while (true);
+
+ ctx->x = x;
+ ctx->y = y;
+}
+EXPORT_SYMBOL_GPL(cifs_arc4_crypt);
+
+static int __init
+init_cifs_common(void)
+{
+ return 0;
+}
+static void __init
+exit_cifs_common(void)
+{
+}
+
+module_init(init_cifs_common)
+module_exit(exit_cifs_common)
diff --git a/fs/cifs_common/cifs_md4.c b/fs/cifs_common/cifs_md4.c
new file mode 100644
index 000000000000..50f78cfc6ce9
--- /dev/null
+++ b/fs/cifs_common/cifs_md4.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cryptographic API.
+ *
+ * MD4 Message Digest Algorithm (RFC1320).
+ *
+ * Implementation derived from Andrew Tridgell and Steve French's
+ * CIFS MD4 implementation, and the cryptoapi implementation
+ * originally based on the public domain implementation written
+ * by Colin Plumb in 1993.
+ *
+ * Copyright (c) Andrew Tridgell 1997-1998.
+ * Modified by Steve French (sfrench@us.ibm.com) 2002
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include "md4.h"
+
+MODULE_LICENSE("GPL");
+
+static inline u32 lshift(u32 x, unsigned int s)
+{
+ x &= 0xFFFFFFFF;
+ return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
+}
+
+static inline u32 F(u32 x, u32 y, u32 z)
+{
+ return (x & y) | ((~x) & z);
+}
+
+static inline u32 G(u32 x, u32 y, u32 z)
+{
+ return (x & y) | (x & z) | (y & z);
+}
+
+static inline u32 H(u32 x, u32 y, u32 z)
+{
+ return x ^ y ^ z;
+}
+
+#define ROUND1(a,b,c,d,k,s) (a = lshift(a + F(b,c,d) + k, s))
+#define ROUND2(a,b,c,d,k,s) (a = lshift(a + G(b,c,d) + k + (u32)0x5A827999,s))
+#define ROUND3(a,b,c,d,k,s) (a = lshift(a + H(b,c,d) + k + (u32)0x6ED9EBA1,s))
+
+static void md4_transform(u32 *hash, u32 const *in)
+{
+ u32 a, b, c, d;
+
+ a = hash[0];
+ b = hash[1];
+ c = hash[2];
+ d = hash[3];
+
+ ROUND1(a, b, c, d, in[0], 3);
+ ROUND1(d, a, b, c, in[1], 7);
+ ROUND1(c, d, a, b, in[2], 11);
+ ROUND1(b, c, d, a, in[3], 19);
+ ROUND1(a, b, c, d, in[4], 3);
+ ROUND1(d, a, b, c, in[5], 7);
+ ROUND1(c, d, a, b, in[6], 11);
+ ROUND1(b, c, d, a, in[7], 19);
+ ROUND1(a, b, c, d, in[8], 3);
+ ROUND1(d, a, b, c, in[9], 7);
+ ROUND1(c, d, a, b, in[10], 11);
+ ROUND1(b, c, d, a, in[11], 19);
+ ROUND1(a, b, c, d, in[12], 3);
+ ROUND1(d, a, b, c, in[13], 7);
+ ROUND1(c, d, a, b, in[14], 11);
+ ROUND1(b, c, d, a, in[15], 19);
+
+ ROUND2(a, b, c, d, in[0], 3);
+ ROUND2(d, a, b, c, in[4], 5);
+ ROUND2(c, d, a, b, in[8], 9);
+ ROUND2(b, c, d, a, in[12], 13);
+ ROUND2(a, b, c, d, in[1], 3);
+ ROUND2(d, a, b, c, in[5], 5);
+ ROUND2(c, d, a, b, in[9], 9);
+ ROUND2(b, c, d, a, in[13], 13);
+ ROUND2(a, b, c, d, in[2], 3);
+ ROUND2(d, a, b, c, in[6], 5);
+ ROUND2(c, d, a, b, in[10], 9);
+ ROUND2(b, c, d, a, in[14], 13);
+ ROUND2(a, b, c, d, in[3], 3);
+ ROUND2(d, a, b, c, in[7], 5);
+ ROUND2(c, d, a, b, in[11], 9);
+ ROUND2(b, c, d, a, in[15], 13);
+
+ ROUND3(a, b, c, d, in[0], 3);
+ ROUND3(d, a, b, c, in[8], 9);
+ ROUND3(c, d, a, b, in[4], 11);
+ ROUND3(b, c, d, a, in[12], 15);
+ ROUND3(a, b, c, d, in[2], 3);
+ ROUND3(d, a, b, c, in[10], 9);
+ ROUND3(c, d, a, b, in[6], 11);
+ ROUND3(b, c, d, a, in[14], 15);
+ ROUND3(a, b, c, d, in[1], 3);
+ ROUND3(d, a, b, c, in[9], 9);
+ ROUND3(c, d, a, b, in[5], 11);
+ ROUND3(b, c, d, a, in[13], 15);
+ ROUND3(a, b, c, d, in[3], 3);
+ ROUND3(d, a, b, c, in[11], 9);
+ ROUND3(c, d, a, b, in[7], 11);
+ ROUND3(b, c, d, a, in[15], 15);
+
+ hash[0] += a;
+ hash[1] += b;
+ hash[2] += c;
+ hash[3] += d;
+}
+
+static inline void md4_transform_helper(struct md4_ctx *ctx)
+{
+ le32_to_cpu_array(ctx->block, ARRAY_SIZE(ctx->block));
+ md4_transform(ctx->hash, ctx->block);
+}
+
+int cifs_md4_init(struct md4_ctx *mctx)
+{
+ memset(mctx, 0, sizeof(struct md4_ctx));
+ mctx->hash[0] = 0x67452301;
+ mctx->hash[1] = 0xefcdab89;
+ mctx->hash[2] = 0x98badcfe;
+ mctx->hash[3] = 0x10325476;
+ mctx->byte_count = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cifs_md4_init);
+
+int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len)
+{
+ const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f);
+
+ mctx->byte_count += len;
+
+ if (avail > len) {
+ memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
+ data, len);
+ return 0;
+ }
+
+ memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
+ data, avail);
+
+ md4_transform_helper(mctx);
+ data += avail;
+ len -= avail;
+
+ while (len >= sizeof(mctx->block)) {
+ memcpy(mctx->block, data, sizeof(mctx->block));
+ md4_transform_helper(mctx);
+ data += sizeof(mctx->block);
+ len -= sizeof(mctx->block);
+ }
+
+ memcpy(mctx->block, data, len);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cifs_md4_update);
+
+int cifs_md4_final(struct md4_ctx *mctx, u8 *out)
+{
+ const unsigned int offset = mctx->byte_count & 0x3f;
+ char *p = (char *)mctx->block + offset;
+ int padding = 56 - (offset + 1);
+
+ *p++ = 0x80;
+ if (padding < 0) {
+ memset(p, 0x00, padding + sizeof(u64));
+ md4_transform_helper(mctx);
+ p = (char *)mctx->block;
+ padding = 56;
+ }
+
+ memset(p, 0, padding);
+ mctx->block[14] = mctx->byte_count << 3;
+ mctx->block[15] = mctx->byte_count >> 29;
+ le32_to_cpu_array(mctx->block, (sizeof(mctx->block) -
+ sizeof(u64)) / sizeof(u32));
+ md4_transform(mctx->hash, mctx->block);
+ cpu_to_le32_array(mctx->hash, ARRAY_SIZE(mctx->hash));
+ memcpy(out, mctx->hash, sizeof(mctx->hash));
+ memset(mctx, 0, sizeof(*mctx));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cifs_md4_final);
diff --git a/fs/cifs_common/md4.h b/fs/cifs_common/md4.h
new file mode 100644
index 000000000000..5337becc699a
--- /dev/null
+++ b/fs/cifs_common/md4.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Common values for ARC4 Cipher Algorithm
+ */
+
+#ifndef _CIFS_MD4_H
+#define _CIFS_MD4_H
+
+#include <linux/types.h>
+
+#define MD4_DIGEST_SIZE 16
+#define MD4_HMAC_BLOCK_SIZE 64
+#define MD4_BLOCK_WORDS 16
+#define MD4_HASH_WORDS 4
+
+struct md4_ctx {
+ u32 hash[MD4_HASH_WORDS];
+ u32 block[MD4_BLOCK_WORDS];
+ u64 byte_count;
+};
+
+
+int cifs_md4_init(struct md4_ctx *mctx);
+int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len);
+int cifs_md4_final(struct md4_ctx *mctx, u8 *out);
+
+#endif /* _CIFS_MD4_H */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index ac5e0c0e9181..1466b5d01cbb 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -45,7 +45,7 @@ static void configfs_d_iput(struct dentry * dentry,
/*
* Set sd->s_dentry to null only when this dentry is the one
* that is going to be killed. Otherwise configfs_d_iput may
- * run just after configfs_attach_attr and set sd->s_dentry to
+ * run just after configfs_lookup and set sd->s_dentry to
* NULL even it's still in use.
*/
if (sd->s_dentry == dentry)
@@ -417,44 +417,16 @@ static void configfs_remove_dir(struct config_item * item)
dput(dentry);
}
-
-/* attaches attribute's configfs_dirent to the dentry corresponding to the
- * attribute file
- */
-static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
-{
- struct configfs_attribute * attr = sd->s_element;
- struct inode *inode;
-
- spin_lock(&configfs_dirent_lock);
- dentry->d_fsdata = configfs_get(sd);
- sd->s_dentry = dentry;
- spin_unlock(&configfs_dirent_lock);
-
- inode = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG);
- if (IS_ERR(inode)) {
- configfs_put(sd);
- return PTR_ERR(inode);
- }
- if (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) {
- inode->i_size = 0;
- inode->i_fop = &configfs_bin_file_operations;
- } else {
- inode->i_size = PAGE_SIZE;
- inode->i_fop = &configfs_file_operations;
- }
- d_add(dentry, inode);
- return 0;
-}
-
static struct dentry * configfs_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
struct configfs_dirent * sd;
- int found = 0;
- int err;
+ struct inode *inode = NULL;
+
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
/*
* Fake invisibility if dir belongs to a group/default groups hierarchy
@@ -464,36 +436,39 @@ static struct dentry * configfs_lookup(struct inode *dir,
* not complete their initialization, since the dentries of the
* attributes won't be instantiated.
*/
- err = -ENOENT;
if (!configfs_dirent_is_ready(parent_sd))
- goto out;
+ return ERR_PTR(-ENOENT);
+ spin_lock(&configfs_dirent_lock);
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
- if (sd->s_type & CONFIGFS_NOT_PINNED) {
- const unsigned char * name = configfs_get_name(sd);
+ if ((sd->s_type & CONFIGFS_NOT_PINNED) &&
+ !strcmp(configfs_get_name(sd), dentry->d_name.name)) {
+ struct configfs_attribute *attr = sd->s_element;
+ umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
- if (strcmp(name, dentry->d_name.name))
- continue;
+ dentry->d_fsdata = configfs_get(sd);
+ sd->s_dentry = dentry;
+ spin_unlock(&configfs_dirent_lock);
- found = 1;
- err = configfs_attach_attr(sd, dentry);
- break;
+ inode = configfs_create(dentry, mode);
+ if (IS_ERR(inode)) {
+ configfs_put(sd);
+ return ERR_CAST(inode);
+ }
+ if (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) {
+ inode->i_size = 0;
+ inode->i_fop = &configfs_bin_file_operations;
+ } else {
+ inode->i_size = PAGE_SIZE;
+ inode->i_fop = &configfs_file_operations;
+ }
+ goto done;
}
}
-
- if (!found) {
- /*
- * If it doesn't exist and it isn't a NOT_PINNED item,
- * it must be negative.
- */
- if (dentry->d_name.len > NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
- d_add(dentry, NULL);
- return NULL;
- }
-
-out:
- return ERR_PTR(err);
+ spin_unlock(&configfs_dirent_lock);
+done:
+ d_add(dentry, inode);
+ return NULL;
}
/*
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index d00455440d08..eb538c28df94 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -26,7 +26,7 @@
* it to find the directory entry again if requested. Naively, that would just
* mean using the ciphertext filenames. However, since the ciphertext filenames
* can contain illegal characters ('\0' and '/'), they must be encoded in some
- * way. We use base64. But that can cause names to exceed NAME_MAX (255
+ * way. We use base64url. But that can cause names to exceed NAME_MAX (255
* bytes), so we also need to use a strong hash to abbreviate long names.
*
* The filesystem may also need another kind of hash, the "dirhash", to quickly
@@ -38,7 +38,7 @@
* casefolded directories use this type of dirhash. At least in these cases,
* each no-key name must include the name's dirhash too.
*
- * To meet all these requirements, we base64-encode the following
+ * To meet all these requirements, we base64url-encode the following
* variable-length structure. It contains the dirhash, or 0's if the filesystem
* didn't provide one; up to 149 bytes of the ciphertext name; and for
* ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes.
@@ -52,15 +52,19 @@ struct fscrypt_nokey_name {
u32 dirhash[2];
u8 bytes[149];
u8 sha256[SHA256_DIGEST_SIZE];
-}; /* 189 bytes => 252 bytes base64-encoded, which is <= NAME_MAX (255) */
+}; /* 189 bytes => 252 bytes base64url-encoded, which is <= NAME_MAX (255) */
/*
- * Decoded size of max-size nokey name, i.e. a name that was abbreviated using
+ * Decoded size of max-size no-key name, i.e. a name that was abbreviated using
* the strong hash and thus includes the 'sha256' field. This isn't simply
* sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included.
*/
#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
+/* Encoded size of max-size no-key name */
+#define FSCRYPT_NOKEY_NAME_MAX_ENCODED \
+ FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX)
+
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
if (str->len == 1 && str->name[0] == '.')
@@ -175,62 +179,82 @@ static int fname_decrypt(const struct inode *inode,
return 0;
}
-static const char lookup_table[65] =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+static const char base64url_table[65] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
-#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
+#define FSCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
/**
- * base64_encode() - base64-encode some bytes
- * @src: the bytes to encode
- * @len: number of bytes to encode
- * @dst: (output) the base64-encoded string. Not NUL-terminated.
+ * fscrypt_base64url_encode() - base64url-encode some binary data
+ * @src: the binary data to encode
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the base64url-encoded string. Not NUL-terminated.
*
- * Encodes the input string using characters from the set [A-Za-z0-9+,].
- * The encoded string is roughly 4/3 times the size of the input string.
+ * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL
+ * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used,
+ * as it's unneeded and not required by the RFC. base64url is used instead of
+ * base64 to avoid the '/' character, which isn't allowed in filenames.
*
- * Return: length of the encoded string
+ * Return: the length of the resulting base64url-encoded string in bytes.
+ * This will be equal to FSCRYPT_BASE64URL_CHARS(srclen).
*/
-static int base64_encode(const u8 *src, int len, char *dst)
+static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst)
{
- int i, bits = 0, ac = 0;
+ u32 ac = 0;
+ int bits = 0;
+ int i;
char *cp = dst;
- for (i = 0; i < len; i++) {
- ac += src[i] << bits;
+ for (i = 0; i < srclen; i++) {
+ ac = (ac << 8) | src[i];
bits += 8;
do {
- *cp++ = lookup_table[ac & 0x3f];
- ac >>= 6;
bits -= 6;
+ *cp++ = base64url_table[(ac >> bits) & 0x3f];
} while (bits >= 6);
}
if (bits)
- *cp++ = lookup_table[ac & 0x3f];
+ *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f];
return cp - dst;
}
-static int base64_decode(const char *src, int len, u8 *dst)
+/**
+ * fscrypt_base64url_decode() - base64url-decode a string
+ * @src: the string to decode. Doesn't need to be NUL-terminated.
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the decoded binary data
+ *
+ * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with
+ * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't
+ * accepted, nor are non-encoding characters such as whitespace.
+ *
+ * This implementation hasn't been optimized for performance.
+ *
+ * Return: the length of the resulting decoded binary data in bytes,
+ * or -1 if the string isn't a valid base64url string.
+ */
+static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
{
- int i, bits = 0, ac = 0;
- const char *p;
- u8 *cp = dst;
+ u32 ac = 0;
+ int bits = 0;
+ int i;
+ u8 *bp = dst;
+
+ for (i = 0; i < srclen; i++) {
+ const char *p = strchr(base64url_table, src[i]);
- for (i = 0; i < len; i++) {
- p = strchr(lookup_table, src[i]);
if (p == NULL || src[i] == 0)
- return -2;
- ac += (p - lookup_table) << bits;
+ return -1;
+ ac = (ac << 6) | (p - base64url_table);
bits += 6;
if (bits >= 8) {
- *cp++ = ac & 0xff;
- ac >>= 8;
bits -= 8;
+ *bp++ = (u8)(ac >> bits);
}
}
- if (ac)
+ if (ac & ((1 << bits) - 1))
return -1;
- return cp - dst;
+ return bp - dst;
}
bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
@@ -263,10 +287,8 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
struct fscrypt_str *crypto_str)
{
- const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX);
- u32 max_presented_len;
-
- max_presented_len = max(max_encoded_len, max_encrypted_len);
+ u32 max_presented_len = max_t(u32, FSCRYPT_NOKEY_NAME_MAX_ENCODED,
+ max_encrypted_len);
crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS);
if (!crypto_str->name)
@@ -342,7 +364,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
offsetof(struct fscrypt_nokey_name, bytes));
BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) !=
offsetof(struct fscrypt_nokey_name, sha256));
- BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX);
+ BUILD_BUG_ON(FSCRYPT_NOKEY_NAME_MAX_ENCODED > NAME_MAX);
nokey_name.dirhash[0] = hash;
nokey_name.dirhash[1] = minor_hash;
@@ -358,7 +380,8 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
nokey_name.sha256);
size = FSCRYPT_NOKEY_NAME_MAX;
}
- oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
+ oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size,
+ oname->name);
return 0;
}
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
@@ -432,14 +455,15 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
* user-supplied name
*/
- if (iname->len > BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX))
+ if (iname->len > FSCRYPT_NOKEY_NAME_MAX_ENCODED)
return -ENOENT;
fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL);
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
- ret = base64_decode(iname->name, iname->len, fname->crypto_buf.name);
+ ret = fscrypt_base64url_decode(iname->name, iname->len,
+ fname->crypto_buf.name);
if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) ||
(ret > offsetof(struct fscrypt_nokey_name, sha256) &&
ret != FSCRYPT_NOKEY_NAME_MAX)) {
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index a73b0376e6f3..af74599ae1cf 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -384,3 +384,47 @@ err_kfree:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(fscrypt_get_symlink);
+
+/**
+ * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks
+ * @path: the path for the encrypted symlink being queried
+ * @stat: the struct being filled with the symlink's attributes
+ *
+ * Override st_size of encrypted symlinks to be the length of the decrypted
+ * symlink target (or the no-key encoded symlink target, if the key is
+ * unavailable) rather than the length of the encrypted symlink target. This is
+ * necessary for st_size to match the symlink target that userspace actually
+ * sees. POSIX requires this, and some userspace programs depend on it.
+ *
+ * This requires reading the symlink target from disk if needed, setting up the
+ * inode's encryption key if possible, and then decrypting or encoding the
+ * symlink target. This makes lstat() more heavyweight than is normally the
+ * case. However, decrypted symlink targets will be cached in ->i_link, so
+ * usually the symlink won't have to be read and decrypted again later if/when
+ * it is actually followed, readlink() is called, or lstat() is called again.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *inode = d_inode(dentry);
+ const char *link;
+ DEFINE_DELAYED_CALL(done);
+
+ /*
+ * To get the symlink target that userspace will see (whether it's the
+ * decrypted target or the no-key encoded target), we can just get it in
+ * the same way the VFS does during path resolution and readlink().
+ */
+ link = READ_ONCE(inode->i_link);
+ if (!link) {
+ link = inode->i_op->get_link(dentry, inode, &done);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ }
+ stat->size = strlen(link);
+ do_delayed_call(&done);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr);
diff --git a/fs/d_path.c b/fs/d_path.c
index 23a53f7b5c71..cd60c7535181 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -22,13 +22,57 @@ static char *extract_string(struct prepend_buffer *p)
return ERR_PTR(-ENAMETOOLONG);
}
-static void prepend(struct prepend_buffer *p, const char *str, int namelen)
+static bool prepend_char(struct prepend_buffer *p, unsigned char c)
{
- p->len -= namelen;
- if (likely(p->len >= 0)) {
- p->buf -= namelen;
- memcpy(p->buf, str, namelen);
+ if (likely(p->len > 0)) {
+ p->len--;
+ *--p->buf = c;
+ return true;
+ }
+ p->len = -1;
+ return false;
+}
+
+/*
+ * The source of the prepend data can be an optimistoc load
+ * of a dentry name and length. And because we don't hold any
+ * locks, the length and the pointer to the name may not be
+ * in sync if a concurrent rename happens, and the kernel
+ * copy might fault as a result.
+ *
+ * The end result will correct itself when we check the
+ * rename sequence count, but we need to be able to handle
+ * the fault gracefully.
+ */
+static bool prepend_copy(void *dst, const void *src, int len)
+{
+ if (unlikely(copy_from_kernel_nofault(dst, src, len))) {
+ memset(dst, 'x', len);
+ return false;
}
+ return true;
+}
+
+static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
+{
+ // Already overflowed?
+ if (p->len < 0)
+ return false;
+
+ // Will overflow?
+ if (p->len < namelen) {
+ // Fill as much as possible from the end of the name
+ str += namelen - p->len;
+ p->buf -= p->len;
+ prepend_copy(p->buf, str, p->len);
+ p->len = -1;
+ return false;
+ }
+
+ // Fits fully
+ p->len -= namelen;
+ p->buf -= namelen;
+ return prepend_copy(p->buf, str, namelen);
}
/**
@@ -40,32 +84,21 @@ static void prepend(struct prepend_buffer *p, const char *str, int namelen)
* With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
* make sure that either the old or the new name pointer and length are
* fetched. However, there may be mismatch between length and pointer.
- * The length cannot be trusted, we need to copy it byte-by-byte until
- * the length is reached or a null byte is found. It also prepends "/" at
+ * But since the length cannot be trusted, we need to copy the name very
+ * carefully when doing the prepend_copy(). It also prepends "/" at
* the beginning of the name. The sequence number check at the caller will
* retry it again when a d_move() does happen. So any garbage in the buffer
* due to mismatched pointer and length will be discarded.
*
- * Load acquire is needed to make sure that we see that terminating NUL.
+ * Load acquire is needed to make sure that we see the new name data even
+ * if we might get the length wrong.
*/
static bool prepend_name(struct prepend_buffer *p, const struct qstr *name)
{
const char *dname = smp_load_acquire(&name->name); /* ^^^ */
u32 dlen = READ_ONCE(name->len);
- char *s;
- p->len -= dlen + 1;
- if (unlikely(p->len < 0))
- return false;
- s = p->buf -= dlen + 1;
- *s++ = '/';
- while (dlen--) {
- char c = *dname++;
- if (!c)
- break;
- *s++ = c;
- }
- return true;
+ return prepend(p, dname, dlen) && prepend_char(p, '/');
}
static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
@@ -158,7 +191,7 @@ restart:
b = *p;
if (b.len == p->len)
- prepend(&b, "/", 1);
+ prepend_char(&b, '/');
*p = b;
return error;
@@ -186,7 +219,7 @@ char *__d_path(const struct path *path,
{
DECLARE_BUFFER(b, buf, buflen);
- prepend(&b, "", 1);
+ prepend_char(&b, 0);
if (unlikely(prepend_path(path, root, &b) > 0))
return NULL;
return extract_string(&b);
@@ -198,7 +231,7 @@ char *d_absolute_path(const struct path *path,
struct path root = {};
DECLARE_BUFFER(b, buf, buflen);
- prepend(&b, "", 1);
+ prepend_char(&b, 0);
if (unlikely(prepend_path(path, &root, &b) > 1))
return ERR_PTR(-EINVAL);
return extract_string(&b);
@@ -255,7 +288,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
if (unlikely(d_unlinked(path->dentry)))
prepend(&b, " (deleted)", 11);
else
- prepend(&b, "", 1);
+ prepend_char(&b, 0);
prepend_path(path, &root, &b);
rcu_read_unlock();
@@ -290,7 +323,7 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
/* these dentries are never renamed, so d_lock is not needed */
prepend(&b, " (deleted)", 11);
prepend(&b, dentry->d_name.name, dentry->d_name.len);
- prepend(&b, "/", 1);
+ prepend_char(&b, '/');
return extract_string(&b);
}
@@ -324,7 +357,7 @@ restart:
}
done_seqretry(&rename_lock, seq);
if (b.len == p->len)
- prepend(&b, "/", 1);
+ prepend_char(&b, '/');
return extract_string(&b);
}
@@ -332,7 +365,7 @@ char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
{
DECLARE_BUFFER(b, buf, buflen);
- prepend(&b, "", 1);
+ prepend_char(&b, 0);
return __dentry_path(dentry, &b);
}
EXPORT_SYMBOL(dentry_path_raw);
@@ -344,7 +377,7 @@ char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
if (unlikely(d_unlinked(dentry)))
prepend(&b, "//deleted", 10);
else
- prepend(&b, "", 1);
+ prepend_char(&b, 0);
return __dentry_path(dentry, &b);
}
@@ -397,7 +430,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
unsigned len;
DECLARE_BUFFER(b, page, PATH_MAX);
- prepend(&b, "", 1);
+ prepend_char(&b, 0);
if (unlikely(prepend_path(&pwd, &root, &b) > 0))
prepend(&b, "(unreachable)", 13);
rcu_read_unlock();
diff --git a/fs/dax.c b/fs/dax.c
index 99b4e78d888f..4e3e5a283a91 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1005,12 +1005,12 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
{
return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
}
-static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
pfn_t *pfnp)
{
const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1066,6 +1066,66 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
return ret;
}
+#ifdef CONFIG_FS_DAX_PMD
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap *iomap, void **entry)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ unsigned long pmd_addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct inode *inode = mapping->host;
+ pgtable_t pgtable = NULL;
+ struct page *zero_page;
+ spinlock_t *ptl;
+ pmd_t pmd_entry;
+ pfn_t pfn;
+
+ zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
+
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ pfn = page_to_pfn_t(zero_page);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, false);
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
+ ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (!pmd_none(*(vmf->pmd))) {
+ spin_unlock(ptl);
+ goto fallback;
+ }
+
+ if (pgtable) {
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
+ pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
+ pmd_entry = pmd_mkhuge(pmd_entry);
+ set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
+ spin_unlock(ptl);
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
+ return VM_FAULT_NOPAGE;
+
+fallback:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
+ return VM_FAULT_FALLBACK;
+}
+#else
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap *iomap, void **entry)
+{
+ return VM_FAULT_FALLBACK;
+}
+#endif /* CONFIG_FS_DAX_PMD */
+
s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
@@ -1103,20 +1163,21 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
return size;
}
-static loff_t
-dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap, struct iomap *srcmap)
+static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
+ struct iov_iter *iter)
{
+ const struct iomap *iomap = &iomi->iomap;
+ loff_t length = iomap_length(iomi);
+ loff_t pos = iomi->pos;
struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
- struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
size_t xfer;
int id;
if (iov_iter_rw(iter) == READ) {
- end = min(end, i_size_read(inode));
+ end = min(end, i_size_read(iomi->inode));
if (pos >= end)
return 0;
@@ -1133,7 +1194,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* written by write(2) is visible in mmap.
*/
if (iomap->flags & IOMAP_F_NEW) {
- invalidate_inode_pages2_range(inode->i_mapping,
+ invalidate_inode_pages2_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
}
@@ -1209,31 +1270,29 @@ ssize_t
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos, ret = 0, done = 0;
- unsigned flags = 0;
+ struct iomap_iter iomi = {
+ .inode = iocb->ki_filp->f_mapping->host,
+ .pos = iocb->ki_pos,
+ .len = iov_iter_count(iter),
+ };
+ loff_t done = 0;
+ int ret;
if (iov_iter_rw(iter) == WRITE) {
- lockdep_assert_held_write(&inode->i_rwsem);
- flags |= IOMAP_WRITE;
+ lockdep_assert_held_write(&iomi.inode->i_rwsem);
+ iomi.flags |= IOMAP_WRITE;
} else {
- lockdep_assert_held(&inode->i_rwsem);
+ lockdep_assert_held(&iomi.inode->i_rwsem);
}
if (iocb->ki_flags & IOCB_NOWAIT)
- flags |= IOMAP_NOWAIT;
+ iomi.flags |= IOMAP_NOWAIT;
- while (iov_iter_count(iter)) {
- ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
- iter, dax_iomap_actor);
- if (ret <= 0)
- break;
- pos += ret;
- done += ret;
- }
+ while ((ret = iomap_iter(&iomi, ops)) > 0)
+ iomi.processed = dax_iomap_iter(&iomi, iter);
- iocb->ki_pos += done;
+ done = iomi.pos - iocb->ki_pos;
+ iocb->ki_pos = iomi.pos;
return done ? done : ret;
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);
@@ -1250,44 +1309,146 @@ static vm_fault_t dax_fault_return(int error)
* flushed on write-faults (non-cow), but not read-faults.
*/
static bool dax_fault_is_synchronous(unsigned long flags,
- struct vm_area_struct *vma, struct iomap *iomap)
+ struct vm_area_struct *vma, const struct iomap *iomap)
{
return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
&& (iomap->flags & IOMAP_F_DIRTY);
}
+/*
+ * When handling a synchronous page fault and the inode need a fsync, we can
+ * insert the PTE/PMD into page tables only after that fsync happened. Skip
+ * insertion for now and return the pfn so that caller can insert it after the
+ * fsync is done.
+ */
+static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
+{
+ if (WARN_ON_ONCE(!pfnp))
+ return VM_FAULT_SIGBUS;
+ *pfnp = pfn;
+ return VM_FAULT_NEEDDSYNC;
+}
+
+static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
+ const struct iomap_iter *iter)
+{
+ sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
+ unsigned long vaddr = vmf->address;
+ vm_fault_t ret;
+ int error = 0;
+
+ switch (iter->iomap.type) {
+ case IOMAP_HOLE:
+ case IOMAP_UNWRITTEN:
+ clear_user_highpage(vmf->cow_page, vaddr);
+ break;
+ case IOMAP_MAPPED:
+ error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev,
+ sector, vmf->cow_page, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ }
+
+ if (error)
+ return dax_fault_return(error);
+
+ __SetPageUptodate(vmf->cow_page);
+ ret = finish_fault(vmf);
+ if (!ret)
+ return VM_FAULT_DONE_COW;
+ return ret;
+}
+
+/**
+ * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
+ * @vmf: vm fault instance
+ * @iter: iomap iter
+ * @pfnp: pfn to be returned
+ * @xas: the dax mapping tree of a file
+ * @entry: an unlocked dax entry to be inserted
+ * @pmd: distinguish whether it is a pmd fault
+ */
+static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
+ const struct iomap_iter *iter, pfn_t *pfnp,
+ struct xa_state *xas, void **entry, bool pmd)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ const struct iomap *iomap = &iter->iomap;
+ size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
+ loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
+ unsigned long entry_flags = pmd ? DAX_PMD : 0;
+ int err = 0;
+ pfn_t pfn;
+
+ if (!pmd && vmf->cow_page)
+ return dax_fault_cow_page(vmf, iter);
+
+ /* if we are reading UNWRITTEN and HOLE, return a hole. */
+ if (!write &&
+ (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
+ if (!pmd)
+ return dax_load_hole(xas, mapping, entry, vmf);
+ return dax_pmd_load_hole(xas, vmf, iomap, entry);
+ }
+
+ if (iomap->type != IOMAP_MAPPED) {
+ WARN_ON_ONCE(1);
+ return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
+ }
+
+ err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
+ if (err)
+ return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
+
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
+ write && !sync);
+
+ if (sync)
+ return dax_fault_synchronous_pfnp(pfnp, pfn);
+
+ /* insert PMD pfn */
+ if (pmd)
+ return vmf_insert_pfn_pmd(vmf, pfn, write);
+
+ /* insert PTE pfn */
+ if (write)
+ return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+}
+
static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
- struct vm_area_struct *vma = vmf->vma;
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
- struct inode *inode = mapping->host;
- unsigned long vaddr = vmf->address;
- loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
- struct iomap iomap = { .type = IOMAP_HOLE };
- struct iomap srcmap = { .type = IOMAP_HOLE };
- unsigned flags = IOMAP_FAULT;
- int error, major = 0;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync;
+ struct iomap_iter iter = {
+ .inode = mapping->host,
+ .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
+ .len = PAGE_SIZE,
+ .flags = IOMAP_FAULT,
+ };
vm_fault_t ret = 0;
void *entry;
- pfn_t pfn;
+ int error;
- trace_dax_pte_fault(inode, vmf, ret);
+ trace_dax_pte_fault(iter.inode, vmf, ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
- if (pos >= i_size_read(inode)) {
+ if (iter.pos >= i_size_read(iter.inode)) {
ret = VM_FAULT_SIGBUS;
goto out;
}
- if (write && !vmf->cow_page)
- flags |= IOMAP_WRITE;
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+ iter.flags |= IOMAP_WRITE;
entry = grab_mapping_entry(&xas, mapping, 0);
if (xa_is_internal(entry)) {
@@ -1306,234 +1467,103 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto unlock_entry;
}
- /*
- * Note that we don't bother to use iomap_apply here: DAX required
- * the file system block size to be equal the page size, which means
- * that we never have to deal with more than a single extent here.
- */
- error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
- if (iomap_errp)
- *iomap_errp = error;
- if (error) {
- ret = dax_fault_return(error);
- goto unlock_entry;
- }
- if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- error = -EIO; /* fs corruption? */
- goto error_finish_iomap;
- }
-
- if (vmf->cow_page) {
- sector_t sector = dax_iomap_sector(&iomap, pos);
-
- switch (iomap.type) {
- case IOMAP_HOLE:
- case IOMAP_UNWRITTEN:
- clear_user_highpage(vmf->cow_page, vaddr);
- break;
- case IOMAP_MAPPED:
- error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
- sector, vmf->cow_page, vaddr);
- break;
- default:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
+ while ((error = iomap_iter(&iter, ops)) > 0) {
+ if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
+ iter.processed = -EIO; /* fs corruption? */
+ continue;
}
- if (error)
- goto error_finish_iomap;
-
- __SetPageUptodate(vmf->cow_page);
- ret = finish_fault(vmf);
- if (!ret)
- ret = VM_FAULT_DONE_COW;
- goto finish_iomap;
- }
-
- sync = dax_fault_is_synchronous(flags, vma, &iomap);
-
- switch (iomap.type) {
- case IOMAP_MAPPED:
- if (iomap.flags & IOMAP_F_NEW) {
+ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
+ if (ret != VM_FAULT_SIGBUS &&
+ (iter.iomap.flags & IOMAP_F_NEW)) {
count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
- major = VM_FAULT_MAJOR;
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+ ret |= VM_FAULT_MAJOR;
}
- error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
- if (error < 0)
- goto error_finish_iomap;
-
- entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
- 0, write && !sync);
- /*
- * If we are doing synchronous page fault and inode needs fsync,
- * we can insert PTE into page tables only after that happens.
- * Skip insertion for now and return the pfn so that caller can
- * insert it after fsync is done.
- */
- if (sync) {
- if (WARN_ON_ONCE(!pfnp)) {
- error = -EIO;
- goto error_finish_iomap;
- }
- *pfnp = pfn;
- ret = VM_FAULT_NEEDDSYNC | major;
- goto finish_iomap;
- }
- trace_dax_insert_mapping(inode, vmf, entry);
- if (write)
- ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
- else
- ret = vmf_insert_mixed(vma, vaddr, pfn);
-
- goto finish_iomap;
- case IOMAP_UNWRITTEN:
- case IOMAP_HOLE:
- if (!write) {
- ret = dax_load_hole(&xas, mapping, &entry, vmf);
- goto finish_iomap;
- }
- fallthrough;
- default:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
+ if (!(ret & VM_FAULT_ERROR))
+ iter.processed = PAGE_SIZE;
}
- error_finish_iomap:
- ret = dax_fault_return(error);
- finish_iomap:
- if (ops->iomap_end) {
- int copied = PAGE_SIZE;
+ if (iomap_errp)
+ *iomap_errp = error;
+ if (!ret && error)
+ ret = dax_fault_return(error);
- if (ret & VM_FAULT_ERROR)
- copied = 0;
- /*
- * The fault is done by now and there's no way back (other
- * thread may be already happily using PTE we have installed).
- * Just ignore error from ->iomap_end since we cannot do much
- * with it.
- */
- ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
- }
- unlock_entry:
+unlock_entry:
dax_unlock_entry(&xas, entry);
- out:
- trace_dax_pte_fault_done(inode, vmf, ret);
- return ret | major;
+out:
+ trace_dax_pte_fault_done(iter.inode, vmf, ret);
+ return ret;
}
#ifdef CONFIG_FS_DAX_PMD
-static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- struct iomap *iomap, void **entry)
+static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
+ pgoff_t max_pgoff)
{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
- struct vm_area_struct *vma = vmf->vma;
- struct inode *inode = mapping->host;
- pgtable_t pgtable = NULL;
- struct page *zero_page;
- spinlock_t *ptl;
- pmd_t pmd_entry;
- pfn_t pfn;
-
- zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
-
- if (unlikely(!zero_page))
- goto fallback;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
- pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ /*
+ * Make sure that the faulting address's PMD offset (color) matches
+ * the PMD offset from the start of the file. This is necessary so
+ * that a PMD range in the page table overlaps exactly with a PMD
+ * range in the page cache.
+ */
+ if ((vmf->pgoff & PG_PMD_COLOUR) !=
+ ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
+ return true;
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vmf->vma->vm_flags & VM_SHARED))
+ return true;
- ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
- if (!pmd_none(*(vmf->pmd))) {
- spin_unlock(ptl);
- goto fallback;
- }
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vmf->vma->vm_start)
+ return true;
+ if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
+ return true;
- if (pgtable) {
- pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- mm_inc_nr_ptes(vma->vm_mm);
- }
- pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
- pmd_entry = pmd_mkhuge(pmd_entry);
- set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
- spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
- return VM_FAULT_NOPAGE;
+ /* If the PMD would extend beyond the file size */
+ if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
+ return true;
-fallback:
- if (pgtable)
- pte_free(vma->vm_mm, pgtable);
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
- return VM_FAULT_FALLBACK;
+ return false;
}
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
- struct vm_area_struct *vma = vmf->vma;
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
- unsigned long pmd_addr = vmf->address & PMD_MASK;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync;
- unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
- struct inode *inode = mapping->host;
- vm_fault_t result = VM_FAULT_FALLBACK;
- struct iomap iomap = { .type = IOMAP_HOLE };
- struct iomap srcmap = { .type = IOMAP_HOLE };
+ struct iomap_iter iter = {
+ .inode = mapping->host,
+ .len = PMD_SIZE,
+ .flags = IOMAP_FAULT,
+ };
+ vm_fault_t ret = VM_FAULT_FALLBACK;
pgoff_t max_pgoff;
void *entry;
- loff_t pos;
int error;
- pfn_t pfn;
+
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ iter.flags |= IOMAP_WRITE;
/*
* Check whether offset isn't beyond end of file now. Caller is
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
- max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-
- trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
-
- /*
- * Make sure that the faulting address's PMD offset (color) matches
- * the PMD offset from the start of the file. This is necessary so
- * that a PMD range in the page table overlaps exactly with a PMD
- * range in the page cache.
- */
- if ((vmf->pgoff & PG_PMD_COLOUR) !=
- ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
- goto fallback;
+ max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
- /* Fall back to PTEs if we're going to COW */
- if (write && !(vma->vm_flags & VM_SHARED))
- goto fallback;
-
- /* If the PMD would extend outside the VMA */
- if (pmd_addr < vma->vm_start)
- goto fallback;
- if ((pmd_addr + PMD_SIZE) > vma->vm_end)
- goto fallback;
+ trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
if (xas.xa_index >= max_pgoff) {
- result = VM_FAULT_SIGBUS;
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- /* If the PMD would extend beyond the file size */
- if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
+ if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
goto fallback;
/*
@@ -1544,7 +1574,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
*/
entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
if (xa_is_internal(entry)) {
- result = xa_to_internal(entry);
+ ret = xa_to_internal(entry);
goto fallback;
}
@@ -1556,88 +1586,30 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
*/
if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
!pmd_devmap(*vmf->pmd)) {
- result = 0;
+ ret = 0;
goto unlock_entry;
}
- /*
- * Note that we don't use iomap_apply here. We aren't doing I/O, only
- * setting up a mapping, so really we're using iomap_begin() as a way
- * to look up our filesystem block.
- */
- pos = (loff_t)xas.xa_index << PAGE_SHIFT;
- error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
- &srcmap);
- if (error)
- goto unlock_entry;
-
- if (iomap.offset + iomap.length < pos + PMD_SIZE)
- goto finish_iomap;
-
- sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
-
- switch (iomap.type) {
- case IOMAP_MAPPED:
- error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
- if (error < 0)
- goto finish_iomap;
-
- entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
- DAX_PMD, write && !sync);
-
- /*
- * If we are doing synchronous page fault and inode needs fsync,
- * we can insert PMD into page tables only after that happens.
- * Skip insertion for now and return the pfn so that caller can
- * insert it after fsync is done.
- */
- if (sync) {
- if (WARN_ON_ONCE(!pfnp))
- goto finish_iomap;
- *pfnp = pfn;
- result = VM_FAULT_NEEDDSYNC;
- goto finish_iomap;
- }
+ iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
+ while ((error = iomap_iter(&iter, ops)) > 0) {
+ if (iomap_length(&iter) < PMD_SIZE)
+ continue; /* actually breaks out of the loop */
- trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
- result = vmf_insert_pfn_pmd(vmf, pfn, write);
- break;
- case IOMAP_UNWRITTEN:
- case IOMAP_HOLE:
- if (WARN_ON_ONCE(write))
- break;
- result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
- break;
- default:
- WARN_ON_ONCE(1);
- break;
+ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
+ if (ret != VM_FAULT_FALLBACK)
+ iter.processed = PMD_SIZE;
}
- finish_iomap:
- if (ops->iomap_end) {
- int copied = PMD_SIZE;
-
- if (result == VM_FAULT_FALLBACK)
- copied = 0;
- /*
- * The fault is done by now and there's no way back (other
- * thread may be already happily using PMD we have installed).
- * Just ignore error from ->iomap_end since we cannot do much
- * with it.
- */
- ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
- &iomap);
- }
- unlock_entry:
+unlock_entry:
dax_unlock_entry(&xas, entry);
- fallback:
- if (result == VM_FAULT_FALLBACK) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
+fallback:
+ if (ret == VM_FAULT_FALLBACK) {
+ split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
count_vm_event(THP_FAULT_FALLBACK);
}
out:
- trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
- return result;
+ trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
+ return ret;
}
#else
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index df00231d3ecc..7d162b0efbf0 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -179,8 +179,10 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
if (!fops_get(real_fops)) {
#ifdef CONFIG_MODULES
if (real_fops->owner &&
- real_fops->owner->state == MODULE_STATE_GOING)
+ real_fops->owner->state == MODULE_STATE_GOING) {
+ r = -ENXIO;
goto out;
+ }
#endif
/* Huh? Module did not clean up after itself at exit? */
@@ -314,8 +316,10 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
if (!fops_get(real_fops)) {
#ifdef CONFIG_MODULES
if (real_fops->owner &&
- real_fops->owner->state == MODULE_STATE_GOING)
+ real_fops->owner->state == MODULE_STATE_GOING) {
+ r = -ENXIO;
goto out;
+ }
#endif
/* Huh? Module did not cleanup after itself at exit? */
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 10c36ae1a8f9..45ebbe602bbf 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -85,8 +85,10 @@ int dlm_recover_directory(struct dlm_ls *ls)
for (;;) {
int left;
error = dlm_recovery_stopped(ls);
- if (error)
+ if (error) {
+ error = -EINTR;
goto out_free;
+ }
error = dlm_rcom_names(ls, memb->nodeid,
last_name, last_len);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 91d1ca3a121a..5f57538b5d45 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -468,7 +468,7 @@ struct dlm_rcom {
struct dlm_opt_header {
uint16_t t_type;
uint16_t t_length;
- uint32_t o_pad;
+ uint32_t t_pad;
/* need to be 8 byte aligned */
char t_value[];
};
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d71aba8c3e64..10eddfa6c3d7 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -498,7 +498,7 @@ static int new_lockspace(const char *name, const char *cluster,
ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
DLM_LSFL_NEWEXCL));
- size = dlm_config.ci_rsbtbl_size;
+ size = READ_ONCE(dlm_config.ci_rsbtbl_size);
ls->ls_rsbtbl_size = size;
ls->ls_rsbtbl = vmalloc(array_size(size, sizeof(struct dlm_rsbtable)));
@@ -793,6 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
if (ls_count == 1) {
dlm_scand_stop();
+ dlm_clear_members(ls);
dlm_midcomms_shutdown();
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0ea9ae35da0b..8f715c620e1f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -84,9 +84,7 @@ struct connection {
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
atomic_t writequeue_cnt;
- void (*connect_action) (struct connection *); /* What to do to connect */
- void (*shutdown_action)(struct connection *con); /* What to do to shutdown */
- bool (*eof_condition)(struct connection *con); /* What to do to eof check */
+ struct mutex wq_alloc;
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
@@ -145,6 +143,24 @@ struct dlm_node_addr {
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
};
+struct dlm_proto_ops {
+ bool try_new_addr;
+ const char *name;
+ int proto;
+
+ int (*connect)(struct connection *con, struct socket *sock,
+ struct sockaddr *addr, int addr_len);
+ void (*sockopts)(struct socket *sock);
+ int (*bind)(struct socket *sock);
+ int (*listen_validate)(void);
+ void (*listen_sockopts)(struct socket *sock);
+ int (*listen_bind)(struct socket *sock);
+ /* What to do to shutdown */
+ void (*shutdown_action)(struct connection *con);
+ /* What to do to eof check */
+ bool (*eof_condition)(struct connection *con);
+};
+
static struct listen_sock_callbacks {
void (*sk_error_report)(struct sock *);
void (*sk_data_ready)(struct sock *);
@@ -168,12 +184,26 @@ static struct hlist_head connection_hash[CONN_HASH_SIZE];
static DEFINE_SPINLOCK(connections_lock);
DEFINE_STATIC_SRCU(connections_srcu);
+static const struct dlm_proto_ops *dlm_proto_ops;
+
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
-static void sctp_connect_to_sock(struct connection *con);
-static void tcp_connect_to_sock(struct connection *con);
-static void dlm_tcp_shutdown(struct connection *con);
+/* need to held writequeue_lock */
+static struct writequeue_entry *con_next_wq(struct connection *con)
+{
+ struct writequeue_entry *e;
+
+ if (list_empty(&con->writequeue))
+ return NULL;
+
+ e = list_first_entry(&con->writequeue, struct writequeue_entry,
+ list);
+ if (e->len == 0)
+ return NULL;
+
+ return e;
+}
static struct connection *__find_con(int nodeid, int r)
{
@@ -208,20 +238,6 @@ static int dlm_con_init(struct connection *con, int nodeid)
INIT_WORK(&con->rwork, process_recv_sockets);
init_waitqueue_head(&con->shutdown_wait);
- switch (dlm_config.ci_protocol) {
- case DLM_PROTO_TCP:
- con->connect_action = tcp_connect_to_sock;
- con->shutdown_action = dlm_tcp_shutdown;
- con->eof_condition = tcp_eof_condition;
- break;
- case DLM_PROTO_SCTP:
- con->connect_action = sctp_connect_to_sock;
- break;
- default:
- kfree(con->rx_buf);
- return -EINVAL;
- }
-
return 0;
}
@@ -249,6 +265,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
return NULL;
}
+ mutex_init(&con->wq_alloc);
+
spin_lock(&connections_lock);
/* Because multiple workqueues/threads calls this function it can
* race on multiple cpu's. Instead of locking hot path __find_con()
@@ -583,8 +601,7 @@ static void lowcomms_error_report(struct sock *sk)
goto out;
orig_report = listen_sock.sk_error_report;
- if (con->sock == NULL ||
- kernel_getpeername(con->sock, (struct sockaddr *)&saddr) < 0) {
+ if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) {
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
"sending to node %d, port %d, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
@@ -801,6 +818,7 @@ static void close_connection(struct connection *con, bool and_other,
con->rx_leftover = 0;
con->retries = 0;
+ clear_bit(CF_APP_LIMITED, &con->flags);
clear_bit(CF_CONNECTED, &con->flags);
clear_bit(CF_DELAY_CONNECT, &con->flags);
clear_bit(CF_RECONNECT, &con->flags);
@@ -877,7 +895,6 @@ static int con_realloc_receive_buf(struct connection *con, int newlen)
/* Data received from remote end */
static int receive_from_sock(struct connection *con)
{
- int call_again_soon = 0;
struct msghdr msg;
struct kvec iov;
int ret, buflen;
@@ -897,41 +914,40 @@ static int receive_from_sock(struct connection *con)
goto out_resched;
}
- /* calculate new buffer parameter regarding last receive and
- * possible leftover bytes
- */
- iov.iov_base = con->rx_buf + con->rx_leftover;
- iov.iov_len = con->rx_buflen - con->rx_leftover;
-
- memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
- ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
- msg.msg_flags);
- if (ret <= 0)
- goto out_close;
- else if (ret == iov.iov_len)
- call_again_soon = 1;
-
- /* new buflen according readed bytes and leftover from last receive */
- buflen = ret + con->rx_leftover;
- ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
- if (ret < 0)
- goto out_close;
+ for (;;) {
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
+ */
+ iov.iov_base = con->rx_buf + con->rx_leftover;
+ iov.iov_len = con->rx_buflen - con->rx_leftover;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
+ if (ret == -EAGAIN)
+ break;
+ else if (ret <= 0)
+ goto out_close;
- /* calculate leftover bytes from process and put it into begin of
- * the receive buffer, so next receive we have the full message
- * at the start address of the receive buffer.
- */
- con->rx_leftover = buflen - ret;
- if (con->rx_leftover) {
- memmove(con->rx_buf, con->rx_buf + ret,
- con->rx_leftover);
- call_again_soon = true;
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen = ret + con->rx_leftover;
+ ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
+ if (ret < 0)
+ goto out_close;
+
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen - ret;
+ if (con->rx_leftover) {
+ memmove(con->rx_buf, con->rx_buf + ret,
+ con->rx_leftover);
+ }
}
- if (call_again_soon)
- goto out_resched;
-
+ dlm_midcomms_receive_done(con->nodeid);
mutex_unlock(&con->sock_mutex);
return 0;
@@ -946,7 +962,8 @@ out_close:
log_print("connection %p got EOF from %d",
con, con->nodeid);
- if (con->eof_condition && con->eof_condition(con)) {
+ if (dlm_proto_ops->eof_condition &&
+ dlm_proto_ops->eof_condition(con)) {
set_bit(CF_EOF, &con->flags);
mutex_unlock(&con->sock_mutex);
} else {
@@ -1134,242 +1151,6 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
return result;
}
-/* Initiate an SCTP association.
- This is a special case of send_to_sock() in that we don't yet have a
- peeled-off socket for this association, so we use the listening socket
- and add the primary IP address of the remote node.
- */
-static void sctp_connect_to_sock(struct connection *con)
-{
- struct sockaddr_storage daddr;
- int result;
- int addr_len;
- struct socket *sock;
- unsigned int mark;
-
- mutex_lock(&con->sock_mutex);
-
- /* Some odd races can cause double-connects, ignore them */
- if (con->retries++ > MAX_CONNECT_RETRIES)
- goto out;
-
- if (con->sock) {
- log_print("node %d already connected.", con->nodeid);
- goto out;
- }
-
- memset(&daddr, 0, sizeof(daddr));
- result = nodeid_to_addr(con->nodeid, &daddr, NULL, true, &mark);
- if (result < 0) {
- log_print("no address for nodeid %d", con->nodeid);
- goto out;
- }
-
- /* Create a socket to communicate with */
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
- SOCK_STREAM, IPPROTO_SCTP, &sock);
- if (result < 0)
- goto socket_err;
-
- sock_set_mark(sock->sk, mark);
-
- add_sock(sock, con);
-
- /* Bind to all addresses. */
- if (sctp_bind_addrs(con->sock, 0))
- goto bind_err;
-
- make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len);
-
- log_print_ratelimited("connecting to %d", con->nodeid);
-
- /* Turn off Nagle's algorithm */
- sctp_sock_set_nodelay(sock->sk);
-
- /*
- * Make sock->ops->connect() function return in specified time,
- * since O_NONBLOCK argument in connect() function does not work here,
- * then, we should restore the default value of this attribute.
- */
- sock_set_sndtimeo(sock->sk, 5);
- result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
- 0);
- sock_set_sndtimeo(sock->sk, 0);
-
- if (result == -EINPROGRESS)
- result = 0;
- if (result == 0) {
- if (!test_and_set_bit(CF_CONNECTED, &con->flags))
- log_print("successful connected to node %d", con->nodeid);
- goto out;
- }
-
-bind_err:
- con->sock = NULL;
- sock_release(sock);
-
-socket_err:
- /*
- * Some errors are fatal and this list might need adjusting. For other
- * errors we try again until the max number of retries is reached.
- */
- if (result != -EHOSTUNREACH &&
- result != -ENETUNREACH &&
- result != -ENETDOWN &&
- result != -EINVAL &&
- result != -EPROTONOSUPPORT) {
- log_print("connect %d try %d error %d", con->nodeid,
- con->retries, result);
- mutex_unlock(&con->sock_mutex);
- msleep(1000);
- lowcomms_connect_sock(con);
- return;
- }
-
-out:
- mutex_unlock(&con->sock_mutex);
-}
-
-/* Connect a new socket to its peer */
-static void tcp_connect_to_sock(struct connection *con)
-{
- struct sockaddr_storage saddr, src_addr;
- unsigned int mark;
- int addr_len;
- struct socket *sock = NULL;
- int result;
-
- mutex_lock(&con->sock_mutex);
- if (con->retries++ > MAX_CONNECT_RETRIES)
- goto out;
-
- /* Some odd races can cause double-connects, ignore them */
- if (con->sock)
- goto out;
-
- /* Create a socket to communicate with */
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
- SOCK_STREAM, IPPROTO_TCP, &sock);
- if (result < 0)
- goto out_err;
-
- memset(&saddr, 0, sizeof(saddr));
- result = nodeid_to_addr(con->nodeid, &saddr, NULL, false, &mark);
- if (result < 0) {
- log_print("no address for nodeid %d", con->nodeid);
- goto out_err;
- }
-
- sock_set_mark(sock->sk, mark);
-
- add_sock(sock, con);
-
- /* Bind to our cluster-known address connecting to avoid
- routing problems */
- memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
- make_sockaddr(&src_addr, 0, &addr_len);
- result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
- addr_len);
- if (result < 0) {
- log_print("could not bind for connect: %d", result);
- /* This *may* not indicate a critical error */
- }
-
- make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
-
- log_print_ratelimited("connecting to %d", con->nodeid);
-
- /* Turn off Nagle's algorithm */
- tcp_sock_set_nodelay(sock->sk);
-
- result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
- O_NONBLOCK);
- if (result == -EINPROGRESS)
- result = 0;
- if (result == 0)
- goto out;
-
-out_err:
- if (con->sock) {
- sock_release(con->sock);
- con->sock = NULL;
- } else if (sock) {
- sock_release(sock);
- }
- /*
- * Some errors are fatal and this list might need adjusting. For other
- * errors we try again until the max number of retries is reached.
- */
- if (result != -EHOSTUNREACH &&
- result != -ENETUNREACH &&
- result != -ENETDOWN &&
- result != -EINVAL &&
- result != -EPROTONOSUPPORT) {
- log_print("connect %d try %d error %d", con->nodeid,
- con->retries, result);
- mutex_unlock(&con->sock_mutex);
- msleep(1000);
- lowcomms_connect_sock(con);
- return;
- }
-out:
- mutex_unlock(&con->sock_mutex);
- return;
-}
-
-/* On error caller must run dlm_close_sock() for the
- * listen connection socket.
- */
-static int tcp_create_listen_sock(struct listen_connection *con,
- struct sockaddr_storage *saddr)
-{
- struct socket *sock = NULL;
- int result = 0;
- int addr_len;
-
- if (dlm_local_addr[0]->ss_family == AF_INET)
- addr_len = sizeof(struct sockaddr_in);
- else
- addr_len = sizeof(struct sockaddr_in6);
-
- /* Create a socket to communicate with */
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
- SOCK_STREAM, IPPROTO_TCP, &sock);
- if (result < 0) {
- log_print("Can't create listening comms socket");
- goto create_out;
- }
-
- sock_set_mark(sock->sk, dlm_config.ci_mark);
-
- /* Turn off Nagle's algorithm */
- tcp_sock_set_nodelay(sock->sk);
-
- sock_set_reuseaddr(sock->sk);
-
- add_listen_sock(sock, con);
-
- /* Bind to our port */
- make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
- result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
- if (result < 0) {
- log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
- goto create_out;
- }
- sock_set_keepalive(sock->sk);
-
- result = sock->ops->listen(sock, 5);
- if (result < 0) {
- log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
- goto create_out;
- }
-
- return 0;
-
-create_out:
- return result;
-}
-
/* Get local addresses */
static void init_local(void)
{
@@ -1396,63 +1177,6 @@ static void deinit_local(void)
kfree(dlm_local_addr[i]);
}
-/* Initialise SCTP socket and bind to all interfaces
- * On error caller must run dlm_close_sock() for the
- * listen connection socket.
- */
-static int sctp_listen_for_all(struct listen_connection *con)
-{
- struct socket *sock = NULL;
- int result = -EINVAL;
-
- log_print("Using SCTP for communications");
-
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
- SOCK_STREAM, IPPROTO_SCTP, &sock);
- if (result < 0) {
- log_print("Can't create comms socket, check SCTP is loaded");
- goto out;
- }
-
- sock_set_rcvbuf(sock->sk, NEEDED_RMEM);
- sock_set_mark(sock->sk, dlm_config.ci_mark);
- sctp_sock_set_nodelay(sock->sk);
-
- add_listen_sock(sock, con);
-
- /* Bind to all addresses. */
- result = sctp_bind_addrs(con->sock, dlm_config.ci_tcp_port);
- if (result < 0)
- goto out;
-
- result = sock->ops->listen(sock, 5);
- if (result < 0) {
- log_print("Can't set socket listening");
- goto out;
- }
-
- return 0;
-
-out:
- return result;
-}
-
-static int tcp_listen_for_all(void)
-{
- /* We don't support multi-homed hosts */
- if (dlm_local_count > 1) {
- log_print("TCP protocol can't handle multi-homed hosts, "
- "try SCTP");
- return -EINVAL;
- }
-
- log_print("Using TCP for communications");
-
- return tcp_create_listen_sock(&listen_con, dlm_local_addr[0]);
-}
-
-
-
static struct writequeue_entry *new_writequeue_entry(struct connection *con,
gfp_t allocation)
{
@@ -1528,19 +1252,37 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
{
struct writequeue_entry *e;
struct dlm_msg *msg;
+ bool sleepable;
msg = kzalloc(sizeof(*msg), allocation);
if (!msg)
return NULL;
+ /* this mutex is being used as a wait to avoid multiple "fast"
+ * new writequeue page list entry allocs in new_wq_entry in
+ * normal operation which is sleepable context. Without it
+ * we could end in multiple writequeue entries with one
+ * dlm message because multiple callers were waiting at
+ * the writequeue_lock in new_wq_entry().
+ */
+ sleepable = gfpflags_normal_context(allocation);
+ if (sleepable)
+ mutex_lock(&con->wq_alloc);
+
kref_init(&msg->ref);
e = new_wq_entry(con, len, allocation, ppc, cb, mh);
if (!e) {
+ if (sleepable)
+ mutex_unlock(&con->wq_alloc);
+
kfree(msg);
return NULL;
}
+ if (sleepable)
+ mutex_unlock(&con->wq_alloc);
+
msg->ppc = *ppc;
msg->len = len;
msg->entry = e;
@@ -1646,10 +1388,9 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
/* Send a message */
static void send_to_sock(struct connection *con)
{
- int ret = 0;
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
- int len, offset;
+ int len, offset, ret;
int count = 0;
mutex_lock(&con->sock_mutex);
@@ -1658,7 +1399,8 @@ static void send_to_sock(struct connection *con)
spin_lock(&con->writequeue_lock);
for (;;) {
- if (list_empty(&con->writequeue))
+ e = con_next_wq(con);
+ if (!e)
break;
e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
@@ -1667,25 +1409,22 @@ static void send_to_sock(struct connection *con)
BUG_ON(len == 0 && e->users == 0);
spin_unlock(&con->writequeue_lock);
- ret = 0;
- if (len) {
- ret = kernel_sendpage(con->sock, e->page, offset, len,
- msg_flags);
- if (ret == -EAGAIN || ret == 0) {
- if (ret == -EAGAIN &&
- test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
- !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
- /* Notify TCP that we're limited by the
- * application window size.
- */
- set_bit(SOCK_NOSPACE, &con->sock->flags);
- con->sock->sk->sk_write_pending++;
- }
- cond_resched();
- goto out;
- } else if (ret < 0)
- goto out;
- }
+ ret = kernel_sendpage(con->sock, e->page, offset, len,
+ msg_flags);
+ if (ret == -EAGAIN || ret == 0) {
+ if (ret == -EAGAIN &&
+ test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
+ !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+ /* Notify TCP that we're limited by the
+ * application window size.
+ */
+ set_bit(SOCK_NOSPACE, &con->sock->flags);
+ con->sock->sk->sk_write_pending++;
+ }
+ cond_resched();
+ goto out;
+ } else if (ret < 0)
+ goto out;
/* Don't starve people filling buffers */
if (++count >= MAX_SEND_MSG_COUNT) {
@@ -1770,12 +1509,9 @@ int dlm_lowcomms_close(int nodeid)
static void process_recv_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, rwork);
- int err;
clear_bit(CF_READ_PENDING, &con->flags);
- do {
- err = receive_from_sock(con);
- } while (!err);
+ receive_from_sock(con);
}
static void process_listen_recv_socket(struct work_struct *work)
@@ -1783,6 +1519,74 @@ static void process_listen_recv_socket(struct work_struct *work)
accept_from_sock(&listen_con);
}
+static void dlm_connect(struct connection *con)
+{
+ struct sockaddr_storage addr;
+ int result, addr_len;
+ struct socket *sock;
+ unsigned int mark;
+
+ /* Some odd races can cause double-connects, ignore them */
+ if (con->retries++ > MAX_CONNECT_RETRIES)
+ return;
+
+ if (con->sock) {
+ log_print("node %d already connected.", con->nodeid);
+ return;
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ result = nodeid_to_addr(con->nodeid, &addr, NULL,
+ dlm_proto_ops->try_new_addr, &mark);
+ if (result < 0) {
+ log_print("no address for nodeid %d", con->nodeid);
+ return;
+ }
+
+ /* Create a socket to communicate with */
+ result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ SOCK_STREAM, dlm_proto_ops->proto, &sock);
+ if (result < 0)
+ goto socket_err;
+
+ sock_set_mark(sock->sk, mark);
+ dlm_proto_ops->sockopts(sock);
+
+ add_sock(sock, con);
+
+ result = dlm_proto_ops->bind(sock);
+ if (result < 0)
+ goto add_sock_err;
+
+ log_print_ratelimited("connecting to %d", con->nodeid);
+ make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
+ result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr,
+ addr_len);
+ if (result < 0)
+ goto add_sock_err;
+
+ return;
+
+add_sock_err:
+ dlm_close_sock(&con->sock);
+
+socket_err:
+ /*
+ * Some errors are fatal and this list might need adjusting. For other
+ * errors we try again until the max number of retries is reached.
+ */
+ if (result != -EHOSTUNREACH &&
+ result != -ENETUNREACH &&
+ result != -ENETDOWN &&
+ result != -EINVAL &&
+ result != -EPROTONOSUPPORT) {
+ log_print("connect %d try %d error %d", con->nodeid,
+ con->retries, result);
+ msleep(1000);
+ lowcomms_connect_sock(con);
+ }
+}
+
/* Send workqueue function */
static void process_send_sockets(struct work_struct *work)
{
@@ -1797,11 +1601,15 @@ static void process_send_sockets(struct work_struct *work)
dlm_midcomms_unack_msg_resend(con->nodeid);
}
- if (con->sock == NULL) { /* not mutex protected so check it inside too */
+ if (con->sock == NULL) {
if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
msleep(1000);
- con->connect_action(con);
+
+ mutex_lock(&con->sock_mutex);
+ dlm_connect(con);
+ mutex_unlock(&con->sock_mutex);
}
+
if (!list_empty(&con->writequeue))
send_to_sock(con);
}
@@ -1840,8 +1648,8 @@ static int work_start(void)
static void shutdown_conn(struct connection *con)
{
- if (con->shutdown_action)
- con->shutdown_action(con);
+ if (dlm_proto_ops->shutdown_action)
+ dlm_proto_ops->shutdown_action(con);
}
void dlm_lowcomms_shutdown(void)
@@ -1948,8 +1756,198 @@ void dlm_lowcomms_stop(void)
srcu_read_unlock(&connections_srcu, idx);
work_stop();
deinit_local();
+
+ dlm_proto_ops = NULL;
}
+static int dlm_listen_for_all(void)
+{
+ struct socket *sock;
+ int result;
+
+ log_print("Using %s for communications",
+ dlm_proto_ops->name);
+
+ result = dlm_proto_ops->listen_validate();
+ if (result < 0)
+ return result;
+
+ result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ SOCK_STREAM, dlm_proto_ops->proto, &sock);
+ if (result < 0) {
+ log_print("Can't create comms socket, check SCTP is loaded");
+ goto out;
+ }
+
+ sock_set_mark(sock->sk, dlm_config.ci_mark);
+ dlm_proto_ops->listen_sockopts(sock);
+
+ result = dlm_proto_ops->listen_bind(sock);
+ if (result < 0)
+ goto out;
+
+ save_listen_callbacks(sock);
+ add_listen_sock(sock, &listen_con);
+
+ INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
+ result = sock->ops->listen(sock, 5);
+ if (result < 0) {
+ dlm_close_sock(&listen_con.sock);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ sock_release(sock);
+ return result;
+}
+
+static int dlm_tcp_bind(struct socket *sock)
+{
+ struct sockaddr_storage src_addr;
+ int result, addr_len;
+
+ /* Bind to our cluster-known address connecting to avoid
+ * routing problems.
+ */
+ memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+ make_sockaddr(&src_addr, 0, &addr_len);
+
+ result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
+ addr_len);
+ if (result < 0) {
+ /* This *may* not indicate a critical error */
+ log_print("could not bind for connect: %d", result);
+ }
+
+ return 0;
+}
+
+static int dlm_tcp_connect(struct connection *con, struct socket *sock,
+ struct sockaddr *addr, int addr_len)
+{
+ int ret;
+
+ ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
+ switch (ret) {
+ case -EINPROGRESS:
+ fallthrough;
+ case 0:
+ return 0;
+ }
+
+ return ret;
+}
+
+static int dlm_tcp_listen_validate(void)
+{
+ /* We don't support multi-homed hosts */
+ if (dlm_local_count > 1) {
+ log_print("TCP protocol can't handle multi-homed hosts, try SCTP");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void dlm_tcp_sockopts(struct socket *sock)
+{
+ /* Turn off Nagle's algorithm */
+ tcp_sock_set_nodelay(sock->sk);
+}
+
+static void dlm_tcp_listen_sockopts(struct socket *sock)
+{
+ dlm_tcp_sockopts(sock);
+ sock_set_reuseaddr(sock->sk);
+}
+
+static int dlm_tcp_listen_bind(struct socket *sock)
+{
+ int addr_len;
+
+ /* Bind to our port */
+ make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
+ return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0],
+ addr_len);
+}
+
+static const struct dlm_proto_ops dlm_tcp_ops = {
+ .name = "TCP",
+ .proto = IPPROTO_TCP,
+ .connect = dlm_tcp_connect,
+ .sockopts = dlm_tcp_sockopts,
+ .bind = dlm_tcp_bind,
+ .listen_validate = dlm_tcp_listen_validate,
+ .listen_sockopts = dlm_tcp_listen_sockopts,
+ .listen_bind = dlm_tcp_listen_bind,
+ .shutdown_action = dlm_tcp_shutdown,
+ .eof_condition = tcp_eof_condition,
+};
+
+static int dlm_sctp_bind(struct socket *sock)
+{
+ return sctp_bind_addrs(sock, 0);
+}
+
+static int dlm_sctp_connect(struct connection *con, struct socket *sock,
+ struct sockaddr *addr, int addr_len)
+{
+ int ret;
+
+ /*
+ * Make sock->ops->connect() function return in specified time,
+ * since O_NONBLOCK argument in connect() function does not work here,
+ * then, we should restore the default value of this attribute.
+ */
+ sock_set_sndtimeo(sock->sk, 5);
+ ret = sock->ops->connect(sock, addr, addr_len, 0);
+ sock_set_sndtimeo(sock->sk, 0);
+ if (ret < 0)
+ return ret;
+
+ if (!test_and_set_bit(CF_CONNECTED, &con->flags))
+ log_print("successful connected to node %d", con->nodeid);
+
+ return 0;
+}
+
+static int dlm_sctp_listen_validate(void)
+{
+ if (!IS_ENABLED(CONFIG_IP_SCTP)) {
+ log_print("SCTP is not enabled by this kernel");
+ return -EOPNOTSUPP;
+ }
+
+ request_module("sctp");
+ return 0;
+}
+
+static int dlm_sctp_bind_listen(struct socket *sock)
+{
+ return sctp_bind_addrs(sock, dlm_config.ci_tcp_port);
+}
+
+static void dlm_sctp_sockopts(struct socket *sock)
+{
+ /* Turn off Nagle's algorithm */
+ sctp_sock_set_nodelay(sock->sk);
+ sock_set_rcvbuf(sock->sk, NEEDED_RMEM);
+}
+
+static const struct dlm_proto_ops dlm_sctp_ops = {
+ .name = "SCTP",
+ .proto = IPPROTO_SCTP,
+ .try_new_addr = true,
+ .connect = dlm_sctp_connect,
+ .sockopts = dlm_sctp_sockopts,
+ .bind = dlm_sctp_bind,
+ .listen_validate = dlm_sctp_listen_validate,
+ .listen_sockopts = dlm_sctp_sockopts,
+ .listen_bind = dlm_sctp_bind_listen,
+};
+
int dlm_lowcomms_start(void)
{
int error = -EINVAL;
@@ -1976,23 +1974,27 @@ int dlm_lowcomms_start(void)
/* Start listening */
switch (dlm_config.ci_protocol) {
case DLM_PROTO_TCP:
- error = tcp_listen_for_all();
+ dlm_proto_ops = &dlm_tcp_ops;
break;
case DLM_PROTO_SCTP:
- error = sctp_listen_for_all(&listen_con);
+ dlm_proto_ops = &dlm_sctp_ops;
break;
default:
log_print("Invalid protocol identifier %d set",
dlm_config.ci_protocol);
error = -EINVAL;
- break;
+ goto fail_proto_ops;
}
+
+ error = dlm_listen_for_all();
if (error)
- goto fail_unlisten;
+ goto fail_listen;
return 0;
-fail_unlisten:
+fail_listen:
+ dlm_proto_ops = NULL;
+fail_proto_ops:
dlm_allow_conn = 0;
dlm_close_sock(&listen_con.sock);
work_stop();
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index aaae7115c00d..4ccae07cf005 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -46,6 +46,7 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg);
int dlm_lowcomms_connect_node(int nodeid);
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark);
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
+void dlm_midcomms_receive_done(int nodeid);
#endif /* __LOWCOMMS_DOT_H__ */
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index d9e1e4170eb1..731d489aa323 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -443,8 +443,10 @@ static int ping_members(struct dlm_ls *ls)
list_for_each_entry(memb, &ls->ls_nodes, list) {
error = dlm_recovery_stopped(ls);
- if (error)
+ if (error) {
+ error = -EINTR;
break;
+ }
error = dlm_rcom_status(ls, memb->nodeid, 0);
if (error)
break;
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index e3de268898ed..7ae39ec8d9b0 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -109,12 +109,6 @@
* compatibility. There exists better ways to make a better handling.
* However this should be changed in the next major version bump of dlm.
*
- * Ack handling:
- *
- * Currently we send an ack message for every dlm message. However we
- * can ack multiple dlm messages with one ack by just delaying the ack
- * message. Will reduce some traffic but makes the drop detection slower.
- *
* Tail Size checking:
*
* There exists a message tail payload in e.g. DLM_MSG however we don't
@@ -169,6 +163,7 @@ struct midcomms_node {
#define DLM_NODE_FLAG_CLOSE 1
#define DLM_NODE_FLAG_STOP_TX 2
#define DLM_NODE_FLAG_STOP_RX 3
+#define DLM_NODE_ULP_DELIVERED 4
unsigned long flags;
wait_queue_head_t shutdown_wait;
@@ -480,11 +475,12 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
{
if (seq == node->seq_next) {
node->seq_next++;
- /* send ack before fin */
- dlm_send_ack(node->nodeid, node->seq_next);
switch (p->header.h_cmd) {
case DLM_FIN:
+ /* send ack before fin */
+ dlm_send_ack(node->nodeid, node->seq_next);
+
spin_lock(&node->state_lock);
pr_debug("receive fin msg from node %d with state %s\n",
node->nodeid, dlm_state_str(node->state));
@@ -534,6 +530,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
default:
WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
dlm_receive_buffer(p, node->nodeid);
+ set_bit(DLM_NODE_ULP_DELIVERED, &node->flags);
break;
}
} else {
@@ -933,6 +930,49 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
return ret;
}
+void dlm_midcomms_receive_done(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ /* old protocol, we do nothing */
+ switch (node->version) {
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ /* do nothing if we didn't delivered stateful to ulp */
+ if (!test_and_clear_bit(DLM_NODE_ULP_DELIVERED,
+ &node->flags)) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ /* we only ack if state is ESTABLISHED */
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ spin_unlock(&node->state_lock);
+ dlm_send_ack(node->nodeid, node->seq_next);
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ /* do nothing FIN has it's own ack send */
+ break;
+ };
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
void dlm_midcomms_unack_msg_resend(int nodeid)
{
struct midcomms_node *node;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 5651933f54a4..6cba86470278 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -89,22 +89,15 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
return 0;
}
-static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc)
+static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc)
{
dlm_rcom_out(rc);
-}
-
-static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
- struct dlm_rcom *rc)
-{
- _send_rcom(ls, rc);
dlm_midcomms_commit_mhandle(mh);
}
-static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg,
- struct dlm_rcom *rc)
+static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc)
{
- _send_rcom(ls, rc);
+ dlm_rcom_out(rc);
dlm_lowcomms_commit_msg(msg);
dlm_lowcomms_put_msg(msg);
}
@@ -204,7 +197,7 @@ retry:
allow_sync_reply(ls, &rc->rc_id);
memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
- send_rcom_stateless(ls, msg, rc);
+ send_rcom_stateless(msg, rc);
error = dlm_wait_function(ls, &rcom_response);
disallow_sync_reply(ls);
@@ -287,7 +280,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
spin_unlock(&ls->ls_recover_lock);
do_send:
- send_rcom_stateless(ls, msg, rc);
+ send_rcom_stateless(msg, rc);
}
static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
@@ -327,7 +320,7 @@ retry:
allow_sync_reply(ls, &rc->rc_id);
memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
- send_rcom_stateless(ls, msg, rc);
+ send_rcom_stateless(msg, rc);
error = dlm_wait_function(ls, &rcom_response);
disallow_sync_reply(ls);
@@ -356,7 +349,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
nodeid);
- send_rcom_stateless(ls, msg, rc);
+ send_rcom_stateless(msg, rc);
}
int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
@@ -373,7 +366,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
memcpy(rc->rc_buf, r->res_name, r->res_length);
rc->rc_id = (unsigned long) r->res_id;
- send_rcom(ls, mh, rc);
+ send_rcom(mh, rc);
out:
return error;
}
@@ -404,7 +397,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
- send_rcom(ls, mh, rc);
+ send_rcom(mh, rc);
}
static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
@@ -461,7 +454,7 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
pack_rcom_lock(r, lkb, rl);
rc->rc_id = (unsigned long) r;
- send_rcom(ls, mh, rc);
+ send_rcom(mh, rc);
out:
return error;
}
@@ -487,7 +480,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
- send_rcom(ls, mh, rc);
+ send_rcom(mh, rc);
}
/* If the lockspace doesn't exist then still send a status message
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 85e245392715..97d052cea5a9 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -125,8 +125,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_recover_waiters_pre(ls);
error = dlm_recovery_stopped(ls);
- if (error)
+ if (error) {
+ error = -EINTR;
goto fail;
+ }
if (neg || dlm_no_directory(ls)) {
/*
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index f00fcc4a4f72..e619c31b6bd9 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -3,6 +3,7 @@
* Implement the manual drop-all-pagecache function
*/
+#include <linux/pagemap.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -27,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
* we need to reschedule to avoid softlockups.
*/
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- (inode->i_mapping->nrpages == 0 && !need_resched())) {
+ (mapping_empty(inode->i_mapping) && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 906af0c1998c..14b747026742 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,6 +3,7 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select FS_IOMAP
select LIBCRC32C
help
EROFS (Enhanced Read-Only File System) is a lightweight
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 3787a5fb0a42..9db829715652 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -2,35 +2,13 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
#include <linux/prefetch.h>
-
+#include <linux/dax.h>
#include <trace/events/erofs.h>
-static void erofs_readendio(struct bio *bio)
-{
- struct bio_vec *bvec;
- blk_status_t err = bio->bi_status;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
-
- /* page is already locked */
- DBG_BUGON(PageUptodate(page));
-
- if (err)
- SetPageError(page);
- else
- SetPageUptodate(page);
-
- unlock_page(page);
- /* page could be reclaimed now */
- }
- bio_put(bio);
-}
-
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
{
struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
@@ -59,13 +37,6 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
lastblk = nblocks - tailendpacking;
- if (offset >= inode->i_size) {
- /* leave out-of-bound access unmapped */
- map->m_flags = 0;
- map->m_plen = 0;
- goto out;
- }
-
/* there is no hole in flatmode */
map->m_flags = EROFS_MAP_MAPPED;
@@ -100,217 +71,273 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
goto err_out;
}
-out:
map->m_llen = map->m_plen;
-
err_out:
trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
return err;
}
-static inline struct bio *erofs_read_raw_page(struct bio *bio,
- struct address_space *mapping,
- struct page *page,
- erofs_off_t *last_block,
- unsigned int nblocks,
- unsigned int *eblks,
- bool ra)
+static int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
{
- struct inode *const inode = mapping->host;
- struct super_block *const sb = inode->i_sb;
- erofs_off_t current_block = (erofs_off_t)page->index;
- int err;
-
- DBG_BUGON(!nblocks);
-
- if (PageUptodate(page)) {
- err = 0;
- goto has_updated;
- }
+ struct super_block *sb = inode->i_sb;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_inode_chunk_index *idx;
+ struct page *page;
+ u64 chunknr;
+ unsigned int unit;
+ erofs_off_t pos;
+ int err = 0;
- /* note that for readpage case, bio also equals to NULL */
- if (bio &&
- (*last_block + 1 != current_block || !*eblks)) {
-submit_bio_retry:
- submit_bio(bio);
- bio = NULL;
+ if (map->m_la >= inode->i_size) {
+ /* leave out-of-bound access unmapped */
+ map->m_flags = 0;
+ map->m_plen = 0;
+ goto out;
}
- if (!bio) {
- struct erofs_map_blocks map = {
- .m_la = blknr_to_addr(current_block),
- };
- erofs_blk_t blknr;
- unsigned int blkoff;
-
- err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
- if (err)
- goto err_out;
-
- /* zero out the holed page */
- if (!(map.m_flags & EROFS_MAP_MAPPED)) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
-
- /* imply err = 0, see erofs_map_blocks */
- goto has_updated;
- }
-
- /* for RAW access mode, m_plen must be equal to m_llen */
- DBG_BUGON(map.m_plen != map.m_llen);
-
- blknr = erofs_blknr(map.m_pa);
- blkoff = erofs_blkoff(map.m_pa);
-
- /* deal with inline page */
- if (map.m_flags & EROFS_MAP_META) {
- void *vsrc, *vto;
- struct page *ipage;
+ if (vi->datalayout != EROFS_INODE_CHUNK_BASED)
+ return erofs_map_blocks_flatmode(inode, map, flags);
- DBG_BUGON(map.m_plen > PAGE_SIZE);
+ if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
+ unit = sizeof(*idx); /* chunk index */
+ else
+ unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */
- ipage = erofs_get_meta_page(inode->i_sb, blknr);
+ chunknr = map->m_la >> vi->chunkbits;
+ pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
+ vi->xattr_isize, unit) + unit * chunknr;
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
- goto err_out;
- }
+ page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos));
+ if (IS_ERR(page))
+ return PTR_ERR(page);
- vsrc = kmap_atomic(ipage);
- vto = kmap_atomic(page);
- memcpy(vto, vsrc + blkoff, map.m_plen);
- memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
- kunmap_atomic(vto);
- kunmap_atomic(vsrc);
- flush_dcache_page(page);
+ map->m_la = chunknr << vi->chunkbits;
+ map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
+ roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
- SetPageUptodate(page);
- /* TODO: could we unlock the page earlier? */
- unlock_page(ipage);
- put_page(ipage);
+ /* handle block map */
+ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
+ __le32 *blkaddr = page_address(page) + erofs_blkoff(pos);
- /* imply err = 0, see erofs_map_blocks */
- goto has_updated;
+ if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
+ map->m_flags = 0;
+ } else {
+ map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr));
+ map->m_flags = EROFS_MAP_MAPPED;
}
+ goto out_unlock;
+ }
+ /* parse chunk indexes */
+ idx = page_address(page) + erofs_blkoff(pos);
+ switch (le32_to_cpu(idx->blkaddr)) {
+ case EROFS_NULL_ADDR:
+ map->m_flags = 0;
+ break;
+ default:
+ /* only one device is supported for now */
+ if (idx->device_id) {
+ erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
+ le16_to_cpu(idx->device_id),
+ chunknr, vi->nid);
+ err = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+ map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
+ map->m_flags = EROFS_MAP_MAPPED;
+ break;
+ }
+out_unlock:
+ unlock_page(page);
+ put_page(page);
+out:
+ map->m_llen = map->m_plen;
+ return err;
+}
- /* pa must be block-aligned for raw reading */
- DBG_BUGON(erofs_blkoff(map.m_pa));
+static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ int ret;
+ struct erofs_map_blocks map;
+
+ map.m_la = offset;
+ map.m_llen = length;
+
+ ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (ret < 0)
+ return ret;
+
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
+ iomap->offset = map.m_la;
+ iomap->length = map.m_llen;
+ iomap->flags = 0;
+ iomap->private = NULL;
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ if (!iomap->length)
+ iomap->length = length;
+ return 0;
+ }
- /* max # of continuous pages */
- if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
- nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
+ if (map.m_flags & EROFS_MAP_META) {
+ struct page *ipage;
+
+ iomap->type = IOMAP_INLINE;
+ ipage = erofs_get_meta_page(inode->i_sb,
+ erofs_blknr(map.m_pa));
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+ iomap->inline_data = page_address(ipage) +
+ erofs_blkoff(map.m_pa);
+ iomap->private = ipage;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = map.m_pa;
+ }
+ return 0;
+}
- *eblks = bio_max_segs(nblocks);
- bio = bio_alloc(GFP_NOIO, *eblks);
+static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ struct page *ipage = iomap->private;
- bio->bi_end_io = erofs_readendio;
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_iter.bi_sector = (sector_t)blknr <<
- LOG_SECTORS_PER_BLOCK;
- bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0);
+ if (ipage) {
+ DBG_BUGON(iomap->type != IOMAP_INLINE);
+ unlock_page(ipage);
+ put_page(ipage);
+ } else {
+ DBG_BUGON(iomap->type == IOMAP_INLINE);
}
+ return written;
+}
- err = bio_add_page(bio, page, PAGE_SIZE, 0);
- /* out of the extent or bio is full */
- if (err < PAGE_SIZE)
- goto submit_bio_retry;
- --*eblks;
- *last_block = current_block;
- return bio;
+static const struct iomap_ops erofs_iomap_ops = {
+ .iomap_begin = erofs_iomap_begin,
+ .iomap_end = erofs_iomap_end,
+};
-err_out:
- /* for sync reading, set page error immediately */
- if (!ra) {
- SetPageError(page);
- ClearPageUptodate(page);
+int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
+#ifdef CONFIG_EROFS_FS_ZIP
+ return iomap_fiemap(inode, fieinfo, start, len,
+ &z_erofs_iomap_report_ops);
+#else
+ return -EOPNOTSUPP;
+#endif
}
-has_updated:
- unlock_page(page);
-
- /* if updated manually, continuous pages has a gap */
- if (bio)
- submit_bio(bio);
- return err ? ERR_PTR(err) : NULL;
+ return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops);
}
/*
* since we dont have write or truncate flows, so no inode
* locking needs to be held at the moment.
*/
-static int erofs_raw_access_readpage(struct file *file, struct page *page)
+static int erofs_readpage(struct file *file, struct page *page)
{
- erofs_off_t last_block;
- unsigned int eblks;
- struct bio *bio;
-
- trace_erofs_readpage(page, true);
+ return iomap_readpage(page, &erofs_iomap_ops);
+}
- bio = erofs_read_raw_page(NULL, page->mapping,
- page, &last_block, 1, &eblks, false);
+static void erofs_readahead(struct readahead_control *rac)
+{
+ return iomap_readahead(rac, &erofs_iomap_ops);
+}
- if (IS_ERR(bio))
- return PTR_ERR(bio);
+static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+{
+ return iomap_bmap(mapping, block, &erofs_iomap_ops);
+}
- if (bio)
- submit_bio(bio);
+static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ loff_t align = iocb->ki_pos | iov_iter_count(to) |
+ iov_iter_alignment(to);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ unsigned int blksize_mask;
+
+ if (bdev)
+ blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1;
+ else
+ blksize_mask = (1 << inode->i_blkbits) - 1;
+
+ if (align & blksize_mask)
+ return -EINVAL;
return 0;
}
-static void erofs_raw_access_readahead(struct readahead_control *rac)
+static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- erofs_off_t last_block;
- unsigned int eblks;
- struct bio *bio = NULL;
- struct page *page;
-
- trace_erofs_readpages(rac->mapping->host, readahead_index(rac),
- readahead_count(rac), true);
-
- while ((page = readahead_page(rac))) {
- prefetchw(&page->flags);
-
- bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block,
- readahead_count(rac), &eblks, true);
-
- /* all the page errors are ignored when readahead */
- if (IS_ERR(bio)) {
- pr_err("%s, readahead error at page %lu of nid %llu\n",
- __func__, page->index,
- EROFS_I(rac->mapping->host)->nid);
-
- bio = NULL;
- }
-
- put_page(page);
+ /* no need taking (shared) inode lock since it's a ro filesystem */
+ if (!iov_iter_count(to))
+ return 0;
+
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(iocb->ki_filp->f_mapping->host))
+ return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ int err = erofs_prepare_dio(iocb, to);
+
+ if (!err)
+ return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+ NULL, 0);
+ if (err < 0)
+ return err;
}
+ return filemap_read(iocb, to, 0);
+}
+
+/* for uncompressed (aligned) files and raw access for other files */
+const struct address_space_operations erofs_raw_access_aops = {
+ .readpage = erofs_readpage,
+ .readahead = erofs_readahead,
+ .bmap = erofs_bmap,
+ .direct_IO = noop_direct_IO,
+};
- if (bio)
- submit_bio(bio);
+#ifdef CONFIG_FS_DAX
+static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
}
-static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
{
- struct inode *inode = mapping->host;
- struct erofs_map_blocks map = {
- .m_la = blknr_to_addr(block),
- };
+ return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
- if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) {
- erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
+static const struct vm_operations_struct erofs_dax_vm_ops = {
+ .fault = erofs_dax_fault,
+ .huge_fault = erofs_dax_huge_fault,
+};
- if (block >> LOG_SECTORS_PER_BLOCK >= blks)
- return 0;
- }
+static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (!IS_DAX(file_inode(file)))
+ return generic_file_readonly_mmap(file, vma);
- if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW))
- return erofs_blknr(map.m_pa);
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ return -EINVAL;
+ vma->vm_ops = &erofs_dax_vm_ops;
+ vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
-
-/* for uncompressed (aligned) files and raw access for other files */
-const struct address_space_operations erofs_raw_access_aops = {
- .readpage = erofs_raw_access_readpage,
- .readahead = erofs_raw_access_readahead,
- .bmap = erofs_bmap,
+#else
+#define erofs_file_mmap generic_file_readonly_mmap
+#endif
+
+const struct file_operations erofs_file_fops = {
+ .llseek = generic_file_llseek,
+ .read_iter = erofs_file_read_iter,
+ .mmap = erofs_file_mmap,
+ .splice_read = generic_file_splice_read,
};
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 0f8da74570b4..b0b23f41abc3 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -4,6 +4,7 @@
*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#ifndef __EROFS_FS_H
#define __EROFS_FS_H
@@ -19,10 +20,12 @@
#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001
#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
+#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
#define EROFS_ALL_FEATURE_INCOMPAT \
(EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
- EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER)
+ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
+ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
#define EROFS_SB_EXTSLOT_SIZE 16
@@ -64,13 +67,16 @@ struct erofs_super_block {
* inode, [xattrs], last_inline_data, ... | ... | no-holed data
* 3 - inode compression D:
* inode, [xattrs], map_header, extents ... | ...
- * 4~7 - reserved
+ * 4 - inode chunk-based E:
+ * inode, [xattrs], chunk indexes ... | ...
+ * 5~7 - reserved
*/
enum {
EROFS_INODE_FLAT_PLAIN = 0,
EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1,
EROFS_INODE_FLAT_INLINE = 2,
EROFS_INODE_FLAT_COMPRESSION = 3,
+ EROFS_INODE_CHUNK_BASED = 4,
EROFS_INODE_DATALAYOUT_MAX
};
@@ -90,6 +96,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_ALL \
((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
+/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
+#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
+/* with chunk indexes or just a 4-byte blkaddr array */
+#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
+
+#define EROFS_CHUNK_FORMAT_ALL \
+ (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
+
+struct erofs_inode_chunk_info {
+ __le16 format; /* chunk blkbits, etc. */
+ __le16 reserved;
+};
+
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
@@ -107,6 +126,9 @@ struct erofs_inode_compact {
/* for device files, used to indicate old/new device # */
__le32 rdev;
+
+ /* for chunk-based files, it contains the summary info */
+ struct erofs_inode_chunk_info c;
} i_u;
__le32 i_ino; /* only used for 32-bit stat compatibility */
__le16 i_uid;
@@ -135,6 +157,9 @@ struct erofs_inode_extended {
/* for device files, used to indicate old/new device # */
__le32 rdev;
+
+ /* for chunk-based files, it contains the summary info */
+ struct erofs_inode_chunk_info c;
} i_u;
/* only used for 32-bit stat compatibility */
@@ -204,6 +229,19 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
e->e_name_len + le16_to_cpu(e->e_value_size));
}
+/* represent a zeroed chunk (hole) */
+#define EROFS_NULL_ADDR -1
+
+/* 4-byte block address array */
+#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
+
+/* 8-byte inode chunk indexes */
+struct erofs_inode_chunk_index {
+ __le16 advise; /* always 0, don't care for now */
+ __le16 device_id; /* back-end storage id, always 0 for now */
+ __le32 blkaddr; /* start block address of this inode chunk */
+};
+
/* maximum supported size of a physical compression cluster */
#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
@@ -338,9 +376,14 @@ static inline void erofs_check_ondisk_layout_definitions(void)
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8);
BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
+ /* keep in sync between 2 index structures for better extendibility */
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
+ sizeof(struct z_erofs_vle_decompressed_index));
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index aa8a0d770ba3..31ac3a73b390 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
@@ -122,8 +123,11 @@ static struct page *erofs_read_inode(struct inode *inode,
/* total blocks for compressed files */
if (erofs_inode_is_data_compressed(vi->datalayout))
nblks = le32_to_cpu(die->i_u.compressed_blocks);
-
+ else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
+ /* fill chunked inode summary info */
+ vi->chunkformat = le16_to_cpu(die->i_u.c.format);
kfree(copied);
+ copied = NULL;
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
@@ -160,6 +164,8 @@ static struct page *erofs_read_inode(struct inode *inode,
inode->i_size = le32_to_cpu(dic->i_size);
if (erofs_inode_is_data_compressed(vi->datalayout))
nblks = le32_to_cpu(dic->i_u.compressed_blocks);
+ else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
+ vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
break;
default:
erofs_err(inode->i_sb,
@@ -169,11 +175,26 @@ static struct page *erofs_read_inode(struct inode *inode,
goto err_out;
}
+ if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_ALL)) {
+ erofs_err(inode->i_sb,
+ "unsupported chunk format %x of nid %llu",
+ vi->chunkformat, vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+ vi->chunkbits = LOG_BLOCK_SIZE +
+ (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
+ }
inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
+ inode->i_flags &= ~S_DAX;
+ if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+ vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+ inode->i_flags |= S_DAX;
if (!nblks)
/* measure inode.i_blocks as generic filesystems */
inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
@@ -247,7 +268,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
- inode->i_fop = &generic_ro_fops;
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ inode->i_fop = &generic_ro_fops;
+ else
+ inode->i_fop = &erofs_file_fops;
break;
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
@@ -358,6 +382,7 @@ const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
.get_acl = erofs_get_acl,
+ .fiemap = erofs_fiemap,
};
const struct inode_operations erofs_symlink_iops = {
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 543c2ff97d30..9524e155b38f 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#ifndef __EROFS_INTERNAL_H
#define __EROFS_INTERNAL_H
@@ -15,6 +16,7 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/iomap.h>
#include "erofs_fs.h"
/* redefine pr_fmt "erofs: " */
@@ -83,6 +85,7 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
+ struct dax_device *dax_dev;
u32 blocks;
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
@@ -115,6 +118,8 @@ struct erofs_sb_info {
/* Mount flags set via mount options or defaults */
#define EROFS_MOUNT_XATTR_USER 0x00000010
#define EROFS_MOUNT_POSIX_ACL 0x00000020
+#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
+#define EROFS_MOUNT_DAX_NEVER 0x00000080
#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option)
@@ -257,6 +262,10 @@ struct erofs_inode {
union {
erofs_blk_t raw_blkaddr;
+ struct {
+ unsigned short chunkformat;
+ unsigned char chunkbits;
+ };
#ifdef CONFIG_EROFS_FS_ZIP
struct {
unsigned short z_advise;
@@ -353,8 +362,15 @@ struct erofs_map_blocks {
/* Flags used by erofs_map_blocks_flatmode() */
#define EROFS_GET_BLOCKS_RAW 0x0001
+/*
+ * Used to get the exact decompressed length, e.g. fiemap (consider lookback
+ * approach instead if possible since it's more metadata lightweight.)
+ */
+#define EROFS_GET_BLOCKS_FIEMAP 0x0002
/* zmap.c */
+extern const struct iomap_ops z_erofs_iomap_report_ops;
+
#ifdef CONFIG_EROFS_FS_ZIP
int z_erofs_fill_inode(struct inode *inode);
int z_erofs_map_blocks_iter(struct inode *inode,
@@ -371,7 +387,10 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
#endif /* !CONFIG_EROFS_FS_ZIP */
/* data.c */
+extern const struct file_operations erofs_file_fops;
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
/* inode.c */
static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
@@ -441,8 +460,7 @@ int __init z_erofs_init_zip_subsystem(void);
void z_erofs_exit_zip_subsystem(void);
int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
-int erofs_try_to_free_cached_page(struct address_space *mapping,
- struct page *page);
+int erofs_try_to_free_cached_page(struct page *page);
int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int len);
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index a8271ce5e13f..8629e616028c 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -245,4 +245,5 @@ const struct inode_operations erofs_dir_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
.get_acl = erofs_get_acl,
+ .fiemap = erofs_fiemap,
};
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 8fc6c04b54f4..a8d49e8fc83a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -11,6 +11,7 @@
#include <linux/crc32c.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include <linux/dax.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -355,6 +356,8 @@ enum {
Opt_user_xattr,
Opt_acl,
Opt_cache_strategy,
+ Opt_dax,
+ Opt_dax_enum,
Opt_err
};
@@ -365,14 +368,47 @@ static const struct constant_table erofs_param_cache_strategy[] = {
{}
};
+static const struct constant_table erofs_dax_param_enums[] = {
+ {"always", EROFS_MOUNT_DAX_ALWAYS},
+ {"never", EROFS_MOUNT_DAX_NEVER},
+ {}
+};
+
static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_flag_no("user_xattr", Opt_user_xattr),
fsparam_flag_no("acl", Opt_acl),
fsparam_enum("cache_strategy", Opt_cache_strategy,
erofs_param_cache_strategy),
+ fsparam_flag("dax", Opt_dax),
+ fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
{}
};
+static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
+{
+#ifdef CONFIG_FS_DAX
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ switch (mode) {
+ case EROFS_MOUNT_DAX_ALWAYS:
+ warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ set_opt(ctx, DAX_ALWAYS);
+ clear_opt(ctx, DAX_NEVER);
+ return true;
+ case EROFS_MOUNT_DAX_NEVER:
+ set_opt(ctx, DAX_NEVER);
+ clear_opt(ctx, DAX_ALWAYS);
+ return true;
+ default:
+ DBG_BUGON(1);
+ return false;
+ }
+#else
+ errorfc(fc, "dax options not supported");
+ return false;
+#endif
+}
+
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
@@ -412,6 +448,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
break;
+ case Opt_dax:
+ if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS))
+ return -EINVAL;
+ break;
+ case Opt_dax_enum:
+ if (!erofs_fc_set_dax_mode(fc, result.uint_32))
+ return -EINVAL;
+ break;
default:
return -ENOPARAM;
}
@@ -430,7 +474,7 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
DBG_BUGON(mapping->a_ops != &managed_cache_aops);
if (PagePrivate(page))
- ret = erofs_try_to_free_cached_page(mapping, page);
+ ret = erofs_try_to_free_cached_page(page);
return ret;
}
@@ -496,10 +540,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
sb->s_fs_info = sbi;
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
err = erofs_read_superblock(sb);
if (err)
return err;
+ if (test_opt(ctx, DAX_ALWAYS) &&
+ !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
+ errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(ctx, DAX_ALWAYS);
+ }
sb->s_flags |= SB_RDONLY | SB_NOATIME;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_time_gran = 1;
@@ -609,6 +659,7 @@ static void erofs_kill_sb(struct super_block *sb)
sbi = EROFS_SB(sb);
if (!sbi)
return;
+ fs_put_dax(sbi->dax_dev);
kfree(sbi);
sb->s_fs_info = NULL;
}
@@ -711,8 +762,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int erofs_show_options(struct seq_file *seq, struct dentry *root)
{
- struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
- struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
+ struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
+ struct erofs_fs_context *ctx = &sbi->ctx;
#ifdef CONFIG_EROFS_FS_XATTR
if (test_opt(ctx, XATTR_USER))
@@ -734,6 +785,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
seq_puts(seq, ",cache_strategy=readaround");
#endif
+ if (test_opt(ctx, DAX_ALWAYS))
+ seq_puts(seq, ",dax=always");
+ if (test_opt(ctx, DAX_NEVER))
+ seq_puts(seq, ",dax=never");
return 0;
}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 8dd54b420a1d..778f2c52295d 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -673,12 +673,15 @@ ssize_t erofs_listxattr(struct dentry *dentry,
}
#ifdef CONFIG_EROFS_FS_POSIX_ACL
-struct posix_acl *erofs_get_acl(struct inode *inode, int type)
+struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu)
{
struct posix_acl *acl;
int prefix, rc;
char *value = NULL;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 366dcb400525..94090c74b3f7 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -80,7 +80,7 @@ static inline int erofs_getxattr(struct inode *inode, int index,
#endif /* !CONFIG_EROFS_FS_XATTR */
#ifdef CONFIG_EROFS_FS_POSIX_ACL
-struct posix_acl *erofs_get_acl(struct inode *inode, int type);
+struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu);
#else
#define erofs_get_acl (NULL)
#endif
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index cb4d0889eca9..11c7a1aaebad 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -309,7 +309,6 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
- struct address_space *const mapping = MNGD_MAPPING(sbi);
int i;
/*
@@ -326,7 +325,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
if (!trylock_page(page))
return -EBUSY;
- if (page->mapping != mapping)
+ if (!erofs_page_is_managed(sbi, page))
continue;
/* barrier is implied in the following 'unlock_page' */
@@ -337,8 +336,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
return 0;
}
-int erofs_try_to_free_cached_page(struct address_space *mapping,
- struct page *page)
+int erofs_try_to_free_cached_page(struct page *page)
{
struct z_erofs_pcluster *const pcl = (void *)page_private(page);
int ret = 0; /* 0 - busy */
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index f68aea4baed7..9fb98d85a3ce 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -212,9 +212,34 @@ static unsigned int decode_compactedbits(unsigned int lobits,
return lo;
}
+static int get_compacted_la_distance(unsigned int lclusterbits,
+ unsigned int encodebits,
+ unsigned int vcnt, u8 *in, int i)
+{
+ const unsigned int lomask = (1 << lclusterbits) - 1;
+ unsigned int lo, d1 = 0;
+ u8 type;
+
+ DBG_BUGON(i >= vcnt);
+
+ do {
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+
+ if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ return d1;
+ ++d1;
+ } while (++i < vcnt);
+
+ /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */
+ if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT))
+ d1 += lo - 1;
+ return d1;
+}
+
static int unpack_compacted_index(struct z_erofs_maprecorder *m,
unsigned int amortizedshift,
- unsigned int eofs)
+ unsigned int eofs, bool lookahead)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
@@ -243,6 +268,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
m->type = type;
if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << lclusterbits;
+
+ /* figure out lookahead_distance: delta[1] if needed */
+ if (lookahead)
+ m->delta[1] = get_compacted_la_distance(lclusterbits,
+ encodebits, vcnt, in, i);
if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
if (!big_pcluster) {
DBG_BUGON(1);
@@ -313,7 +343,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
}
static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned long lcn)
+ unsigned long lcn, bool lookahead)
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
@@ -364,11 +394,12 @@ out:
err = z_erofs_reload_indexes(m, erofs_blknr(pos));
if (err)
return err;
- return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
+ return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos),
+ lookahead);
}
static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned int lcn)
+ unsigned int lcn, bool lookahead)
{
const unsigned int datamode = EROFS_I(m->inode)->datalayout;
@@ -376,7 +407,7 @@ static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
return legacy_load_cluster_from_disk(m, lcn);
if (datamode == EROFS_INODE_FLAT_COMPRESSION)
- return compacted_load_cluster_from_disk(m, lcn);
+ return compacted_load_cluster_from_disk(m, lcn, lookahead);
return -EINVAL;
}
@@ -399,7 +430,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
/* load extent head logical cluster if needed */
lcn -= lookback_distance;
- err = z_erofs_load_cluster_from_disk(m, lcn);
+ err = z_erofs_load_cluster_from_disk(m, lcn, false);
if (err)
return err;
@@ -450,7 +481,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
if (m->compressedlcs)
goto out;
- err = z_erofs_load_cluster_from_disk(m, lcn);
+ err = z_erofs_load_cluster_from_disk(m, lcn, false);
if (err)
return err;
@@ -498,6 +529,48 @@ err_bonus_cblkcnt:
return -EFSCORRUPTED;
}
+static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
+{
+ struct inode *inode = m->inode;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_map_blocks *map = m->map;
+ unsigned int lclusterbits = vi->z_logical_clusterbits;
+ u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
+ int err;
+
+ do {
+ /* handle the last EOF pcluster (no next HEAD lcluster) */
+ if ((lcn << lclusterbits) >= inode->i_size) {
+ map->m_llen = inode->i_size - map->m_la;
+ return 0;
+ }
+
+ err = z_erofs_load_cluster_from_disk(m, lcn, true);
+ if (err)
+ return err;
+
+ if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+ DBG_BUGON(!m->delta[1] &&
+ m->clusterofs != 1 << lclusterbits);
+ } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
+ /* go on until the next HEAD lcluster */
+ if (lcn != headlcn)
+ break;
+ m->delta[1] = 1;
+ } else {
+ erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+ lcn += m->delta[1];
+ } while (m->delta[1]);
+
+ map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
+ return 0;
+}
+
int z_erofs_map_blocks_iter(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
@@ -531,7 +604,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
- err = z_erofs_load_cluster_from_disk(&m, initial_lcn);
+ err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false);
if (err)
goto unmap_out;
@@ -581,6 +654,12 @@ int z_erofs_map_blocks_iter(struct inode *inode,
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto out;
+
+ if (flags & EROFS_GET_BLOCKS_FIEMAP) {
+ err = z_erofs_get_extent_decompressedlen(&m);
+ if (!err)
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ }
unmap_out:
if (m.kaddr)
kunmap_atomic(m.kaddr);
@@ -596,3 +675,41 @@ out:
DBG_BUGON(err < 0 && err != -ENOMEM);
return err;
}
+
+static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ int ret;
+ struct erofs_map_blocks map = { .m_la = offset };
+
+ ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
+ if (map.mpage)
+ put_page(map.mpage);
+ if (ret < 0)
+ return ret;
+
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = map.m_la;
+ iomap->length = map.m_llen;
+ if (map.m_flags & EROFS_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = map.m_pa;
+ } else {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ /*
+ * No strict rule how to describe extents for post EOF, yet
+ * we need do like below. Otherwise, iomap itself will get
+ * into an endless loop on post EOF.
+ */
+ if (iomap->offset >= inode->i_size)
+ iomap->length = length + map.m_la - offset;
+ }
+ iomap->flags = 0;
+ return 0;
+}
+
+const struct iomap_ops z_erofs_iomap_report_ops = {
+ .iomap_begin = z_erofs_iomap_begin_report,
+};
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6dd4f34..3627dd7d25db 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,8 +25,6 @@
#include <linux/idr.h>
#include <linux/uio.h>
-DEFINE_PER_CPU(int, eventfd_wake_count);
-
static DEFINE_IDA(eventfd_ida);
struct eventfd_ctx {
@@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* Deadlock or stack overflow issues can happen if we recurse here
* through waitqueue wakeup handlers. If the caller users potentially
* nested waitqueues with custom wakeup handlers, then it should
- * check eventfd_signal_count() before calling this function. If
- * it returns true, the eventfd_signal() call should be deferred to a
+ * check eventfd_signal_allowed() before calling this function. If
+ * it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ if (WARN_ON_ONCE(current->in_eventfd_signal))
return 0;
spin_lock_irqsave(&ctx->wqh.lock, flags);
- this_cpu_inc(eventfd_wake_count);
+ current->in_eventfd_signal = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- this_cpu_dec(eventfd_wake_count);
+ current->in_eventfd_signal = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
diff --git a/fs/exec.c b/fs/exec.c
index 38f63451b928..2dc489c164fe 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -217,8 +217,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* We are doing an exec(). 'current' is the process
* doing the exec and bprm->mm is the new process's mm.
*/
+ mmap_read_lock(bprm->mm);
ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
&page, NULL, NULL);
+ mmap_read_unlock(bprm->mm);
if (ret <= 0)
return NULL;
@@ -574,7 +576,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
}
if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
+ flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
@@ -592,7 +594,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
ret = 0;
out:
if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
+ flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
@@ -634,7 +636,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
kaddr = kmap_atomic(page);
flush_arg_page(bprm, pos & PAGE_MASK, page);
memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
- flush_kernel_dcache_page(page);
+ flush_dcache_page(page);
kunmap_atomic(kaddr);
put_arg_page(page);
}
@@ -2070,10 +2072,8 @@ SYSCALL_DEFINE5(execveat,
const char __user *const __user *, envp,
int, flags)
{
- int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
-
return do_execveat(fd,
- getname_flags(filename, lookup_flags, NULL),
+ getname_uflags(filename, flags),
argv, envp, flags);
}
@@ -2091,10 +2091,8 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
const compat_uptr_t __user *, envp,
int, flags)
{
- int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
-
return compat_do_execveat(fd,
- getname_flags(filename, lookup_flags, NULL),
+ getname_uflags(filename, flags),
argv, envp, flags);
}
#endif
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 54eec9185627..1248ff4ef562 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config EXT2_FS
tristate "Second extended fs support"
+ select FS_IOMAP
help
Ext2 is a standard Linux file system for hard disks.
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index b9a9db98e94b..bf298967c5b8 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -141,13 +141,16 @@ fail:
* inode->i_mutex: don't care
*/
struct posix_acl *
-ext2_get_acl(struct inode *inode, int type)
+ext2_get_acl(struct inode *inode, int type, bool rcu)
{
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 917db5f6630a..925ab6287d35 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size)
#ifdef CONFIG_EXT2_FS_POSIX_ACL
/* acl.c */
-extern struct posix_acl *ext2_get_acl(struct inode *inode, int type);
+extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu);
extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e512630cb63e..3be9dd6412b7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -667,9 +667,6 @@ struct ext2_inode_info {
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
-#ifdef CONFIG_FS_DAX
- struct rw_semaphore dax_sem;
-#endif
/*
* truncate_mutex is for serialising ext2_truncate() against
@@ -685,14 +682,6 @@ struct ext2_inode_info {
#endif
};
-#ifdef CONFIG_FS_DAX
-#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
-#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem)
-#else
-#define dax_sem_down_write(ext2_inode)
-#define dax_sem_up_write(ext2_inode)
-#endif
-
/*
* Inode dynamic state flags
*/
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index f98466acc672..eb97aa3d700e 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -81,7 +81,7 @@ out_unlock:
*
* mmap_lock (MM)
* sb_start_pagefault (vfs, freeze)
- * ext2_inode_info->dax_sem
+ * address_space->invalidate_lock
* address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
* ext2_inode_info->truncate_mutex
*
@@ -91,7 +91,6 @@ out_unlock:
static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
- struct ext2_inode_info *ei = EXT2_I(inode);
vm_fault_t ret;
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
@@ -100,11 +99,11 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
}
- down_read(&ei->dax_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops);
- up_read(&ei->dax_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
sb_end_pagefault(inode->i_sb);
return ret;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index dadb121beb22..333fa62661d5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,7 +799,6 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
}
-#ifdef CONFIG_FS_DAX
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
@@ -852,16 +851,18 @@ const struct iomap_ops ext2_iomap_ops = {
.iomap_begin = ext2_iomap_begin,
.iomap_end = ext2_iomap_end,
};
-#else
-/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
-const struct iomap_ops ext2_iomap_ops;
-#endif /* CONFIG_FS_DAX */
int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext2_get_block);
+ int ret;
+
+ inode_lock(inode);
+ len = min_t(u64, len, i_size_read(inode));
+ ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops);
+ inode_unlock(inode);
+
+ return ret;
}
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
@@ -1177,7 +1178,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
ext2_free_data(inode, p, q);
}
-/* dax_sem must be held when calling this function */
+/* mapping->invalidate_lock must be held when calling this function */
static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
__le32 *i_data = EXT2_I(inode)->i_data;
@@ -1194,7 +1195,7 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
#ifdef CONFIG_FS_DAX
- WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+ WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock));
#endif
n = ext2_block_to_path(inode, iblock, offsets, NULL);
@@ -1276,9 +1277,9 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
if (ext2_inode_is_fast_symlink(inode))
return;
- dax_sem_down_write(EXT2_I(inode));
+ filemap_invalidate_lock(inode->i_mapping);
__ext2_truncate_blocks(inode, offset);
- dax_sem_up_write(EXT2_I(inode));
+ filemap_invalidate_unlock(inode->i_mapping);
}
static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1308,10 +1309,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
if (error)
return error;
- dax_sem_down_write(EXT2_I(inode));
+ filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, newsize);
__ext2_truncate_blocks(inode, newsize);
- dax_sem_up_write(EXT2_I(inode));
+ filemap_invalidate_unlock(inode->i_mapping);
inode->i_mtime = inode->i_ctime = current_time(inode);
if (inode_needs_sync(inode)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 21e09fbaa46f..987bcf32ed46 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -206,9 +206,6 @@ static void init_once(void *foo)
init_rwsem(&ei->xattr_sem);
#endif
mutex_init(&ei->truncate_mutex);
-#ifdef CONFIG_FS_DAX
- init_rwsem(&ei->dax_sem);
-#endif
inode_init_once(&ei->vfs_inode);
}
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 49e7af6cc93f..7d89142e1421 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -10,7 +10,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
- xattr_user.o fast_commit.o
+ xattr_user.o fast_commit.o orphan.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index c5eaffccecc3..0613dfcbfd4a 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -142,13 +142,16 @@ fail:
* inode->i_mutex: don't care
*/
struct posix_acl *
-ext4_get_acl(struct inode *inode, int type)
+ext4_get_acl(struct inode *inode, int type, bool rcu)
{
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 84b8942a57f2..3219669732bf 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -55,7 +55,7 @@ static inline int ext4_acl_count(size_t size)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
-struct posix_acl *ext4_get_acl(struct inode *inode, int type);
+struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu);
int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9dc6e74b265c..a0fb0c4bdc7c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -652,8 +652,14 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* possible we just missed a transaction commit that did so
*/
smp_mb();
- if (sbi->s_mb_free_pending == 0)
+ if (sbi->s_mb_free_pending == 0) {
+ if (test_opt(sb, DISCARD)) {
+ atomic_inc(&sbi->s_retry_alloc_pending);
+ flush_work(&sbi->s_discard_work);
+ atomic_dec(&sbi->s_retry_alloc_pending);
+ }
return ext4_has_free_clusters(sbi, 1, 0);
+ }
/*
* it's possible we've just missed a transaction commit here,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c51e243450d..90ff5acaf11f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1034,7 +1034,14 @@ struct ext4_inode_info {
*/
struct rw_semaphore xattr_sem;
- struct list_head i_orphan; /* unlinked but open inodes */
+ /*
+ * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
+ * i_orphan is used.
+ */
+ union {
+ struct list_head i_orphan; /* unlinked but open inodes */
+ unsigned int i_orphan_idx; /* Index in orphan file */
+ };
/* Fast commit related info */
@@ -1086,15 +1093,6 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
- /*
- * i_mmap_sem is for serializing page faults with truncate / punch hole
- * operations. We have to make sure that new page cannot be faulted in
- * a section of the inode that is being punched. We cannot easily use
- * i_data_sem for this since we need protection for the whole punch
- * operation and i_data_sem ranks below transaction start so we have
- * to occasionally drop it.
- */
- struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
@@ -1428,7 +1426,8 @@ struct ext4_super_block {
__u8 s_last_error_errcode;
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
- __le32 s_reserved[95]; /* Padding to the end of the block */
+ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
+ __le32 s_reserved[94]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1447,6 +1446,54 @@ struct ext4_super_block {
#define EXT4_ENC_UTF8_12_1 1
+/* Types of ext4 journal triggers */
+enum ext4_journal_trigger_type {
+ EXT4_JTR_ORPHAN_FILE,
+ EXT4_JTR_NONE /* This must be the last entry for indexing to work! */
+};
+
+#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE
+
+struct ext4_journal_trigger {
+ struct jbd2_buffer_trigger_type tr_triggers;
+ struct super_block *sb;
+};
+
+static inline struct ext4_journal_trigger *EXT4_TRIGGER(
+ struct jbd2_buffer_trigger_type *trigger)
+{
+ return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
+}
+
+#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04
+
+/* Structure at the tail of orphan block */
+struct ext4_orphan_block_tail {
+ __le32 ob_magic;
+ __le32 ob_checksum;
+};
+
+static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
+{
+ return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
+ sizeof(u32);
+}
+
+struct ext4_orphan_block {
+ atomic_t ob_free_entries; /* Number of free orphan entries in block */
+ struct buffer_head *ob_bh; /* Buffer for orphan block */
+};
+
+/*
+ * Info about orphan file.
+ */
+struct ext4_orphan_info {
+ int of_blocks; /* Number of orphan blocks in a file */
+ __u32 of_csum_seed; /* Checksum seed for orphan file */
+ struct ext4_orphan_block *of_binfo; /* Array with info about orphan
+ * file blocks */
+};
+
/*
* fourth extended-fs super-block data in memory
*/
@@ -1501,9 +1548,11 @@ struct ext4_sb_info {
/* Journaling */
struct journal_s *s_journal;
- struct list_head s_orphan;
- struct mutex s_orphan_lock;
unsigned long s_ext4_flags; /* Ext4 superblock flags */
+ struct mutex s_orphan_lock; /* Protects on disk list changes */
+ struct list_head s_orphan; /* List of orphaned inodes in on disk
+ list */
+ struct ext4_orphan_info s_orphan_info;
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
@@ -1536,6 +1585,9 @@ struct ext4_sb_info {
unsigned int s_mb_free_pending;
struct list_head s_freed_data_list; /* List of blocks to be freed
after commit completed */
+ struct list_head s_discard_list;
+ struct work_struct s_discard_work;
+ atomic_t s_retry_alloc_pending;
struct rb_root s_mb_avg_fragment_size_root;
rwlock_t s_mb_rb_lock;
struct list_head *s_mb_largest_free_orders;
@@ -1625,6 +1677,9 @@ struct ext4_sb_info {
struct mb_cache *s_ea_inode_cache;
spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+ /* Journal triggers for checksum computation */
+ struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT];
+
/* Ratelimit ext4 messages. */
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
@@ -1835,6 +1890,7 @@ enum {
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
+ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1936,6 +1992,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
*/
#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
+#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1956,6 +2013,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
+#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be
+ non-empty */
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -2039,6 +2098,7 @@ EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
+EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
@@ -2053,6 +2113,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
+EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT)
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
@@ -2086,7 +2147,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
-#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \
+ EXT4_FEATURE_COMPAT_ORPHAN_FILE)
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
@@ -2111,7 +2173,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA |\
EXT4_FEATURE_RO_COMPAT_PROJECT |\
- EXT4_FEATURE_RO_COMPAT_VERITY)
+ EXT4_FEATURE_RO_COMPAT_VERITY |\
+ EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -2147,6 +2210,8 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}
+extern int ext4_feature_set_ok(struct super_block *sb, int readonly);
+
/*
* Superblock flags
*/
@@ -2159,7 +2224,6 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
}
-
/*
* Default values for user and/or group using reserved blocks
*/
@@ -2920,13 +2984,14 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
+ struct inode *inode,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
- int (*fn)(handle_t *handle,
+ int (*fn)(handle_t *handle, struct inode *inode,
struct buffer_head *bh));
-int do_journal_get_write_access(handle_t *handle,
+int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -2972,7 +3037,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
-extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
@@ -3006,8 +3070,6 @@ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
struct buffer_head *bh);
-extern int ext4_orphan_add(handle_t *, struct inode *);
-extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
@@ -3476,6 +3538,7 @@ static inline bool ext4_is_quota_journalled(struct super_block *sb)
return (ext4_has_feature_quota(sb) ||
sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
}
+int ext4_enable_quotas(struct super_block *sb);
#endif
/*
@@ -3737,6 +3800,19 @@ extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);
/* verity.c */
extern const struct fsverity_operations ext4_verityops;
+/* orphan.c */
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern void ext4_orphan_cleanup(struct super_block *sb,
+ struct ext4_super_block *es);
+extern void ext4_release_orphan_info(struct super_block *sb);
+extern int ext4_init_orphan_info(struct super_block *sb);
+extern int ext4_orphan_file_empty(struct super_block *sb);
+extern void ext4_orphan_file_block_trigger(
+ struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size);
+
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 44e59881a1f0..26435f3a3094 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -173,10 +173,11 @@ struct partial_cluster {
#define EXT_MAX_EXTENT(__hdr__) \
((le16_to_cpu((__hdr__)->eh_max)) ? \
((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
- : 0)
+ : NULL)
#define EXT_MAX_INDEX(__hdr__) \
((le16_to_cpu((__hdr__)->eh_max)) ? \
- ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0)
+ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
+ : NULL)
static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
{
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b60f0152ea57..6def7339056d 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -218,9 +218,11 @@ static void ext4_check_bdev_write_error(struct super_block *sb)
}
int __ext4_journal_get_write_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type)
{
- int err = 0;
+ int err;
might_sleep();
@@ -229,11 +231,18 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
- if (err)
+ if (err) {
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
+ return err;
+ }
}
- return err;
+ if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ return 0;
+ BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
+ jbd2_journal_set_triggers(bh,
+ &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
+ return 0;
}
/*
@@ -301,17 +310,27 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
}
int __ext4_journal_get_create_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type)
{
- int err = 0;
+ int err;
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_get_create_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__,
- bh, handle, err);
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ err = jbd2_journal_get_create_access(handle, bh);
+ if (err) {
+ ext4_journal_abort_handle(where, line, __func__, bh, handle,
+ err);
+ return err;
}
- return err;
+ if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ return 0;
+ BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
+ jbd2_journal_set_triggers(bh,
+ &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
+ return 0;
}
int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0d2fa423b7ad..0e4fa644df01 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -231,26 +231,32 @@ int ext4_expand_extra_isize(struct inode *inode,
* Wrapper functions with which ext4 calls into JBD.
*/
int __ext4_journal_get_write_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type);
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
int __ext4_journal_get_create_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
+ handle_t *handle, struct super_block *sb,
+ struct buffer_head *bh,
+ enum ext4_journal_trigger_type trigger_type);
int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
+#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \
+ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \
+ (bh), (trigger_type))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
__ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
(bh), (block_nr))
-#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
+#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \
+ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \
+ (bh), (trigger_type))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92ad64b89d9b..c0de30f25185 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -139,7 +139,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, path->p_bh);
+ return ext4_journal_get_write_access(handle, inode->i_sb,
+ path->p_bh, EXT4_JTR_NONE);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
@@ -1082,7 +1083,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
}
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto cleanup;
@@ -1160,7 +1162,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
}
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto cleanup;
@@ -1286,7 +1289,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
return -ENOMEM;
lock_buffer(bh);
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err) {
unlock_buffer(bh);
goto out;
@@ -3569,7 +3573,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_len - ee_block);
err = ext4_ext_zeroout(inode, &zero_ex1);
if (err)
- goto out;
+ goto fallback;
split_map.m_len = allocated;
}
if (split_map.m_lblk - ee_block + split_map.m_len <
@@ -3583,7 +3587,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_pblock(ex));
err = ext4_ext_zeroout(inode, &zero_ex2);
if (err)
- goto out;
+ goto fallback;
}
split_map.m_len += split_map.m_lblk - ee_block;
@@ -3592,6 +3596,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
}
+fallback:
err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
flags);
if (err > 0)
@@ -4474,6 +4479,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
unsigned int max_blocks;
loff_t new_size = 0;
@@ -4560,17 +4566,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* Prevent page faults from reinstantiating pages we have
* released from page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
goto out_mutex;
}
ret = ext4_update_disksize_before_punch(inode, offset, len);
if (ret) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
goto out_mutex;
}
/* Now release the pages and zero block aligned part of pages */
@@ -4579,7 +4585,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (ret)
goto out_mutex;
}
@@ -5221,6 +5227,7 @@ out:
static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
ext4_lblk_t punch_start, punch_stop;
handle_t *handle;
unsigned int credits;
@@ -5274,7 +5281,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -5289,15 +5296,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
* Write tail of the last page before removed range since it will get
* removed from the page cache below.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+ ret = filemap_write_and_wait_range(mapping, ioffset, offset);
if (ret)
goto out_mmap;
/*
* Write data that will be shifted to preserve them when discarding
* page cache below. We are also protected from pages becoming dirty
- * by i_mmap_sem.
+ * by i_rwsem and invalidate_lock.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+ ret = filemap_write_and_wait_range(mapping, offset + len,
LLONG_MAX);
if (ret)
goto out_mmap;
@@ -5350,7 +5357,7 @@ out_stop:
ext4_journal_stop(handle);
ext4_fc_stop_ineligible(sb);
out_mmap:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
@@ -5367,6 +5374,7 @@ out_mutex:
static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
@@ -5425,7 +5433,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -5526,7 +5534,7 @@ out_stop:
ext4_journal_stop(handle);
ext4_fc_stop_ineligible(sb);
out_mmap:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index e8195229c252..8e610a381862 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -775,28 +775,27 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
}
/* Same as above, but adds dentry tlv. */
-static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
- int parent_ino, int ino, int dlen,
- const unsigned char *dname,
- u32 *crc)
+static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
+ struct ext4_fc_dentry_update *fc_dentry)
{
struct ext4_fc_dentry_info fcd;
struct ext4_fc_tl tl;
+ int dlen = fc_dentry->fcd_name.len;
u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
crc);
if (!dst)
return false;
- fcd.fc_parent_ino = cpu_to_le32(parent_ino);
- fcd.fc_ino = cpu_to_le32(ino);
- tl.fc_tag = cpu_to_le16(tag);
+ fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
+ fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
+ tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
dst += sizeof(tl);
ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
dst += sizeof(fcd);
- ext4_fc_memcpy(sb, dst, dname, dlen, crc);
+ ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
dst += dlen;
return true;
@@ -992,11 +991,7 @@ __releases(&sbi->s_fc_lock)
&sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
spin_unlock(&sbi->s_fc_lock);
- if (!ext4_fc_add_dentry_tlv(
- sb, fc_dentry->fcd_op,
- fc_dentry->fcd_parent, fc_dentry->fcd_ino,
- fc_dentry->fcd_name.len,
- fc_dentry->fcd_name.name, crc)) {
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
ret = -ENOSPC;
goto lock_and_exit;
}
@@ -1035,11 +1030,7 @@ __releases(&sbi->s_fc_lock)
if (ret)
goto lock_and_exit;
- if (!ext4_fc_add_dentry_tlv(
- sb, fc_dentry->fcd_op,
- fc_dentry->fcd_parent, fc_dentry->fcd_ino,
- fc_dentry->fcd_name.len,
- fc_dentry->fcd_name.name, crc)) {
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
ret = -ENOSPC;
goto lock_and_exit;
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 816dedcbd541..ac0e11bbb445 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
pfn_t pfn;
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
retry:
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
if (IS_ERR(handle)) {
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
return VM_FAULT_SIGBUS;
}
} else {
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
}
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
if (write) {
@@ -731,10 +732,10 @@ retry:
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
} else {
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
}
return result;
@@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = ext4_filemap_fault,
+ .fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
@@ -822,7 +823,8 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (IS_ERR(handle))
goto out;
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto out_journal;
lock_buffer(sbi->s_sbh);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e89fc0f770b0..f73e5eb43eae 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -300,7 +300,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
}
BUFFER_TRACE(bitmap_bh, "get_write_access");
- fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+ fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (fatal)
goto error_return;
@@ -308,7 +309,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
gdp = ext4_get_group_desc(sb, block_group, &bh2);
if (gdp) {
BUFFER_TRACE(bh2, "get_write_access");
- fatal = ext4_journal_get_write_access(handle, bh2);
+ fatal = ext4_journal_get_write_access(handle, sb, bh2,
+ EXT4_JTR_NONE);
}
ext4_lock_group(sb, block_group);
cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
@@ -1085,7 +1087,8 @@ repeat_in_this_group:
}
}
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(sb, err);
goto out;
@@ -1127,7 +1130,8 @@ got:
}
BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, group_desc_bh);
+ err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(sb, err);
goto out;
@@ -1144,7 +1148,8 @@ got:
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
- err = ext4_journal_get_write_access(handle, block_bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
+ EXT4_JTR_NONE);
if (err) {
brelse(block_bitmap_bh);
ext4_std_error(sb, err);
@@ -1583,8 +1588,8 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
num = sbi->s_itb_per_group - used_blks;
BUFFER_TRACE(group_desc_bh, "get_write_access");
- ret = ext4_journal_get_write_access(handle,
- group_desc_bh);
+ ret = ext4_journal_get_write_access(handle, sb, group_desc_bh,
+ EXT4_JTR_NONE);
if (ret)
goto err_out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a7bc6ad656a9..89efa78ed4b2 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -354,7 +354,8 @@ static int ext4_alloc_branch(handle_t *handle,
}
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, ar->inode->i_sb,
+ bh, EXT4_JTR_NONE);
if (err) {
unlock_buffer(bh);
goto failed;
@@ -429,7 +430,8 @@ static int ext4_splice_branch(handle_t *handle,
*/
if (where->bh) {
BUFFER_TRACE(where->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, where->bh);
+ err = ext4_journal_get_write_access(handle, ar->inode->i_sb,
+ where->bh, EXT4_JTR_NONE);
if (err)
goto err_out;
}
@@ -728,7 +730,8 @@ static int ext4_ind_truncate_ensure_credits(handle_t *handle,
return ret;
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
- ret = ext4_journal_get_write_access(handle, bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(ret))
return ret;
}
@@ -916,7 +919,8 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) { /* For indirect block */
BUFFER_TRACE(this_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, this_bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ this_bh, EXT4_JTR_NONE);
/* Important: if we can't update the indirect pointers
* to the blocks, we can't free them. */
if (err)
@@ -1079,7 +1083,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
*/
BUFFER_TRACE(parent_bh, "get_write_access");
if (!ext4_journal_get_write_access(handle,
- parent_bh)){
+ inode->i_sb, parent_bh,
+ EXT4_JTR_NONE)) {
*p = 0;
BUFFER_TRACE(parent_bh,
"call ext4_handle_dirty_metadata");
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 70cb64db33f7..82bf4ff6be28 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -264,7 +264,8 @@ static int ext4_create_inline_data(handle_t *handle,
return error;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
@@ -350,7 +351,8 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
goto out;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
@@ -427,7 +429,8 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
goto out;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
@@ -593,7 +596,7 @@ retry:
ret = __block_write_begin(page, from, to, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ ret = ext4_walk_page_buffers(handle, inode, page_buffers(page),
from, to, NULL,
do_journal_get_write_access);
}
@@ -682,7 +685,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
goto convert;
}
- ret = ext4_journal_get_write_access(handle, iloc.bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
+ EXT4_JTR_NONE);
if (ret)
goto out;
@@ -750,6 +754,12 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
ext4_write_lock_xattr(inode, &no_expand);
BUG_ON(!ext4_has_inline_data(inode));
+ /*
+ * ei->i_inline_off may have changed since ext4_write_begin()
+ * called ext4_try_to_write_inline_data()
+ */
+ (void) ext4_find_inline_data_nolock(inode);
+
kaddr = kmap_atomic(page);
ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
kunmap_atomic(kaddr);
@@ -923,7 +933,8 @@ retry_journal:
if (ret < 0)
goto out_release_page;
}
- ret = ext4_journal_get_write_access(handle, iloc.bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
+ EXT4_JTR_NONE);
if (ret)
goto out_release_page;
@@ -1028,7 +1039,8 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
return err;
BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh,
+ EXT4_JTR_NONE);
if (err)
return err;
ext4_insert_dentry(dir, inode, de, inline_size, fname);
@@ -1223,7 +1235,8 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
}
lock_buffer(data_bh);
- error = ext4_journal_get_create_access(handle, data_bh);
+ error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh,
+ EXT4_JTR_NONE);
if (error) {
unlock_buffer(data_bh);
error = -EIO;
@@ -1707,7 +1720,8 @@ int ext4_delete_inline_entry(handle_t *handle,
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d8de607849df..d18852d6029c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -139,7 +139,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
static void ext4_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
@@ -869,7 +868,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
*/
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
- err = ext4_journal_get_create_access(handle, bh);
+ err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(err)) {
unlock_buffer(bh);
goto errout;
@@ -954,12 +954,12 @@ out_brelse:
return err;
}
-int ext4_walk_page_buffers(handle_t *handle,
+int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
struct buffer_head *head,
unsigned from,
unsigned to,
int *partial,
- int (*fn)(handle_t *handle,
+ int (*fn)(handle_t *handle, struct inode *inode,
struct buffer_head *bh))
{
struct buffer_head *bh;
@@ -978,7 +978,7 @@ int ext4_walk_page_buffers(handle_t *handle,
*partial = 1;
continue;
}
- err = (*fn)(handle, bh);
+ err = (*fn)(handle, inode, bh);
if (!ret)
ret = err;
}
@@ -1009,7 +1009,7 @@ int ext4_walk_page_buffers(handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-int do_journal_get_write_access(handle_t *handle,
+int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
int dirty = buffer_dirty(bh);
@@ -1028,7 +1028,8 @@ int do_journal_get_write_access(handle_t *handle,
if (dirty)
clear_buffer_dirty(bh);
BUFFER_TRACE(bh, "get write access");
- ret = ext4_journal_get_write_access(handle, bh);
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (!ret && dirty)
ret = ext4_handle_dirty_metadata(handle, NULL, bh);
return ret;
@@ -1208,8 +1209,8 @@ retry_journal:
ret = __block_write_begin(page, pos, len, ext4_get_block);
#endif
if (!ret && ext4_should_journal_data(inode)) {
- ret = ext4_walk_page_buffers(handle, page_buffers(page),
- from, to, NULL,
+ ret = ext4_walk_page_buffers(handle, inode,
+ page_buffers(page), from, to, NULL,
do_journal_get_write_access);
}
@@ -1253,7 +1254,8 @@ retry_journal:
}
/* For write_end() in data=journal mode */
-static int write_end_fn(handle_t *handle, struct buffer_head *bh)
+static int write_end_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
int ret;
if (!buffer_mapped(bh) || buffer_freed(bh))
@@ -1352,6 +1354,7 @@ errout:
* to call ext4_handle_dirty_metadata() instead.
*/
static void ext4_journalled_zero_new_buffers(handle_t *handle,
+ struct inode *inode,
struct page *page,
unsigned from, unsigned to)
{
@@ -1370,7 +1373,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
size = min(to, block_end) - start;
zero_user(page, start, size);
- write_end_fn(handle, bh);
+ write_end_fn(handle, inode, bh);
}
clear_buffer_new(bh);
}
@@ -1412,13 +1415,13 @@ static int ext4_journalled_write_end(struct file *file,
copied = ret;
} else if (unlikely(copied < len) && !PageUptodate(page)) {
copied = 0;
- ext4_journalled_zero_new_buffers(handle, page, from, to);
+ ext4_journalled_zero_new_buffers(handle, inode, page, from, to);
} else {
if (unlikely(copied < len))
- ext4_journalled_zero_new_buffers(handle, page,
+ ext4_journalled_zero_new_buffers(handle, inode, page,
from + copied, to);
- ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
- from + copied, &partial,
+ ret = ext4_walk_page_buffers(handle, inode, page_buffers(page),
+ from, from + copied, &partial,
write_end_fn);
if (!partial)
SetPageUptodate(page);
@@ -1619,7 +1622,8 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
}
@@ -1851,13 +1855,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-static int bget_one(handle_t *handle, struct buffer_head *bh)
+static int bget_one(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
get_bh(bh);
return 0;
}
-static int bput_one(handle_t *handle, struct buffer_head *bh)
+static int bput_one(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
put_bh(bh);
return 0;
@@ -1888,7 +1894,7 @@ static int __ext4_journalled_writepage(struct page *page,
BUG();
goto out;
}
- ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
NULL, bget_one);
}
/*
@@ -1920,11 +1926,11 @@ static int __ext4_journalled_writepage(struct page *page,
if (inline_data) {
ret = ext4_mark_inode_dirty(handle, inode);
} else {
- ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
- do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+ NULL, do_journal_get_write_access);
- err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
- write_end_fn);
+ err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+ NULL, write_end_fn);
}
if (ret == 0)
ret = err;
@@ -1941,7 +1947,7 @@ out:
unlock_page(page);
out_no_pagelock:
if (!inline_data && page_bufs)
- ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+ ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len,
NULL, bput_one);
brelse(inode_bh);
return ret;
@@ -2031,7 +2037,7 @@ static int ext4_writepage(struct page *page,
* for the extremely common case, this is an optimization that
* skips a useless round trip through ext4_bio_write_page().
*/
- if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
if ((current->flags & PF_MEMALLOC) ||
@@ -3794,7 +3800,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
}
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err)
goto unlock;
}
@@ -3950,20 +3957,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
-static void ext4_wait_dax_page(struct ext4_inode_info *ei)
+static void ext4_wait_dax_page(struct inode *inode)
{
- up_write(&ei->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
schedule();
- down_write(&ei->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
}
int ext4_break_layouts(struct inode *inode)
{
- struct ext4_inode_info *ei = EXT4_I(inode);
struct page *page;
int error;
- if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
+ if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
return -EINVAL;
do {
@@ -3974,7 +3980,7 @@ int ext4_break_layouts(struct inode *inode)
error = ___wait_var_event(&page->_refcount,
atomic_read(&page->_refcount) == 1,
TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(ei));
+ ext4_wait_dax_page(inode));
} while (error == 0);
return error;
@@ -4005,9 +4011,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
if (ext4_has_inline_data(inode)) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_convert_inline_data(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (ret)
return ret;
}
@@ -4058,7 +4064,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
@@ -4131,7 +4137,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
out_stop:
ext4_journal_stop(handle);
out_dio:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
@@ -4330,101 +4336,93 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
- if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
- goto simulate_eio;
- if (!buffer_uptodate(bh)) {
- lock_buffer(bh);
+ if (ext4_buffer_uptodate(bh))
+ goto has_buffer;
- if (ext4_buffer_uptodate(bh)) {
- /* someone brought it uptodate while we waited */
- unlock_buffer(bh);
- goto has_buffer;
- }
-
- /*
- * If we have all information of the inode in memory and this
- * is the only valid inode in the block, we need not read the
- * block.
- */
- if (in_mem) {
- struct buffer_head *bitmap_bh;
- int i, start;
+ lock_buffer(bh);
+ /*
+ * If we have all information of the inode in memory and this
+ * is the only valid inode in the block, we need not read the
+ * block.
+ */
+ if (in_mem) {
+ struct buffer_head *bitmap_bh;
+ int i, start;
- start = inode_offset & ~(inodes_per_block - 1);
+ start = inode_offset & ~(inodes_per_block - 1);
- /* Is the inode bitmap in cache? */
- bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- if (unlikely(!bitmap_bh))
- goto make_io;
+ /* Is the inode bitmap in cache? */
+ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+ if (unlikely(!bitmap_bh))
+ goto make_io;
- /*
- * If the inode bitmap isn't in cache then the
- * optimisation may end up performing two reads instead
- * of one, so skip it.
- */
- if (!buffer_uptodate(bitmap_bh)) {
- brelse(bitmap_bh);
- goto make_io;
- }
- for (i = start; i < start + inodes_per_block; i++) {
- if (i == inode_offset)
- continue;
- if (ext4_test_bit(i, bitmap_bh->b_data))
- break;
- }
+ /*
+ * If the inode bitmap isn't in cache then the
+ * optimisation may end up performing two reads instead
+ * of one, so skip it.
+ */
+ if (!buffer_uptodate(bitmap_bh)) {
brelse(bitmap_bh);
- if (i == start + inodes_per_block) {
- /* all other inodes are free, so skip I/O */
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- goto has_buffer;
- }
+ goto make_io;
+ }
+ for (i = start; i < start + inodes_per_block; i++) {
+ if (i == inode_offset)
+ continue;
+ if (ext4_test_bit(i, bitmap_bh->b_data))
+ break;
}
+ brelse(bitmap_bh);
+ if (i == start + inodes_per_block) {
+ /* all other inodes are free, so skip I/O */
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
+ }
make_io:
- /*
- * If we need to do any I/O, try to pre-readahead extra
- * blocks from the inode table.
- */
- blk_start_plug(&plug);
- if (EXT4_SB(sb)->s_inode_readahead_blks) {
- ext4_fsblk_t b, end, table;
- unsigned num;
- __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
-
- table = ext4_inode_table(sb, gdp);
- /* s_inode_readahead_blks is always a power of 2 */
- b = block & ~((ext4_fsblk_t) ra_blks - 1);
- if (table > b)
- b = table;
- end = b + ra_blks;
- num = EXT4_INODES_PER_GROUP(sb);
- if (ext4_has_group_desc_csum(sb))
- num -= ext4_itable_unused_count(sb, gdp);
- table += num / inodes_per_block;
- if (end > table)
- end = table;
- while (b <= end)
- ext4_sb_breadahead_unmovable(sb, b++);
- }
+ /*
+ * If we need to do any I/O, try to pre-readahead extra
+ * blocks from the inode table.
+ */
+ blk_start_plug(&plug);
+ if (EXT4_SB(sb)->s_inode_readahead_blks) {
+ ext4_fsblk_t b, end, table;
+ unsigned num;
+ __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
+
+ table = ext4_inode_table(sb, gdp);
+ /* s_inode_readahead_blks is always a power of 2 */
+ b = block & ~((ext4_fsblk_t) ra_blks - 1);
+ if (table > b)
+ b = table;
+ end = b + ra_blks;
+ num = EXT4_INODES_PER_GROUP(sb);
+ if (ext4_has_group_desc_csum(sb))
+ num -= ext4_itable_unused_count(sb, gdp);
+ table += num / inodes_per_block;
+ if (end > table)
+ end = table;
+ while (b <= end)
+ ext4_sb_breadahead_unmovable(sb, b++);
+ }
- /*
- * There are other valid inodes in the buffer, this inode
- * has in-inode xattrs, or we don't have this inode in memory.
- * Read the block from disk.
- */
- trace_ext4_load_inode(sb, ino);
- ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
- blk_finish_plug(&plug);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- simulate_eio:
- if (ret_block)
- *ret_block = block;
- brelse(bh);
- return -EIO;
- }
+ /*
+ * There are other valid inodes in the buffer, this inode
+ * has in-inode xattrs, or we don't have this inode in memory.
+ * Read the block from disk.
+ */
+ trace_ext4_load_inode(sb, ino);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
+ blk_finish_plug(&plug);
+ wait_on_buffer(bh);
+ ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
+ if (!buffer_uptodate(bh)) {
+ if (ret_block)
+ *ret_block = block;
+ brelse(bh);
+ return -EIO;
}
has_buffer:
iloc->bh = bh;
@@ -4603,6 +4601,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
@@ -4613,9 +4612,13 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
projid_t i_projid;
if ((!(flags & EXT4_IGET_SPECIAL) &&
- (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
+ ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
+ ino == le32_to_cpu(es->s_usr_quota_inum) ||
+ ino == le32_to_cpu(es->s_grp_quota_inum) ||
+ ino == le32_to_cpu(es->s_prj_quota_inum) ||
+ ino == le32_to_cpu(es->s_orphan_file_inum))) ||
(ino < EXT4_ROOT_INO) ||
- (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+ (ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
return ERR_PTR(-ESTALE);
__ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
@@ -4928,8 +4931,14 @@ static int ext4_inode_blocks_set(handle_t *handle,
ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
return 0;
}
+
+ /*
+ * This should never happen since sb->s_maxbytes should not have
+ * allowed this, sb->s_maxbytes was set according to the huge_file
+ * feature in ext4_fill_super().
+ */
if (!ext4_has_feature_huge_file(sb))
- return -EFBIG;
+ return -EFSCORRUPTED;
if (i_blocks <= 0xffffffffffffULL) {
/*
@@ -5032,16 +5041,14 @@ static int ext4_do_update_inode(handle_t *handle,
spin_lock(&ei->i_raw_lock);
- /* For fields not tracked in the in-memory inode,
- * initialise them to zero for new inodes. */
+ /*
+ * For fields not tracked in the in-memory inode, initialise them
+ * to zero for new inodes.
+ */
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
err = ext4_inode_blocks_set(handle, raw_inode, ei);
- if (err) {
- spin_unlock(&ei->i_raw_lock);
- goto out_brelse;
- }
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
i_uid = i_uid_read(inode);
@@ -5050,10 +5057,11 @@ static int ext4_do_update_inode(handle_t *handle,
if (!(test_opt(inode->i_sb, NO_UID32))) {
raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
-/*
- * Fix up interoperability with old kernels. Otherwise, old inodes get
- * re-used with the upper 16 bits of the uid/gid intact
- */
+ /*
+ * Fix up interoperability with old kernels. Otherwise,
+ * old inodes get re-used with the upper 16 bits of the
+ * uid/gid intact.
+ */
if (ei->i_dtime && list_empty(&ei->i_orphan)) {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
@@ -5122,8 +5130,9 @@ static int ext4_do_update_inode(handle_t *handle,
}
}
- BUG_ON(!ext4_has_feature_project(inode->i_sb) &&
- i_projid != EXT4_DEF_PROJID);
+ if (i_projid != EXT4_DEF_PROJID &&
+ !ext4_has_feature_project(inode->i_sb))
+ err = err ?: -EFSCORRUPTED;
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
@@ -5131,6 +5140,11 @@ static int ext4_do_update_inode(handle_t *handle,
ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "corrupted inode contents");
+ goto out_brelse;
+ }
+
if (inode->i_sb->s_flags & SB_LAZYTIME)
ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
bh->b_data);
@@ -5138,13 +5152,15 @@ static int ext4_do_update_inode(handle_t *handle,
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, bh);
if (err)
- goto out_brelse;
+ goto out_error;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
if (set_large_file) {
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb,
+ EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (err)
- goto out_brelse;
+ goto out_error;
lock_buffer(EXT4_SB(sb)->s_sbh);
ext4_set_feature_large_file(sb);
ext4_superblock_csum_set(sb);
@@ -5154,9 +5170,10 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_SB(sb)->s_sbh);
}
ext4_update_inode_fsync_trans(handle, inode, need_datasync);
+out_error:
+ ext4_std_error(inode->i_sb, err);
out_brelse:
brelse(bh);
- ext4_std_error(inode->i_sb, err);
return err;
}
@@ -5426,11 +5443,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
inode_dio_wait(inode);
}
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
rc = ext4_break_layouts(inode);
if (rc) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
goto err_out;
}
@@ -5506,7 +5523,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
error = rc;
}
out_mmap_sem:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
}
if (!error) {
@@ -5743,7 +5760,8 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
err = ext4_get_inode_loc(inode, iloc);
if (!err) {
BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ iloc->bh, EXT4_JTR_NONE);
if (err) {
brelse(iloc->bh);
iloc->bh = NULL;
@@ -5866,7 +5884,8 @@ int ext4_expand_extra_isize(struct inode *inode,
ext4_write_lock_xattr(inode, &no_expand);
BUFFER_TRACE(iloc->bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, iloc->bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
+ EXT4_JTR_NONE);
if (error) {
brelse(iloc->bh);
goto out_unlock;
@@ -5983,10 +6002,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
* data (and journalled aops don't know how to handle these cases).
*/
if (val) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = filemap_write_and_wait(inode->i_mapping);
if (err < 0) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return err;
}
}
@@ -6019,7 +6038,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
percpu_up_write(&sbi->s_writepages_rwsem);
if (val)
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
/* Finally we can mark the inode as dirty. */
@@ -6037,7 +6056,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
-static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
{
return !buffer_mapped(bh);
}
@@ -6063,7 +6083,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(mapping);
err = ext4_convert_inline_data(inode);
if (err)
@@ -6110,7 +6130,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
* inode to the transaction's list to writeprotect pages on commit.
*/
if (page_has_buffers(page)) {
- if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page),
0, len, NULL,
ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
@@ -6156,11 +6176,13 @@ retry_alloc:
err = __block_write_begin(page, 0, len, ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
- if (ext4_walk_page_buffers(handle, page_buffers(page),
- 0, len, NULL, do_journal_get_write_access))
+ if (ext4_walk_page_buffers(handle, inode,
+ page_buffers(page), 0, len, NULL,
+ do_journal_get_write_access))
goto out_error;
- if (ext4_walk_page_buffers(handle, page_buffers(page),
- 0, len, NULL, write_end_fn))
+ if (ext4_walk_page_buffers(handle, inode,
+ page_buffers(page), 0, len, NULL,
+ write_end_fn))
goto out_error;
if (ext4_jbd2_inode_add_write(handle, inode,
page_offset(page), len))
@@ -6176,7 +6198,7 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(err);
out:
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
return ret;
out_error:
@@ -6184,15 +6206,3 @@ out_error:
ext4_journal_stop(handle);
goto out;
}
-
-vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- vm_fault_t ret;
-
- down_read(&EXT4_I(inode)->i_mmap_sem);
- ret = filemap_fault(vmf);
- up_read(&EXT4_I(inode)->i_mmap_sem);
-
- return ret;
-}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6eed6170aded..606dee9e08a3 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
goto journal_err_out;
}
- down_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto err_out;
@@ -256,7 +256,7 @@ err_out1:
ext4_double_up_write_data_sem(inode, inode_bl);
err_out:
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
journal_err_out:
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
@@ -1154,7 +1154,9 @@ resizefs_out:
err = PTR_ERR(handle);
goto pwsalt_err_exit;
}
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb,
+ sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto pwsalt_err_journal;
lock_buffer(sbi->s_sbh);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 089c958aa2c3..72bfac2d6dce 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -408,6 +408,10 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr);
+static int ext4_try_to_trim_range(struct super_block *sb,
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
+ ext4_grpblk_t max, ext4_grpblk_t minblocks);
+
/*
* The algorithm using this percpu seq counter goes below:
* 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -2474,6 +2478,12 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
* This could return negative error code if something goes wrong
* during ext4_mb_init_group(). This should not be called with
* ext4_lock_group() held.
+ *
+ * Note: because we are conditionally operating with the group lock in
+ * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
+ * function using __acquire and __release. This means we need to be
+ * super careful before messing with the error path handling via "goto
+ * out"!
*/
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
@@ -2487,8 +2497,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
- if (should_lock)
+ if (should_lock) {
ext4_lock_group(sb, group);
+ __release(ext4_group_lock_ptr(sb, group));
+ }
free = grp->bb_free;
if (free == 0)
goto out;
@@ -2496,8 +2508,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
goto out;
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
goto out;
- if (should_lock)
+ if (should_lock) {
+ __acquire(ext4_group_lock_ptr(sb, group));
ext4_unlock_group(sb, group);
+ }
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
@@ -2524,12 +2538,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
return ret;
}
- if (should_lock)
+ if (should_lock) {
ext4_lock_group(sb, group);
+ __release(ext4_group_lock_ptr(sb, group));
+ }
ret = ext4_mb_good_group(ac, group, cr);
out:
- if (should_lock)
+ if (should_lock) {
+ __acquire(ext4_group_lock_ptr(sb, group));
ext4_unlock_group(sb, group);
+ }
return ret;
}
@@ -2965,6 +2983,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
}
static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
+__acquires(&EXT4_SB(sb)->s_mb_rb_lock)
{
struct super_block *sb = PDE_DATA(file_inode(seq->file));
unsigned long position;
@@ -3037,6 +3056,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
}
static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
+__releases(&EXT4_SB(sb)->s_mb_rb_lock)
{
struct super_block *sb = PDE_DATA(file_inode(seq->file));
@@ -3308,6 +3328,57 @@ static int ext4_groupinfo_create_slab(size_t size)
return 0;
}
+static void ext4_discard_work(struct work_struct *work)
+{
+ struct ext4_sb_info *sbi = container_of(work,
+ struct ext4_sb_info, s_discard_work);
+ struct super_block *sb = sbi->s_sb;
+ struct ext4_free_data *fd, *nfd;
+ struct ext4_buddy e4b;
+ struct list_head discard_list;
+ ext4_group_t grp, load_grp;
+ int err = 0;
+
+ INIT_LIST_HEAD(&discard_list);
+ spin_lock(&sbi->s_md_lock);
+ list_splice_init(&sbi->s_discard_list, &discard_list);
+ spin_unlock(&sbi->s_md_lock);
+
+ load_grp = UINT_MAX;
+ list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
+ /*
+ * If filesystem is umounting or no memory or suffering
+ * from no space, give up the discard
+ */
+ if ((sb->s_flags & SB_ACTIVE) && !err &&
+ !atomic_read(&sbi->s_retry_alloc_pending)) {
+ grp = fd->efd_group;
+ if (grp != load_grp) {
+ if (load_grp != UINT_MAX)
+ ext4_mb_unload_buddy(&e4b);
+
+ err = ext4_mb_load_buddy(sb, grp, &e4b);
+ if (err) {
+ kmem_cache_free(ext4_free_data_cachep, fd);
+ load_grp = UINT_MAX;
+ continue;
+ } else {
+ load_grp = grp;
+ }
+ }
+
+ ext4_lock_group(sb, grp);
+ ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
+ fd->efd_start_cluster + fd->efd_count - 1, 1);
+ ext4_unlock_group(sb, grp);
+ }
+ kmem_cache_free(ext4_free_data_cachep, fd);
+ }
+
+ if (load_grp != UINT_MAX)
+ ext4_mb_unload_buddy(&e4b);
+}
+
int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3376,6 +3447,9 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&sbi->s_md_lock);
sbi->s_mb_free_pending = 0;
INIT_LIST_HEAD(&sbi->s_freed_data_list);
+ INIT_LIST_HEAD(&sbi->s_discard_list);
+ INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
+ atomic_set(&sbi->s_retry_alloc_pending, 0);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -3474,6 +3548,14 @@ int ext4_mb_release(struct super_block *sb)
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
int count;
+ if (test_opt(sb, DISCARD)) {
+ /*
+ * wait the discard work to drain all of ext4_free_data
+ */
+ flush_work(&sbi->s_discard_work);
+ WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
+ }
+
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
cond_resched();
@@ -3596,7 +3678,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
put_page(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->efd_group);
- kmem_cache_free(ext4_free_data_cachep, entry);
ext4_mb_unload_buddy(&e4b);
mb_debug(sb, "freed %d blocks in %d structures\n", count,
@@ -3611,10 +3692,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_free_data *entry, *tmp;
- struct bio *discard_bio = NULL;
struct list_head freed_data_list;
struct list_head *cut_pos = NULL;
- int err;
+ bool wake;
INIT_LIST_HEAD(&freed_data_list);
@@ -3629,30 +3709,20 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
cut_pos);
spin_unlock(&sbi->s_md_lock);
- if (test_opt(sb, DISCARD)) {
- list_for_each_entry(entry, &freed_data_list, efd_list) {
- err = ext4_issue_discard(sb, entry->efd_group,
- entry->efd_start_cluster,
- entry->efd_count,
- &discard_bio);
- if (err && err != -EOPNOTSUPP) {
- ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%d failed"
- " with %d", entry->efd_group,
- entry->efd_start_cluster,
- entry->efd_count, err);
- } else if (err == -EOPNOTSUPP)
- break;
- }
+ list_for_each_entry(entry, &freed_data_list, efd_list)
+ ext4_free_data_in_buddy(sb, entry);
- if (discard_bio) {
- submit_bio_wait(discard_bio);
- bio_put(discard_bio);
- }
+ if (test_opt(sb, DISCARD)) {
+ spin_lock(&sbi->s_md_lock);
+ wake = list_empty(&sbi->s_discard_list);
+ list_splice_tail(&freed_data_list, &sbi->s_discard_list);
+ spin_unlock(&sbi->s_md_lock);
+ if (wake)
+ queue_work(system_unbound_wq, &sbi->s_discard_work);
+ } else {
+ list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
-
- list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
- ext4_free_data_in_buddy(sb, entry);
}
int __init ext4_init_mballoc(void)
@@ -3726,7 +3796,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (err)
goto out_err;
@@ -3739,7 +3810,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_free_group_clusters(sb, gdp));
BUFFER_TRACE(gdp_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdp_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
if (err)
goto out_err;
@@ -5916,7 +5987,8 @@ do_more:
}
BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (err)
goto error_return;
@@ -5926,7 +5998,7 @@ do_more:
* using it
*/
BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
+ err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
if (err)
goto error_return;
#ifdef AGGRESSIVE_CHECK
@@ -6107,7 +6179,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
}
BUFFER_TRACE(bitmap_bh, "getting write access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
+ err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+ EXT4_JTR_NONE);
if (err)
goto error_return;
@@ -6117,7 +6190,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
* using it
*/
BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
+ err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
if (err)
goto error_return;
@@ -6183,19 +6256,19 @@ error_return:
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
* @count: number of blocks to TRIM
- * @group: alloc. group we are working with
* @e4b: ext4 buddy for the group
*
* Trim "count" blocks starting at "start" in the "group". To assure that no
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_trim_extent(struct super_block *sb,
+ int start, int count, struct ext4_buddy *e4b)
__releases(bitlock)
__acquires(bitlock)
{
struct ext4_free_extent ex;
+ ext4_group_t group = e4b->bd_group;
int ret = 0;
trace_ext4_trim_extent(sb, group, start, count);
@@ -6218,51 +6291,21 @@ __acquires(bitlock)
return ret;
}
-/**
- * ext4_trim_all_free -- function to trim all free space in alloc. group
- * @sb: super block for file system
- * @group: group to be trimmed
- * @start: first group block to examine
- * @max: last group block to examine
- * @minblocks: minimum extent block count
- *
- * ext4_trim_all_free walks through group's buddy bitmap searching for free
- * extents. When the free block is found, ext4_trim_extent is called to TRIM
- * the extent.
- *
- *
- * ext4_trim_all_free walks through group's block bitmap searching for free
- * extents. When the free extent is found, mark it as used in group buddy
- * bitmap. Then issue a TRIM command on this extent and free the extent in
- * the group buddy bitmap. This is done until whole group is scanned.
- */
-static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
- ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks)
+static int ext4_try_to_trim_range(struct super_block *sb,
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
+ ext4_grpblk_t max, ext4_grpblk_t minblocks)
+__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
+__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
+ ext4_grpblk_t next, count, free_count;
void *bitmap;
- ext4_grpblk_t next, count = 0, free_count = 0;
- struct ext4_buddy e4b;
int ret = 0;
- trace_ext4_trim_all_free(sb, group, start, max);
-
- ret = ext4_mb_load_buddy(sb, group, &e4b);
- if (ret) {
- ext4_warning(sb, "Error %d loading buddy information for %u",
- ret, group);
- return ret;
- }
- bitmap = e4b.bd_bitmap;
-
- ext4_lock_group(sb, group);
- if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
- minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
- goto out;
-
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
+ bitmap = e4b->bd_bitmap;
+ start = (e4b->bd_info->bb_first_free > start) ?
+ e4b->bd_info->bb_first_free : start;
+ count = 0;
+ free_count = 0;
while (start <= max) {
start = mb_find_next_zero_bit(bitmap, max + 1, start);
@@ -6271,8 +6314,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start,
- next - start, group, &e4b);
+ ret = ext4_trim_extent(sb, start, next - start, e4b);
if (ret && ret != -EOPNOTSUPP)
break;
ret = 0;
@@ -6287,25 +6329,64 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
}
if (need_resched()) {
- ext4_unlock_group(sb, group);
+ ext4_unlock_group(sb, e4b->bd_group);
cond_resched();
- ext4_lock_group(sb, group);
+ ext4_lock_group(sb, e4b->bd_group);
}
- if ((e4b.bd_info->bb_free - free_count) < minblocks)
+ if ((e4b->bd_info->bb_free - free_count) < minblocks)
break;
}
- if (!ret) {
- ret = count;
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ return count;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb: super block for file system
+ * @group: group to be trimmed
+ * @start: first group block to examine
+ * @max: last group block to examine
+ * @minblocks: minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap.
+ */
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+ ext4_grpblk_t start, ext4_grpblk_t max,
+ ext4_grpblk_t minblocks)
+{
+ struct ext4_buddy e4b;
+ int ret;
+
+ trace_ext4_trim_all_free(sb, group, start, max);
+
+ ret = ext4_mb_load_buddy(sb, group, &e4b);
+ if (ret) {
+ ext4_warning(sb, "Error %d loading buddy information for %u",
+ ret, group);
+ return ret;
+ }
+
+ ext4_lock_group(sb, group);
+
+ if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
+ minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) {
+ ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
+ if (ret >= 0)
+ EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ } else {
+ ret = 0;
}
-out:
+
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
- count, group);
+ ret, group);
return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f3bbcd4efb56..da7698341d7d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -70,7 +70,8 @@ static struct buffer_head *ext4_append(handle_t *handle,
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (err) {
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -1927,12 +1928,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
}
BUFFER_TRACE(*bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, *bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, *bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
BUFFER_TRACE(frame->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, frame->bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
@@ -2109,7 +2112,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err;
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (err) {
ext4_std_error(dir->i_sb, err);
return err;
@@ -2167,7 +2171,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
blocksize = dir->i_sb->s_blocksize;
dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
BUFFER_TRACE(bh, "get_write_access");
- retval = ext4_journal_get_write_access(handle, bh);
+ retval = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (retval) {
ext4_std_error(dir->i_sb, retval);
brelse(bh);
@@ -2419,7 +2424,7 @@ again:
}
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
if (err)
goto journal_error;
@@ -2476,7 +2481,8 @@ again:
node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
sb->s_blocksize);
BUFFER_TRACE(frame->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, frame->bh);
+ err = ext4_journal_get_write_access(handle, sb, frame->bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
if (!add_level) {
@@ -2486,8 +2492,9 @@ again:
icount1, icount2));
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
- err = ext4_journal_get_write_access(handle,
- (frame - 1)->bh);
+ err = ext4_journal_get_write_access(handle, sb,
+ (frame - 1)->bh,
+ EXT4_JTR_NONE);
if (err)
goto journal_error;
@@ -2636,7 +2643,8 @@ static int ext4_delete_entry(handle_t *handle,
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
+ EXT4_JTR_NONE);
if (unlikely(err))
goto out;
@@ -3046,186 +3054,6 @@ bool ext4_empty_dir(struct inode *inode)
return true;
}
-/*
- * ext4_orphan_add() links an unlinked or truncated inode into a list of
- * such inodes, starting at the superblock, in case we crash before the
- * file is closed/deleted, or in case the inode truncate spans multiple
- * transactions and the last transaction is not recovered after a crash.
- *
- * At filesystem recovery time, we walk this list deleting unlinked
- * inodes and truncating linked inodes in ext4_orphan_cleanup().
- *
- * Orphan list manipulation functions must be called under i_mutex unless
- * we are just creating the inode or deleting it.
- */
-int ext4_orphan_add(handle_t *handle, struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_iloc iloc;
- int err = 0, rc;
- bool dirty = false;
-
- if (!sbi->s_journal || is_bad_inode(inode))
- return 0;
-
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !inode_is_locked(inode));
- /*
- * Exit early if inode already is on orphan list. This is a big speedup
- * since we don't have to contend on the global s_orphan_lock.
- */
- if (!list_empty(&EXT4_I(inode)->i_orphan))
- return 0;
-
- /*
- * Orphan handling is only valid for files with data blocks
- * being truncated, or files being unlinked. Note that we either
- * hold i_mutex, or the inode can not be referenced from outside,
- * so i_nlink should not be bumped due to race
- */
- ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
-
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
- goto out;
-
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto out;
-
- mutex_lock(&sbi->s_orphan_lock);
- /*
- * Due to previous errors inode may be already a part of on-disk
- * orphan list. If so skip on-disk list modification.
- */
- if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
- (le32_to_cpu(sbi->s_es->s_inodes_count))) {
- /* Insert this inode at the head of the on-disk orphan list */
- NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
- lock_buffer(sbi->s_sbh);
- sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- ext4_superblock_csum_set(sb);
- unlock_buffer(sbi->s_sbh);
- dirty = true;
- }
- list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
- mutex_unlock(&sbi->s_orphan_lock);
-
- if (dirty) {
- err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
- rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
- if (!err)
- err = rc;
- if (err) {
- /*
- * We have to remove inode from in-memory list if
- * addition to on disk orphan list failed. Stray orphan
- * list entries can cause panics at unmount time.
- */
- mutex_lock(&sbi->s_orphan_lock);
- list_del_init(&EXT4_I(inode)->i_orphan);
- mutex_unlock(&sbi->s_orphan_lock);
- }
- } else
- brelse(iloc.bh);
-
- jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
- jbd_debug(4, "orphan inode %lu will point to %d\n",
- inode->i_ino, NEXT_ORPHAN(inode));
-out:
- ext4_std_error(sb, err);
- return err;
-}
-
-/*
- * ext4_orphan_del() removes an unlinked or truncated inode from the list
- * of such inodes stored on disk, because it is finally being cleaned up.
- */
-int ext4_orphan_del(handle_t *handle, struct inode *inode)
-{
- struct list_head *prev;
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- __u32 ino_next;
- struct ext4_iloc iloc;
- int err = 0;
-
- if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
- return 0;
-
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !inode_is_locked(inode));
- /* Do this quick check before taking global s_orphan_lock. */
- if (list_empty(&ei->i_orphan))
- return 0;
-
- if (handle) {
- /* Grab inode buffer early before taking global s_orphan_lock */
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- }
-
- mutex_lock(&sbi->s_orphan_lock);
- jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
-
- prev = ei->i_orphan.prev;
- list_del_init(&ei->i_orphan);
-
- /* If we're on an error path, we may not have a valid
- * transaction handle with which to update the orphan list on
- * disk, but we still need to remove the inode from the linked
- * list in memory. */
- if (!handle || err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_err;
- }
-
- ino_next = NEXT_ORPHAN(inode);
- if (prev == &sbi->s_orphan) {
- jbd_debug(4, "superblock will point to %u\n", ino_next);
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_brelse;
- }
- lock_buffer(sbi->s_sbh);
- sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- ext4_superblock_csum_set(inode->i_sb);
- unlock_buffer(sbi->s_sbh);
- mutex_unlock(&sbi->s_orphan_lock);
- err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
- } else {
- struct ext4_iloc iloc2;
- struct inode *i_prev =
- &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
-
- jbd_debug(4, "orphan inode %lu will point to %u\n",
- i_prev->i_ino, ino_next);
- err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
- if (err) {
- mutex_unlock(&sbi->s_orphan_lock);
- goto out_brelse;
- }
- NEXT_ORPHAN(i_prev) = ino_next;
- err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
- mutex_unlock(&sbi->s_orphan_lock);
- }
- if (err)
- goto out_brelse;
- NEXT_ORPHAN(inode) = 0;
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-out_err:
- ext4_std_error(inode->i_sb, err);
- return err;
-
-out_brelse:
- brelse(iloc.bh);
- goto out_err;
-}
-
static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
{
int retval;
@@ -3675,7 +3503,8 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
return -EFSCORRUPTED;
BUFFER_TRACE(ent->dir_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, ent->dir_bh);
+ return ext4_journal_get_write_access(handle, ent->dir->i_sb,
+ ent->dir_bh, EXT4_JTR_NONE);
}
static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
@@ -3710,7 +3539,8 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
int retval, retval2;
BUFFER_TRACE(ent->bh, "get write access");
- retval = ext4_journal_get_write_access(handle, ent->bh);
+ retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh,
+ EXT4_JTR_NONE);
if (retval)
return retval;
ent->de->inode = cpu_to_le32(ino);
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
new file mode 100644
index 000000000000..53adc8f570a3
--- /dev/null
+++ b/fs/ext4/orphan.c
@@ -0,0 +1,652 @@
+/*
+ * Ext4 orphan inode handling
+ */
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
+{
+ int i, j, start;
+ struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
+ int ret = 0;
+ bool found = false;
+ __le32 *bdata;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int looped = 0;
+
+ /*
+ * Find block with free orphan entry. Use CPU number for a naive hash
+ * for a search start in the orphan file
+ */
+ start = raw_smp_processor_id()*13 % oi->of_blocks;
+ i = start;
+ do {
+ if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
+ >= 0) {
+ found = true;
+ break;
+ }
+ if (++i >= oi->of_blocks)
+ i = 0;
+ } while (i != start);
+
+ if (!found) {
+ /*
+ * For now we don't grow or shrink orphan file. We just use
+ * whatever was allocated at mke2fs time. The additional
+ * credits we would have to reserve for each orphan inode
+ * operation just don't seem worth it.
+ */
+ return -ENOSPC;
+ }
+
+ ret = ext4_journal_get_write_access(handle, inode->i_sb,
+ oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
+ if (ret) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return ret;
+ }
+
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ /* Find empty slot in a block */
+ j = 0;
+ do {
+ if (looped) {
+ /*
+ * Did we walk through the block several times without
+ * finding free entry? It is theoretically possible
+ * if entries get constantly allocated and freed or
+ * if the block is corrupted. Avoid indefinite looping
+ * and bail. We'll use orphan list instead.
+ */
+ if (looped > 3) {
+ atomic_inc(&oi->of_binfo[i].ob_free_entries);
+ return -ENOSPC;
+ }
+ cond_resched();
+ }
+ while (bdata[j]) {
+ if (++j >= inodes_per_ob) {
+ j = 0;
+ looped++;
+ }
+ }
+ } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
+ (__le32)0);
+
+ EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
+ ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+
+ return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
+}
+
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_mutex unless
+ * we are just creating the inode or deleting it.
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_iloc iloc;
+ int err = 0, rc;
+ bool dirty = false;
+
+ if (!sbi->s_journal || is_bad_inode(inode))
+ return 0;
+
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !inode_is_locked(inode));
+ /*
+ * Inode orphaned in orphan file or in orphan list?
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
+ !list_empty(&EXT4_I(inode)->i_orphan))
+ return 0;
+
+ /*
+ * Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. Note that we either
+ * hold i_mutex, or the inode can not be referenced from outside,
+ * so i_nlink should not be bumped due to race
+ */
+ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+ if (sbi->s_orphan_info.of_blocks) {
+ err = ext4_orphan_file_add(handle, inode);
+ /*
+ * Fallback to normal orphan list of orphan file is
+ * out of space
+ */
+ if (err != -ENOSPC)
+ return err;
+ }
+
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out;
+
+ mutex_lock(&sbi->s_orphan_lock);
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+ (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+ /* Insert this inode at the head of the on-disk orphan list */
+ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+ lock_buffer(sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ dirty = true;
+ }
+ list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+
+ if (dirty) {
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+ if (err) {
+ /*
+ * We have to remove inode from in-memory list if
+ * addition to on disk orphan list failed. Stray orphan
+ * list entries can cause panics at unmount time.
+ */
+ mutex_lock(&sbi->s_orphan_lock);
+ list_del_init(&EXT4_I(inode)->i_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ } else
+ brelse(iloc.bh);
+
+ jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %lu will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+out:
+ ext4_std_error(sb, err);
+ return err;
+}
+
+static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
+ __le32 *bdata;
+ int blk, off;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
+ int ret = 0;
+
+ if (!handle)
+ goto out;
+ blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
+ off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
+ if (WARN_ON_ONCE(blk >= oi->of_blocks))
+ goto out;
+
+ ret = ext4_journal_get_write_access(handle, inode->i_sb,
+ oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
+ if (ret)
+ goto out;
+
+ bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
+ bdata[off] = 0;
+ atomic_inc(&oi->of_binfo[blk].ob_free_entries);
+ ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
+out:
+ ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+ INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);
+
+ return ret;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+ struct list_head *prev;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __u32 ino_next;
+ struct ext4_iloc iloc;
+ int err = 0;
+
+ if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
+ return 0;
+
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !inode_is_locked(inode));
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
+ return ext4_orphan_file_del(handle, inode);
+
+ /* Do this quick check before taking global s_orphan_lock. */
+ if (list_empty(&ei->i_orphan))
+ return 0;
+
+ if (handle) {
+ /* Grab inode buffer early before taking global s_orphan_lock */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ }
+
+ mutex_lock(&sbi->s_orphan_lock);
+ jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+ prev = ei->i_orphan.prev;
+ list_del_init(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+ * disk, but we still need to remove the inode from the linked
+ * list in memory. */
+ if (!handle || err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_err;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+ if (prev == &sbi->s_orphan) {
+ jbd_debug(4, "superblock will point to %u\n", ino_next);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ sbi->s_sbh, EXT4_JTR_NONE);
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_brelse;
+ }
+ lock_buffer(sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ ext4_superblock_csum_set(inode->i_sb);
+ unlock_buffer(sbi->s_sbh);
+ mutex_unlock(&sbi->s_orphan_lock);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ } else {
+ struct ext4_iloc iloc2;
+ struct inode *i_prev =
+ &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+ jbd_debug(4, "orphan inode %lu will point to %u\n",
+ i_prev->i_ino, ino_next);
+ err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
+ goto out_brelse;
+ }
+ NEXT_ORPHAN(i_prev) = ino_next;
+ err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(inode) = 0;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+out_err:
+ ext4_std_error(inode->i_sb, err);
+ return err;
+
+out_brelse:
+ brelse(iloc.bh);
+ goto out_err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+ return dquot_quota_on_mount(sb,
+ rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type],
+ lockdep_is_held(&sb->s_umount)),
+ EXT4_SB(sb)->s_jquota_fmt, type);
+}
+#endif
+
+static void ext4_process_orphan(struct inode *inode,
+ int *nr_truncates, int *nr_orphans)
+{
+ struct super_block *sb = inode->i_sb;
+ int ret;
+
+ dquot_initialize(inode);
+ if (inode->i_nlink) {
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
+ jbd_debug(2, "truncating inode %lu to %lld bytes\n",
+ inode->i_ino, inode->i_size);
+ inode_lock(inode);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ret = ext4_truncate(inode);
+ if (ret) {
+ /*
+ * We need to clean up the in-core orphan list
+ * manually if ext4_truncate() failed to get a
+ * transaction handle.
+ */
+ ext4_orphan_del(NULL, inode);
+ ext4_std_error(inode->i_sb, ret);
+ }
+ inode_unlock(inode);
+ (*nr_truncates)++;
+ } else {
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
+ jbd_debug(2, "deleting unreferenced inode %lu\n",
+ inode->i_ino);
+ (*nr_orphans)++;
+ }
+ iput(inode); /* The delete magic happens here! */
+}
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
+{
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+ struct inode *inode;
+ int i, j;
+#ifdef CONFIG_QUOTA
+ int quota_update = 0;
+#endif
+ __le32 *bdata;
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+
+ if (!es->s_last_orphan && !oi->of_blocks) {
+ jbd_debug(4, "no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (bdev_read_only(sb->s_bdev)) {
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, skipping orphan cleanup");
+ return;
+ }
+
+ /* Check if feature set would not allow a r/w mount */
+ if (!ext4_feature_set_ok(sb, 0)) {
+ ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
+ ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
+ "clearing orphan list.\n");
+ es->s_last_orphan = 0;
+ }
+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ return;
+ }
+
+ if (s_flags & SB_RDONLY) {
+ ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+ sb->s_flags &= ~SB_RDONLY;
+ }
+#ifdef CONFIG_QUOTA
+ /*
+ * Turn on quotas which were not enabled for read-only mounts if
+ * filesystem has quota feature, so that they are updated correctly.
+ */
+ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
+ int ret = ext4_enable_quotas(sb);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on quotas: error %d", ret);
+ }
+
+ /* Turn on journaled quotas used for old sytle */
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (EXT4_SB(sb)->s_qf_names[i]) {
+ int ret = ext4_quota_on_mount(sb, i);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on journaled "
+ "quota: type %d: error %d", i, ret);
+ }
+ }
+#endif
+
+ while (es->s_last_orphan) {
+ /*
+ * We may have encountered an error during cleanup; if
+ * so, skip the rest.
+ */
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+ if (IS_ERR(inode)) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+ ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
+ }
+
+ for (i = 0; i < oi->of_blocks; i++) {
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ for (j = 0; j < inodes_per_ob; j++) {
+ if (!bdata[j])
+ continue;
+ inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
+ if (IS_ERR(inode))
+ continue;
+ ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
+ EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
+ ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
+ }
+ }
+
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
+
+ if (nr_orphans)
+ ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
+ if (nr_truncates)
+ ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+ /* Turn off quotas if they were enabled for orphan cleanup */
+ if (quota_update) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (sb_dqopt(sb)->files[i])
+ dquot_quota_off(sb, i);
+ }
+ }
+#endif
+ sb->s_flags = s_flags; /* Restore SB_RDONLY status */
+}
+
+void ext4_release_orphan_info(struct super_block *sb)
+{
+ int i;
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+
+ if (!oi->of_blocks)
+ return;
+ for (i = 0; i < oi->of_blocks; i++)
+ brelse(oi->of_binfo[i].ob_bh);
+ kfree(oi->of_binfo);
+}
+
+static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
+ struct super_block *sb,
+ struct buffer_head *bh)
+{
+ return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
+ sizeof(struct ext4_orphan_block_tail));
+}
+
+static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ __u32 calculated;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct ext4_orphan_block_tail *ot;
+ __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
+
+ if (!ext4_has_metadata_csum(sb))
+ return 1;
+
+ ot = ext4_orphan_block_tail(sb, bh);
+ calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
+ (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
+ calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
+ inodes_per_ob * sizeof(__u32));
+ return le32_to_cpu(ot->ob_checksum) == calculated;
+}
+
+/* This gets called only when checksumming is enabled */
+void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
+ __u32 csum;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct ext4_orphan_block_tail *ot;
+ __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
+
+ csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
+ (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
+ csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
+ inodes_per_ob * sizeof(__u32));
+ ot = ext4_orphan_block_tail(sb, bh);
+ ot->ob_checksum = cpu_to_le32(csum);
+}
+
+int ext4_init_orphan_info(struct super_block *sb)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ struct inode *inode;
+ int i, j;
+ int ret;
+ int free;
+ __le32 *bdata;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+ struct ext4_orphan_block_tail *ot;
+ ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
+
+ if (!ext4_has_feature_orphan_file(sb))
+ return 0;
+
+ inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL);
+ if (IS_ERR(inode)) {
+ ext4_msg(sb, KERN_ERR, "get orphan inode failed");
+ return PTR_ERR(inode);
+ }
+ oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
+ oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
+ oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
+ GFP_KERNEL);
+ if (!oi->of_binfo) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ for (i = 0; i < oi->of_blocks; i++) {
+ oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
+ if (IS_ERR(oi->of_binfo[i].ob_bh)) {
+ ret = PTR_ERR(oi->of_binfo[i].ob_bh);
+ goto out_free;
+ }
+ if (!oi->of_binfo[i].ob_bh) {
+ ret = -EIO;
+ goto out_free;
+ }
+ ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
+ if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
+ ext4_error(sb, "orphan file block %d: bad magic", i);
+ ret = -EIO;
+ goto out_free;
+ }
+ if (!ext4_orphan_file_block_csum_verify(sb,
+ oi->of_binfo[i].ob_bh)) {
+ ext4_error(sb, "orphan file block %d: bad checksum", i);
+ ret = -EIO;
+ goto out_free;
+ }
+ bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
+ free = 0;
+ for (j = 0; j < inodes_per_ob; j++)
+ if (bdata[j] == 0)
+ free++;
+ atomic_set(&oi->of_binfo[i].ob_free_entries, free);
+ }
+ iput(inode);
+ return 0;
+out_free:
+ for (i--; i >= 0; i--)
+ brelse(oi->of_binfo[i].ob_bh);
+ kfree(oi->of_binfo);
+out_put:
+ iput(inode);
+ return ret;
+}
+
+int ext4_orphan_file_empty(struct super_block *sb)
+{
+ struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
+ int i;
+ int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
+
+ if (!ext4_has_feature_orphan_file(sb))
+ return 1;
+ for (i = 0; i < oi->of_blocks; i++)
+ if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
+ inodes_per_ob)
+ return 0;
+ return 1;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 7a9f1adef679..b63cb88ccdae 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -409,7 +409,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
BUFFER_TRACE(bh, "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, bh))) {
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
+ if (err) {
brelse(bh);
bh = ERR_PTR(err);
} else {
@@ -474,7 +475,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
return -ENOMEM;
BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh,
+ EXT4_JTR_NONE);
if (err) {
brelse(bh);
return err;
@@ -569,7 +571,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
}
BUFFER_TRACE(gdb, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb);
+ err = ext4_journal_get_write_access(handle, sb, gdb,
+ EXT4_JTR_NONE);
if (err) {
brelse(gdb);
goto out;
@@ -837,17 +840,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
}
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (unlikely(err))
goto errout;
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE);
if (unlikely(err))
goto errout;
BUFFER_TRACE(dind, "get_write_access");
- err = ext4_journal_get_write_access(handle, dind);
+ err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE);
if (unlikely(err)) {
ext4_std_error(sb, err);
goto errout;
@@ -956,7 +960,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
n_group_desc[gdb_num] = gdb_bh;
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE);
if (err) {
kvfree(n_group_desc);
brelse(gdb_bh);
@@ -1042,7 +1046,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
for (i = 0; i < reserved_gdb; i++) {
BUFFER_TRACE(primary[i], "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, primary[i])))
+ if ((err = ext4_journal_get_write_access(handle, sb, primary[i],
+ EXT4_JTR_NONE)))
goto exit_bh;
}
@@ -1149,10 +1154,9 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
backup_block, backup_block -
ext4_group_first_block_no(sb, group));
BUFFER_TRACE(bh, "get_write_access");
- if ((err = ext4_journal_get_write_access(handle, bh))) {
- brelse(bh);
+ if ((err = ext4_journal_get_write_access(handle, sb, bh,
+ EXT4_JTR_NONE)))
break;
- }
lock_buffer(bh);
memcpy(bh->b_data, data, size);
if (rest)
@@ -1232,7 +1236,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
gdb_num);
BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
+ err = ext4_journal_get_write_access(handle, sb, gdb_bh,
+ EXT4_JTR_NONE);
if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
err = reserve_backup_gdb(handle, resize_inode, group);
@@ -1509,7 +1514,8 @@ static int ext4_flex_group_add(struct super_block *sb,
}
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto exit_journal;
@@ -1722,7 +1728,8 @@ static int ext4_group_extend_no_check(struct super_block *sb,
}
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE);
if (err) {
ext4_warning(sb, "error %d on journal write access", err);
goto errout;
@@ -1884,7 +1891,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
return PTR_ERR(handle);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
if (err)
goto errout;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dfa09a277b56..136940af00b8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -80,7 +80,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
-static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
@@ -90,12 +89,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
/*
* Lock ordering
*
- * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
- * i_mmap_rwsem (inode->i_mmap_rwsem)!
- *
* page fault path:
- * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
- * page lock -> i_data_sem (rw)
+ * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
+ * -> page lock -> i_data_sem (rw)
*
* buffered write path:
* sb_start_write -> i_mutex -> mmap_lock
@@ -103,8 +99,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
* i_data_sem (rw)
*
* truncate:
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
+ * page lock
+ * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
* i_data_sem (rw)
*
* direct IO:
@@ -1175,6 +1172,7 @@ static void ext4_put_super(struct super_block *sb)
flush_work(&sbi->s_error_work);
destroy_workqueue(sbi->rsv_conversion_wq);
+ ext4_release_orphan_info(sb);
/*
* Unregister sysfs before destroying jbd2 journal.
@@ -1200,6 +1198,7 @@ static void ext4_put_super(struct super_block *sb)
if (!sb_rdonly(sb) && !aborted) {
ext4_clear_feature_journal_needs_recovery(sb);
+ ext4_clear_feature_orphan_present(sb);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
if (!sb_rdonly(sb))
@@ -1360,7 +1359,6 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
- init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
ext4_fc_init_inode(&ei->vfs_inode);
}
@@ -1585,14 +1583,12 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path);
-static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
-static int ext4_enable_quotas(struct super_block *sb);
static struct dquot **ext4_get_dquots(struct inode *inode)
{
@@ -2687,8 +2683,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext4_update_tstamp(es, s_mtime);
- if (sbi->s_journal)
+ if (sbi->s_journal) {
ext4_set_feature_journal_needs_recovery(sb);
+ if (ext4_has_feature_orphan_file(sb))
+ ext4_set_feature_orphan_present(sb);
+ }
err = ext4_commit_super(sb);
done:
@@ -2970,169 +2969,6 @@ static int ext4_check_descriptors(struct super_block *sb,
return 1;
}
-/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
- * the superblock) which were deleted from all directories, but held open by
- * a process at the time of a crash. We walk the list and try to delete these
- * inodes at recovery time (only with a read-write filesystem).
- *
- * In order to keep the orphan inode chain consistent during traversal (in
- * case of crash during recovery), we link each inode into the superblock
- * orphan list_head and handle it the same way as an inode deletion during
- * normal operation (which journals the operations for us).
- *
- * We only do an iget() and an iput() on each inode, which is very safe if we
- * accidentally point at an in-use or already deleted inode. The worst that
- * can happen in this case is that we get a "bit already cleared" message from
- * ext4_free_inode(). The only reason we would point at a wrong inode is if
- * e2fsck was run on this filesystem, and it must have already done the orphan
- * inode cleanup for us, so we can safely abort without any further action.
- */
-static void ext4_orphan_cleanup(struct super_block *sb,
- struct ext4_super_block *es)
-{
- unsigned int s_flags = sb->s_flags;
- int ret, nr_orphans = 0, nr_truncates = 0;
-#ifdef CONFIG_QUOTA
- int quota_update = 0;
- int i;
-#endif
- if (!es->s_last_orphan) {
- jbd_debug(4, "no orphan inodes to clean up\n");
- return;
- }
-
- if (bdev_read_only(sb->s_bdev)) {
- ext4_msg(sb, KERN_ERR, "write access "
- "unavailable, skipping orphan cleanup");
- return;
- }
-
- /* Check if feature set would not allow a r/w mount */
- if (!ext4_feature_set_ok(sb, 0)) {
- ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
- "unknown ROCOMPAT features");
- return;
- }
-
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- /* don't clear list on RO mount w/ errors */
- if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
- ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
- "clearing orphan list.\n");
- es->s_last_orphan = 0;
- }
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
- return;
- }
-
- if (s_flags & SB_RDONLY) {
- ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
- sb->s_flags &= ~SB_RDONLY;
- }
-#ifdef CONFIG_QUOTA
- /*
- * Turn on quotas which were not enabled for read-only mounts if
- * filesystem has quota feature, so that they are updated correctly.
- */
- if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
- int ret = ext4_enable_quotas(sb);
-
- if (!ret)
- quota_update = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "Cannot turn on quotas: error %d", ret);
- }
-
- /* Turn on journaled quotas used for old sytle */
- for (i = 0; i < EXT4_MAXQUOTAS; i++) {
- if (EXT4_SB(sb)->s_qf_names[i]) {
- int ret = ext4_quota_on_mount(sb, i);
-
- if (!ret)
- quota_update = 1;
- else
- ext4_msg(sb, KERN_ERR,
- "Cannot turn on journaled "
- "quota: type %d: error %d", i, ret);
- }
- }
-#endif
-
- while (es->s_last_orphan) {
- struct inode *inode;
-
- /*
- * We may have encountered an error during cleanup; if
- * so, skip the rest.
- */
- if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
- es->s_last_orphan = 0;
- break;
- }
-
- inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
- if (IS_ERR(inode)) {
- es->s_last_orphan = 0;
- break;
- }
-
- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
- dquot_initialize(inode);
- if (inode->i_nlink) {
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG,
- "%s: truncating inode %lu to %lld bytes",
- __func__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %lu to %lld bytes\n",
- inode->i_ino, inode->i_size);
- inode_lock(inode);
- truncate_inode_pages(inode->i_mapping, inode->i_size);
- ret = ext4_truncate(inode);
- if (ret) {
- /*
- * We need to clean up the in-core orphan list
- * manually if ext4_truncate() failed to get a
- * transaction handle.
- */
- ext4_orphan_del(NULL, inode);
- ext4_std_error(inode->i_sb, ret);
- }
- inode_unlock(inode);
- nr_truncates++;
- } else {
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG,
- "%s: deleting unreferenced inode %lu",
- __func__, inode->i_ino);
- jbd_debug(2, "deleting unreferenced inode %lu\n",
- inode->i_ino);
- nr_orphans++;
- }
- iput(inode); /* The delete magic happens here! */
- }
-
-#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
-
- if (nr_orphans)
- ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
- PLURAL(nr_orphans));
- if (nr_truncates)
- ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
- PLURAL(nr_truncates));
-#ifdef CONFIG_QUOTA
- /* Turn off quotas if they were enabled for orphan cleanup */
- if (quota_update) {
- for (i = 0; i < EXT4_MAXQUOTAS; i++) {
- if (sb_dqopt(sb)->files[i])
- dquot_quota_off(sb, i);
- }
- }
-#endif
- sb->s_flags = s_flags; /* Restore SB_RDONLY status */
-}
-
/*
* Maximal extent format file size.
* Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -3312,7 +3148,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
* Returns 1 if this filesystem can be mounted as requested,
* 0 if it cannot be.
*/
-static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
if (ext4_has_unknown_ext4_incompat_features(sb)) {
ext4_msg(sb, KERN_ERR,
@@ -4014,6 +3850,20 @@ static const char *ext4_quota_mode(struct super_block *sb)
#endif
}
+static void ext4_setup_csum_trigger(struct super_block *sb,
+ enum ext4_journal_trigger_type type,
+ void (*trigger)(
+ struct jbd2_buffer_trigger_type *type,
+ struct buffer_head *bh,
+ void *mapped_data,
+ size_t size))
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ sbi->s_journal_triggers[type].sb = sb;
+ sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
@@ -4112,6 +3962,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
silent = 1;
goto cantfind_ext4;
}
+ ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
+ ext4_orphan_file_block_trigger);
/* Load the checksum driver */
sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
@@ -4776,6 +4628,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
+ ext4_has_feature_orphan_present(sb) ||
ext4_has_feature_journal_needs_recovery(sb));
if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
@@ -5032,6 +4885,14 @@ no_journal:
err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
GFP_KERNEL);
}
+ /*
+ * Update the checksum after updating free space/inode
+ * counters. Otherwise the superblock can have an incorrect
+ * checksum in the buffer cache until it is written out and
+ * e2fsprogs programs trying to open a file system immediately
+ * after it is mounted can fail.
+ */
+ ext4_superblock_csum_set(sb);
if (!err)
err = percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb), GFP_KERNEL);
@@ -5066,12 +4927,15 @@ no_journal:
if (err)
goto failed_mount7;
+ err = ext4_init_orphan_info(sb);
+ if (err)
+ goto failed_mount8;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
err = ext4_enable_quotas(sb);
if (err)
- goto failed_mount8;
+ goto failed_mount9;
}
#endif /* CONFIG_QUOTA */
@@ -5090,7 +4954,7 @@ no_journal:
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
if (err)
- goto failed_mount8;
+ goto failed_mount9;
}
if (EXT4_SB(sb)->s_journal) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -5136,6 +5000,8 @@ cantfind_ext4:
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
+failed_mount9:
+ ext4_release_orphan_info(sb);
failed_mount8:
ext4_unregister_sysfs(sb);
kobject_put(&sbi->s_kobj);
@@ -5646,8 +5512,15 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
if (err < 0)
goto out;
- if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
+ if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
+ ext4_has_feature_orphan_present(sb))) {
+ if (!ext4_orphan_file_empty(sb)) {
+ ext4_error(sb, "Orphan file not empty on read-only fs.");
+ err = -EFSCORRUPTED;
+ goto out;
+ }
ext4_clear_feature_journal_needs_recovery(sb);
+ ext4_clear_feature_orphan_present(sb);
ext4_commit_super(sb);
}
out:
@@ -5790,6 +5663,8 @@ static int ext4_freeze(struct super_block *sb)
/* Journal blocked and flushed, clear needs_recovery flag. */
ext4_clear_feature_journal_needs_recovery(sb);
+ if (ext4_orphan_file_empty(sb))
+ ext4_clear_feature_orphan_present(sb);
}
error = ext4_commit_super(sb);
@@ -5812,6 +5687,8 @@ static int ext4_unfreeze(struct super_block *sb)
if (EXT4_SB(sb)->s_journal) {
/* Reset the needs_recovery flag before the fs is unlocked. */
ext4_set_feature_journal_needs_recovery(sb);
+ if (ext4_has_feature_orphan_file(sb))
+ ext4_set_feature_orphan_present(sb);
}
ext4_commit_super(sb);
@@ -6015,7 +5892,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* around from a previously readonly bdev mount,
* require a full umount/remount for now.
*/
- if (es->s_last_orphan) {
+ if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
ext4_msg(sb, KERN_WARNING, "Couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
@@ -6312,16 +6189,6 @@ static int ext4_write_info(struct super_block *sb, int type)
return ret;
}
-/*
- * Turn on quotas during mount time - we need to find
- * the quota file and such...
- */
-static int ext4_quota_on_mount(struct super_block *sb, int type)
-{
- return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
- EXT4_SB(sb)->s_jquota_fmt, type);
-}
-
static void lockdep_set_quota_inode(struct inode *inode, int subclass)
{
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -6451,7 +6318,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
}
/* Enable usage tracking for all quota types. */
-static int ext4_enable_quotas(struct super_block *sb)
+int ext4_enable_quotas(struct super_block *sb)
{
int type, err = 0;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
@@ -6609,7 +6476,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
if (!bh)
goto out;
BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
if (err) {
brelse(bh);
return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index dd05af983092..69109746e6e2 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -52,10 +52,20 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
return paddr;
}
+static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns,
+ const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ ext4_getattr(mnt_userns, path, stat, request_mask, query_flags);
+
+ return fscrypt_symlink_getattr(path, stat);
+}
+
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
- .getattr = ext4_getattr,
+ .getattr = ext4_encrypted_symlink_getattr,
.listxattr = ext4_listxattr,
};
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index bcbe3668c1d4..ce84aa2786c7 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -11,14 +11,16 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ struct address_space *mapping = inode->i_mapping;
+
/*
* We don't need to call ext4_break_layouts() because the blocks we
* are truncating were never visible to userspace.
*/
- down_write(&EXT4_I(inode)->i_mmap_sem);
- truncate_inode_pages(inode->i_mapping, inode->i_size);
+ filemap_invalidate_lock(mapping);
+ truncate_inode_pages(mapping, inode->i_size);
ext4_truncate(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
}
/*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6dd5c05c444a..1e0fc1ed845b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -791,7 +791,8 @@ static void ext4_xattr_update_super_block(handle_t *handle,
return;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+ if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
+ EXT4_JTR_NONE) == 0) {
lock_buffer(EXT4_SB(sb)->s_sbh);
ext4_set_feature_xattr(sb);
ext4_superblock_csum_set(sb);
@@ -1169,7 +1170,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
continue;
}
if (err > 0) {
- err = ext4_journal_get_write_access(handle, bh);
+ err = ext4_journal_get_write_access(handle,
+ parent->i_sb, bh, EXT4_JTR_NONE);
if (err) {
ext4_warning_inode(ea_inode,
"Re-get write access err=%d",
@@ -1230,7 +1232,8 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
int error = 0;
BUFFER_TRACE(bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (error)
goto out;
@@ -1371,7 +1374,8 @@ retry:
"ext4_getblk() return bh = NULL");
return -EFSCORRUPTED;
}
- ret = ext4_journal_get_write_access(handle, bh);
+ ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh,
+ EXT4_JTR_NONE);
if (ret)
goto out;
@@ -1855,7 +1859,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (s->base) {
BUFFER_TRACE(bs->bh, "get_write_access");
- error = ext4_journal_get_write_access(handle, bs->bh);
+ error = ext4_journal_get_write_access(handle, sb, bs->bh,
+ EXT4_JTR_NONE);
if (error)
goto cleanup;
lock_buffer(bs->bh);
@@ -1987,8 +1992,9 @@ inserted:
if (error)
goto cleanup;
BUFFER_TRACE(new_bh, "get_write_access");
- error = ext4_journal_get_write_access(handle,
- new_bh);
+ error = ext4_journal_get_write_access(
+ handle, sb, new_bh,
+ EXT4_JTR_NONE);
if (error)
goto cleanup_dquot;
lock_buffer(new_bh);
@@ -2092,7 +2098,8 @@ getblk_failed:
}
lock_buffer(new_bh);
- error = ext4_journal_get_create_access(handle, new_bh);
+ error = ext4_journal_get_create_access(handle, sb,
+ new_bh, EXT4_JTR_NONE);
if (error) {
unlock_buffer(new_bh);
error = -EIO;
@@ -2848,7 +2855,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
goto cleanup;
}
- error = ext4_journal_get_write_access(handle, iloc.bh);
+ error = ext4_journal_get_write_access(handle, inode->i_sb,
+ iloc.bh, EXT4_JTR_NONE);
if (error) {
EXT4_ERROR_INODE(inode, "write access (error %d)",
error);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 7669de7b49ce..7eea3cfd894d 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -105,6 +105,13 @@ config F2FS_FS_LZO
help
Support LZO compress algorithm, if unsure, say Y.
+config F2FS_FS_LZORLE
+ bool "LZO-RLE compression support"
+ depends on F2FS_FS_LZO
+ default y
+ help
+ Support LZO-RLE compress algorithm, if unsure, say Y.
+
config F2FS_FS_LZ4
bool "LZ4 compression support"
depends on F2FS_FS_COMPRESSION
@@ -114,7 +121,6 @@ config F2FS_FS_LZ4
config F2FS_FS_LZ4HC
bool "LZ4HC compression support"
- depends on F2FS_FS_COMPRESSION
depends on F2FS_FS_LZ4
default y
help
@@ -128,10 +134,11 @@ config F2FS_FS_ZSTD
help
Support ZSTD compress algorithm, if unsure, say Y.
-config F2FS_FS_LZORLE
- bool "LZO-RLE compression support"
- depends on F2FS_FS_COMPRESSION
- depends on F2FS_FS_LZO
+config F2FS_IOSTAT
+ bool "F2FS IO statistics information"
+ depends on F2FS_FS
default y
help
- Support LZO-RLE compress algorithm, if unsure, say Y.
+ Support getting IO statistics through sysfs and printing out periodic
+ IO statistics tracepoint events. You have to turn on "iostat_enable"
+ sysfs node to enable this feature.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index e5295746208b..8a7322d229e4 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
f2fs-$(CONFIG_FS_VERITY) += verity.o
f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o
+f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 239ad9453b99..16e826e01f09 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -196,8 +196,11 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
return acl;
}
-struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu)
{
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
return __f2fs_get_acl(inode, type, NULL);
}
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 986fd1bc780b..a26e33cab4ff 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -33,7 +33,7 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
-extern struct posix_acl *f2fs_get_acl(struct inode *, int);
+extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool);
extern int f2fs_set_acl(struct user_namespace *, struct inode *,
struct posix_acl *, int);
extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6c208108d69c..83e9bc0f91ff 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -18,6 +18,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "iostat.h"
#include <trace/events/f2fs.h>
#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
@@ -465,16 +466,29 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
unsigned int devidx, int type)
{
struct inode_management *im = &sbi->im[type];
- struct ino_entry *e, *tmp;
+ struct ino_entry *e = NULL, *new = NULL;
- tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
+ if (type == FLUSH_INO) {
+ rcu_read_lock();
+ e = radix_tree_lookup(&im->ino_root, ino);
+ rcu_read_unlock();
+ }
+
+retry:
+ if (!e)
+ new = f2fs_kmem_cache_alloc(ino_entry_slab,
+ GFP_NOFS, true, NULL);
radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
spin_lock(&im->ino_lock);
e = radix_tree_lookup(&im->ino_root, ino);
if (!e) {
- e = tmp;
+ if (!new) {
+ spin_unlock(&im->ino_lock);
+ goto retry;
+ }
+ e = new;
if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
f2fs_bug_on(sbi, 1);
@@ -492,8 +506,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
spin_unlock(&im->ino_lock);
radix_tree_preload_end();
- if (e != tmp)
- kmem_cache_free(ino_entry_slab, tmp);
+ if (new && e != new)
+ kmem_cache_free(ino_entry_slab, new);
}
static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -1289,12 +1303,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long flags;
- spin_lock_irqsave(&sbi->cp_lock, flags);
+ if (cpc->reason & CP_UMOUNT) {
+ if (le32_to_cpu(ckpt->cp_pack_total_block_count) >
+ sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) {
+ clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+ f2fs_notice(sbi, "Disable nat_bits due to no space");
+ } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
+ f2fs_nat_bitmap_enabled(sbi)) {
+ f2fs_enable_nat_bits(sbi);
+ set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+ f2fs_notice(sbi, "Rebuild and enable nat_bits");
+ }
+ }
- if ((cpc->reason & CP_UMOUNT) &&
- le32_to_cpu(ckpt->cp_pack_total_block_count) >
- sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
- disable_nat_bits(sbi, false);
+ spin_lock_irqsave(&sbi->cp_lock, flags);
if (cpc->reason & CP_TRIMMED)
__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
@@ -1480,7 +1502,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_next_addr(sbi);
/* write nat bits */
- if (enabled_nat_bits(sbi, cpc)) {
+ if ((cpc->reason & CP_UMOUNT) &&
+ is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
__u64 cp_ver = cur_cp_version(ckpt);
block_t blk;
@@ -1639,8 +1662,11 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* write cached NAT/SIT entries to NAT/SIT area */
err = f2fs_flush_nat_entries(sbi, cpc);
- if (err)
+ if (err) {
+ f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err);
+ f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
goto stop;
+ }
f2fs_flush_sit_entries(sbi, cpc);
@@ -1648,10 +1674,13 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_save_inmem_curseg(sbi);
err = do_checkpoint(sbi, cpc);
- if (err)
+ if (err) {
+ f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err);
+ f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
f2fs_release_discard_addrs(sbi);
- else
+ } else {
f2fs_clear_prefree_segments(sbi, cpc);
+ }
f2fs_restore_inmem_curseg(sbi);
stop:
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 455561826c7d..c1bf9ad4c220 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -28,7 +28,8 @@ static void *page_array_alloc(struct inode *inode, int nr)
unsigned int size = sizeof(struct page *) * nr;
if (likely(size <= sbi->page_array_slab_size))
- return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS);
+ return f2fs_kmem_cache_alloc(sbi->page_array_slab,
+ GFP_F2FS_ZERO, false, F2FS_I_SB(inode));
return f2fs_kzalloc(sbi, size, GFP_NOFS);
}
@@ -898,6 +899,54 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc)
return false;
}
+bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
+ bool compressed = dn->data_blkaddr == COMPRESS_ADDR;
+ int cluster_end = 0;
+ int i;
+ char *reason = "";
+
+ if (!compressed)
+ return false;
+
+ /* [..., COMPR_ADDR, ...] */
+ if (dn->ofs_in_node % cluster_size) {
+ reason = "[*|C|*|*]";
+ goto out;
+ }
+
+ for (i = 1; i < cluster_size; i++) {
+ block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
+
+ /* [COMPR_ADDR, ..., COMPR_ADDR] */
+ if (blkaddr == COMPRESS_ADDR) {
+ reason = "[C|*|C|*]";
+ goto out;
+ }
+ if (compressed) {
+ if (!__is_valid_data_blkaddr(blkaddr)) {
+ if (!cluster_end)
+ cluster_end = i;
+ continue;
+ }
+ /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */
+ if (cluster_end) {
+ reason = "[C|N|N|V]";
+ goto out;
+ }
+ }
+ }
+ return false;
+out:
+ f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s",
+ dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ return true;
+}
+
static int __f2fs_cluster_blocks(struct inode *inode,
unsigned int cluster_idx, bool compr)
{
@@ -915,6 +964,11 @@ static int __f2fs_cluster_blocks(struct inode *inode,
goto fail;
}
+ if (f2fs_sanity_check_cluster(&dn)) {
+ ret = -EFSCORRUPTED;
+ goto fail;
+ }
+
if (dn.data_blkaddr == COMPRESS_ADDR) {
int i;
@@ -1228,7 +1282,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
fio.version = ni.version;
- cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS);
+ cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi);
if (!cic)
goto out_put_dnode;
@@ -1340,12 +1394,6 @@ out_destroy_crypt:
for (--i; i >= 0; i--)
fscrypt_finalize_bounce_page(&cc->cpages[i]);
- for (i = 0; i < cc->nr_cpages; i++) {
- if (!cc->cpages[i])
- continue;
- f2fs_compress_free_page(cc->cpages[i]);
- cc->cpages[i] = NULL;
- }
out_put_cic:
kmem_cache_free(cic_entry_slab, cic);
out_put_dnode:
@@ -1356,6 +1404,12 @@ out_unlock_op:
else
f2fs_unlock_op(sbi);
out_free:
+ for (i = 0; i < cc->nr_cpages; i++) {
+ if (!cc->cpages[i])
+ continue;
+ f2fs_compress_free_page(cc->cpages[i]);
+ cc->cpages[i] = NULL;
+ }
page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
return -EAGAIN;
@@ -1506,7 +1560,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
pgoff_t start_idx = start_idx_of_cluster(cc);
int i;
- dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS);
+ dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO,
+ false, F2FS_I_SB(cc->inode));
if (!dic)
return ERR_PTR(-ENOMEM);
@@ -1666,6 +1721,30 @@ void f2fs_put_page_dic(struct page *page)
f2fs_put_dic(dic);
}
+/*
+ * check whether cluster blocks are contiguous, and add extent cache entry
+ * only if cluster blocks are logically and physically contiguous.
+ */
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
+{
+ bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR;
+ int i = compressed ? 1 : 0;
+ block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
+
+ for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) {
+ block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + i);
+
+ if (!__is_valid_data_blkaddr(blkaddr))
+ break;
+ if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr)
+ return 0;
+ }
+
+ return compressed ? i - 1 : i;
+}
+
const struct address_space_operations f2fs_compress_aops = {
.releasepage = f2fs_release_page,
.invalidatepage = f2fs_invalidate_page,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d2cf48c5a2e4..f4fd6c246c9a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -25,6 +25,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "iostat.h"
#include <trace/events/f2fs.h>
#define NUM_PREALLOC_POST_READ_CTXS 128
@@ -116,6 +117,7 @@ struct bio_post_read_ctx {
struct f2fs_sb_info *sbi;
struct work_struct work;
unsigned int enabled_steps;
+ block_t fs_blkaddr;
};
static void f2fs_finish_read_bio(struct bio *bio)
@@ -228,7 +230,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
struct bio_vec *bv;
struct bvec_iter_all iter_all;
bool all_compressed = true;
- block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector);
+ block_t blkaddr = ctx->fs_blkaddr;
bio_for_each_segment_all(bv, ctx->bio, iter_all) {
struct page *page = bv->bv_page;
@@ -269,7 +271,10 @@ static void f2fs_post_read_work(struct work_struct *work)
static void f2fs_read_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
- struct bio_post_read_ctx *ctx = bio->bi_private;
+ struct bio_post_read_ctx *ctx;
+
+ iostat_update_and_unbind_ctx(bio, 0);
+ ctx = bio->bi_private;
if (time_to_inject(sbi, FAULT_READ_IO)) {
f2fs_show_injection_info(sbi, FAULT_READ_IO);
@@ -291,10 +296,13 @@ static void f2fs_read_end_io(struct bio *bio)
static void f2fs_write_end_io(struct bio *bio)
{
- struct f2fs_sb_info *sbi = bio->bi_private;
+ struct f2fs_sb_info *sbi;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
+ iostat_update_and_unbind_ctx(bio, 1);
+ sbi = bio->bi_private;
+
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
@@ -398,6 +406,8 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
fio->type, fio->temp);
}
+ iostat_alloc_and_bind_ctx(sbi, bio, NULL);
+
if (fio->io_wbc)
wbc_init_bio(fio->io_wbc, bio);
@@ -479,6 +489,8 @@ submit_io:
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
else
trace_f2fs_submit_write_bio(sbi->sb, type, bio);
+
+ iostat_update_submit_ctx(bio, type);
submit_bio(bio);
}
@@ -723,7 +735,7 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
struct bio_entry *be;
- be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+ be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL);
be->bio = bio;
bio_get(bio);
@@ -970,7 +982,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
- struct bio_post_read_ctx *ctx;
+ struct bio_post_read_ctx *ctx = NULL;
unsigned int post_read_steps = 0;
bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
@@ -1003,8 +1015,10 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
ctx->bio = bio;
ctx->sbi = sbi;
ctx->enabled_steps = post_read_steps;
+ ctx->fs_blkaddr = blkaddr;
bio->bi_private = ctx;
}
+ iostat_alloc_and_bind_ctx(sbi, bio, ctx);
return bio;
}
@@ -1133,7 +1147,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
{
- struct extent_info ei = {0, 0, 0};
+ struct extent_info ei = {0, };
struct inode *inode = dn->inode;
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
@@ -1150,7 +1164,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
- struct extent_info ei = {0,0,0};
+ struct extent_info ei = {0, };
int err;
page = f2fs_grab_cache_page(mapping, index, for_write);
@@ -1448,7 +1462,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int err = 0, ofs = 1;
unsigned int ofs_in_node, last_ofs_in_node;
blkcnt_t prealloc;
- struct extent_info ei = {0,0,0};
+ struct extent_info ei = {0, };
block_t blkaddr;
unsigned int start_pgofs;
@@ -1490,7 +1504,21 @@ next_dnode:
if (err) {
if (flag == F2FS_GET_BLOCK_BMAP)
map->m_pblk = 0;
+
if (err == -ENOENT) {
+ /*
+ * There is one exceptional case that read_node_page()
+ * may return -ENOENT due to filesystem has been
+ * shutdown or cp_error, so force to convert error
+ * number to EIO for such case.
+ */
+ if (map->m_may_create &&
+ (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
+ f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto unlock_out;
+ }
+
err = 0;
if (map->m_next_pgofs)
*map->m_next_pgofs =
@@ -1550,6 +1578,13 @@ next_block:
map->m_flags |= F2FS_MAP_NEW;
blkaddr = dn.data_blkaddr;
} else {
+ if (f2fs_compressed_file(inode) &&
+ f2fs_sanity_check_cluster(&dn) &&
+ (flag != F2FS_GET_BLOCK_FIEMAP ||
+ IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
+ err = -EFSCORRUPTED;
+ goto sync_out;
+ }
if (flag == F2FS_GET_BLOCK_BMAP) {
map->m_pblk = 0;
goto sync_out;
@@ -1843,8 +1878,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
int ret = 0;
- bool compr_cluster = false;
+ bool compr_cluster = false, compr_appended;
unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
+ unsigned int count_in_cluster = 0;
loff_t maxbytes;
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
@@ -1892,15 +1928,17 @@ next:
map.m_next_pgofs = &next_pgofs;
map.m_seg_type = NO_CHECK_TYPE;
- if (compr_cluster)
- map.m_len = cluster_size - 1;
+ if (compr_cluster) {
+ map.m_lblk += 1;
+ map.m_len = cluster_size - count_in_cluster;
+ }
ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
if (ret)
goto out;
/* HOLE */
- if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) {
start_blk = next_pgofs;
if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode,
@@ -1910,6 +1948,14 @@ next:
flags |= FIEMAP_EXTENT_LAST;
}
+ compr_appended = false;
+ /* In a case of compressed cluster, append this to the last extent */
+ if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) ||
+ !(map.m_flags & F2FS_MAP_FLAGS))) {
+ compr_appended = true;
+ goto skip_fill;
+ }
+
if (size) {
flags |= FIEMAP_EXTENT_MERGED;
if (IS_ENCRYPTED(inode))
@@ -1926,38 +1972,36 @@ next:
if (start_blk > last_blk)
goto out;
- if (compr_cluster) {
- compr_cluster = false;
-
-
- logical = blks_to_bytes(inode, start_blk - 1);
- phys = blks_to_bytes(inode, map.m_pblk);
- size = blks_to_bytes(inode, cluster_size);
-
- flags |= FIEMAP_EXTENT_ENCODED;
-
- start_blk += cluster_size - 1;
-
- if (start_blk > last_blk)
- goto out;
-
- goto prep_next;
- }
-
+skip_fill:
if (map.m_pblk == COMPRESS_ADDR) {
compr_cluster = true;
- start_blk++;
- goto prep_next;
- }
-
- logical = blks_to_bytes(inode, start_blk);
- phys = blks_to_bytes(inode, map.m_pblk);
- size = blks_to_bytes(inode, map.m_len);
- flags = 0;
- if (map.m_flags & F2FS_MAP_UNWRITTEN)
- flags = FIEMAP_EXTENT_UNWRITTEN;
+ count_in_cluster = 1;
+ } else if (compr_appended) {
+ unsigned int appended_blks = cluster_size -
+ count_in_cluster + 1;
+ size += blks_to_bytes(inode, appended_blks);
+ start_blk += appended_blks;
+ compr_cluster = false;
+ } else {
+ logical = blks_to_bytes(inode, start_blk);
+ phys = __is_valid_data_blkaddr(map.m_pblk) ?
+ blks_to_bytes(inode, map.m_pblk) : 0;
+ size = blks_to_bytes(inode, map.m_len);
+ flags = 0;
+
+ if (compr_cluster) {
+ flags = FIEMAP_EXTENT_ENCODED;
+ count_in_cluster += map.m_len;
+ if (count_in_cluster == cluster_size) {
+ compr_cluster = false;
+ size += blks_to_bytes(inode, 1);
+ }
+ } else if (map.m_flags & F2FS_MAP_UNWRITTEN) {
+ flags = FIEMAP_EXTENT_UNWRITTEN;
+ }
- start_blk += bytes_to_blks(inode, size);
+ start_blk += bytes_to_blks(inode, size);
+ }
prep_next:
cond_resched();
@@ -2115,6 +2159,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
sector_t last_block_in_file;
const unsigned blocksize = blks_to_bytes(inode, 1);
struct decompress_io_ctx *dic = NULL;
+ struct extent_info ei = {0, };
+ bool from_dnode = true;
int i;
int ret = 0;
@@ -2137,6 +2183,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
continue;
}
unlock_page(page);
+ if (for_write)
+ put_page(page);
cc->rpages[i] = NULL;
cc->nr_rpages--;
}
@@ -2145,6 +2193,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
if (f2fs_cluster_is_empty(cc))
goto out;
+ if (f2fs_lookup_extent_cache(inode, start_idx, &ei))
+ from_dnode = false;
+
+ if (!from_dnode)
+ goto skip_reading_dnode;
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
if (ret)
@@ -2152,11 +2206,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR);
+skip_reading_dnode:
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
- blkaddr = data_blkaddr(dn.inode, dn.node_page,
- dn.ofs_in_node + i);
+ blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page,
+ dn.ofs_in_node + i) :
+ ei.blk + i - 1;
if (!__is_valid_data_blkaddr(blkaddr))
break;
@@ -2166,6 +2222,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
goto out_put_dnode;
}
cc->nr_cpages++;
+
+ if (!from_dnode && i >= ei.c_len)
+ break;
}
/* nothing to decompress */
@@ -2185,8 +2244,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
block_t blkaddr;
struct bio_post_read_ctx *ctx;
- blkaddr = data_blkaddr(dn.inode, dn.node_page,
- dn.ofs_in_node + i + 1);
+ blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page,
+ dn.ofs_in_node + i + 1) :
+ ei.blk + i;
f2fs_wait_on_block_writeback(inode, blkaddr);
@@ -2220,7 +2280,7 @@ submit_and_realloc:
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
goto submit_and_realloc;
- ctx = bio->bi_private;
+ ctx = get_post_read_ctx(bio);
ctx->enabled_steps |= STEP_DECOMPRESS;
refcount_inc(&dic->refcnt);
@@ -2231,13 +2291,15 @@ submit_and_realloc:
*last_block_in_bio = blkaddr;
}
- f2fs_put_dnode(&dn);
+ if (from_dnode)
+ f2fs_put_dnode(&dn);
*bio_ret = bio;
return 0;
out_put_dnode:
- f2fs_put_dnode(&dn);
+ if (from_dnode)
+ f2fs_put_dnode(&dn);
out:
for (i = 0; i < cc->cluster_size; i++) {
if (cc->rpages[i]) {
@@ -2272,6 +2334,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
.nr_rpages = 0,
.nr_cpages = 0,
};
+ pgoff_t nc_cluster_idx = NULL_CLUSTER;
#endif
unsigned nr_pages = rac ? readahead_count(rac) : 1;
unsigned max_nr_pages = nr_pages;
@@ -2304,12 +2367,23 @@ static int f2fs_mpage_readpages(struct inode *inode,
if (ret)
goto set_error_page;
}
- ret = f2fs_is_compressed_cluster(inode, page->index);
- if (ret < 0)
- goto set_error_page;
- else if (!ret)
- goto read_single_page;
+ if (cc.cluster_idx == NULL_CLUSTER) {
+ if (nc_cluster_idx ==
+ page->index >> cc.log_cluster_size) {
+ goto read_single_page;
+ }
+ ret = f2fs_is_compressed_cluster(inode, page->index);
+ if (ret < 0)
+ goto set_error_page;
+ else if (!ret) {
+ nc_cluster_idx =
+ page->index >> cc.log_cluster_size;
+ goto read_single_page;
+ }
+
+ nc_cluster_idx = NULL_CLUSTER;
+ }
ret = f2fs_init_compress_ctx(&cc);
if (ret)
goto set_error_page;
@@ -2498,6 +2572,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
if (f2fs_is_atomic_file(inode))
return true;
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+ return true;
/* swap file is migrating in aligned write mode */
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
@@ -2530,7 +2606,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
struct page *page = fio->page;
struct inode *inode = page->mapping->host;
struct dnode_of_data dn;
- struct extent_info ei = {0,0,0};
+ struct extent_info ei = {0, };
struct node_info ni;
bool ipu_force = false;
int err = 0;
@@ -3176,9 +3252,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
FS_CP_DATA_IO : FS_DATA_IO);
}
-static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+static void f2fs_write_failed(struct inode *inode, loff_t to)
{
- struct inode *inode = mapping->host;
loff_t i_size = i_size_read(inode);
if (IS_NOQUOTA(inode))
@@ -3187,12 +3262,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
if (to > i_size && !f2fs_verity_in_progress(inode)) {
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache(inode, i_size);
f2fs_truncate_blocks(inode, i_size, true);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -3206,7 +3281,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
struct dnode_of_data dn;
struct page *ipage;
bool locked = false;
- struct extent_info ei = {0,0,0};
+ struct extent_info ei = {0, };
int err = 0;
int flag;
@@ -3328,6 +3403,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
*fsdata = NULL;
+ if (len == PAGE_SIZE)
+ goto repeat;
+
ret = f2fs_prepare_compress_overwrite(inode, pagep,
index, fsdata);
if (ret < 0) {
@@ -3410,7 +3488,7 @@ repeat:
fail:
f2fs_put_page(page, 1);
- f2fs_write_failed(mapping, pos + len);
+ f2fs_write_failed(inode, pos + len);
if (drop_atomic)
f2fs_drop_inmem_pages_all(sbi, false);
return err;
@@ -3552,7 +3630,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (f2fs_force_buffered_io(inode, iocb, iter))
return 0;
- do_opu = allow_outplace_dio(inode, iocb, iter);
+ do_opu = rw == WRITE && f2fs_lfs_mode(sbi);
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
@@ -3600,7 +3678,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
count - iov_iter_count(iter));
} else if (err < 0) {
- f2fs_write_failed(mapping, offset + count);
+ f2fs_write_failed(inode, offset + count);
}
} else {
if (err > 0)
@@ -3852,7 +3930,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
int ret = 0;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
set_inode_flag(inode, FI_ALIGNED_WRITE);
@@ -3894,7 +3972,7 @@ done:
clear_inode_flag(inode, FI_DO_DEFRAG);
clear_inode_flag(inode, FI_ALIGNED_WRITE);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 833325038ef3..8c50518475a9 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -323,11 +323,27 @@ get_cache:
#endif
}
+static char *s_flag[] = {
+ [SBI_IS_DIRTY] = " fs_dirty",
+ [SBI_IS_CLOSE] = " closing",
+ [SBI_NEED_FSCK] = " need_fsck",
+ [SBI_POR_DOING] = " recovering",
+ [SBI_NEED_SB_WRITE] = " sb_dirty",
+ [SBI_NEED_CP] = " need_cp",
+ [SBI_IS_SHUTDOWN] = " shutdown",
+ [SBI_IS_RECOVERED] = " recovered",
+ [SBI_CP_DISABLED] = " cp_disabled",
+ [SBI_CP_DISABLED_QUICK] = " cp_disabled_quick",
+ [SBI_QUOTA_NEED_FLUSH] = " quota_need_flush",
+ [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush",
+ [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair",
+ [SBI_IS_RESIZEFS] = " resizefs",
+};
+
static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
- int i = 0;
- int j;
+ int i = 0, j = 0;
mutex_lock(&f2fs_stat_mutex);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
@@ -337,7 +353,13 @@ static int stat_show(struct seq_file *s, void *v)
si->sbi->sb->s_bdev, i++,
f2fs_readonly(si->sbi->sb) ? "RO": "RW",
is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
- "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"));
+ "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good"));
+ if (si->sbi->s_flag) {
+ seq_puts(s, "[SBI:");
+ for_each_set_bit(j, &si->sbi->s_flag, 32)
+ seq_puts(s, s_flag[j]);
+ seq_puts(s, "]\n");
+ }
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -450,6 +472,15 @@ static int stat_show(struct seq_file *s, void *v)
si->data_segs, si->bg_data_segs);
seq_printf(s, " - node segments : %d (%d)\n",
si->node_segs, si->bg_node_segs);
+ seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
+ "Idle Greedy (%d), Idle AT (%d), "
+ "Urgent High (%d), Urgent Low (%d)\n",
+ si->sbi->gc_reclaimed_segs[GC_NORMAL],
+ si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
+ si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
+ si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
+ si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
+ si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
@@ -611,7 +642,7 @@ void __init f2fs_create_root_stats(void)
#ifdef CONFIG_DEBUG_FS
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
- debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
+ debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL,
&stat_fops);
#endif
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 456651682daf..1820e9c106f7 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -83,8 +83,8 @@ int f2fs_init_casefolded_name(const struct inode *dir,
struct super_block *sb = dir->i_sb;
if (IS_CASEFOLDED(dir)) {
- fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab,
- GFP_NOFS);
+ fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
+ GFP_NOFS, false, F2FS_SB(sb));
if (!fname->cf_name.name)
return -ENOMEM;
fname->cf_name.len = utf8_casefold(sb->s_encoding,
@@ -1000,6 +1000,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
struct blk_plug plug;
bool readdir_ra = sbi->readdir_ra == 1;
+ bool found_valid_dirent = false;
int err = 0;
bit_pos = ((unsigned long)ctx->pos % d->max);
@@ -1014,13 +1015,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
de = &d->dentry[bit_pos];
if (de->name_len == 0) {
+ if (found_valid_dirent || !bit_pos) {
+ printk_ratelimited(
+ "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
+ KERN_WARNING, sbi->sb->s_id,
+ le32_to_cpu(de->ino));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
bit_pos++;
ctx->pos = start_pos + bit_pos;
- printk_ratelimited(
- "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
- KERN_WARNING, sbi->sb->s_id,
- le32_to_cpu(de->ino));
- set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
}
@@ -1063,6 +1066,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
ctx->pos = start_pos + bit_pos;
+ found_valid_dirent = true;
}
out:
if (readdir_ra)
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 3ebf976a682d..866e72b29bd5 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -239,7 +239,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
{
struct extent_node *en;
- en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
+ en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
if (!en)
return NULL;
@@ -292,7 +292,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
mutex_lock(&sbi->extent_tree_lock);
et = radix_tree_lookup(&sbi->extent_tree_root, ino);
if (!et) {
- et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
+ et = f2fs_kmem_cache_alloc(extent_tree_slab,
+ GFP_NOFS, true, NULL);
f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
memset(et, 0, sizeof(struct extent_tree));
et->ino = ino;
@@ -661,6 +662,47 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
f2fs_mark_inode_dirty_sync(inode, true);
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+ pgoff_t fofs, block_t blkaddr, unsigned int llen,
+ unsigned int c_len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_node *en = NULL;
+ struct extent_node *prev_en = NULL, *next_en = NULL;
+ struct extent_info ei;
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ bool leftmost = false;
+
+ trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen);
+
+ /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ return;
+
+ write_lock(&et->lock);
+
+ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
+ (struct rb_entry *)et->cached_en, fofs,
+ (struct rb_entry **)&prev_en,
+ (struct rb_entry **)&next_en,
+ &insert_p, &insert_parent, false,
+ &leftmost);
+ if (en)
+ goto unlock_out;
+
+ set_extent_info(&ei, fofs, blkaddr, llen);
+ ei.c_len = c_len;
+
+ if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+ __insert_extent_tree(sbi, et, &ei,
+ insert_p, insert_parent, leftmost);
+unlock_out:
+ write_unlock(&et->lock);
+}
+#endif
+
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct extent_tree *et, *next;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ee8eb33e2c25..b339ae89c1ad 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -43,6 +43,7 @@ enum {
FAULT_KVMALLOC,
FAULT_PAGE_ALLOC,
FAULT_PAGE_GET,
+ FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */
FAULT_ALLOC_NID,
FAULT_ORPHAN,
FAULT_BLOCK,
@@ -53,6 +54,7 @@ enum {
FAULT_CHECKPOINT,
FAULT_DISCARD,
FAULT_WRITE_IO,
+ FAULT_SLAB_ALLOC,
FAULT_MAX,
};
@@ -139,6 +141,11 @@ struct f2fs_mount_info {
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
+ int discard_unit; /*
+ * discard command's offset/size should
+ * be aligned to this unit: block,
+ * segment or section
+ */
struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */
block_t unusable_cap_perc; /* percentage for cap */
block_t unusable_cap; /* Amount of space allowed to be
@@ -542,7 +549,7 @@ enum {
*/
};
-#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */
+#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */
/* congestion wait timeout value, default: 20ms */
#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20))
@@ -575,6 +582,9 @@ struct extent_info {
unsigned int fofs; /* start offset in a file */
unsigned int len; /* length of the extent */
u32 blk; /* start block address of the extent */
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ unsigned int c_len; /* physical extent length of compressed blocks */
+#endif
};
struct extent_node {
@@ -754,7 +764,6 @@ struct f2fs_inode_info {
/* avoid racing between foreground op and gc */
struct rw_semaphore i_gc_rwsem[2];
- struct rw_semaphore i_mmap_sem;
struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
int i_extra_isize; /* size of extra space located in i_addr */
@@ -794,6 +803,9 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
ei->fofs = fofs;
ei->blk = blk;
ei->len = len;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ ei->c_len = 0;
+#endif
}
static inline bool __is_discard_mergeable(struct discard_info *back,
@@ -818,6 +830,12 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur,
static inline bool __is_extent_mergeable(struct extent_info *back,
struct extent_info *front)
{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (back->c_len && back->len != back->c_len)
+ return false;
+ if (front->c_len && front->len != front->c_len)
+ return false;
+#endif
return (back->fofs + back->len == front->fofs &&
back->blk + back->len == front->blk);
}
@@ -1253,6 +1271,7 @@ enum {
GC_IDLE_AT,
GC_URGENT_HIGH,
GC_URGENT_LOW,
+ MAX_GC_MODE,
};
enum {
@@ -1298,6 +1317,12 @@ enum {
*/
};
+enum {
+ DISCARD_UNIT_BLOCK, /* basic discard unit is block */
+ DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */
+ DISCARD_UNIT_SECTION, /* basic discard unit is section */
+};
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1687,14 +1712,6 @@ struct f2fs_sb_info {
#endif
spinlock_t stat_lock; /* lock for stat operations */
- /* For app/fs IO statistics */
- spinlock_t iostat_lock;
- unsigned long long rw_iostat[NR_IO_TYPE];
- unsigned long long prev_rw_iostat[NR_IO_TYPE];
- bool iostat_enable;
- unsigned long iostat_next_period;
- unsigned int iostat_period_ms;
-
/* to attach REQ_META|REQ_FUA flags */
unsigned int data_io_flag;
unsigned int node_io_flag;
@@ -1733,6 +1750,12 @@ struct f2fs_sb_info {
struct kmem_cache *inline_xattr_slab; /* inline xattr entry */
unsigned int inline_xattr_slab_size; /* default inline xattr slab size */
+ /* For reclaimed segs statistics per each GC mode */
+ unsigned int gc_segment_mode; /* GC state for reclaimed segments */
+ unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */
+
+ unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
@@ -1748,6 +1771,20 @@ struct f2fs_sb_info {
unsigned int compress_watermark; /* cache page watermark */
atomic_t compress_page_hit; /* cache hit count */
#endif
+
+#ifdef CONFIG_F2FS_IOSTAT
+ /* For app/fs IO statistics */
+ spinlock_t iostat_lock;
+ unsigned long long rw_iostat[NR_IO_TYPE];
+ unsigned long long prev_rw_iostat[NR_IO_TYPE];
+ bool iostat_enable;
+ unsigned long iostat_next_period;
+ unsigned int iostat_period_ms;
+
+ /* For io latency related statistics info in one iostat period */
+ spinlock_t iostat_lat_lock;
+ struct iostat_lat_info *iostat_io_lat;
+#endif
};
struct f2fs_private_dio {
@@ -2035,36 +2072,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
-static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
-{
- unsigned long flags;
- unsigned char *nat_bits;
-
- /*
- * In order to re-enable nat_bits we need to call fsck.f2fs by
- * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost,
- * so let's rely on regular fsck or unclean shutdown.
- */
-
- if (lock)
- spin_lock_irqsave(&sbi->cp_lock, flags);
- __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
- nat_bits = NM_I(sbi)->nat_bits;
- NM_I(sbi)->nat_bits = NULL;
- if (lock)
- spin_unlock_irqrestore(&sbi->cp_lock, flags);
-
- kvfree(nat_bits);
-}
-
-static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
- struct cp_control *cpc)
-{
- bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
-
- return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
-}
-
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
down_read(&sbi->cp_rwsem);
@@ -2588,7 +2595,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL);
}
-static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
+static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep,
gfp_t flags)
{
void *entry;
@@ -2599,6 +2606,20 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
return entry;
}
+static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
+ gfp_t flags, bool nofail, struct f2fs_sb_info *sbi)
+{
+ if (nofail)
+ return f2fs_kmem_cache_alloc_nofail(cachep, flags);
+
+ if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) {
+ f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC);
+ return NULL;
+ }
+
+ return kmem_cache_alloc(cachep, flags);
+}
+
static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type)
{
if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
@@ -3211,47 +3232,6 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
sizeof((f2fs_inode)->field)) \
<= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \
-#define DEFAULT_IOSTAT_PERIOD_MS 3000
-#define MIN_IOSTAT_PERIOD_MS 100
-/* maximum period of iostat tracing is 1 day */
-#define MAX_IOSTAT_PERIOD_MS 8640000
-
-static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
-{
- int i;
-
- spin_lock(&sbi->iostat_lock);
- for (i = 0; i < NR_IO_TYPE; i++) {
- sbi->rw_iostat[i] = 0;
- sbi->prev_rw_iostat[i] = 0;
- }
- spin_unlock(&sbi->iostat_lock);
-}
-
-extern void f2fs_record_iostat(struct f2fs_sb_info *sbi);
-
-static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
- enum iostat_type type, unsigned long long io_bytes)
-{
- if (!sbi->iostat_enable)
- return;
- spin_lock(&sbi->iostat_lock);
- sbi->rw_iostat[type] += io_bytes;
-
- if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
- sbi->rw_iostat[APP_BUFFERED_IO] =
- sbi->rw_iostat[APP_WRITE_IO] -
- sbi->rw_iostat[APP_DIRECT_IO];
-
- if (type == APP_READ_IO || type == APP_DIRECT_READ_IO)
- sbi->rw_iostat[APP_BUFFERED_READ_IO] =
- sbi->rw_iostat[APP_READ_IO] -
- sbi->rw_iostat[APP_DIRECT_READ_IO];
- spin_unlock(&sbi->iostat_lock);
-
- f2fs_record_iostat(sbi);
-}
-
#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META)
@@ -3418,6 +3398,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
int f2fs_truncate_xattr_node(struct inode *inode);
int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
unsigned int seq_id);
+bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi);
int f2fs_remove_inode_page(struct inode *inode);
struct page *f2fs_new_inode_page(struct inode *inode);
struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
@@ -3442,6 +3423,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page);
int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum);
+void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi);
int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
int f2fs_build_node_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi);
@@ -3465,6 +3447,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
+int f2fs_start_discard_thread(struct f2fs_sb_info *sbi);
void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi);
@@ -3987,6 +3970,9 @@ void f2fs_destroy_extent_cache(void);
/*
* sysfs.c
*/
+#define MIN_RA_MUL 2
+#define MAX_RA_MUL 256
+
int __init f2fs_init_sysfs(void);
void f2fs_exit_sysfs(void);
int f2fs_register_sysfs(struct f2fs_sb_info *sbi);
@@ -4041,18 +4027,23 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed,
block_t blkaddr);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
+bool f2fs_sanity_check_cluster(struct dnode_of_data *dn);
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
int f2fs_write_multi_pages(struct compress_ctx *cc,
int *submitted,
struct writeback_control *wbc,
enum iostat_type io_type);
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
+void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+ pgoff_t fofs, block_t blkaddr, unsigned int llen,
+ unsigned int c_len);
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
void f2fs_put_page_dic(struct page *page);
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
@@ -4107,6 +4098,8 @@ static inline void f2fs_put_page_dic(struct page *page)
{
WARN_ON_ONCE(1);
}
+static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; }
+static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; }
static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { }
static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
@@ -4122,6 +4115,9 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
nid_t ino) { }
#define inc_compr_inode_stat(inode) do { } while (0)
+static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+ pgoff_t fofs, block_t blkaddr, unsigned int llen,
+ unsigned int c_len) { }
#endif
static inline void set_compress_context(struct inode *inode)
@@ -4137,7 +4133,8 @@ static inline void set_compress_context(struct inode *inode)
1 << COMPRESS_CHKSUM : 0;
F2FS_I(inode)->i_cluster_size =
1 << F2FS_I(inode)->i_log_cluster_size;
- if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 &&
+ if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 ||
+ F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) &&
F2FS_OPTION(sbi).compress_level)
F2FS_I(inode)->i_compress_flag |=
F2FS_OPTION(sbi).compress_level <<
@@ -4305,16 +4302,6 @@ static inline int block_unaligned_IO(struct inode *inode,
return align & blocksize_mask;
}
-static inline int allow_outplace_dio(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int rw = iov_iter_rw(iter);
-
- return (f2fs_lfs_mode(sbi) && (rw == WRITE) &&
- !block_unaligned_IO(inode, iocb, iter));
-}
-
static inline bool f2fs_force_buffered_io(struct inode *inode,
struct kiocb *iocb, struct iov_iter *iter)
{
@@ -4369,6 +4356,11 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
return false;
}
+static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
+{
+ return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6afd4562335f..9c8ef33bd8d3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -23,6 +23,7 @@
#include <linux/nls.h>
#include <linux/sched/signal.h>
#include <linux/fileattr.h>
+#include <linux/fadvise.h>
#include "f2fs.h"
#include "node.h"
@@ -30,6 +31,7 @@
#include "xattr.h"
#include "acl.h"
#include "gc.h"
+#include "iostat.h"
#include <trace/events/f2fs.h>
#include <uapi/linux/f2fs.h>
@@ -38,10 +40,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
vm_fault_t ret;
- down_read(&F2FS_I(inode)->i_mmap_sem);
ret = filemap_fault(vmf);
- up_read(&F2FS_I(inode)->i_mmap_sem);
-
if (!ret)
f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO,
F2FS_BLKSIZE);
@@ -101,7 +100,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
file_update_time(vmf->vma->vm_file);
- down_read(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
page_offset(page) > i_size_read(inode) ||
@@ -159,7 +158,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
trace_f2fs_vm_page_mkwrite(page, DATA);
out_sem:
- up_read(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
err:
@@ -261,8 +260,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
};
unsigned int seq_id = 0;
- if (unlikely(f2fs_readonly(inode->i_sb) ||
- is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ if (unlikely(f2fs_readonly(inode->i_sb)))
return 0;
trace_f2fs_sync_file_enter(inode);
@@ -276,7 +274,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
ret = file_write_and_wait_range(file, start, end);
clear_inode_flag(inode, FI_NEED_IPU);
- if (ret) {
+ if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
return ret;
}
@@ -301,6 +299,18 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
f2fs_exist_written_data(sbi, ino, UPDATE_INO))
goto flush_out;
goto out;
+ } else {
+ /*
+ * for OPU case, during fsync(), node can be persisted before
+ * data when lower device doesn't support write barrier, result
+ * in data corruption after SPO.
+ * So for strict fsync mode, force to use atomic write sematics
+ * to keep write order in between data/node and last node to
+ * avoid potential data corruption.
+ */
+ if (F2FS_OPTION(sbi).fsync_mode ==
+ FSYNC_MODE_STRICT && !atomic)
+ atomic = true;
}
go_write:
/*
@@ -740,6 +750,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
return err;
#ifdef CONFIG_F2FS_FS_COMPRESSION
+ /*
+ * For compressed file, after release compress blocks, don't allow write
+ * direct, but we should allow write direct after truncate to zero.
+ */
+ if (f2fs_compressed_file(inode) && !free_from
+ && is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
+ clear_inode_flag(inode, FI_COMPRESS_RELEASED);
+
if (from != free_from) {
err = f2fs_truncate_partial_cluster(inode, from, lock);
if (err)
@@ -940,7 +958,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, attr->ia_size);
@@ -950,7 +968,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* do not trim all blocks after i_size if target size is
* larger than i_size.
*/
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -1085,7 +1103,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
if (pg_start < pg_end) {
- struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1095,16 +1112,15 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
blk_end = (loff_t)pg_end << PAGE_SHIFT;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
- truncate_inode_pages_range(mapping, blk_start,
- blk_end - 1);
+ truncate_pagecache_range(inode, blk_start, blk_end - 1);
f2fs_lock_op(sbi);
ret = f2fs_truncate_hole(inode, pg_start, pg_end);
f2fs_unlock_op(sbi);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -1339,7 +1355,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
/* avoid gc operation during block exchange */
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
f2fs_lock_op(sbi);
f2fs_drop_extent_tree(inode);
@@ -1347,7 +1363,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
f2fs_unlock_op(sbi);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -1378,13 +1394,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
/* write out all moved pages, if possible */
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
truncate_pagecache(inode, offset);
new_size = i_size_read(inode) - len;
ret = f2fs_truncate_blocks(inode, new_size, true);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
if (!ret)
f2fs_i_size_write(inode, new_size);
return ret;
@@ -1484,7 +1500,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
pgoff_t end;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
truncate_pagecache_range(inode,
(loff_t)index << PAGE_SHIFT,
@@ -1496,7 +1512,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE);
if (ret) {
f2fs_unlock_op(sbi);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -1508,7 +1524,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_balance_fs(sbi, dn.node_changed);
@@ -1543,6 +1559,7 @@ out:
static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
pgoff_t nr, pg_start, pg_end, delta, idx;
loff_t new_size;
int ret = 0;
@@ -1565,14 +1582,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = f2fs_truncate_blocks(inode, i_size_read(inode), true);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (ret)
return ret;
/* write out all dirty pages from offset */
- ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
if (ret)
return ret;
@@ -1583,7 +1600,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* avoid gc operation during block exchange */
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
truncate_pagecache(inode, offset);
while (!ret && idx > pg_start) {
@@ -1599,14 +1616,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
idx + delta, nr, false);
f2fs_unlock_op(sbi);
}
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/* write out all moved pages, if possible */
- down_write(&F2FS_I(inode)->i_mmap_sem);
- filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ filemap_invalidate_lock(mapping);
+ filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
truncate_pagecache(inode, offset);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
if (!ret)
f2fs_i_size_write(inode, new_size);
@@ -3440,7 +3457,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
goto out;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3475,8 +3492,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
released_blocks += ret;
}
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- up_write(&F2FS_I(inode)->i_mmap_sem);
out:
inode_unlock(inode);
@@ -3593,7 +3610,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
}
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3628,8 +3645,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
reserved_blocks += ret;
}
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- up_write(&F2FS_I(inode)->i_mmap_sem);
if (ret >= 0) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
@@ -3748,7 +3765,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
goto err;
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
ret = filemap_write_and_wait_range(mapping, range.start,
to_end ? LLONG_MAX : end_addr - 1);
@@ -3835,7 +3852,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
ret = f2fs_secure_erase(prev_bdev, inode, prev_index,
prev_block, len, range.flags);
out:
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
err:
inode_unlock(inode);
@@ -4292,7 +4309,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* back to buffered IO.
*/
if (!f2fs_force_buffered_io(inode, iocb, from) &&
- allow_outplace_dio(inode, iocb, from))
+ f2fs_lfs_mode(F2FS_I_SB(inode)))
goto write;
}
preallocated = true;
@@ -4313,9 +4330,9 @@ write:
/* if we couldn't write data, we should deallocate blocks. */
if (preallocated && i_size_read(inode) < target_size) {
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- down_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
f2fs_truncate(inode);
- up_write(&F2FS_I(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
@@ -4332,6 +4349,34 @@ out:
return ret;
}
+static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
+ int advice)
+{
+ struct inode *inode;
+ struct address_space *mapping;
+ struct backing_dev_info *bdi;
+
+ if (advice == POSIX_FADV_SEQUENTIAL) {
+ inode = file_inode(filp);
+ if (S_ISFIFO(inode->i_mode))
+ return -ESPIPE;
+
+ mapping = filp->f_mapping;
+ if (!mapping || len < 0)
+ return -EINVAL;
+
+ bdi = inode_to_bdi(mapping->host);
+ filp->f_ra.ra_pages = bdi->ra_pages *
+ F2FS_I_SB(inode)->seq_file_ra_mul;
+ spin_lock(&filp->f_lock);
+ filp->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&filp->f_lock);
+ return 0;
+ }
+
+ return generic_fadvise(filp, offset, len, advice);
+}
+
#ifdef CONFIG_COMPAT
struct compat_f2fs_gc_range {
u32 sync;
@@ -4460,4 +4505,5 @@ const struct file_operations f2fs_file_operations = {
#endif
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .fadvise = f2fs_file_fadvise,
};
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 0e42ee5f7770..77391e3b7d68 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -19,6 +19,7 @@
#include "node.h"
#include "segment.h"
#include "gc.h"
+#include "iostat.h"
#include <trace/events/f2fs.h>
static struct kmem_cache *victim_entry_slab;
@@ -371,7 +372,8 @@ static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi,
struct atgc_management *am = &sbi->am;
struct victim_entry *ve;
- ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS);
+ ve = f2fs_kmem_cache_alloc(victim_entry_slab,
+ GFP_NOFS, true, NULL);
ve->mtime = mtime;
ve->segno = segno;
@@ -849,7 +851,8 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
iput(inode);
return;
}
- new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS);
+ new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab,
+ GFP_NOFS, true, NULL);
new_ie->inode = inode;
f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
@@ -1497,8 +1500,10 @@ next_step:
int err;
if (S_ISREG(inode->i_mode)) {
- if (!down_write_trylock(&fi->i_gc_rwsem[READ]))
+ if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
+ sbi->skipped_gc_rwsem++;
continue;
+ }
if (!down_write_trylock(
&fi->i_gc_rwsem[WRITE])) {
sbi->skipped_gc_rwsem++;
@@ -1646,6 +1651,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
force_migrate);
stat_inc_seg_count(sbi, type, gc_type);
+ sbi->gc_reclaimed_segs[sbi->gc_mode]++;
migrated++;
freed:
@@ -1747,7 +1753,7 @@ gc_more:
round++;
}
- if (gc_type == FG_GC && seg_freed)
+ if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
if (sync)
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
new file mode 100644
index 000000000000..cdcf54ae0db8
--- /dev/null
+++ b/fs/f2fs/iostat.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * f2fs iostat support
+ *
+ * Copyright 2021 Google LLC
+ * Author: Daeho Jeong <daehojeong@google.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/seq_file.h>
+
+#include "f2fs.h"
+#include "iostat.h"
+#include <trace/events/f2fs.h>
+
+#define NUM_PREALLOC_IOSTAT_CTXS 128
+static struct kmem_cache *bio_iostat_ctx_cache;
+static mempool_t *bio_iostat_ctx_pool;
+
+int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ time64_t now = ktime_get_real_seconds();
+
+ if (!sbi->iostat_enable)
+ return 0;
+
+ seq_printf(seq, "time: %-16llu\n", now);
+
+ /* print app write IOs */
+ seq_puts(seq, "[WRITE]\n");
+ seq_printf(seq, "app buffered: %-16llu\n",
+ sbi->rw_iostat[APP_BUFFERED_IO]);
+ seq_printf(seq, "app direct: %-16llu\n",
+ sbi->rw_iostat[APP_DIRECT_IO]);
+ seq_printf(seq, "app mapped: %-16llu\n",
+ sbi->rw_iostat[APP_MAPPED_IO]);
+
+ /* print fs write IOs */
+ seq_printf(seq, "fs data: %-16llu\n",
+ sbi->rw_iostat[FS_DATA_IO]);
+ seq_printf(seq, "fs node: %-16llu\n",
+ sbi->rw_iostat[FS_NODE_IO]);
+ seq_printf(seq, "fs meta: %-16llu\n",
+ sbi->rw_iostat[FS_META_IO]);
+ seq_printf(seq, "fs gc data: %-16llu\n",
+ sbi->rw_iostat[FS_GC_DATA_IO]);
+ seq_printf(seq, "fs gc node: %-16llu\n",
+ sbi->rw_iostat[FS_GC_NODE_IO]);
+ seq_printf(seq, "fs cp data: %-16llu\n",
+ sbi->rw_iostat[FS_CP_DATA_IO]);
+ seq_printf(seq, "fs cp node: %-16llu\n",
+ sbi->rw_iostat[FS_CP_NODE_IO]);
+ seq_printf(seq, "fs cp meta: %-16llu\n",
+ sbi->rw_iostat[FS_CP_META_IO]);
+
+ /* print app read IOs */
+ seq_puts(seq, "[READ]\n");
+ seq_printf(seq, "app buffered: %-16llu\n",
+ sbi->rw_iostat[APP_BUFFERED_READ_IO]);
+ seq_printf(seq, "app direct: %-16llu\n",
+ sbi->rw_iostat[APP_DIRECT_READ_IO]);
+ seq_printf(seq, "app mapped: %-16llu\n",
+ sbi->rw_iostat[APP_MAPPED_READ_IO]);
+
+ /* print fs read IOs */
+ seq_printf(seq, "fs data: %-16llu\n",
+ sbi->rw_iostat[FS_DATA_READ_IO]);
+ seq_printf(seq, "fs gc data: %-16llu\n",
+ sbi->rw_iostat[FS_GDATA_READ_IO]);
+ seq_printf(seq, "fs compr_data: %-16llu\n",
+ sbi->rw_iostat[FS_CDATA_READ_IO]);
+ seq_printf(seq, "fs node: %-16llu\n",
+ sbi->rw_iostat[FS_NODE_READ_IO]);
+ seq_printf(seq, "fs meta: %-16llu\n",
+ sbi->rw_iostat[FS_META_READ_IO]);
+
+ /* print other IOs */
+ seq_puts(seq, "[OTHER]\n");
+ seq_printf(seq, "fs discard: %-16llu\n",
+ sbi->rw_iostat[FS_DISCARD]);
+
+ return 0;
+}
+
+static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
+{
+ int io, idx = 0;
+ unsigned int cnt;
+ struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
+ struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+
+ spin_lock_irq(&sbi->iostat_lat_lock);
+ for (idx = 0; idx < MAX_IO_TYPE; idx++) {
+ for (io = 0; io < NR_PAGE_TYPE; io++) {
+ cnt = io_lat->bio_cnt[idx][io];
+ iostat_lat[idx][io].peak_lat =
+ jiffies_to_msecs(io_lat->peak_lat[idx][io]);
+ iostat_lat[idx][io].cnt = cnt;
+ iostat_lat[idx][io].avg_lat = cnt ?
+ jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0;
+ io_lat->sum_lat[idx][io] = 0;
+ io_lat->peak_lat[idx][io] = 0;
+ io_lat->bio_cnt[idx][io] = 0;
+ }
+ }
+ spin_unlock_irq(&sbi->iostat_lat_lock);
+
+ trace_f2fs_iostat_latency(sbi, iostat_lat);
+}
+
+static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
+{
+ unsigned long long iostat_diff[NR_IO_TYPE];
+ int i;
+
+ if (time_is_after_jiffies(sbi->iostat_next_period))
+ return;
+
+ /* Need double check under the lock */
+ spin_lock(&sbi->iostat_lock);
+ if (time_is_after_jiffies(sbi->iostat_next_period)) {
+ spin_unlock(&sbi->iostat_lock);
+ return;
+ }
+ sbi->iostat_next_period = jiffies +
+ msecs_to_jiffies(sbi->iostat_period_ms);
+
+ for (i = 0; i < NR_IO_TYPE; i++) {
+ iostat_diff[i] = sbi->rw_iostat[i] -
+ sbi->prev_rw_iostat[i];
+ sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
+ }
+ spin_unlock(&sbi->iostat_lock);
+
+ trace_f2fs_iostat(sbi, iostat_diff);
+
+ __record_iostat_latency(sbi);
+}
+
+void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
+{
+ struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+ int i;
+
+ spin_lock(&sbi->iostat_lock);
+ for (i = 0; i < NR_IO_TYPE; i++) {
+ sbi->rw_iostat[i] = 0;
+ sbi->prev_rw_iostat[i] = 0;
+ }
+ spin_unlock(&sbi->iostat_lock);
+
+ spin_lock_irq(&sbi->iostat_lat_lock);
+ memset(io_lat, 0, sizeof(struct iostat_lat_info));
+ spin_unlock_irq(&sbi->iostat_lat_lock);
+}
+
+void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+ enum iostat_type type, unsigned long long io_bytes)
+{
+ if (!sbi->iostat_enable)
+ return;
+
+ spin_lock(&sbi->iostat_lock);
+ sbi->rw_iostat[type] += io_bytes;
+
+ if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
+ sbi->rw_iostat[APP_BUFFERED_IO] =
+ sbi->rw_iostat[APP_WRITE_IO] -
+ sbi->rw_iostat[APP_DIRECT_IO];
+
+ if (type == APP_READ_IO || type == APP_DIRECT_READ_IO)
+ sbi->rw_iostat[APP_BUFFERED_READ_IO] =
+ sbi->rw_iostat[APP_READ_IO] -
+ sbi->rw_iostat[APP_DIRECT_READ_IO];
+ spin_unlock(&sbi->iostat_lock);
+
+ f2fs_record_iostat(sbi);
+}
+
+static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
+ int rw, bool is_sync)
+{
+ unsigned long ts_diff;
+ unsigned int iotype = iostat_ctx->type;
+ unsigned long flags;
+ struct f2fs_sb_info *sbi = iostat_ctx->sbi;
+ struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+ int idx;
+
+ if (!sbi->iostat_enable)
+ return;
+
+ ts_diff = jiffies - iostat_ctx->submit_ts;
+ if (iotype >= META_FLUSH)
+ iotype = META;
+
+ if (rw == 0) {
+ idx = READ_IO;
+ } else {
+ if (is_sync)
+ idx = WRITE_SYNC_IO;
+ else
+ idx = WRITE_ASYNC_IO;
+ }
+
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
+ io_lat->sum_lat[idx][iotype] += ts_diff;
+ io_lat->bio_cnt[idx][iotype]++;
+ if (ts_diff > io_lat->peak_lat[idx][iotype])
+ io_lat->peak_lat[idx][iotype] = ts_diff;
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
+}
+
+void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
+{
+ struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
+ bool is_sync = bio->bi_opf & REQ_SYNC;
+
+ if (rw == 0)
+ bio->bi_private = iostat_ctx->post_read_ctx;
+ else
+ bio->bi_private = iostat_ctx->sbi;
+ __update_iostat_latency(iostat_ctx, rw, is_sync);
+ mempool_free(iostat_ctx, bio_iostat_ctx_pool);
+}
+
+void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
+ struct bio *bio, struct bio_post_read_ctx *ctx)
+{
+ struct bio_iostat_ctx *iostat_ctx;
+ /* Due to the mempool, this never fails. */
+ iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS);
+ iostat_ctx->sbi = sbi;
+ iostat_ctx->submit_ts = 0;
+ iostat_ctx->type = 0;
+ iostat_ctx->post_read_ctx = ctx;
+ bio->bi_private = iostat_ctx;
+}
+
+int __init f2fs_init_iostat_processing(void)
+{
+ bio_iostat_ctx_cache =
+ kmem_cache_create("f2fs_bio_iostat_ctx",
+ sizeof(struct bio_iostat_ctx), 0, 0, NULL);
+ if (!bio_iostat_ctx_cache)
+ goto fail;
+ bio_iostat_ctx_pool =
+ mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS,
+ bio_iostat_ctx_cache);
+ if (!bio_iostat_ctx_pool)
+ goto fail_free_cache;
+ return 0;
+
+fail_free_cache:
+ kmem_cache_destroy(bio_iostat_ctx_cache);
+fail:
+ return -ENOMEM;
+}
+
+void f2fs_destroy_iostat_processing(void)
+{
+ mempool_destroy(bio_iostat_ctx_pool);
+ kmem_cache_destroy(bio_iostat_ctx_cache);
+}
+
+int f2fs_init_iostat(struct f2fs_sb_info *sbi)
+{
+ /* init iostat info */
+ spin_lock_init(&sbi->iostat_lock);
+ spin_lock_init(&sbi->iostat_lat_lock);
+ sbi->iostat_enable = false;
+ sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
+ sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info),
+ GFP_KERNEL);
+ if (!sbi->iostat_io_lat)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void f2fs_destroy_iostat(struct f2fs_sb_info *sbi)
+{
+ kfree(sbi->iostat_io_lat);
+}
diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h
new file mode 100644
index 000000000000..22a2d01f57ef
--- /dev/null
+++ b/fs/f2fs/iostat.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ * Author: Daeho Jeong <daehojeong@google.com>
+ */
+#ifndef __F2FS_IOSTAT_H__
+#define __F2FS_IOSTAT_H__
+
+struct bio_post_read_ctx;
+
+#ifdef CONFIG_F2FS_IOSTAT
+
+#define DEFAULT_IOSTAT_PERIOD_MS 3000
+#define MIN_IOSTAT_PERIOD_MS 100
+/* maximum period of iostat tracing is 1 day */
+#define MAX_IOSTAT_PERIOD_MS 8640000
+
+enum {
+ READ_IO,
+ WRITE_SYNC_IO,
+ WRITE_ASYNC_IO,
+ MAX_IO_TYPE,
+};
+
+struct iostat_lat_info {
+ unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */
+ unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */
+ unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */
+};
+
+extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
+ void *offset);
+extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi);
+extern void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+ enum iostat_type type, unsigned long long io_bytes);
+
+struct bio_iostat_ctx {
+ struct f2fs_sb_info *sbi;
+ unsigned long submit_ts;
+ enum page_type type;
+ struct bio_post_read_ctx *post_read_ctx;
+};
+
+static inline void iostat_update_submit_ctx(struct bio *bio,
+ enum page_type type)
+{
+ struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
+
+ iostat_ctx->submit_ts = jiffies;
+ iostat_ctx->type = type;
+}
+
+static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
+{
+ struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
+
+ return iostat_ctx->post_read_ctx;
+}
+
+extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw);
+extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
+ struct bio *bio, struct bio_post_read_ctx *ctx);
+extern int f2fs_init_iostat_processing(void);
+extern void f2fs_destroy_iostat_processing(void);
+extern int f2fs_init_iostat(struct f2fs_sb_info *sbi);
+extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi);
+#else
+static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+ enum iostat_type type, unsigned long long io_bytes) {}
+static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {}
+static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
+ struct bio *bio, struct bio_post_read_ctx *ctx) {}
+static inline void iostat_update_submit_ctx(struct bio *bio,
+ enum page_type type) {}
+static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
+{
+ return bio->bi_private;
+}
+static inline int f2fs_init_iostat_processing(void) { return 0; }
+static inline void f2fs_destroy_iostat_processing(void) {}
+static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {}
+#endif
+#endif /* __F2FS_IOSTAT_H__ */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e149c8c66a71..9c528e583c9d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1323,9 +1323,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
return target;
}
+static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns,
+ const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags);
+
+ return fscrypt_symlink_getattr(path, stat);
+}
+
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.get_link = f2fs_encrypted_get_link,
- .getattr = f2fs_getattr,
+ .getattr = f2fs_encrypted_symlink_getattr,
.setattr = f2fs_setattr,
.listxattr = f2fs_listxattr,
};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 0be9e2d7120e..e863136081b4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -17,6 +17,7 @@
#include "node.h"
#include "segment.h"
#include "xattr.h"
+#include "iostat.h"
#include <trace/events/f2fs.h>
#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
@@ -162,14 +163,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
return dst_page;
}
-static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
+static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi,
+ nid_t nid, bool no_fail)
{
struct nat_entry *new;
- if (no_fail)
- new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
- else
- new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
+ new = f2fs_kmem_cache_alloc(nat_entry_slab,
+ GFP_F2FS_ZERO, no_fail, sbi);
if (new) {
nat_set_nid(new, nid);
nat_reset_flag(new);
@@ -242,7 +242,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
head = radix_tree_lookup(&nm_i->nat_set_root, set);
if (!head) {
- head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
+ head = f2fs_kmem_cache_alloc(nat_entry_set_slab,
+ GFP_NOFS, true, NULL);
INIT_LIST_HEAD(&head->entry_list);
INIT_LIST_HEAD(&head->set_list);
@@ -329,7 +330,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
unsigned long flags;
unsigned int seq_id;
- fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
+ fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab,
+ GFP_NOFS, true, NULL);
get_page(page);
fn->page = page;
@@ -428,7 +430,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *new, *e;
- new = __alloc_nat_entry(nid, false);
+ new = __alloc_nat_entry(sbi, nid, false);
if (!new)
return;
@@ -451,7 +453,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
+ struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
@@ -552,7 +554,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
int i;
ni->nid = nid;
-
+retry:
/* Check nat cache */
down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
@@ -564,10 +566,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
return 0;
}
- memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+ /*
+ * Check current segment summary by trying to grab journal_rwsem first.
+ * This sem is on the critical path on the checkpoint requiring the above
+ * nat_tree_lock. Therefore, we should retry, if we failed to grab here
+ * while not bothering checkpoint.
+ */
+ if (!rwsem_is_locked(&sbi->cp_global_sem)) {
+ down_read(&curseg->journal_rwsem);
+ } else if (!down_read_trylock(&curseg->journal_rwsem)) {
+ up_read(&nm_i->nat_tree_lock);
+ goto retry;
+ }
- /* Check current segment summary */
- down_read(&curseg->journal_rwsem);
i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
ne = nat_in_journal(journal, i);
@@ -832,6 +843,26 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
dn->ofs_in_node = offset[level];
dn->node_page = npage[level];
dn->data_blkaddr = f2fs_data_blkaddr(dn);
+
+ if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
+ f2fs_sb_has_readonly(sbi)) {
+ unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
+ block_t blkaddr;
+
+ if (!c_len)
+ goto out;
+
+ blkaddr = f2fs_data_blkaddr(dn);
+ if (blkaddr == COMPRESS_ADDR)
+ blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ dn->ofs_in_node + 1);
+
+ f2fs_update_extent_tree_range_compressed(dn->inode,
+ index, blkaddr,
+ F2FS_I(dn->inode)->i_cluster_size,
+ c_len);
+ }
+out:
return 0;
release_pages:
@@ -1321,7 +1352,8 @@ static int read_node_page(struct page *page, int op_flags)
if (err)
return err;
- if (unlikely(ni.blk_addr == NULL_ADDR) ||
+ /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
+ if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
ClearPageUptodate(page);
return -ENOENT;
@@ -2181,6 +2213,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
}
}
+bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int i;
+ bool ret = true;
+
+ down_read(&nm_i->nat_tree_lock);
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
+ ret = false;
+ break;
+ }
+ }
+ up_read(&nm_i->nat_tree_lock);
+
+ return ret;
+}
+
static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
bool set, bool build)
{
@@ -2222,7 +2272,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
if (unlikely(f2fs_check_nid_range(sbi, nid)))
return false;
- i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
+ i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL);
i->nid = nid;
i->state = FREE_NID;
@@ -2812,7 +2862,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
- ne = __alloc_nat_entry(nid, true);
+ ne = __alloc_nat_entry(sbi, nid, true);
__init_nat_entry(nm_i, ne, &raw_ne, true);
}
@@ -2852,7 +2902,23 @@ add_out:
list_add_tail(&nes->set_list, head);
}
-static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
+static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs,
+ unsigned int valid)
+{
+ if (valid == 0) {
+ __set_bit_le(nat_ofs, nm_i->empty_nat_bits);
+ __clear_bit_le(nat_ofs, nm_i->full_nat_bits);
+ return;
+ }
+
+ __clear_bit_le(nat_ofs, nm_i->empty_nat_bits);
+ if (valid == NAT_ENTRY_PER_BLOCK)
+ __set_bit_le(nat_ofs, nm_i->full_nat_bits);
+ else
+ __clear_bit_le(nat_ofs, nm_i->full_nat_bits);
+}
+
+static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
struct page *page)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -2861,7 +2927,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
int valid = 0;
int i = 0;
- if (!enabled_nat_bits(sbi, NULL))
+ if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
return;
if (nat_index == 0) {
@@ -2872,17 +2938,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
valid++;
}
- if (valid == 0) {
- __set_bit_le(nat_index, nm_i->empty_nat_bits);
- __clear_bit_le(nat_index, nm_i->full_nat_bits);
- return;
+
+ __update_nat_bits(nm_i, nat_index, valid);
+}
+
+void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int nat_ofs;
+
+ down_read(&nm_i->nat_tree_lock);
+
+ for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
+ unsigned int valid = 0, nid_ofs = 0;
+
+ /* handle nid zero due to it should never be used */
+ if (unlikely(nat_ofs == 0)) {
+ valid = 1;
+ nid_ofs = 1;
+ }
+
+ for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) {
+ if (!test_bit_le(nid_ofs,
+ nm_i->free_nid_bitmap[nat_ofs]))
+ valid++;
+ }
+
+ __update_nat_bits(nm_i, nat_ofs, valid);
}
- __clear_bit_le(nat_index, nm_i->empty_nat_bits);
- if (valid == NAT_ENTRY_PER_BLOCK)
- __set_bit_le(nat_index, nm_i->full_nat_bits);
- else
- __clear_bit_le(nat_index, nm_i->full_nat_bits);
+ up_read(&nm_i->nat_tree_lock);
}
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@@ -2901,7 +2986,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- if (enabled_nat_bits(sbi, cpc) ||
+ if ((cpc->reason & CP_UMOUNT) ||
!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
@@ -2948,7 +3033,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
if (to_journal) {
up_write(&curseg->journal_rwsem);
} else {
- __update_nat_bits(sbi, start_nid, page);
+ update_nat_bits(sbi, start_nid, page);
f2fs_put_page(page, 1);
}
@@ -2979,7 +3064,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* during unmount, let's flush nat_bits before checking
* nat_cnt[DIRTY_NAT].
*/
- if (enabled_nat_bits(sbi, cpc)) {
+ if (cpc->reason & CP_UMOUNT) {
down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
up_write(&nm_i->nat_tree_lock);
@@ -2995,7 +3080,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
- if (enabled_nat_bits(sbi, cpc) ||
+ if (cpc->reason & CP_UMOUNT ||
!__has_cursum_space(journal,
nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
remove_nats_in_journal(sbi);
@@ -3032,15 +3117,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
__u64 cp_ver = cur_cp_version(ckpt);
block_t nat_bits_addr;
- if (!enabled_nat_bits(sbi, NULL))
- return 0;
-
nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
nm_i->nat_bits = f2fs_kvzalloc(sbi,
nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
if (!nm_i->nat_bits)
return -ENOMEM;
+ nm_i->full_nat_bits = nm_i->nat_bits + 8;
+ nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
+
+ if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
+ return 0;
+
nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++) {
@@ -3057,13 +3145,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
cp_ver |= (cur_cp_crc(ckpt) << 32);
if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
- disable_nat_bits(sbi, true);
+ clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+ f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)",
+ cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits));
return 0;
}
- nm_i->full_nat_bits = nm_i->nat_bits + 8;
- nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
-
f2fs_notice(sbi, "Found nat_bits in checkpoint");
return 0;
}
@@ -3074,7 +3161,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
unsigned int i = 0;
nid_t nid, last_nid;
- if (!enabled_nat_bits(sbi, NULL))
+ if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
return;
for (i = 0; i < nm_i->nat_blocks; i++) {
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 695eacfe776c..04655511d7f5 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -91,7 +91,8 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
goto err_out;
}
- entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
+ entry = f2fs_kmem_cache_alloc(fsync_entry_slab,
+ GFP_F2FS_ZERO, true, NULL);
entry->inode = inode;
list_add_tail(&entry->list, head);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 15cc89eef28d..a135d2247415 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,6 +20,7 @@
#include "segment.h"
#include "node.h"
#include "gc.h"
+#include "iostat.h"
#include <trace/events/f2fs.h>
#define __reverse_ffz(x) __reverse_ffs(~(x))
@@ -188,7 +189,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
set_page_private_atomic(page);
- new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
+ new = f2fs_kmem_cache_alloc(inmem_entry_slab,
+ GFP_NOFS, true, NULL);
/* add atomic page indices to the list */
new->page = page;
@@ -776,11 +778,22 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
return 0;
for (i = 1; i < sbi->s_ndevs; i++) {
+ int count = DEFAULT_RETRY_IO_COUNT;
+
if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
continue;
- ret = __submit_flush_wait(sbi, FDEV(i).bdev);
- if (ret)
+
+ do {
+ ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+ if (ret)
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
+ } while (ret && --count);
+
+ if (ret) {
+ f2fs_stop_checkpoint(sbi, false);
break;
+ }
spin_lock(&sbi->dev_lock);
f2fs_clear_bit(i, (char *)&sbi->dirty_device);
@@ -990,7 +1003,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
pend_list = &dcc->pend_list[plist_idx(len)];
- dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
+ dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL);
INIT_LIST_HEAD(&dc->list);
dc->bdev = bdev;
dc->lstart = lstart;
@@ -1893,7 +1906,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
offset = GET_BLKOFF_FROM_SEG0(sbi, i);
- if (!f2fs_test_and_set_bit(offset, se->discard_map))
+ if (f2fs_block_unit_discard(sbi) &&
+ !f2fs_test_and_set_bit(offset, se->discard_map))
sbi->discard_blks--;
}
@@ -1918,7 +1932,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
int i;
- if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi))
+ if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) ||
+ !f2fs_block_unit_discard(sbi))
return false;
if (!force) {
@@ -1949,7 +1964,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
if (!de) {
de = f2fs_kmem_cache_alloc(discard_entry_slab,
- GFP_F2FS_ZERO);
+ GFP_F2FS_ZERO, true, NULL);
de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
list_add_tail(&de->list, head);
}
@@ -2003,14 +2018,18 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
bool force = (cpc->reason & CP_DISCARD);
- bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
+ bool section_alignment = F2FS_OPTION(sbi).discard_unit ==
+ DISCARD_UNIT_SECTION;
+
+ if (f2fs_lfs_mode(sbi) && __is_large_section(sbi))
+ section_alignment = true;
mutex_lock(&dirty_i->seglist_lock);
while (1) {
int i;
- if (need_align && end != -1)
+ if (section_alignment && end != -1)
end--;
start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
if (start >= MAIN_SEGS(sbi))
@@ -2018,7 +2037,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
start + 1);
- if (need_align) {
+ if (section_alignment) {
start = rounddown(start, sbi->segs_per_sec);
end = roundup(end, sbi->segs_per_sec);
}
@@ -2056,6 +2075,9 @@ next:
}
mutex_unlock(&dirty_i->seglist_lock);
+ if (!f2fs_block_unit_discard(sbi))
+ goto wakeup;
+
/* send small discards */
list_for_each_entry_safe(entry, this, head, list) {
unsigned int cur_pos = 0, next_pos, len, total_len = 0;
@@ -2089,12 +2111,29 @@ skip:
dcc->nr_discards -= total_len;
}
+wakeup:
wake_up_discard_thread(sbi, false);
}
-static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
+int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ int err = 0;
+
+ if (!f2fs_realtime_discard_enable(sbi))
+ return 0;
+
+ dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
+ "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
+ if (IS_ERR(dcc->f2fs_issue_discard))
+ err = PTR_ERR(dcc->f2fs_issue_discard);
+
+ return err;
+}
+
+static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
+{
struct discard_cmd_control *dcc;
int err = 0, i;
@@ -2108,6 +2147,11 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
return -ENOMEM;
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+ if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
+ dcc->discard_granularity = sbi->blocks_per_seg;
+ else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
+ dcc->discard_granularity = BLKS_PER_SEC(sbi);
+
INIT_LIST_HEAD(&dcc->entry_list);
for (i = 0; i < MAX_PLIST_NUM; i++)
INIT_LIST_HEAD(&dcc->pend_list[i]);
@@ -2127,13 +2171,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
init_waitqueue_head(&dcc->discard_wait_queue);
SM_I(sbi)->dcc_info = dcc;
init_thread:
- dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
- "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
- if (IS_ERR(dcc->f2fs_issue_discard)) {
- err = PTR_ERR(dcc->f2fs_issue_discard);
+ err = f2fs_start_discard_thread(sbi);
+ if (err) {
kfree(dcc);
SM_I(sbi)->dcc_info = NULL;
- return err;
}
return err;
@@ -2255,7 +2296,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
del = 0;
}
- if (!f2fs_test_and_set_bit(offset, se->discard_map))
+ if (f2fs_block_unit_discard(sbi) &&
+ !f2fs_test_and_set_bit(offset, se->discard_map))
sbi->discard_blks--;
/*
@@ -2297,7 +2339,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
}
}
- if (f2fs_test_and_clear_bit(offset, se->discard_map))
+ if (f2fs_block_unit_discard(sbi) &&
+ f2fs_test_and_clear_bit(offset, se->discard_map))
sbi->discard_blks++;
}
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -3563,7 +3606,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
goto drop_bio;
}
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) {
+ if (f2fs_cp_error(sbi)) {
err = -EIO;
goto drop_bio;
}
@@ -4071,7 +4114,8 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
static struct sit_entry_set *grab_sit_entry_set(void)
{
struct sit_entry_set *ses =
- f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS);
+ f2fs_kmem_cache_alloc(sit_entry_set_slab,
+ GFP_NOFS, true, NULL);
ses->entry_cnt = 0;
INIT_LIST_HEAD(&ses->set_list);
@@ -4282,6 +4326,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
unsigned int sit_segs, start;
char *src_bitmap, *bitmap;
unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size;
+ unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0;
/* allocate memory for SIT information */
sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL);
@@ -4304,9 +4349,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
#ifdef CONFIG_F2FS_CHECK_FS
- bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 4;
+ bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map);
#else
- bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 3;
+ bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map);
#endif
sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
if (!sit_i->bitmap)
@@ -4326,8 +4371,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
bitmap += SIT_VBLOCK_MAP_SIZE;
#endif
- sit_i->sentries[start].discard_map = bitmap;
- bitmap += SIT_VBLOCK_MAP_SIZE;
+ if (discard_map) {
+ sit_i->sentries[start].discard_map = bitmap;
+ bitmap += SIT_VBLOCK_MAP_SIZE;
+ }
}
sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
@@ -4489,17 +4536,19 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (IS_NODESEG(se->type))
total_node_blocks += se->valid_blocks;
- /* build discard map only one time */
- if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
- memset(se->discard_map, 0xff,
- SIT_VBLOCK_MAP_SIZE);
- } else {
- memcpy(se->discard_map,
- se->cur_valid_map,
- SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks +=
- sbi->blocks_per_seg -
- se->valid_blocks;
+ if (f2fs_block_unit_discard(sbi)) {
+ /* build discard map only one time */
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
+ SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map,
+ se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks +=
+ sbi->blocks_per_seg -
+ se->valid_blocks;
+ }
}
if (__is_large_section(sbi))
@@ -4535,13 +4584,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (IS_NODESEG(se->type))
total_node_blocks += se->valid_blocks;
- if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
- memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
- } else {
- memcpy(se->discard_map, se->cur_valid_map,
- SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks += old_valid_blocks;
- sbi->discard_blks -= se->valid_blocks;
+ if (f2fs_block_unit_discard(sbi)) {
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map, se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks += old_valid_blocks;
+ sbi->discard_blks -= se->valid_blocks;
+ }
}
if (__is_large_section(sbi)) {
@@ -5159,7 +5210,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
- sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec;
+ sm_info->min_seq_blocks = sbi->blocks_per_seg;
sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
sm_info->min_ssr_sections = reserved_sections(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 050230c70a53..89fff258727d 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -142,7 +142,7 @@ enum {
};
/*
- * In the victim_sel_policy->alloc_mode, there are two block allocation modes.
+ * In the victim_sel_policy->alloc_mode, there are three block allocation modes.
* LFS writes data sequentially with cleaning operations.
* SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
* AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into
@@ -155,7 +155,7 @@ enum {
};
/*
- * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
+ * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes.
* GC_CB is based on cost-benefit algorithm.
* GC_GREEDY is based on greedy algorithm.
* GC_AT is based on age-threshold algorithm.
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8fecd3050ccd..78ebc306ee2b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -33,6 +33,7 @@
#include "segment.h"
#include "xattr.h"
#include "gc.h"
+#include "iostat.h"
#define CREATE_TRACE_POINTS
#include <trace/events/f2fs.h>
@@ -56,6 +57,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_CHECKPOINT] = "checkpoint error",
[FAULT_DISCARD] = "discard error",
[FAULT_WRITE_IO] = "write IO error",
+ [FAULT_SLAB_ALLOC] = "slab alloc",
};
void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -155,6 +157,7 @@ enum {
Opt_atgc,
Opt_gc_merge,
Opt_nogc_merge,
+ Opt_discard_unit,
Opt_err,
};
@@ -231,6 +234,7 @@ static match_table_t f2fs_tokens = {
{Opt_atgc, "atgc"},
{Opt_gc_merge, "gc_merge"},
{Opt_nogc_merge, "nogc_merge"},
+ {Opt_discard_unit, "discard_unit=%s"},
{Opt_err, NULL},
};
@@ -657,10 +661,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
return -EINVAL;
break;
case Opt_discard:
+ if (!f2fs_hw_support_discard(sbi)) {
+ f2fs_warn(sbi, "device does not support discard");
+ break;
+ }
set_opt(sbi, DISCARD);
break;
case Opt_nodiscard:
- if (f2fs_sb_has_blkzoned(sbi)) {
+ if (f2fs_hw_should_discard(sbi)) {
f2fs_warn(sbi, "discard is required for zoned block devices");
return -EINVAL;
}
@@ -1173,6 +1181,25 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_nogc_merge:
clear_opt(sbi, GC_MERGE);
break;
+ case Opt_discard_unit:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (!strcmp(name, "block")) {
+ F2FS_OPTION(sbi).discard_unit =
+ DISCARD_UNIT_BLOCK;
+ } else if (!strcmp(name, "segment")) {
+ F2FS_OPTION(sbi).discard_unit =
+ DISCARD_UNIT_SEGMENT;
+ } else if (!strcmp(name, "section")) {
+ F2FS_OPTION(sbi).discard_unit =
+ DISCARD_UNIT_SECTION;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1211,6 +1238,14 @@ default_check:
return -EINVAL;
}
#endif
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ if (F2FS_OPTION(sbi).discard_unit !=
+ DISCARD_UNIT_SECTION) {
+ f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default");
+ F2FS_OPTION(sbi).discard_unit =
+ DISCARD_UNIT_SECTION;
+ }
+ }
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_test_compress_extension(sbi)) {
@@ -1271,7 +1306,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
- fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
+ fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep,
+ GFP_F2FS_ZERO, false, F2FS_SB(sb));
if (!fi)
return NULL;
@@ -1289,7 +1325,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
mutex_init(&fi->inmem_lock);
init_rwsem(&fi->i_gc_rwsem[READ]);
init_rwsem(&fi->i_gc_rwsem[WRITE]);
- init_rwsem(&fi->i_mmap_sem);
init_rwsem(&fi->i_xattr_sem);
/* Will be used by directory only */
@@ -1542,6 +1577,7 @@ static void f2fs_put_super(struct super_block *sb)
#endif
fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
destroy_percpu_info(sbi);
+ f2fs_destroy_iostat(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
#ifdef CONFIG_UNICODE
@@ -1925,6 +1961,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, ATGC))
seq_puts(seq, ",atgc");
+
+ if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK)
+ seq_printf(seq, ",discard_unit=%s", "block");
+ else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
+ seq_printf(seq, ",discard_unit=%s", "segment");
+ else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
+ seq_printf(seq, ",discard_unit=%s", "section");
+
return 0;
}
@@ -1960,11 +2004,15 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).unusable_cap = 0;
sbi->sb->s_flags |= SB_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
- set_opt(sbi, DISCARD);
- if (f2fs_sb_has_blkzoned(sbi))
+ if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
+ set_opt(sbi, DISCARD);
+ if (f2fs_sb_has_blkzoned(sbi)) {
F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
- else
+ F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION;
+ } else {
F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
+ F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK;
+ }
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
@@ -2039,8 +2087,17 @@ restore_flag:
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
{
+ int retry = DEFAULT_RETRY_IO_COUNT;
+
/* we should flush all the data to keep data consistency */
- sync_inodes_sb(sbi->sb);
+ do {
+ sync_inodes_sb(sbi->sb);
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
+
+ if (unlikely(retry < 0))
+ f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");
down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
@@ -2061,12 +2118,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool need_restart_gc = false, need_stop_gc = false;
bool need_restart_ckpt = false, need_stop_ckpt = false;
bool need_restart_flush = false, need_stop_flush = false;
+ bool need_restart_discard = false, need_stop_discard = false;
bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
- bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
+ bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
bool no_io_align = !F2FS_IO_ALIGNED(sbi);
bool no_atgc = !test_opt(sbi, ATGC);
+ bool no_discard = !test_opt(sbi, DISCARD);
bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
- bool checkpoint_changed;
+ bool block_unit_discard = f2fs_block_unit_discard(sbi);
+ struct discard_cmd_control *dcc;
#ifdef CONFIG_QUOTA
int i, j;
#endif
@@ -2111,8 +2171,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
err = parse_options(sb, data, true);
if (err)
goto restore_opts;
- checkpoint_changed =
- disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT);
/*
* Previous and new state of filesystem is RO,
@@ -2169,6 +2227,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if (block_unit_discard != f2fs_block_unit_discard(sbi)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "switch discard_unit option is not allowed");
+ goto restore_opts;
+ }
+
if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
err = -EINVAL;
f2fs_warn(sbi, "disabling checkpoint not compatible with read-only");
@@ -2234,11 +2298,26 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_flush = true;
}
- if (checkpoint_changed) {
+ if (no_discard == !!test_opt(sbi, DISCARD)) {
+ if (test_opt(sbi, DISCARD)) {
+ err = f2fs_start_discard_thread(sbi);
+ if (err)
+ goto restore_flush;
+ need_stop_discard = true;
+ } else {
+ dcc = SM_I(sbi)->dcc_info;
+ f2fs_stop_discard_thread(sbi);
+ if (atomic_read(&dcc->discard_cmd_cnt))
+ f2fs_issue_discard_timeout(sbi);
+ need_restart_discard = true;
+ }
+ }
+
+ if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) {
if (test_opt(sbi, DISABLE_CHECKPOINT)) {
err = f2fs_disable_checkpoint(sbi);
if (err)
- goto restore_flush;
+ goto restore_discard;
} else {
f2fs_enable_checkpoint(sbi);
}
@@ -2258,6 +2337,13 @@ skip:
adjust_unusable_cap_perc(sbi);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
return 0;
+restore_discard:
+ if (need_restart_discard) {
+ if (f2fs_start_discard_thread(sbi))
+ f2fs_warn(sbi, "discard has been stopped");
+ } else if (need_stop_discard) {
+ f2fs_stop_discard_thread(sbi);
+ }
restore_flush:
if (need_restart_flush) {
if (f2fs_create_flush_cmd_control(sbi))
@@ -2518,6 +2604,33 @@ static int f2fs_enable_quotas(struct super_block *sb)
return 0;
}
+static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type)
+{
+ struct quota_info *dqopt = sb_dqopt(sbi->sb);
+ struct address_space *mapping = dqopt->files[type]->i_mapping;
+ int ret = 0;
+
+ ret = dquot_writeback_dquots(sbi->sb, type);
+ if (ret)
+ goto out;
+
+ ret = filemap_fdatawrite(mapping);
+ if (ret)
+ goto out;
+
+ /* if we are using journalled quota */
+ if (is_journalled_quota(sbi))
+ goto out;
+
+ ret = filemap_fdatawait(mapping);
+
+ truncate_inode_pages(&dqopt->files[type]->i_data, 0);
+out:
+ if (ret)
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ return ret;
+}
+
int f2fs_quota_sync(struct super_block *sb, int type)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -2526,56 +2639,41 @@ int f2fs_quota_sync(struct super_block *sb, int type)
int ret;
/*
- * do_quotactl
- * f2fs_quota_sync
- * down_read(quota_sem)
- * dquot_writeback_dquots()
- * f2fs_dquot_commit
- * block_operation
- * down_read(quota_sem)
- */
- f2fs_lock_op(sbi);
-
- down_read(&sbi->quota_sem);
- ret = dquot_writeback_dquots(sb, type);
- if (ret)
- goto out;
-
- /*
* Now when everything is written we can discard the pagecache so
* that userspace sees the changes.
*/
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- struct address_space *mapping;
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_active(sb, cnt))
- continue;
- mapping = dqopt->files[cnt]->i_mapping;
+ if (!sb_has_quota_active(sb, type))
+ return 0;
- ret = filemap_fdatawrite(mapping);
- if (ret)
- goto out;
+ inode_lock(dqopt->files[cnt]);
- /* if we are using journalled quota */
- if (is_journalled_quota(sbi))
- continue;
+ /*
+ * do_quotactl
+ * f2fs_quota_sync
+ * down_read(quota_sem)
+ * dquot_writeback_dquots()
+ * f2fs_dquot_commit
+ * block_operation
+ * down_read(quota_sem)
+ */
+ f2fs_lock_op(sbi);
+ down_read(&sbi->quota_sem);
- ret = filemap_fdatawait(mapping);
- if (ret)
- set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+ ret = f2fs_quota_sync_file(sbi, cnt);
+
+ up_read(&sbi->quota_sem);
+ f2fs_unlock_op(sbi);
- inode_lock(dqopt->files[cnt]);
- truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
inode_unlock(dqopt->files[cnt]);
+
+ if (ret)
+ break;
}
-out:
- if (ret)
- set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
- f2fs_unlock_op(sbi);
return ret;
}
@@ -3208,11 +3306,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
- if (le32_to_cpu(raw_super->cp_payload) >
- (blocks_per_seg - F2FS_CP_PACKS)) {
- f2fs_info(sbi, "Insane cp_payload (%u > %u)",
+ if (le32_to_cpu(raw_super->cp_payload) >=
+ (blocks_per_seg - F2FS_CP_PACKS -
+ NR_CURSEG_PERSIST_TYPE)) {
+ f2fs_info(sbi, "Insane cp_payload (%u >= %u)",
le32_to_cpu(raw_super->cp_payload),
- blocks_per_seg - F2FS_CP_PACKS);
+ blocks_per_seg - F2FS_CP_PACKS -
+ NR_CURSEG_PERSIST_TYPE);
return -EFSCORRUPTED;
}
@@ -3248,6 +3348,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
unsigned int cp_pack_start_sum, cp_payload;
block_t user_block_count, valid_user_blocks;
block_t avail_node_count, valid_node_count;
+ unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks;
int i, j;
total = le32_to_cpu(raw_super->segment_count);
@@ -3378,6 +3479,17 @@ skip_cross:
return 1;
}
+ nat_blocks = nat_segs << log_blocks_per_seg;
+ nat_bits_bytes = nat_blocks / BITS_PER_BYTE;
+ nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
+ if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) &&
+ (cp_payload + F2FS_CP_PACKS +
+ NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) {
+ f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)",
+ cp_payload, nat_bits_blocks);
+ return -EFSCORRUPTED;
+ }
+
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_err(sbi, "A bug case: need to run fsck");
return 1;
@@ -3410,6 +3522,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
sbi->migration_granularity = sbi->segs_per_sec;
+ sbi->seq_file_ra_mul = MIN_RA_MUL;
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
@@ -3769,7 +3882,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
/* adjust parameters according to the volume size */
if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
- sm_i->dcc_info->discard_granularity = 1;
+ if (f2fs_block_unit_discard(sbi))
+ sm_i->dcc_info->discard_granularity = 1;
sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
}
@@ -3890,11 +4004,6 @@ try_onemore:
set_sbi_flag(sbi, SBI_POR_DOING);
spin_lock_init(&sbi->stat_lock);
- /* init iostat info */
- spin_lock_init(&sbi->iostat_lock);
- sbi->iostat_enable = false;
- sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
-
for (i = 0; i < NR_PAGE_TYPE; i++) {
int n = (i == META) ? 1 : NR_TEMP_TYPE;
int j;
@@ -3925,10 +4034,14 @@ try_onemore:
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
- err = init_percpu_info(sbi);
+ err = f2fs_init_iostat(sbi);
if (err)
goto free_bio_info;
+ err = init_percpu_info(sbi);
+ if (err)
+ goto free_iostat;
+
if (F2FS_IO_ALIGNED(sbi)) {
sbi->write_io_dummy =
mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
@@ -4260,6 +4373,8 @@ free_io_dummy:
mempool_destroy(sbi->write_io_dummy);
free_percpu:
destroy_percpu_info(sbi);
+free_iostat:
+ f2fs_destroy_iostat(sbi);
free_bio_info:
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
@@ -4402,9 +4517,12 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_post_read_processing();
if (err)
goto free_root_stats;
- err = f2fs_init_bio_entry_cache();
+ err = f2fs_init_iostat_processing();
if (err)
goto free_post_read;
+ err = f2fs_init_bio_entry_cache();
+ if (err)
+ goto free_iostat;
err = f2fs_init_bioset();
if (err)
goto free_bio_enrty_cache;
@@ -4426,6 +4544,8 @@ free_bioset:
f2fs_destroy_bioset();
free_bio_enrty_cache:
f2fs_destroy_bio_entry_cache();
+free_iostat:
+ f2fs_destroy_iostat_processing();
free_post_read:
f2fs_destroy_post_read_processing();
free_root_stats:
@@ -4460,6 +4580,7 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_compress_mempool();
f2fs_destroy_bioset();
f2fs_destroy_bio_entry_cache();
+ f2fs_destroy_iostat_processing();
f2fs_destroy_post_read_processing();
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 6642246206bd..a32fe31c33b8 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -17,6 +17,7 @@
#include "f2fs.h"
#include "segment.h"
#include "gc.h"
+#include "iostat.h"
#include <trace/events/f2fs.h>
static struct proc_dir_entry *f2fs_proc_root;
@@ -307,6 +308,14 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
+ if (!strcmp(a->attr.name, "gc_segment_mode"))
+ return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
+
+ if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
+ return sysfs_emit(buf, "%u\n",
+ sbi->gc_reclaimed_segs[sbi->gc_segment_mode]);
+ }
+
ui = (unsigned int *)(ptr + a->offset);
return sprintf(buf, "%u\n", *ui);
@@ -343,7 +352,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
set = false;
}
- if (strlen(name) >= F2FS_EXTENSION_LEN)
+ if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
return -EINVAL;
down_write(&sbi->sb_lock);
@@ -378,7 +387,7 @@ out:
ret = kstrtol(name, 10, &data);
if (ret)
return ret;
- if (data >= IOPRIO_BE_NR || data < 0)
+ if (data >= IOPRIO_NR_LEVELS || data < 0)
return -EINVAL;
cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data);
@@ -420,6 +429,8 @@ out:
if (!strcmp(a->attr.name, "discard_granularity")) {
if (t == 0 || t > MAX_PLIST_NUM)
return -EINVAL;
+ if (!f2fs_block_unit_discard(sbi))
+ return -EINVAL;
if (t == *ui)
return count;
*ui = t;
@@ -467,6 +478,7 @@ out:
return count;
}
+#ifdef CONFIG_F2FS_IOSTAT
if (!strcmp(a->attr.name, "iostat_enable")) {
sbi->iostat_enable = !!t;
if (!sbi->iostat_enable)
@@ -482,6 +494,7 @@ out:
spin_unlock(&sbi->iostat_lock);
return count;
}
+#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (!strcmp(a->attr.name, "compr_written_block") ||
@@ -515,6 +528,29 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "gc_segment_mode")) {
+ if (t < MAX_GC_MODE)
+ sbi->gc_segment_mode = t;
+ else
+ return -EINVAL;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "seq_file_ra_mul")) {
+ if (t >= MIN_RA_MUL && t <= MAX_RA_MUL)
+ sbi->seq_file_ra_mul = t;
+ else
+ return -EINVAL;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -667,8 +703,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
+#ifdef CONFIG_F2FS_IOSTAT
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms);
+#endif
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
@@ -740,6 +778,10 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_cou
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight);
F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs);
+
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_urgent_sleep_time),
@@ -770,8 +812,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(discard_idle_interval),
ATTR_LIST(gc_idle_interval),
ATTR_LIST(umount_discard_timeout),
+#ifdef CONFIG_F2FS_IOSTAT
ATTR_LIST(iostat_enable),
ATTR_LIST(iostat_period_ms),
+#endif
ATTR_LIST(readdir_ra),
ATTR_LIST(max_io_bytes),
ATTR_LIST(gc_pin_file_thresh),
@@ -812,6 +856,9 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(atgc_candidate_count),
ATTR_LIST(atgc_age_weight),
ATTR_LIST(atgc_age_threshold),
+ ATTR_LIST(seq_file_ra_mul),
+ ATTR_LIST(gc_segment_mode),
+ ATTR_LIST(gc_reclaimed_segments),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -1036,101 +1083,6 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
return 0;
}
-void f2fs_record_iostat(struct f2fs_sb_info *sbi)
-{
- unsigned long long iostat_diff[NR_IO_TYPE];
- int i;
-
- if (time_is_after_jiffies(sbi->iostat_next_period))
- return;
-
- /* Need double check under the lock */
- spin_lock(&sbi->iostat_lock);
- if (time_is_after_jiffies(sbi->iostat_next_period)) {
- spin_unlock(&sbi->iostat_lock);
- return;
- }
- sbi->iostat_next_period = jiffies +
- msecs_to_jiffies(sbi->iostat_period_ms);
-
- for (i = 0; i < NR_IO_TYPE; i++) {
- iostat_diff[i] = sbi->rw_iostat[i] -
- sbi->prev_rw_iostat[i];
- sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
- }
- spin_unlock(&sbi->iostat_lock);
-
- trace_f2fs_iostat(sbi, iostat_diff);
-}
-
-static int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
- void *offset)
-{
- struct super_block *sb = seq->private;
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
- time64_t now = ktime_get_real_seconds();
-
- if (!sbi->iostat_enable)
- return 0;
-
- seq_printf(seq, "time: %-16llu\n", now);
-
- /* print app write IOs */
- seq_puts(seq, "[WRITE]\n");
- seq_printf(seq, "app buffered: %-16llu\n",
- sbi->rw_iostat[APP_BUFFERED_IO]);
- seq_printf(seq, "app direct: %-16llu\n",
- sbi->rw_iostat[APP_DIRECT_IO]);
- seq_printf(seq, "app mapped: %-16llu\n",
- sbi->rw_iostat[APP_MAPPED_IO]);
-
- /* print fs write IOs */
- seq_printf(seq, "fs data: %-16llu\n",
- sbi->rw_iostat[FS_DATA_IO]);
- seq_printf(seq, "fs node: %-16llu\n",
- sbi->rw_iostat[FS_NODE_IO]);
- seq_printf(seq, "fs meta: %-16llu\n",
- sbi->rw_iostat[FS_META_IO]);
- seq_printf(seq, "fs gc data: %-16llu\n",
- sbi->rw_iostat[FS_GC_DATA_IO]);
- seq_printf(seq, "fs gc node: %-16llu\n",
- sbi->rw_iostat[FS_GC_NODE_IO]);
- seq_printf(seq, "fs cp data: %-16llu\n",
- sbi->rw_iostat[FS_CP_DATA_IO]);
- seq_printf(seq, "fs cp node: %-16llu\n",
- sbi->rw_iostat[FS_CP_NODE_IO]);
- seq_printf(seq, "fs cp meta: %-16llu\n",
- sbi->rw_iostat[FS_CP_META_IO]);
-
- /* print app read IOs */
- seq_puts(seq, "[READ]\n");
- seq_printf(seq, "app buffered: %-16llu\n",
- sbi->rw_iostat[APP_BUFFERED_READ_IO]);
- seq_printf(seq, "app direct: %-16llu\n",
- sbi->rw_iostat[APP_DIRECT_READ_IO]);
- seq_printf(seq, "app mapped: %-16llu\n",
- sbi->rw_iostat[APP_MAPPED_READ_IO]);
-
- /* print fs read IOs */
- seq_printf(seq, "fs data: %-16llu\n",
- sbi->rw_iostat[FS_DATA_READ_IO]);
- seq_printf(seq, "fs gc data: %-16llu\n",
- sbi->rw_iostat[FS_GDATA_READ_IO]);
- seq_printf(seq, "fs compr_data: %-16llu\n",
- sbi->rw_iostat[FS_CDATA_READ_IO]);
- seq_printf(seq, "fs node: %-16llu\n",
- sbi->rw_iostat[FS_NODE_READ_IO]);
- seq_printf(seq, "fs meta: %-16llu\n",
- sbi->rw_iostat[FS_META_READ_IO]);
-
- /* print other IOs */
- seq_puts(seq, "[OTHER]\n");
- seq_printf(seq, "fs discard: %-16llu\n",
- sbi->rw_iostat[FS_DISCARD]);
-
- return 0;
-}
-
static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
void *offset)
{
@@ -1213,13 +1165,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
if (sbi->s_proc) {
- proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc,
+ proc_create_single_data("segment_info", 0444, sbi->s_proc,
segment_info_seq_show, sb);
- proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc,
+ proc_create_single_data("segment_bits", 0444, sbi->s_proc,
segment_bits_seq_show, sb);
- proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc,
+#ifdef CONFIG_F2FS_IOSTAT
+ proc_create_single_data("iostat_info", 0444, sbi->s_proc,
iostat_info_seq_show, sb);
- proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc,
+#endif
+ proc_create_single_data("victim_bits", 0444, sbi->s_proc,
victim_bits_seq_show, sb);
}
return 0;
@@ -1238,7 +1192,9 @@ put_sb_kobj:
void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
{
if (sbi->s_proc) {
+#ifdef CONFIG_F2FS_IOSTAT
remove_proc_entry("iostat_info", sbi->s_proc);
+#endif
remove_proc_entry("segment_info", sbi->s_proc);
remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry("victim_bits", sbi->s_proc);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index c8f34decbf8e..1d2d29dcd41c 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -27,7 +27,8 @@ static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
{
if (likely(size == sbi->inline_xattr_slab_size)) {
*is_inline = true;
- return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS);
+ return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab,
+ GFP_F2FS_ZERO, false, sbi);
}
*is_inline = false;
return f2fs_kzalloc(sbi, size, GFP_NOFS);
diff --git a/fs/fat/.kunitconfig b/fs/fat/.kunitconfig
new file mode 100644
index 000000000000..0a6971dbeccb
--- /dev/null
+++ b/fs/fat/.kunitconfig
@@ -0,0 +1,5 @@
+CONFIG_KUNIT=y
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_FAT_KUNIT_TEST=y
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index 66532a71e8fd..238cc55f84c4 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -77,7 +77,7 @@ config VFAT_FS
config FAT_DEFAULT_CODEPAGE
int "Default codepage for FAT"
- depends on MSDOS_FS || VFAT_FS
+ depends on FAT_FS
default 437
help
This option should be set to the codepage of your FAT filesystems.
@@ -115,3 +115,15 @@ config FAT_DEFAULT_UTF8
Say Y if you use UTF-8 encoding for file names, N otherwise.
See <file:Documentation/filesystems/vfat.rst> for more information.
+
+config FAT_KUNIT_TEST
+ tristate "Unit Tests for FAT filesystems" if !KUNIT_ALL_TESTS
+ depends on KUNIT && FAT_FS
+ default KUNIT_ALL_TESTS
+ help
+ This builds the FAT KUnit tests
+
+ For more information on KUnit and unit tests in general, please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit
+
+ If unsure, say N
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
index 70645ce2f7fc..2b034112690d 100644
--- a/fs/fat/Makefile
+++ b/fs/fat/Makefile
@@ -10,3 +10,5 @@ obj-$(CONFIG_MSDOS_FS) += msdos.o
fat-y := cache.o dir.o fatent.o file.o inode.o misc.o nfs.o
vfat-y := namei_vfat.o
msdos-y := namei_msdos.o
+
+obj-$(CONFIG_FAT_KUNIT_TEST) += fat_test.o
diff --git a/fs/fat/fat_test.c b/fs/fat/fat_test.c
new file mode 100644
index 000000000000..2dab4ca1d0d8
--- /dev/null
+++ b/fs/fat/fat_test.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for FAT filesystems.
+ *
+ * Copyright (C) 2020 Google LLC.
+ * Author: David Gow <davidgow@google.com>
+ */
+
+#include <kunit/test.h>
+
+#include "fat.h"
+
+static void fat_checksum_test(struct kunit *test)
+{
+ /* With no extension. */
+ KUNIT_EXPECT_EQ(test, fat_checksum("VMLINUX "), (u8)44);
+ /* With 3-letter extension. */
+ KUNIT_EXPECT_EQ(test, fat_checksum("README TXT"), (u8)115);
+ /* With short (1-letter) extension. */
+ KUNIT_EXPECT_EQ(test, fat_checksum("ABCDEFGHA "), (u8)98);
+}
+
+struct fat_timestamp_testcase {
+ const char *name;
+ struct timespec64 ts;
+ __le16 time;
+ __le16 date;
+ u8 cs;
+ int time_offset;
+};
+
+static struct fat_timestamp_testcase time_test_cases[] = {
+ {
+ .name = "Earliest possible UTC (1980-01-01 00:00:00)",
+ .ts = {.tv_sec = 315532800LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(0),
+ .date = cpu_to_le16(33),
+ .cs = 0,
+ .time_offset = 0,
+ },
+ {
+ .name = "Latest possible UTC (2107-12-31 23:59:58)",
+ .ts = {.tv_sec = 4354819198LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(49021),
+ .date = cpu_to_le16(65439),
+ .cs = 0,
+ .time_offset = 0,
+ },
+ {
+ .name = "Earliest possible (UTC-11) (== 1979-12-31 13:00:00 UTC)",
+ .ts = {.tv_sec = 315493200LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(0),
+ .date = cpu_to_le16(33),
+ .cs = 0,
+ .time_offset = 11 * 60,
+ },
+ {
+ .name = "Latest possible (UTC+11) (== 2108-01-01 10:59:58 UTC)",
+ .ts = {.tv_sec = 4354858798LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(49021),
+ .date = cpu_to_le16(65439),
+ .cs = 0,
+ .time_offset = -11 * 60,
+ },
+ {
+ .name = "Leap Day / Year (1996-02-29 00:00:00)",
+ .ts = {.tv_sec = 825552000LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(0),
+ .date = cpu_to_le16(8285),
+ .cs = 0,
+ .time_offset = 0,
+ },
+ {
+ .name = "Year 2000 is leap year (2000-02-29 00:00:00)",
+ .ts = {.tv_sec = 951782400LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(0),
+ .date = cpu_to_le16(10333),
+ .cs = 0,
+ .time_offset = 0,
+ },
+ {
+ .name = "Year 2100 not leap year (2100-03-01 00:00:00)",
+ .ts = {.tv_sec = 4107542400LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(0),
+ .date = cpu_to_le16(61537),
+ .cs = 0,
+ .time_offset = 0,
+ },
+ {
+ .name = "Leap year + timezone UTC+1 (== 2004-02-29 00:30:00 UTC)",
+ .ts = {.tv_sec = 1078014600LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(48064),
+ .date = cpu_to_le16(12380),
+ .cs = 0,
+ .time_offset = -60,
+ },
+ {
+ .name = "Leap year + timezone UTC-1 (== 2004-02-29 23:30:00 UTC)",
+ .ts = {.tv_sec = 1078097400LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(960),
+ .date = cpu_to_le16(12385),
+ .cs = 0,
+ .time_offset = 60,
+ },
+ {
+ .name = "VFAT odd-second resolution (1999-12-31 23:59:59)",
+ .ts = {.tv_sec = 946684799LL, .tv_nsec = 0L},
+ .time = cpu_to_le16(49021),
+ .date = cpu_to_le16(10143),
+ .cs = 100,
+ .time_offset = 0,
+ },
+ {
+ .name = "VFAT 10ms resolution (1980-01-01 00:00:00:0010)",
+ .ts = {.tv_sec = 315532800LL, .tv_nsec = 10000000L},
+ .time = cpu_to_le16(0),
+ .date = cpu_to_le16(33),
+ .cs = 1,
+ .time_offset = 0,
+ },
+};
+
+static void time_testcase_desc(struct fat_timestamp_testcase *t,
+ char *desc)
+{
+ strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE);
+}
+
+KUNIT_ARRAY_PARAM(fat_time, time_test_cases, time_testcase_desc);
+
+static void fat_time_fat2unix_test(struct kunit *test)
+{
+ static struct msdos_sb_info fake_sb;
+ struct timespec64 ts;
+ struct fat_timestamp_testcase *testcase =
+ (struct fat_timestamp_testcase *)test->param_value;
+
+ fake_sb.options.tz_set = 1;
+ fake_sb.options.time_offset = testcase->time_offset;
+
+ fat_time_fat2unix(&fake_sb, &ts,
+ testcase->time,
+ testcase->date,
+ testcase->cs);
+ KUNIT_EXPECT_EQ_MSG(test,
+ testcase->ts.tv_sec,
+ ts.tv_sec,
+ "Timestamp mismatch (seconds)\n");
+ KUNIT_EXPECT_EQ_MSG(test,
+ testcase->ts.tv_nsec,
+ ts.tv_nsec,
+ "Timestamp mismatch (nanoseconds)\n");
+}
+
+static void fat_time_unix2fat_test(struct kunit *test)
+{
+ static struct msdos_sb_info fake_sb;
+ __le16 date, time;
+ u8 cs;
+ struct fat_timestamp_testcase *testcase =
+ (struct fat_timestamp_testcase *)test->param_value;
+
+ fake_sb.options.tz_set = 1;
+ fake_sb.options.time_offset = testcase->time_offset;
+
+ fat_time_unix2fat(&fake_sb, &testcase->ts,
+ &time, &date, &cs);
+ KUNIT_EXPECT_EQ_MSG(test,
+ le16_to_cpu(testcase->time),
+ le16_to_cpu(time),
+ "Time mismatch\n");
+ KUNIT_EXPECT_EQ_MSG(test,
+ le16_to_cpu(testcase->date),
+ le16_to_cpu(date),
+ "Date mismatch\n");
+ KUNIT_EXPECT_EQ_MSG(test,
+ testcase->cs,
+ cs,
+ "Centisecond mismatch\n");
+}
+
+static struct kunit_case fat_test_cases[] = {
+ KUNIT_CASE(fat_checksum_test),
+ KUNIT_CASE_PARAM(fat_time_fat2unix_test, fat_time_gen_params),
+ KUNIT_CASE_PARAM(fat_time_unix2fat_test, fat_time_gen_params),
+ {},
+};
+
+static struct kunit_suite fat_test_suite = {
+ .name = "fat_test",
+ .test_cases = fat_test_cases,
+};
+
+kunit_test_suites(&fat_test_suite);
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 860e884e56e8..978ac6751aeb 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -5,6 +5,7 @@
#include <linux/blkdev.h>
#include <linux/sched/signal.h>
+#include <linux/backing-dev-defs.h>
#include "fat.h"
struct fatent_operations {
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 18a50a46b57f..91ca3c304211 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -230,6 +230,9 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
}
}
+/* Export fat_time_fat2unix() for the fat_test KUnit tests. */
+EXPORT_SYMBOL_GPL(fat_time_fat2unix);
+
/* Convert linear UNIX date to a FAT time/date pair. */
void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f946bec8f1f1..9c6c6a3e2de5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -150,7 +150,8 @@ void f_delown(struct file *filp)
pid_t f_getown(struct file *filp)
{
pid_t pid = 0;
- read_lock(&filp->f_owner.lock);
+
+ read_lock_irq(&filp->f_owner.lock);
rcu_read_lock();
if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
pid = pid_vnr(filp->f_owner.pid);
@@ -158,7 +159,7 @@ pid_t f_getown(struct file *filp)
pid = -pid;
}
rcu_read_unlock();
- read_unlock(&filp->f_owner.lock);
+ read_unlock_irq(&filp->f_owner.lock);
return pid;
}
@@ -208,7 +209,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
struct f_owner_ex owner = {};
int ret = 0;
- read_lock(&filp->f_owner.lock);
+ read_lock_irq(&filp->f_owner.lock);
rcu_read_lock();
if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
owner.pid = pid_vnr(filp->f_owner.pid);
@@ -231,7 +232,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
ret = -EINVAL;
break;
}
- read_unlock(&filp->f_owner.lock);
+ read_unlock_irq(&filp->f_owner.lock);
if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
@@ -249,10 +250,10 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
uid_t src[2];
int err;
- read_lock(&filp->f_owner.lock);
+ read_lock_irq(&filp->f_owner.lock);
src[0] = from_kuid(user_ns, filp->f_owner.uid);
src[1] = from_kuid(user_ns, filp->f_owner.euid);
- read_unlock(&filp->f_owner.lock);
+ read_unlock_irq(&filp->f_owner.lock);
err = put_user(src[0], &dst[0]);
err |= put_user(src[1], &dst[1]);
@@ -1003,13 +1004,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
struct fown_struct *fown;
+ unsigned long flags;
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
- read_lock(&fa->fa_lock);
+ read_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
fown = &fa->fa_file->f_owner;
/* Don't send SIGURG to processes which have not set a
@@ -1018,7 +1020,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
- read_unlock(&fa->fa_lock);
+ read_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}
}
@@ -1049,7 +1051,8 @@ static int __init fcntl_init(void)
__FMODE_EXEC | __FMODE_NONOTIFY));
fasync_cache = kmem_cache_create("fasync_cache",
- sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+ sizeof(struct fasync_struct), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
diff --git a/fs/file.c b/fs/file.c
index 86dc9956af32..d8afa8266859 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -596,18 +596,32 @@ void fd_install(unsigned int fd, struct file *file)
EXPORT_SYMBOL(fd_install);
+/**
+ * pick_file - return file associatd with fd
+ * @files: file struct to retrieve file from
+ * @fd: file descriptor to retrieve file for
+ *
+ * If this functions returns an EINVAL error pointer the fd was beyond the
+ * current maximum number of file descriptors for that fdtable.
+ *
+ * Returns: The file associated with @fd, on error returns an error pointer.
+ */
static struct file *pick_file(struct files_struct *files, unsigned fd)
{
- struct file *file = NULL;
+ struct file *file;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- if (fd >= fdt->max_fds)
+ if (fd >= fdt->max_fds) {
+ file = ERR_PTR(-EINVAL);
goto out_unlock;
+ }
file = fdt->fd[fd];
- if (!file)
+ if (!file) {
+ file = ERR_PTR(-EBADF);
goto out_unlock;
+ }
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
@@ -622,7 +636,7 @@ int close_fd(unsigned fd)
struct file *file;
file = pick_file(files, fd);
- if (!file)
+ if (IS_ERR(file))
return -EBADF;
return filp_close(file, files);
@@ -663,11 +677,16 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
struct file *file;
file = pick_file(cur_fds, fd++);
- if (!file)
+ if (!IS_ERR(file)) {
+ /* found a valid file to close */
+ filp_close(file, cur_fds);
+ cond_resched();
continue;
+ }
- filp_close(file, cur_fds);
- cond_resched();
+ /* beyond the last fd in that table */
+ if (PTR_ERR(file) == -EINVAL)
+ return;
}
}
@@ -682,7 +701,6 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
*/
int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
{
- unsigned int cur_max;
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;
@@ -692,26 +710,26 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
if (fd > max_fd)
return -EINVAL;
- rcu_read_lock();
- cur_max = files_fdtable(cur_fds)->max_fds;
- rcu_read_unlock();
-
- /* cap to last valid index into fdtable */
- cur_max--;
-
if (flags & CLOSE_RANGE_UNSHARE) {
int ret;
unsigned int max_unshare_fds = NR_OPEN_MAX;
/*
- * If the requested range is greater than the current maximum,
- * we're closing everything so only copy all file descriptors
- * beneath the lowest file descriptor.
- * If the caller requested all fds to be made cloexec copy all
- * of the file descriptors since they still want to use them.
+ * If the caller requested all fds to be made cloexec we always
+ * copy all of the file descriptors since they still want to
+ * use them.
*/
- if (!(flags & CLOSE_RANGE_CLOEXEC) && (max_fd >= cur_max))
- max_unshare_fds = fd;
+ if (!(flags & CLOSE_RANGE_CLOEXEC)) {
+ /*
+ * If the requested range is greater than the current
+ * maximum, we're closing everything so only copy all
+ * file descriptors beneath the lowest file descriptor.
+ */
+ rcu_read_lock();
+ if (max_fd >= last_fd(files_fdtable(cur_fds)))
+ max_unshare_fds = fd;
+ rcu_read_unlock();
+ }
ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
if (ret)
@@ -725,8 +743,6 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
swap(cur_fds, fds);
}
- max_fd = min(max_fd, cur_max);
-
if (flags & CLOSE_RANGE_CLOEXEC)
__range_cloexec(cur_fds, fd, max_fd);
else
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4c3370548982..81ec192ce067 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
inc_wb_stat(new_wb, WB_WRITEBACK);
}
+ if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+ atomic_dec(&old_wb->writeback_inodes);
+ atomic_inc(&new_wb->writeback_inodes);
+ }
+
wb_get(new_wb);
/*
@@ -1034,20 +1039,20 @@ restart:
* cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
* @bdi_id: target bdi id
* @memcg_id: target memcg css id
- * @nr: number of pages to write, 0 for best-effort dirty flushing
* @reason: reason why some writeback work initiated
* @done: target wb_completion
*
* Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
* with the specified parameters.
*/
-int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
enum wb_reason reason, struct wb_completion *done)
{
struct backing_dev_info *bdi;
struct cgroup_subsys_state *memcg_css;
struct bdi_writeback *wb;
struct wb_writeback_work *work;
+ unsigned long dirty;
int ret;
/* lookup bdi and memcg */
@@ -1076,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
}
/*
- * If @nr is zero, the caller is attempting to write out most of
+ * The caller is attempting to write out most of
* the currently dirty pages. Let's take the current dirty page
* count and inflate it by 25% which should be large enough to
* flush out most dirty pages while avoiding getting livelocked by
* concurrent dirtiers.
+ *
+ * BTW the memcg stats are flushed periodically and this is best-effort
+ * estimation, so some potential error is ok.
*/
- if (!nr) {
- unsigned long filepages, headroom, dirty, writeback;
-
- mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
- &writeback);
- nr = dirty * 10 / 8;
- }
+ dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
+ dirty = dirty * 10 / 8;
/* issue the writeback work */
work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
if (work) {
- work->nr_pages = nr;
+ work->nr_pages = dirty;
work->sync_mode = WB_SYNC_NONE;
work->range_cyclic = 1;
work->reason = reason;
@@ -1999,7 +2002,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long dirtied_before = jiffies;
struct inode *inode;
@@ -2053,8 +2055,6 @@ static long wb_writeback(struct bdi_writeback *wb,
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb, work);
- wb_update_bandwidth(wb, wb_start);
-
/*
* Did we write something? Try for more
*
@@ -2730,23 +2730,6 @@ int write_inode_now(struct inode *inode, int sync)
EXPORT_SYMBOL(write_inode_now);
/**
- * sync_inode - write an inode and its pages to disk.
- * @inode: the inode to sync
- * @wbc: controls the writeback mode
- *
- * sync_inode() will write an inode and its pages to disk. It will also
- * correctly update the inode on its superblock's dirty inode lists and will
- * update inode->i_state.
- *
- * The caller must have a ref on the inode.
- */
-int sync_inode(struct inode *inode, struct writeback_control *wbc)
-{
- return writeback_single_inode(inode, wbc);
-}
-EXPORT_SYMBOL(sync_inode);
-
-/**
* sync_inode_metadata - write an inode to disk
* @inode: the inode to sync
* @wait: wait for I/O to complete.
@@ -2762,6 +2745,6 @@ int sync_inode_metadata(struct inode *inode, int wait)
.nr_to_write = 0, /* metadata-only */
};
- return sync_inode(inode, &wbc);
+ return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fs_context.c b/fs/fs_context.c
index de1985eae535..b7e43a780a62 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -254,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
struct fs_context *fc;
int ret = -ENOMEM;
- fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
if (!fc)
return ERR_PTR(-ENOMEM);
@@ -649,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = {
*/
static int legacy_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+ fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
if (!fc->fs_private)
return -ENOMEM;
fc->ops = &legacy_fs_context_ops;
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 427efa73b9bd..b313a978ae0a 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -14,6 +14,7 @@ config FSCACHE
config FSCACHE_STATS
bool "Gather statistical information on local caching"
depends on FSCACHE && PROC_FS
+ select NETFS_STATS
help
This option causes statistical information to be gathered on local
caching and exported through file:
@@ -28,23 +29,6 @@ config FSCACHE_STATS
See Documentation/filesystems/caching/fscache.rst for more information.
-config FSCACHE_HISTOGRAM
- bool "Gather latency information on local caching"
- depends on FSCACHE && PROC_FS
- help
- This option causes latency information to be gathered on local
- caching and exported through file:
-
- /proc/fs/fscache/histogram
-
- The generation of this histogram adds a certain amount of overhead to
- execution as there are a number of points at which data is gathered,
- and on a multi-CPU system these may be on cachelines that keep
- bouncing between CPUs. On the other hand, the histogram may be
- useful for debugging purposes. Saying 'N' here is recommended.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
config FSCACHE_DEBUG
bool "Debug FS-Cache"
depends on FSCACHE
@@ -54,10 +38,3 @@ config FSCACHE_DEBUG
enabled by setting bits in /sys/modules/fscache/parameter/debug.
See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_OBJECT_LIST
- bool "Maintain global object list for debugging purposes"
- depends on FSCACHE && PROC_FS
- help
- Maintain a global list of active fscache objects that can be
- retrieved through /proc/fs/fscache/objects for debugging purposes
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 3b2ffa93ac18..03a871d689bb 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -16,7 +16,5 @@ fscache-y := \
fscache-$(CONFIG_PROC_FS) += proc.o
fscache-$(CONFIG_FSCACHE_STATS) += stats.o
-fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
-fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o
obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index fcc136361415..bd4f44c1cce0 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -116,7 +116,7 @@ struct fscache_cache *fscache_select_cache_for_object(
cache = NULL;
spin_unlock(&cookie->lock);
- _leave(" = %p [parent]", cache);
+ _leave(" = %s [parent]", cache ? cache->tag->name : "NULL");
return cache;
}
@@ -152,14 +152,14 @@ struct fscache_cache *fscache_select_cache_for_object(
if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
return NULL;
- _leave(" = %p [specific]", tag->cache);
+ _leave(" = %s [specific]", tag->name);
return tag->cache;
no_preference:
/* netfs has no preference - just select first cache */
cache = list_entry(fscache_cache_list.next,
struct fscache_cache, link);
- _leave(" = %p [first]", cache);
+ _leave(" = %s [first]", cache->tag->name);
return cache;
}
@@ -261,7 +261,6 @@ int fscache_add_cache(struct fscache_cache *cache,
spin_lock(&cache->object_list_lock);
list_add_tail(&ifsdef->cache_link, &cache->object_list);
spin_unlock(&cache->object_list_lock);
- fscache_objlist_add(ifsdef);
/* add the cache's netfs definition index object to the top level index
* cookie as a known backing object */
@@ -270,7 +269,7 @@ int fscache_add_cache(struct fscache_cache *cache,
hlist_add_head(&ifsdef->cookie_link,
&fscache_fsdef_index.backing_objects);
- atomic_inc(&fscache_fsdef_index.usage);
+ refcount_inc(&fscache_fsdef_index.ref);
/* done */
spin_unlock(&fscache_fsdef_index.lock);
@@ -335,7 +334,7 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
struct fscache_object, cache_link);
list_move_tail(&object->cache_link, dying_objects);
- _debug("withdraw %p", object->cookie);
+ _debug("withdraw %x", object->cookie->debug_id);
/* This must be done under object_list_lock to prevent
* a race with fscache_drop_object().
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 751bc5b1cddf..cd42be646ed3 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -19,6 +19,8 @@ static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
#define fscache_cookie_hash_shift 15
static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift];
+static LIST_HEAD(fscache_cookies);
+static DEFINE_RWLOCK(fscache_cookies_lock);
static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
loff_t object_size);
@@ -29,21 +31,29 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
{
- struct hlist_node *object;
+ struct fscache_object *object;
+ struct hlist_node *o;
const u8 *k;
unsigned loop;
- pr_err("%c-cookie c=%p [p=%p fl=%lx nc=%u na=%u]\n",
- prefix, cookie, cookie->parent, cookie->flags,
+ pr_err("%c-cookie c=%08x [p=%08x fl=%lx nc=%u na=%u]\n",
+ prefix,
+ cookie->debug_id,
+ cookie->parent ? cookie->parent->debug_id : 0,
+ cookie->flags,
atomic_read(&cookie->n_children),
atomic_read(&cookie->n_active));
- pr_err("%c-cookie d=%p n=%p\n",
- prefix, cookie->def, cookie->netfs_data);
-
- object = READ_ONCE(cookie->backing_objects.first);
- if (object)
- pr_err("%c-cookie o=%p\n",
- prefix, hlist_entry(object, struct fscache_object, cookie_link));
+ pr_err("%c-cookie d=%p{%s} n=%p\n",
+ prefix,
+ cookie->def,
+ cookie->def ? cookie->def->name : "?",
+ cookie->netfs_data);
+
+ o = READ_ONCE(cookie->backing_objects.first);
+ if (o) {
+ object = hlist_entry(o, struct fscache_object, cookie_link);
+ pr_err("%c-cookie o=%u\n", prefix, object->debug_id);
+ }
pr_err("%c-key=[%u] '", prefix, cookie->key_len);
k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
@@ -57,6 +67,9 @@ void fscache_free_cookie(struct fscache_cookie *cookie)
{
if (cookie) {
BUG_ON(!hlist_empty(&cookie->backing_objects));
+ write_lock(&fscache_cookies_lock);
+ list_del(&cookie->proc_link);
+ write_unlock(&fscache_cookies_lock);
if (cookie->aux_len > sizeof(cookie->inline_aux))
kfree(cookie->aux);
if (cookie->key_len > sizeof(cookie->inline_key))
@@ -74,10 +87,8 @@ void fscache_free_cookie(struct fscache_cookie *cookie)
static int fscache_set_key(struct fscache_cookie *cookie,
const void *index_key, size_t index_key_len)
{
- unsigned long long h;
u32 *buf;
int bufs;
- int i;
bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf));
@@ -91,17 +102,7 @@ static int fscache_set_key(struct fscache_cookie *cookie,
}
memcpy(buf, index_key, index_key_len);
-
- /* Calculate a hash and combine this with the length in the first word
- * or first half word
- */
- h = (unsigned long)cookie->parent;
- h += index_key_len + cookie->type;
-
- for (i = 0; i < bufs; i++)
- h += buf[i];
-
- cookie->key_hash = h ^ (h >> 32);
+ cookie->key_hash = fscache_hash(0, buf, bufs);
return 0;
}
@@ -129,6 +130,8 @@ static long fscache_compare_cookie(const struct fscache_cookie *a,
return memcmp(ka, kb, a->key_len);
}
+static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1);
+
/*
* Allocate a cookie.
*/
@@ -161,8 +164,9 @@ struct fscache_cookie *fscache_alloc_cookie(
goto nomem;
}
- atomic_set(&cookie->usage, 1);
+ refcount_set(&cookie->ref, 1);
atomic_set(&cookie->n_children, 0);
+ cookie->debug_id = atomic_inc_return(&fscache_cookie_debug_id);
/* We keep the active count elevated until relinquishment to prevent an
* attempt to wake up every time the object operations queue quiesces.
@@ -181,6 +185,10 @@ struct fscache_cookie *fscache_alloc_cookie(
/* radix tree insertion won't use the preallocation pool unless it's
* told it may not wait */
INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+
+ write_lock(&fscache_cookies_lock);
+ list_add_tail(&cookie->proc_link, &fscache_cookies);
+ write_unlock(&fscache_cookies_lock);
return cookie;
nomem:
@@ -217,8 +225,8 @@ struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate)
collision:
if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) {
- trace_fscache_cookie(cursor, fscache_cookie_collision,
- atomic_read(&cursor->usage));
+ trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref),
+ fscache_cookie_collision);
pr_err("Duplicate cookie detected\n");
fscache_print_cookie(cursor, 'O');
fscache_print_cookie(candidate, 'N');
@@ -297,7 +305,8 @@ struct fscache_cookie *__fscache_acquire_cookie(
cookie = fscache_hash_cookie(candidate);
if (!cookie) {
- trace_fscache_cookie(candidate, fscache_cookie_discard, 1);
+ trace_fscache_cookie(candidate->debug_id, 1,
+ fscache_cookie_discard);
goto out;
}
@@ -355,7 +364,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie,
bool (*can_enable)(void *data),
void *data)
{
- _enter("%p", cookie);
+ _enter("%x", cookie->debug_id);
trace_fscache_enable(cookie);
@@ -452,10 +461,8 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
/* we may be required to wait for lookup to complete at this point */
if (!fscache_defer_lookup) {
- _debug("non-deferred lookup %p", &cookie->flags);
wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
TASK_UNINTERRUPTIBLE);
- _debug("complete");
if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
goto unavailable;
}
@@ -480,7 +487,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
struct fscache_object *object;
int ret;
- _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+ _enter("%s,%x{%s}", cache->tag->name, cookie->debug_id, cookie->def->name);
spin_lock(&cookie->lock);
hlist_for_each_entry(object, &cookie->backing_objects,
@@ -600,8 +607,6 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
/* Attach to the cookie. The object already has a ref on it. */
hlist_add_head(&object->cookie_link, &cookie->backing_objects);
-
- fscache_objlist_add(object);
ret = 0;
cant_attach_object:
@@ -658,7 +663,7 @@ EXPORT_SYMBOL(__fscache_invalidate);
*/
void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
{
- _enter("%p", cookie);
+ _enter("%x", cookie->debug_id);
wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
TASK_UNINTERRUPTIBLE);
@@ -713,7 +718,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie,
struct fscache_object *object;
bool awaken = false;
- _enter("%p,%u", cookie, invalidate);
+ _enter("%x,%u", cookie->debug_id, invalidate);
trace_fscache_disable(cookie);
@@ -803,8 +808,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie,
return;
}
- _enter("%p{%s,%p,%d},%d",
- cookie, cookie->def->name, cookie->netfs_data,
+ _enter("%x{%s,%d},%d",
+ cookie->debug_id, cookie->def->name,
atomic_read(&cookie->n_active), retire);
trace_fscache_relinquish(cookie, retire);
@@ -821,13 +826,12 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie,
BUG_ON(!radix_tree_empty(&cookie->stores));
if (cookie->parent) {
- ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+ ASSERTCMP(refcount_read(&cookie->parent->ref), >, 0);
ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
atomic_dec(&cookie->parent->n_children);
}
/* Dispose of the netfs's link to the cookie */
- ASSERTCMP(atomic_read(&cookie->usage), >, 0);
fscache_cookie_put(cookie, fscache_cookie_put_relinquish);
_leave("");
@@ -857,17 +861,17 @@ void fscache_cookie_put(struct fscache_cookie *cookie,
enum fscache_cookie_trace where)
{
struct fscache_cookie *parent;
- int usage;
+ int ref;
- _enter("%p", cookie);
+ _enter("%x", cookie->debug_id);
do {
- usage = atomic_dec_return(&cookie->usage);
- trace_fscache_cookie(cookie, where, usage);
+ unsigned int cookie_debug_id = cookie->debug_id;
+ bool zero = __refcount_dec_and_test(&cookie->ref, &ref);
- if (usage > 0)
+ trace_fscache_cookie(cookie_debug_id, ref - 1, where);
+ if (!zero)
return;
- BUG_ON(usage < 0);
parent = cookie->parent;
fscache_unhash_cookie(cookie);
@@ -881,6 +885,19 @@ void fscache_cookie_put(struct fscache_cookie *cookie,
}
/*
+ * Get a reference to a cookie.
+ */
+struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ int ref;
+
+ __refcount_inc(&cookie->ref, &ref);
+ trace_fscache_cookie(cookie->debug_id, ref + 1, where);
+ return cookie;
+}
+
+/*
* check the consistency between the netfs inode and the backing cache
*
* NOTE: it only serves no-index type
@@ -958,3 +975,97 @@ inconsistent:
return -ESTALE;
}
EXPORT_SYMBOL(__fscache_check_consistency);
+
+/*
+ * Generate a list of extant cookies in /proc/fs/fscache/cookies
+ */
+static int fscache_cookies_seq_show(struct seq_file *m, void *v)
+{
+ struct fscache_cookie *cookie;
+ unsigned int keylen = 0, auxlen = 0;
+ char _type[3], *type;
+ u8 *p;
+
+ if (v == &fscache_cookies) {
+ seq_puts(m,
+ "COOKIE PARENT USAGE CHILD ACT TY FL DEF NETFS_DATA\n"
+ "======== ======== ===== ===== === == === ================ ==========\n"
+ );
+ return 0;
+ }
+
+ cookie = list_entry(v, struct fscache_cookie, proc_link);
+
+ switch (cookie->type) {
+ case 0:
+ type = "IX";
+ break;
+ case 1:
+ type = "DT";
+ break;
+ default:
+ snprintf(_type, sizeof(_type), "%02u",
+ cookie->type);
+ type = _type;
+ break;
+ }
+
+ seq_printf(m,
+ "%08x %08x %5u %5u %3u %s %03lx %-16s %px",
+ cookie->debug_id,
+ cookie->parent ? cookie->parent->debug_id : 0,
+ refcount_read(&cookie->ref),
+ atomic_read(&cookie->n_children),
+ atomic_read(&cookie->n_active),
+ type,
+ cookie->flags,
+ cookie->def->name,
+ cookie->netfs_data);
+
+ keylen = cookie->key_len;
+ auxlen = cookie->aux_len;
+
+ if (keylen > 0 || auxlen > 0) {
+ seq_puts(m, " ");
+ p = keylen <= sizeof(cookie->inline_key) ?
+ cookie->inline_key : cookie->key;
+ for (; keylen > 0; keylen--)
+ seq_printf(m, "%02x", *p++);
+ if (auxlen > 0) {
+ seq_puts(m, ", ");
+ p = auxlen <= sizeof(cookie->inline_aux) ?
+ cookie->inline_aux : cookie->aux;
+ for (; auxlen > 0; auxlen--)
+ seq_printf(m, "%02x", *p++);
+ }
+ }
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static void *fscache_cookies_seq_start(struct seq_file *m, loff_t *_pos)
+ __acquires(fscache_cookies_lock)
+{
+ read_lock(&fscache_cookies_lock);
+ return seq_list_start_head(&fscache_cookies, *_pos);
+}
+
+static void *fscache_cookies_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ return seq_list_next(v, &fscache_cookies, _pos);
+}
+
+static void fscache_cookies_seq_stop(struct seq_file *m, void *v)
+ __releases(rcu)
+{
+ read_unlock(&fscache_cookies_lock);
+}
+
+
+const struct seq_operations fscache_cookies_seq_ops = {
+ .start = fscache_cookies_seq_start,
+ .next = fscache_cookies_seq_next,
+ .stop = fscache_cookies_seq_stop,
+ .show = fscache_cookies_seq_show,
+};
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index 09ed8795ad86..0402673c680e 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -45,7 +45,8 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
};
struct fscache_cookie fscache_fsdef_index = {
- .usage = ATOMIC_INIT(1),
+ .debug_id = 1,
+ .ref = REFCOUNT_INIT(1),
.n_active = ATOMIC_INIT(1),
.lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
.backing_objects = HLIST_HEAD_INIT,
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
deleted file mode 100644
index 4e5beeaaf454..000000000000
--- a/fs/fscache/histogram.c
+++ /dev/null
@@ -1,87 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* FS-Cache latency histogram
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#define FSCACHE_DEBUG_LEVEL THREAD
-#include <linux/module.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include "internal.h"
-
-atomic_t fscache_obj_instantiate_histogram[HZ];
-atomic_t fscache_objs_histogram[HZ];
-atomic_t fscache_ops_histogram[HZ];
-atomic_t fscache_retrieval_delay_histogram[HZ];
-atomic_t fscache_retrieval_histogram[HZ];
-
-/*
- * display the time-taken histogram
- */
-static int fscache_histogram_show(struct seq_file *m, void *v)
-{
- unsigned long index;
- unsigned n[5], t;
-
- switch ((unsigned long) v) {
- case 1:
- seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n");
- return 0;
- case 2:
- seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n");
- return 0;
- default:
- index = (unsigned long) v - 3;
- n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
- n[1] = atomic_read(&fscache_ops_histogram[index]);
- n[2] = atomic_read(&fscache_objs_histogram[index]);
- n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
- n[4] = atomic_read(&fscache_retrieval_histogram[index]);
- if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
- return 0;
-
- t = (index * 1000) / HZ;
-
- seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n",
- index, t, n[0], n[1], n[2], n[3], n[4]);
- return 0;
- }
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
-{
- if ((unsigned long long)*_pos >= HZ + 2)
- return NULL;
- if (*_pos == 0)
- *_pos = 1;
- return (void *)(unsigned long) *_pos;
-}
-
-/*
- * move to the next line
- */
-static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
-{
- (*pos)++;
- return (unsigned long long)*pos > HZ + 2 ?
- NULL : (void *)(unsigned long) *pos;
-}
-
-/*
- * clean up after reading
- */
-static void fscache_histogram_stop(struct seq_file *m, void *v)
-{
-}
-
-const struct seq_operations fscache_histogram_ops = {
- .start = fscache_histogram_start,
- .stop = fscache_histogram_stop,
- .next = fscache_histogram_next,
- .show = fscache_histogram_show,
-};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index c483863b740a..c3e4804b8fcb 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -45,6 +45,7 @@ extern struct fscache_cache *fscache_select_cache_for_object(
* cookie.c
*/
extern struct kmem_cache *fscache_cookie_jar;
+extern const struct seq_operations fscache_cookies_seq_ops;
extern void fscache_free_cookie(struct fscache_cookie *);
extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *,
@@ -53,9 +54,18 @@ extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *,
const void *, size_t,
void *, loff_t);
extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *);
+extern struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *,
+ enum fscache_cookie_trace);
extern void fscache_cookie_put(struct fscache_cookie *,
enum fscache_cookie_trace);
+static inline void fscache_cookie_see(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+ where);
+}
+
/*
* fsdef.c
*/
@@ -63,30 +73,6 @@ extern struct fscache_cookie fscache_fsdef_index;
extern struct fscache_cookie_def fscache_fsdef_netfs_def;
/*
- * histogram.c
- */
-#ifdef CONFIG_FSCACHE_HISTOGRAM
-extern atomic_t fscache_obj_instantiate_histogram[HZ];
-extern atomic_t fscache_objs_histogram[HZ];
-extern atomic_t fscache_ops_histogram[HZ];
-extern atomic_t fscache_retrieval_delay_histogram[HZ];
-extern atomic_t fscache_retrieval_histogram[HZ];
-
-static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
-{
- unsigned long jif = jiffies - start_jif;
- if (jif >= HZ)
- jif = HZ - 1;
- atomic_inc(&histogram[jif]);
-}
-
-extern const struct seq_operations fscache_histogram_ops;
-
-#else
-#define fscache_hist(hist, start_jif) do {} while (0)
-#endif
-
-/*
* main.c
*/
extern unsigned fscache_defer_lookup;
@@ -97,6 +83,8 @@ extern struct workqueue_struct *fscache_object_wq;
extern struct workqueue_struct *fscache_op_wq;
DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n);
+
static inline bool fscache_object_congested(void)
{
return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
@@ -108,19 +96,6 @@ static inline bool fscache_object_congested(void)
extern void fscache_enqueue_object(struct fscache_object *);
/*
- * object-list.c
- */
-#ifdef CONFIG_FSCACHE_OBJECT_LIST
-extern const struct proc_ops fscache_objlist_proc_ops;
-
-extern void fscache_objlist_add(struct fscache_object *);
-extern void fscache_objlist_remove(struct fscache_object *);
-#else
-#define fscache_objlist_add(object) do {} while(0)
-#define fscache_objlist_remove(object) do {} while(0)
-#endif
-
-/*
* operation.c
*/
extern int fscache_submit_exclusive_op(struct fscache_object *,
@@ -320,14 +295,6 @@ static inline void fscache_raise_event(struct fscache_object *object,
fscache_enqueue_object(object);
}
-static inline void fscache_cookie_get(struct fscache_cookie *cookie,
- enum fscache_cookie_trace where)
-{
- int usage = atomic_inc_return(&cookie->usage);
-
- trace_fscache_cookie(cookie, where, usage);
-}
-
/*
* get an extra reference to a netfs retrieval context
*/
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index c1e6cc9091aa..4207f98e405f 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -94,6 +94,45 @@ static struct ctl_table fscache_sysctls_root[] = {
#endif
/*
+ * Mixing scores (in bits) for (7,20):
+ * Input delta: 1-bit 2-bit
+ * 1 round: 330.3 9201.6
+ * 2 rounds: 1246.4 25475.4
+ * 3 rounds: 1907.1 31295.1
+ * 4 rounds: 2042.3 31718.6
+ * Perfect: 2048 31744
+ * (32*64) (32*31/2 * 64)
+ */
+#define HASH_MIX(x, y, a) \
+ ( x ^= (a), \
+ y ^= x, x = rol32(x, 7),\
+ x += y, y = rol32(y,20),\
+ y *= 9 )
+
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
+{
+ /* Use arch-optimized multiply if one exists */
+ return __hash_32(y ^ __hash_32(x));
+}
+
+/*
+ * Generate a hash. This is derived from full_name_hash(), but we want to be
+ * sure it is arch independent and that it doesn't change as bits of the
+ * computed hash value might appear on disk. The caller also guarantees that
+ * the hashed data will be a series of aligned 32-bit words.
+ */
+unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n)
+{
+ unsigned int a, x = 0, y = salt;
+
+ for (; n; n--) {
+ a = *data++;
+ HASH_MIX(x, y, a);
+ }
+ return fold_hash(x, y);
+}
+
+/*
* initialise the fs caching module
*/
static int __init fscache_init(void)
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index cce92216fa28..d6bdb7b5e723 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -37,7 +37,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
if (!cookie)
goto already_registered;
if (cookie != candidate) {
- trace_fscache_cookie(candidate, fscache_cookie_discard, 1);
+ trace_fscache_cookie(candidate->debug_id, 1, fscache_cookie_discard);
fscache_free_cookie(candidate);
}
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
deleted file mode 100644
index e106a1a1600d..000000000000
--- a/fs/fscache/object-list.c
+++ /dev/null
@@ -1,414 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Global fscache object list maintainer and viewer
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#define FSCACHE_DEBUG_LEVEL COOKIE
-#include <linux/module.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/key.h>
-#include <keys/user-type.h>
-#include "internal.h"
-
-static struct rb_root fscache_object_list;
-static DEFINE_RWLOCK(fscache_object_list_lock);
-
-struct fscache_objlist_data {
- unsigned long config; /* display configuration */
-#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */
-#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */
-#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */
-#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */
-#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */
-#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */
-#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */
-#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */
-#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */
-#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */
-#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */
-#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */
-#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */
-#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */
-};
-
-/*
- * Add an object to the object list
- * - we use the address of the fscache_object structure as the key into the
- * tree
- */
-void fscache_objlist_add(struct fscache_object *obj)
-{
- struct fscache_object *xobj;
- struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
-
- ASSERT(RB_EMPTY_NODE(&obj->objlist_link));
-
- write_lock(&fscache_object_list_lock);
-
- while (*p) {
- parent = *p;
- xobj = rb_entry(parent, struct fscache_object, objlist_link);
-
- if (obj < xobj)
- p = &(*p)->rb_left;
- else if (obj > xobj)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&obj->objlist_link, parent, p);
- rb_insert_color(&obj->objlist_link, &fscache_object_list);
-
- write_unlock(&fscache_object_list_lock);
-}
-
-/*
- * Remove an object from the object list.
- */
-void fscache_objlist_remove(struct fscache_object *obj)
-{
- if (RB_EMPTY_NODE(&obj->objlist_link))
- return;
-
- write_lock(&fscache_object_list_lock);
-
- BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
- rb_erase(&obj->objlist_link, &fscache_object_list);
-
- write_unlock(&fscache_object_list_lock);
-}
-
-/*
- * find the object in the tree on or after the specified index
- */
-static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
-{
- struct fscache_object *pobj, *obj = NULL, *minobj = NULL;
- struct rb_node *p;
- unsigned long pos;
-
- if (*_pos >= (unsigned long) ERR_PTR(-ENOENT))
- return NULL;
- pos = *_pos;
-
- /* banners (can't represent line 0 by pos 0 as that would involve
- * returning a NULL pointer) */
- if (pos == 0)
- return (struct fscache_object *)(long)++(*_pos);
- if (pos < 3)
- return (struct fscache_object *)pos;
-
- pobj = (struct fscache_object *)pos;
- p = fscache_object_list.rb_node;
- while (p) {
- obj = rb_entry(p, struct fscache_object, objlist_link);
- if (pobj < obj) {
- if (!minobj || minobj > obj)
- minobj = obj;
- p = p->rb_left;
- } else if (pobj > obj) {
- p = p->rb_right;
- } else {
- minobj = obj;
- break;
- }
- obj = NULL;
- }
-
- if (!minobj)
- *_pos = (unsigned long) ERR_PTR(-ENOENT);
- else if (minobj != obj)
- *_pos = (unsigned long) minobj;
- return minobj;
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos)
- __acquires(&fscache_object_list_lock)
-{
- read_lock(&fscache_object_list_lock);
- return fscache_objlist_lookup(_pos);
-}
-
-/*
- * move to the next line
- */
-static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos)
-{
- (*_pos)++;
- return fscache_objlist_lookup(_pos);
-}
-
-/*
- * clean up after reading
- */
-static void fscache_objlist_stop(struct seq_file *m, void *v)
- __releases(&fscache_object_list_lock)
-{
- read_unlock(&fscache_object_list_lock);
-}
-
-/*
- * display an object
- */
-static int fscache_objlist_show(struct seq_file *m, void *v)
-{
- struct fscache_objlist_data *data = m->private;
- struct fscache_object *obj = v;
- struct fscache_cookie *cookie;
- unsigned long config = data->config;
- char _type[3], *type;
- u8 *p;
-
- if ((unsigned long) v == 1) {
- seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
- " EM EV FL S"
- " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
- if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
- FSCACHE_OBJLIST_CONFIG_AUX))
- seq_puts(m, " ");
- if (config & FSCACHE_OBJLIST_CONFIG_KEY)
- seq_puts(m, "OBJECT_KEY");
- if ((config & (FSCACHE_OBJLIST_CONFIG_KEY |
- FSCACHE_OBJLIST_CONFIG_AUX)) ==
- (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX))
- seq_puts(m, ", ");
- if (config & FSCACHE_OBJLIST_CONFIG_AUX)
- seq_puts(m, "AUX_DATA");
- seq_puts(m, "\n");
- return 0;
- }
-
- if ((unsigned long) v == 2) {
- seq_puts(m, "======== ======== ==== ===== === === === == ====="
- " == == == ="
- " | ================ == == ================");
- if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
- FSCACHE_OBJLIST_CONFIG_AUX))
- seq_puts(m, " ================");
- seq_puts(m, "\n");
- return 0;
- }
-
- /* filter out any unwanted objects */
-#define FILTER(criterion, _yes, _no) \
- do { \
- unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \
- unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \
- if (criterion) { \
- if (!(config & yes)) \
- return 0; \
- } else { \
- if (!(config & no)) \
- return 0; \
- } \
- } while(0)
-
- cookie = obj->cookie;
- if (~config) {
- FILTER(cookie->def,
- COOKIE, NOCOOKIE);
- FILTER(fscache_object_is_active(obj) ||
- obj->n_ops != 0 ||
- obj->n_obj_ops != 0 ||
- obj->flags ||
- !list_empty(&obj->dependents),
- BUSY, IDLE);
- FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags),
- PENDWR, NOPENDWR);
- FILTER(atomic_read(&obj->n_reads),
- READS, NOREADS);
- FILTER(obj->events & obj->event_mask,
- EVENTS, NOEVENTS);
- FILTER(work_busy(&obj->work), WORK, NOWORK);
- }
-
- seq_printf(m,
- "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
- obj->debug_id,
- obj->parent ? obj->parent->debug_id : -1,
- obj->state->short_name,
- obj->n_children,
- obj->n_ops,
- obj->n_obj_ops,
- obj->n_in_progress,
- obj->n_exclusive,
- atomic_read(&obj->n_reads),
- obj->event_mask,
- obj->events,
- obj->flags,
- work_busy(&obj->work));
-
- if (fscache_use_cookie(obj)) {
- uint16_t keylen = 0, auxlen = 0;
-
- switch (cookie->type) {
- case 0:
- type = "IX";
- break;
- case 1:
- type = "DT";
- break;
- default:
- snprintf(_type, sizeof(_type), "%02u",
- cookie->type);
- type = _type;
- break;
- }
-
- seq_printf(m, "%-16s %s %2lx %16p",
- cookie->def->name,
- type,
- cookie->flags,
- cookie->netfs_data);
-
- if (config & FSCACHE_OBJLIST_CONFIG_KEY)
- keylen = cookie->key_len;
-
- if (config & FSCACHE_OBJLIST_CONFIG_AUX)
- auxlen = cookie->aux_len;
-
- if (keylen > 0 || auxlen > 0) {
- seq_puts(m, " ");
- p = keylen <= sizeof(cookie->inline_key) ?
- cookie->inline_key : cookie->key;
- for (; keylen > 0; keylen--)
- seq_printf(m, "%02x", *p++);
- if (auxlen > 0) {
- if (config & FSCACHE_OBJLIST_CONFIG_KEY)
- seq_puts(m, ", ");
- p = auxlen <= sizeof(cookie->inline_aux) ?
- cookie->inline_aux : cookie->aux;
- for (; auxlen > 0; auxlen--)
- seq_printf(m, "%02x", *p++);
- }
- }
-
- seq_puts(m, "\n");
- fscache_unuse_cookie(obj);
- } else {
- seq_puts(m, "<no_netfs>\n");
- }
- return 0;
-}
-
-static const struct seq_operations fscache_objlist_ops = {
- .start = fscache_objlist_start,
- .stop = fscache_objlist_stop,
- .next = fscache_objlist_next,
- .show = fscache_objlist_show,
-};
-
-/*
- * get the configuration for filtering the list
- */
-static void fscache_objlist_config(struct fscache_objlist_data *data)
-{
-#ifdef CONFIG_KEYS
- const struct user_key_payload *confkey;
- unsigned long config;
- struct key *key;
- const char *buf;
- int len;
-
- key = request_key(&key_type_user, "fscache:objlist", NULL);
- if (IS_ERR(key))
- goto no_config;
-
- config = 0;
- rcu_read_lock();
-
- confkey = user_key_payload_rcu(key);
- if (!confkey) {
- /* key was revoked */
- rcu_read_unlock();
- key_put(key);
- goto no_config;
- }
-
- buf = confkey->data;
-
- for (len = confkey->datalen - 1; len >= 0; len--) {
- switch (buf[len]) {
- case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break;
- case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break;
- case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break;
- case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break;
- case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break;
- case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break;
- case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break;
- case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break;
- case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break;
- case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break;
- case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break;
- case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break;
- }
- }
-
- rcu_read_unlock();
- key_put(key);
-
- if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE)))
- config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE;
- if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE)))
- config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE;
- if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR)))
- config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR;
- if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS)))
- config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS;
- if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS)))
- config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS;
- if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK)))
- config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK;
-
- data->config = config;
- return;
-
-no_config:
-#endif
- data->config = ULONG_MAX;
-}
-
-/*
- * open "/proc/fs/fscache/objects" to provide a list of active objects
- * - can be configured by a user-defined key added to the caller's keyrings
- */
-static int fscache_objlist_open(struct inode *inode, struct file *file)
-{
- struct fscache_objlist_data *data;
-
- data = __seq_open_private(file, &fscache_objlist_ops, sizeof(*data));
- if (!data)
- return -ENOMEM;
-
- /* get the configuration key */
- fscache_objlist_config(data);
-
- return 0;
-}
-
-/*
- * clean up on close
- */
-static int fscache_objlist_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = file->private_data;
-
- kfree(m->private);
- m->private = NULL;
- return seq_release(inode, file);
-}
-
-const struct proc_ops fscache_objlist_proc_ops = {
- .proc_open = fscache_objlist_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = fscache_objlist_release,
-};
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index cb2146e02cd5..f346a78f4bd6 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -277,13 +277,10 @@ static void fscache_object_work_func(struct work_struct *work)
{
struct fscache_object *object =
container_of(work, struct fscache_object, work);
- unsigned long start;
_enter("{OBJ%x}", object->debug_id);
- start = jiffies;
fscache_object_sm_dispatcher(object);
- fscache_hist(fscache_objs_histogram, start);
fscache_put_object(object, fscache_obj_put_work);
}
@@ -436,7 +433,6 @@ static const struct fscache_state *fscache_parent_ready(struct fscache_object *o
spin_lock(&parent->lock);
parent->n_ops++;
parent->n_obj_ops++;
- object->lookup_jif = jiffies;
spin_unlock(&parent->lock);
_leave("");
@@ -522,7 +518,6 @@ void fscache_object_lookup_negative(struct fscache_object *object)
set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
- _debug("wake up lookup %p", &cookie->flags);
clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
}
@@ -596,7 +591,6 @@ static const struct fscache_state *fscache_object_available(struct fscache_objec
object->cache->ops->lookup_complete(object);
fscache_stat_d(&fscache_n_cop_lookup_complete);
- fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
fscache_stat(&fscache_n_object_avail);
_leave("");
@@ -799,8 +793,6 @@ static void fscache_put_object(struct fscache_object *object,
*/
void fscache_object_destroy(struct fscache_object *object)
{
- fscache_objlist_remove(object);
-
/* We can get rid of the cookie now */
fscache_cookie_put(object->cookie, fscache_cookie_put_object);
object->cookie = NULL;
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 4a5651d4904e..433877107700 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -616,7 +616,6 @@ void fscache_op_work_func(struct work_struct *work)
{
struct fscache_operation *op =
container_of(work, struct fscache_operation, work);
- unsigned long start;
_enter("{OBJ%x OP%x,%d}",
op->object->debug_id, op->debug_id, atomic_read(&op->usage));
@@ -624,9 +623,7 @@ void fscache_op_work_func(struct work_struct *work)
trace_fscache_op(op->object->cookie, op, fscache_op_work);
ASSERT(op->processor != NULL);
- start = jiffies;
op->processor(op);
- fscache_hist(fscache_ops_histogram, start);
fscache_put_operation(op);
_leave("");
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 991b0a871744..27df94ef0e0b 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -289,7 +289,6 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED,
atomic_read(&op->n_pages), ==, 0);
- fscache_hist(fscache_retrieval_histogram, op->start_time);
if (op->context)
fscache_put_context(op->cookie, op->context);
@@ -324,7 +323,6 @@ struct fscache_retrieval *fscache_alloc_retrieval(
op->mapping = mapping;
op->end_io_func = end_io_func;
op->context = context;
- op->start_time = jiffies;
INIT_LIST_HEAD(&op->to_do);
/* Pin the netfs read context in case we need to do the actual netfs
@@ -340,8 +338,6 @@ struct fscache_retrieval *fscache_alloc_retrieval(
*/
int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
{
- unsigned long jif;
-
_enter("");
if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
@@ -351,7 +347,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
fscache_stat(&fscache_n_retrievals_wait);
- jif = jiffies;
if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
TASK_INTERRUPTIBLE) != 0) {
fscache_stat(&fscache_n_retrievals_intr);
@@ -362,7 +357,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
smp_rmb();
- fscache_hist(fscache_retrieval_delay_histogram, jif);
_leave(" = 0 [dly]");
return 0;
}
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index 90a7bc22f7e1..061df8f61ffc 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -21,18 +21,16 @@ int __init fscache_proc_init(void)
if (!proc_mkdir("fs/fscache", NULL))
goto error_dir;
+ if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL,
+ &fscache_cookies_seq_ops))
+ goto error_cookies;
+
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
fscache_stats_show))
goto error_stats;
#endif
-#ifdef CONFIG_FSCACHE_HISTOGRAM
- if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL,
- &fscache_histogram_ops))
- goto error_histogram;
-#endif
-
#ifdef CONFIG_FSCACHE_OBJECT_LIST
if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
&fscache_objlist_proc_ops))
@@ -45,14 +43,12 @@ int __init fscache_proc_init(void)
#ifdef CONFIG_FSCACHE_OBJECT_LIST
error_objects:
#endif
-#ifdef CONFIG_FSCACHE_HISTOGRAM
- remove_proc_entry("fs/fscache/histogram", NULL);
-error_histogram:
-#endif
#ifdef CONFIG_FSCACHE_STATS
remove_proc_entry("fs/fscache/stats", NULL);
error_stats:
#endif
+ remove_proc_entry("fs/fscache/cookies", NULL);
+error_cookies:
remove_proc_entry("fs/fscache", NULL);
error_dir:
_leave(" = -ENOMEM");
@@ -67,11 +63,9 @@ void fscache_proc_cleanup(void)
#ifdef CONFIG_FSCACHE_OBJECT_LIST
remove_proc_entry("fs/fscache/objects", NULL);
#endif
-#ifdef CONFIG_FSCACHE_HISTOGRAM
- remove_proc_entry("fs/fscache/histogram", NULL);
-#endif
#ifdef CONFIG_FSCACHE_STATS
remove_proc_entry("fs/fscache/stats", NULL);
#endif
+ remove_proc_entry("fs/fscache/cookies", NULL);
remove_proc_entry("fs/fscache", NULL);
}
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 52b165319be1..337cb29a8dd5 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -11,7 +11,7 @@
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
-struct posix_acl *fuse_get_acl(struct inode *inode, int type)
+struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int size;
@@ -19,6 +19,9 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type)
void *value = NULL;
struct posix_acl *acl;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
if (fuse_is_bad(inode))
return ERR_PTR(-EIO);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 9d58371d22c2..281d79f8b3d3 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
/*
* Can't do inline reclaim in fault path. We call
* dax_layout_busy_page() before we free a range. And
- * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
- * In fault path we enter with fi->i_mmap_sem held and can't drop
- * it. Also in fault path we hold fi->i_mmap_sem shared and not
- * exclusive, so that creates further issues with fuse_wait_dax_page().
- * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
- * range to become free and retry.
+ * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
+ * In fault path we enter with mapping->invalidate_lock held and can't
+ * drop it. Also in fault path we hold mapping->invalidate_lock shared
+ * and not exclusive, so that creates further issues with
+ * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault()
+ * will wait for a memory range to become free and retry.
*/
if (flags & IOMAP_FAULT) {
alloc_dmap = alloc_dax_mapping(fcd);
@@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
down_write(&fi->dax->sem);
node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
- /* We are holding either inode lock or i_mmap_sem, and that should
+ /* We are holding either inode lock or invalidate_lock, and that should
* ensure that dmap can't be truncated. We are holding a reference
* on dmap and that should make sure it can't be reclaimed. So dmap
* should still be there in tree despite the fact we dropped and
@@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = {
static void fuse_wait_dax_page(struct inode *inode)
{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
schedule();
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
}
-/* Should be called with fi->i_mmap_sem lock held exclusively */
+/* Should be called with mapping->invalidate_lock held exclusively */
static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
loff_t start, loff_t end)
{
@@ -813,18 +811,18 @@ retry:
* we do not want any read/write/mmap to make progress and try
* to populate page cache or access memory we are trying to free.
*/
- down_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
error = 0;
retry = true;
- up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
goto retry;
}
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
- up_read(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
sb_end_pagefault(sb);
@@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
int ret;
struct interval_tree_node *node;
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
/* Lookup a dmap and corresponding file offset to reclaim. */
down_read(&fi->dax->sem);
@@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
out_write_dmap_sem:
up_write(&fi->dax->sem);
out_mmap_sem:
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return dmap;
}
@@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
* had a reference or some other temporary failure,
* Try again. We want to give up inline reclaim only
* if there is no range assigned to this node. Otherwise
- * if a deadlock is possible if we sleep with fi->i_mmap_sem
- * held and worker to free memory can't make progress due
- * to unavailability of fi->i_mmap_sem lock. So sleep
- * only if fi->dax->nr=0
+ * if a deadlock is possible if we sleep with
+ * mapping->invalidate_lock held and worker to free memory
+ * can't make progress due to unavailability of
+ * mapping->invalidate_lock. So sleep only if fi->dax->nr=0
*/
if (retry)
continue;
@@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
* There are no mappings which can be reclaimed. Wait for one.
* We are not holding fi->dax->sem. So it is possible
* that range gets added now. But as we are not holding
- * fi->i_mmap_sem, worker should still be able to free up
- * a range and wake us up.
+ * mapping->invalidate_lock, worker should still be able to
+ * free up a range and wake us up.
*/
if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
if (wait_event_killable_exclusive(fcd->range_waitq,
@@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
/*
* Free a range of memory.
* Locking:
- * 1. Take fi->i_mmap_sem to block dax faults.
+ * 1. Take mapping->invalidate_lock to block dax faults.
* 2. Take fi->dax->sem to protect interval tree and also to make sure
* read/write can not reuse a dmap which we might be freeing.
*/
@@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
if (ret) {
pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
@@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
up_write(&fi->dax->sem);
out_mmap_sem:
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return ret;
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index eade6f965b2e..d9b977c0f38d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_conn *fc = fm->fc;
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct address_space *mapping = inode->i_mapping;
FUSE_ARGS(args);
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
@@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
if (FUSE_IS_DAX(inode) && is_truncate) {
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(mapping);
fault_blocked = true;
err = fuse_dax_break_layouts(inode, 0, 0);
if (err) {
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
return err;
}
}
@@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if ((is_truncate || !is_wb) &&
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
truncate_pagecache(inode, outarg.attr.size);
- invalidate_inode_pages2(inode->i_mapping);
+ invalidate_inode_pages2(mapping);
}
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
out:
if (fault_blocked)
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
return 0;
@@ -1711,7 +1712,7 @@ error:
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
if (fault_blocked)
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(mapping);
return err;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 97f860cfc195..621a662c19fb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
}
if (dax_truncate) {
- down_write(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
goto out;
@@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
out:
if (dax_truncate)
- up_write(&get_fuse_inode(inode)->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
if (is_wb_truncate | dax_truncate) {
fuse_release_nowrite(inode);
@@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (lock_inode) {
inode_lock(inode);
if (block_faults) {
- down_write(&fi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
goto out;
@@ -2976,7 +2976,7 @@ out:
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
if (block_faults)
- up_write(&fi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
if (lock_inode)
inode_unlock(inode);
@@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
* modifications. Yet this does give less guarantees than if the
* copying was performed with write(2).
*
- * To fix this a i_mmap_sem style lock could be used to prevent new
+ * To fix this a mapping->invalidate_lock could be used to prevent new
* faults while the copy is ongoing.
*/
err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 07829ce78695..3d18556a01ad 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -149,13 +149,6 @@ struct fuse_inode {
/** Lock to protect write related fields */
spinlock_t lock;
- /**
- * Can't take inode lock in fault path (leads to circular dependency).
- * Introduce another semaphore which can be taken in fault path and
- * then other filesystem paths can take this to block faults.
- */
- struct rw_semaphore i_mmap_sem;
-
#ifdef CONFIG_FUSE_DAX
/*
* Dax specific inode data
@@ -1216,7 +1209,7 @@ extern const struct xattr_handler *fuse_acl_xattr_handlers[];
extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
struct posix_acl;
-struct posix_acl *fuse_get_acl(struct inode *inode, int type);
+struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b9beb39a4a18..e07e429f32e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->orig_ino = 0;
fi->state = 0;
mutex_init(&fi->mutex);
- init_rwsem(&fi->i_mmap_sem);
spin_lock_init(&fi->lock);
fi->forget = fuse_alloc_forget();
if (!fi->forget)
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 9165d70ead07..734d1f05d823 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -57,13 +57,16 @@ static struct posix_acl *__gfs2_get_acl(struct inode *inode, int type)
return acl;
}
-struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
+struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
bool need_unlock = false;
struct posix_acl *acl;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
int ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
LM_FLAG_ANY, &gh);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index eccc6a43326c..cd180ca7c959 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -11,7 +11,7 @@
#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
-extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
+extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 81d8f064126e..005e920f5d4a 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -574,10 +574,9 @@ void adjust_fs_space(struct inode *inode)
{
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
- struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
- struct buffer_head *m_bh, *l_bh;
+ struct buffer_head *m_bh;
u64 fs_total, new_free;
if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0)
@@ -600,11 +599,7 @@ void adjust_fs_space(struct inode *inode)
(unsigned long long)new_free);
gfs2_statfs_change(sdp, new_free, new_free, 0);
- if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
- goto out2;
- update_statfs(sdp, m_bh, l_bh);
- brelse(l_bh);
-out2:
+ update_statfs(sdp, m_bh);
brelse(m_bh);
out:
sdp->sd_rindex_uptodate = 0;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ed8b67b21718..5414c2c33580 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1002,7 +1002,7 @@ static void gfs2_write_unlock(struct inode *inode)
}
static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
- unsigned len, struct iomap *iomap)
+ unsigned len)
{
unsigned int blockmask = i_blocksize(inode) - 1;
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1013,8 +1013,7 @@ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
}
static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
- unsigned copied, struct page *page,
- struct iomap *iomap)
+ unsigned copied, struct page *page)
{
struct gfs2_trans *tr = current->journal_info;
struct gfs2_inode *ip = GFS2_I(inode);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 84ec053d43b4..c559827cb6f9 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1237,9 +1237,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
-
if (cmd == F_CANCELLK) {
/* Hack: */
cmd = F_SETLK;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 1f3902ecdded..e0eaa9cf9fb6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1494,12 +1494,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
list_del_init(&gh->gh_list);
clear_bit(HIF_HOLDER, &gh->gh_iflags);
- if (find_first_holder(gl) == NULL) {
- if (list_empty(&gl->gl_holders) &&
- !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
- !test_bit(GLF_DEMOTE, &gl->gl_flags))
- fast_path = 1;
- }
+ if (list_empty(&gl->gl_holders) &&
+ !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+ !test_bit(GLF_DEMOTE, &gl->gl_flags))
+ fast_path = 1;
+
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
gfs2_glock_add_to_lru(gl);
@@ -2077,8 +2076,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
*p++ = 'H';
if (test_bit(HIF_WAIT, &iflags))
*p++ = 'W';
- if (test_bit(HIF_FIRST, &iflags))
- *p++ = 'F';
*p = 0;
return buf;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 54d3fbeb3002..79c621c7863d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -33,16 +33,18 @@ extern struct workqueue_struct *gfs2_control_wq;
static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
{
- fs_err(gl->gl_name.ln_sbd,
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ fs_err(sdp,
"AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
"state 0x%lx\n",
bh, (unsigned long long)bh->b_blocknr, bh->b_state,
bh->b_page->mapping, bh->b_page->flags);
- fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
+ fs_err(sdp, "AIL glock %u:%llu mapping %p\n",
gl->gl_name.ln_type, gl->gl_name.ln_number,
gfs2_glock2aspace(gl));
- gfs2_lm(gl->gl_name.ln_sbd, "AIL error\n");
- gfs2_withdraw(gl->gl_name.ln_sbd);
+ gfs2_lm(sdp, "AIL error\n");
+ gfs2_withdraw_delayed(sdp);
}
/**
@@ -610,16 +612,13 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl)
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
- if (error)
- gfs2_consist(sdp);
- if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
- gfs2_consist(sdp);
-
- /* Initialize some head of the log stuff */
- if (!gfs2_withdrawn(sdp)) {
- sdp->sd_log_sequence = head.lh_sequence + 1;
- gfs2_log_pointers_init(sdp, head.lh_blkno);
- }
+ if (gfs2_assert_withdraw_delayed(sdp, !error))
+ return error;
+ if (gfs2_assert_withdraw_delayed(sdp, head.lh_flags &
+ GFS2_LOG_HEAD_UNMOUNT))
+ return -EIO;
+ sdp->sd_log_sequence = head.lh_sequence + 1;
+ gfs2_log_pointers_init(sdp, head.lh_blkno);
}
return 0;
}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e6f820f146cb..0fe49770166e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -253,7 +253,6 @@ struct gfs2_lkstats {
enum {
/* States */
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
- HIF_FIRST = 7,
HIF_WAIT = 10,
};
@@ -768,6 +767,7 @@ struct gfs2_sbd {
struct gfs2_glock *sd_jinode_gl;
struct gfs2_holder sd_sc_gh;
+ struct buffer_head *sd_sc_bh;
struct gfs2_holder sd_qc_gh;
struct completion sd_journal_ready;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index dac040162ecc..50578f881e6d 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -299,6 +299,11 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
gfs2_update_request_times(gl);
+ /* don't want to call dlm if we've unmounted the lock protocol */
+ if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
+ gfs2_glock_free(gl);
+ return;
+ }
/* don't want to skip dlm_unlock writing the lvb when lock has one */
if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 42c15cfc0821..f0ee3ff6f9a8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -594,7 +594,7 @@ void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
{
unsigned int blks = tr->tr_reserved;
unsigned int revokes = tr->tr_revokes;
- unsigned int revoke_blks = 0;
+ unsigned int revoke_blks;
*extra_revokes = 0;
if (revokes) {
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8ee05d25dfa6..ca0bb3a73912 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -761,6 +761,32 @@ static void buf_lo_before_scan(struct gfs2_jdesc *jd,
jd->jd_replayed_blocks = 0;
}
+#define obsolete_rgrp_replay \
+"Replaying 0x%llx from jid=%d/0x%llx but we already have a bh!\n"
+#define obsolete_rgrp_replay2 \
+"busy:%d, pinned:%d rg_gen:0x%llx, j_gen:0x%llx\n"
+
+static void obsolete_rgrp(struct gfs2_jdesc *jd, struct buffer_head *bh_log,
+ u64 blkno)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_rgrp *jrgd = (struct gfs2_rgrp *)bh_log->b_data;
+
+ rgd = gfs2_blk2rgrpd(sdp, blkno, false);
+ if (rgd && rgd->rd_addr == blkno &&
+ rgd->rd_bits && rgd->rd_bits->bi_bh) {
+ fs_info(sdp, obsolete_rgrp_replay, (unsigned long long)blkno,
+ jd->jd_jid, bh_log->b_blocknr);
+ fs_info(sdp, obsolete_rgrp_replay2,
+ buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0,
+ buffer_pinned(rgd->rd_bits->bi_bh),
+ rgd->rd_igeneration,
+ be64_to_cpu(jrgd->rg_igeneration));
+ gfs2_dump_glock(NULL, rgd->rd_gl, true);
+ }
+}
+
static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
struct gfs2_log_descriptor *ld, __be64 *ptr,
int pass)
@@ -799,21 +825,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
struct gfs2_meta_header *mh =
(struct gfs2_meta_header *)bh_ip->b_data;
- if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG)) {
- struct gfs2_rgrpd *rgd;
-
- rgd = gfs2_blk2rgrpd(sdp, blkno, false);
- if (rgd && rgd->rd_addr == blkno &&
- rgd->rd_bits && rgd->rd_bits->bi_bh) {
- fs_info(sdp, "Replaying 0x%llx but we "
- "already have a bh!\n",
- (unsigned long long)blkno);
- fs_info(sdp, "busy:%d, pinned:%d\n",
- buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0,
- buffer_pinned(rgd->rd_bits->bi_bh));
- gfs2_dump_glock(NULL, rgd->rd_gl, true);
- }
- }
+ if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG))
+ obsolete_rgrp(jd, bh_log, blkno);
+
mark_buffer_dirty(bh_ip);
}
brelse(bh_log);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 7c9619997355..72d30a682ece 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -258,8 +258,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head *bh, *bhs[2];
int num = 0;
- if (unlikely(gfs2_withdrawn(sdp)) &&
- (!sdp->sd_jdesc || gl != sdp->sd_jinode_gl)) {
+ if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) {
*bhp = NULL;
return -EIO;
}
@@ -317,7 +316,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
{
- if (unlikely(gfs2_withdrawn(sdp)))
+ if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp))
return -EIO;
wait_on_buffer(bh);
@@ -328,7 +327,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
gfs2_io_error_bh_wd(sdp, bh);
return -EIO;
}
- if (unlikely(gfs2_withdrawn(sdp)))
+ if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp))
return -EIO;
return 0;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 5f4504dd0875..7f8410d8fdc1 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -614,6 +614,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
break;
}
+ d_mark_dontcache(jd->jd_inode);
spin_lock(&sdp->sd_jindex_spin);
jd->jd_jid = sdp->sd_journals++;
jip = GFS2_I(jd->jd_inode);
@@ -677,6 +678,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
error = PTR_ERR(lsi->si_sc_inode);
fs_err(sdp, "can't find local \"sc\" file#%u: %d\n",
jd->jd_jid, error);
+ kfree(lsi);
goto free_local;
}
lsi->si_jid = jd->jd_jid;
@@ -695,8 +697,16 @@ static int init_statfs(struct gfs2_sbd *sdp)
fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
goto free_local;
}
+ /* read in the local statfs buffer - other nodes don't change it. */
+ error = gfs2_meta_inode_buffer(ip, &sdp->sd_sc_bh);
+ if (error) {
+ fs_err(sdp, "Cannot read in local statfs: %d\n", error);
+ goto unlock_sd_gh;
+ }
return 0;
+unlock_sd_gh:
+ gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
free_local:
free_local_statfs_inodes(sdp);
iput(pn);
@@ -710,6 +720,7 @@ out:
static void uninit_statfs(struct gfs2_sbd *sdp)
{
if (!sdp->sd_args.ar_spectator) {
+ brelse(sdp->sd_sc_bh);
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
free_local_statfs_inodes(sdp);
}
@@ -1088,6 +1099,34 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp)
kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
}
+static int init_threads(struct gfs2_sbd *sdp)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
+ fs_err(sdp, "can't start logd thread: %d\n", error);
+ return error;
+ }
+ sdp->sd_logd_process = p;
+
+ p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
+ fs_err(sdp, "can't start quotad thread: %d\n", error);
+ goto fail;
+ }
+ sdp->sd_quotad_process = p;
+ return 0;
+
+fail:
+ kthread_stop(sdp->sd_logd_process);
+ sdp->sd_logd_process = NULL;
+ return error;
+}
+
/**
* gfs2_fill_super - Read in superblock
* @sb: The VFS superblock
@@ -1216,6 +1255,14 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
goto fail_per_node;
}
+ if (!sb_rdonly(sb)) {
+ error = init_threads(sdp);
+ if (error) {
+ gfs2_withdraw_delayed(sdp);
+ goto fail_per_node;
+ }
+ }
+
error = gfs2_freeze_lock(sdp, &freeze_gh, 0);
if (error)
goto fail_per_node;
@@ -1225,6 +1272,12 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
gfs2_freeze_unlock(&freeze_gh);
if (error) {
+ if (sdp->sd_quotad_process)
+ kthread_stop(sdp->sd_quotad_process);
+ sdp->sd_quotad_process = NULL;
+ if (sdp->sd_logd_process)
+ kthread_stop(sdp->sd_logd_process);
+ sdp->sd_logd_process = NULL;
fs_err(sdp, "can't make FS RW: %d\n", error);
goto fail_per_node;
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 4d4ceb0b6903..6e00d15ef0a8 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -119,34 +119,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
return 0;
}
-static int init_threads(struct gfs2_sbd *sdp)
-{
- struct task_struct *p;
- int error = 0;
-
- p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
- if (IS_ERR(p)) {
- error = PTR_ERR(p);
- fs_err(sdp, "can't start logd thread: %d\n", error);
- return error;
- }
- sdp->sd_logd_process = p;
-
- p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
- if (IS_ERR(p)) {
- error = PTR_ERR(p);
- fs_err(sdp, "can't start quotad thread: %d\n", error);
- goto fail;
- }
- sdp->sd_quotad_process = p;
- return 0;
-
-fail:
- kthread_stop(sdp->sd_logd_process);
- sdp->sd_logd_process = NULL;
- return error;
-}
-
/**
* gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
* @sdp: the filesystem
@@ -161,26 +133,17 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
struct gfs2_log_header_host head;
int error;
- error = init_threads(sdp);
- if (error) {
- gfs2_withdraw_delayed(sdp);
- return error;
- }
-
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
- if (gfs2_withdrawn(sdp)) {
- error = -EIO;
- goto fail;
- }
+ if (gfs2_withdrawn(sdp))
+ return -EIO;
error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
if (error || gfs2_withdrawn(sdp))
- goto fail;
+ return error;
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
gfs2_consist(sdp);
- error = -EIO;
- goto fail;
+ return -EIO;
}
/* Initialize some head of the log stuff */
@@ -188,20 +151,8 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
gfs2_log_pointers_init(sdp, head.lh_blkno);
error = gfs2_quota_init(sdp);
- if (error || gfs2_withdrawn(sdp))
- goto fail;
-
- set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-
- return 0;
-
-fail:
- if (sdp->sd_quotad_process)
- kthread_stop(sdp->sd_quotad_process);
- sdp->sd_quotad_process = NULL;
- if (sdp->sd_logd_process)
- kthread_stop(sdp->sd_logd_process);
- sdp->sd_logd_process = NULL;
+ if (!error && !gfs2_withdrawn(sdp))
+ set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
return error;
}
@@ -227,9 +178,8 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp)
{
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
- struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
- struct buffer_head *m_bh, *l_bh;
+ struct buffer_head *m_bh;
struct gfs2_holder gh;
int error;
@@ -248,21 +198,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp)
sizeof(struct gfs2_dinode));
spin_unlock(&sdp->sd_statfs_spin);
} else {
- error = gfs2_meta_inode_buffer(l_ip, &l_bh);
- if (error)
- goto out_m_bh;
-
spin_lock(&sdp->sd_statfs_spin);
gfs2_statfs_change_in(m_sc, m_bh->b_data +
sizeof(struct gfs2_dinode));
- gfs2_statfs_change_in(l_sc, l_bh->b_data +
+ gfs2_statfs_change_in(l_sc, sdp->sd_sc_bh->b_data +
sizeof(struct gfs2_dinode));
spin_unlock(&sdp->sd_statfs_spin);
- brelse(l_bh);
}
-out_m_bh:
brelse(m_bh);
out:
gfs2_glock_dq_uninit(&gh);
@@ -275,22 +219,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
- struct buffer_head *l_bh;
s64 x, y;
int need_sync = 0;
- int error;
-
- error = gfs2_meta_inode_buffer(l_ip, &l_bh);
- if (error)
- return;
- gfs2_trans_add_meta(l_ip->i_gl, l_bh);
+ gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh);
spin_lock(&sdp->sd_statfs_spin);
l_sc->sc_total += total;
l_sc->sc_free += free;
l_sc->sc_dinodes += dinodes;
- gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+ gfs2_statfs_change_out(l_sc, sdp->sd_sc_bh->b_data +
+ sizeof(struct gfs2_dinode));
if (sdp->sd_args.ar_statfs_percent) {
x = 100 * l_sc->sc_free;
y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
@@ -299,20 +238,18 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
}
spin_unlock(&sdp->sd_statfs_spin);
- brelse(l_bh);
if (need_sync)
gfs2_wake_up_statfs(sdp);
}
-void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
- struct buffer_head *l_bh)
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh)
{
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
- gfs2_trans_add_meta(l_ip->i_gl, l_bh);
+ gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh);
gfs2_trans_add_meta(m_ip->i_gl, m_bh);
spin_lock(&sdp->sd_statfs_spin);
@@ -320,7 +257,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
m_sc->sc_free += l_sc->sc_free;
m_sc->sc_dinodes += l_sc->sc_dinodes;
memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
- memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+ memset(sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode),
0, sizeof(struct gfs2_statfs_change));
gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
spin_unlock(&sdp->sd_statfs_spin);
@@ -330,11 +267,10 @@ int gfs2_statfs_sync(struct super_block *sb, int type)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
- struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
struct gfs2_holder gh;
- struct buffer_head *m_bh, *l_bh;
+ struct buffer_head *m_bh;
int error;
error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
@@ -355,21 +291,15 @@ int gfs2_statfs_sync(struct super_block *sb, int type)
}
spin_unlock(&sdp->sd_statfs_spin);
- error = gfs2_meta_inode_buffer(l_ip, &l_bh);
- if (error)
- goto out_bh;
-
error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
if (error)
- goto out_bh2;
+ goto out_bh;
- update_statfs(sdp, m_bh, l_bh);
+ update_statfs(sdp, m_bh);
sdp->sd_statfs_force_sync = 0;
gfs2_trans_end(sdp);
-out_bh2:
- brelse(l_bh);
out_bh:
brelse(m_bh);
out_unlock:
@@ -675,6 +605,7 @@ restart:
gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
if (gfs2_holder_initialized(&sdp->sd_jinode_gh))
gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+ brelse(sdp->sd_sc_bh);
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
free_local_statfs_inodes(sdp);
@@ -1016,7 +947,7 @@ static int gfs2_drop_inode(struct inode *inode)
gfs2_glock_hold(gl);
if (!gfs2_queue_delete_work(gl, 0))
gfs2_glock_queue_put(gl);
- return false;
+ return 0;
}
return generic_drop_inode(inode);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index ec4affb33ed5..58d13fd77aed 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -43,8 +43,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
const void *buf);
extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
void *buf);
-extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
- struct buffer_head *l_bh);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh);
extern int gfs2_statfs_sync(struct super_block *sb, int type);
extern void gfs2_freeze_func(struct work_struct *work);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f4325b44956d..cf345a86ef67 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -278,6 +278,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
goto skip_recovery;
}
sdp->sd_jdesc->jd_inode = inode;
+ d_mark_dontcache(inode);
/*
* Now wait until recovery is complete.
@@ -295,7 +296,7 @@ skip_recovery:
fs_warn(sdp, "Journal recovery complete for jid %d.\n",
sdp->sd_lockstruct.ls_jid);
else
- fs_warn(sdp, "Journal recovery skipped for %d until next "
+ fs_warn(sdp, "Journal recovery skipped for jid %d until next "
"mount.\n", sdp->sd_lockstruct.ls_jid);
fs_warn(sdp, "Glock dequeues delayed: %lu\n", sdp->sd_glock_dqs_held);
sdp->sd_glock_dqs_held = 0;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 69e1a0ae5a4d..78ec190f4155 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -218,6 +218,11 @@ static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp)
!test_bit(SDF_WITHDRAWN, &sdp->sd_flags);
}
+static inline bool gfs2_withdraw_in_prog(struct gfs2_sbd *sdp)
+{
+ return test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags);
+}
+
#define gfs2_tune_get(sdp, field) \
gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 2b36dc6f0a10..ec975f466877 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -2,6 +2,7 @@
config HPFS_FS
tristate "OS/2 HPFS file system support"
depends on BLOCK
+ select FS_IOMAP
help
OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c3a49aacf20a..fb37f57130aa 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -9,6 +9,7 @@
#include "hpfs_fn.h"
#include <linux/mpage.h>
+#include <linux/iomap.h>
#include <linux/fiemap.h>
#define BLOCKS(size) (((size) + 511) >> 9)
@@ -116,6 +117,47 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
return r;
}
+static int hpfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int n_secs;
+ secno s;
+
+ if (WARN_ON_ONCE(flags & (IOMAP_WRITE | IOMAP_ZERO)))
+ return -EINVAL;
+
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = offset;
+
+ hpfs_lock(sb);
+ s = hpfs_bmap(inode, offset >> blkbits, &n_secs);
+ if (s) {
+ n_secs = hpfs_search_hotfix_map_for_range(sb, s,
+ min_t(loff_t, n_secs, length));
+ if (unlikely(!n_secs)) {
+ s = hpfs_search_hotfix_map(sb, s);
+ n_secs = 1;
+ }
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_MERGED;
+ iomap->addr = (u64)s << blkbits;
+ iomap->length = (u64)n_secs << blkbits;
+ } else {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = 1 << blkbits;
+ }
+
+ hpfs_unlock(sb);
+ return 0;
+}
+
+static const struct iomap_ops hpfs_iomap_ops = {
+ .iomap_begin = hpfs_iomap_begin,
+};
+
static int hpfs_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, hpfs_get_block);
@@ -192,7 +234,14 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len)
{
- return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block);
+ int ret;
+
+ inode_lock(inode);
+ len = min_t(u64, len, i_size_read(inode));
+ ret = iomap_fiemap(inode, fieinfo, start, len, &hpfs_iomap_ops);
+ inode_unlock(inode);
+
+ return ret;
}
const struct address_space_operations hpfs_aops = {
diff --git a/fs/inode.c b/fs/inode.c
index c93500d84264..37710ca863b5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -190,6 +190,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
mapping->writeback_index = 0;
+ __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock",
+ &sb->s_type->invalidate_lock_key);
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
@@ -768,7 +770,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
return LRU_ROTATE;
}
- if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(lru_lock);
diff --git a/fs/internal.h b/fs/internal.h
index 82e8eb32ff3d..68a2ae029a27 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -48,8 +48,8 @@ static inline int emergency_thaw_bdev(struct super_block *sb)
/*
* buffer.c
*/
-extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block, struct iomap *iomap);
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block, const struct iomap *iomap);
/*
* char_dev.c
@@ -71,11 +71,15 @@ extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
-long do_rmdir(int dfd, struct filename *name);
-long do_unlinkat(int dfd, struct filename *name);
+int do_rmdir(int dfd, struct filename *name);
+int do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct user_namespace *mnt_userns, struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
+int do_mkdirat(int dfd, struct filename *name, umode_t mode);
+int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
+int do_linkat(int olddfd, struct filename *old, int newdfd,
+ struct filename *new, int flags);
/*
* namespace.c
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 7d2ed8c7dd31..cd9bd095fb1b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
struct completion ref_done;
+ unsigned long create_state;
+ struct callback_head create_work;
+ int create_index;
+
struct rcu_head rcu;
};
@@ -174,7 +178,7 @@ static void io_worker_exit(struct io_worker *worker)
complete(&worker->ref_done);
wait_for_completion(&worker->ref_done);
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
@@ -184,7 +188,7 @@ static void io_worker_exit(struct io_worker *worker)
worker->flags = 0;
current->flags &= ~PF_IO_WORKER;
preempt_enable();
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq);
@@ -250,18 +254,19 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (!ret) {
bool do_create = false, first = false;
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
if (!acct->nr_workers)
first = true;
acct->nr_workers++;
do_create = true;
}
- raw_spin_unlock_irq(&wqe->lock);
- if (do_create)
+ raw_spin_unlock(&wqe->lock);
+ if (do_create) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
create_io_worker(wqe->wq, wqe, acct->index, first);
+ }
}
}
@@ -272,60 +277,63 @@ static void io_wqe_inc_running(struct io_worker *worker)
atomic_inc(&acct->nr_running);
}
-struct create_worker_data {
- struct callback_head work;
- struct io_wqe *wqe;
- int index;
-};
-
static void create_worker_cb(struct callback_head *cb)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;
struct io_wq *wq;
struct io_wqe *wqe;
struct io_wqe_acct *acct;
bool do_create = false, first = false;
- cwd = container_of(cb, struct create_worker_data, work);
- wqe = cwd->wqe;
+ worker = container_of(cb, struct io_worker, create_work);
+ wqe = worker->wqe;
wq = wqe->wq;
- acct = &wqe->acct[cwd->index];
- raw_spin_lock_irq(&wqe->lock);
+ acct = &wqe->acct[worker->create_index];
+ raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) {
if (!acct->nr_workers)
first = true;
acct->nr_workers++;
do_create = true;
}
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
if (do_create) {
- create_io_worker(wq, wqe, cwd->index, first);
+ create_io_worker(wq, wqe, worker->create_index, first);
} else {
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
}
- kfree(cwd);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+ struct io_wqe_acct *acct)
{
- struct create_worker_data *cwd;
struct io_wq *wq = wqe->wq;
/* raced with exit, just ignore create call */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
goto fail;
+ if (!io_worker_get(worker))
+ goto fail;
+ /*
+ * create_state manages ownership of create_work/index. We should
+ * only need one entry per worker, as the worker going to sleep
+ * will trigger the condition, and waking will clear it once it
+ * runs the task_work.
+ */
+ if (test_bit(0, &worker->create_state) ||
+ test_and_set_bit_lock(0, &worker->create_state))
+ goto fail_release;
- cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
- if (cwd) {
- init_task_work(&cwd->work, create_worker_cb);
- cwd->wqe = wqe;
- cwd->index = acct->index;
- if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
- return;
-
- kfree(cwd);
- }
+ init_task_work(&worker->create_work, create_worker_cb);
+ worker->create_index = acct->index;
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ return;
+ clear_bit_unlock(0, &worker->create_state);
+fail_release:
+ io_worker_release(worker);
fail:
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
@@ -343,7 +351,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(wqe, acct);
+ io_queue_worker_create(wqe, worker, acct);
}
}
@@ -416,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
spin_unlock(&wq->hash->wait.lock);
}
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
+/*
+ * We can always run the work if the worker is currently the same type as
+ * the work (eg both are bound, or both are unbound). If they are not the
+ * same, only allow it if incrementing the worker count would be allowed.
+ */
+static bool io_worker_can_run_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ struct io_wqe_acct *acct;
+
+ if (!(worker->flags & IO_WORKER_F_BOUND) !=
+ !(work->flags & IO_WQ_WORK_UNBOUND))
+ return true;
+
+ /* not the same type, check if we'd go over the limit */
+ acct = io_work_get_acct(worker->wqe, work);
+ return acct->nr_workers < acct->max_workers;
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
+ struct io_worker *worker,
+ bool *stalled)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
@@ -428,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
work = container_of(node, struct io_wq_work, list);
+ if (!io_worker_can_run_work(worker, work))
+ break;
+
/* not hashed, can run anytime */
if (!io_wq_is_hashed(work)) {
wq_list_del(&wqe->work_list, node, prev);
@@ -454,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
raw_spin_unlock(&wqe->lock);
io_wait_on_hash(wqe, stall_hash);
raw_spin_lock(&wqe->lock);
+ *stalled = true;
}
return NULL;
@@ -477,9 +510,9 @@ static void io_assign_current_work(struct io_worker *worker,
cond_resched();
}
- spin_lock_irq(&worker->lock);
+ spin_lock(&worker->lock);
worker->cur_work = work;
- spin_unlock_irq(&worker->lock);
+ spin_unlock(&worker->lock);
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
@@ -493,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker)
do {
struct io_wq_work *work;
+ bool stalled;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
@@ -501,13 +535,14 @@ get_next:
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
- work = io_get_next_work(wqe);
+ stalled = false;
+ work = io_get_next_work(wqe, worker, &stalled);
if (work)
__io_worker_busy(wqe, worker, work);
- else if (!wq_list_empty(&wqe->work_list))
+ else if (stalled)
wqe->flags |= IO_WQE_FLAG_STALLED;
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
if (!work)
break;
io_assign_current_work(worker, work);
@@ -539,16 +574,16 @@ get_next:
clear_bit(hash, &wq->hash->map);
if (wq_has_sleeper(&wq->hash->wait))
wake_up(&wq->hash->wait);
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
}
} while (work);
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
} while (1);
}
@@ -569,13 +604,13 @@ static int io_wqe_worker(void *data)
set_current_state(TASK_INTERRUPTIBLE);
loop:
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
io_worker_handle_work(worker);
goto loop;
}
__io_worker_idle(wqe, worker);
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
if (io_flush_signals())
continue;
ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
@@ -594,7 +629,7 @@ loop:
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
io_worker_handle_work(worker);
}
@@ -636,9 +671,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
worker->flags &= ~IO_WORKER_F_RUNNING;
- raw_spin_lock_irq(&worker->wqe->lock);
+ raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
- raw_spin_unlock_irq(&worker->wqe->lock);
+ raw_spin_unlock(&worker->wqe->lock);
}
static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first)
@@ -664,9 +699,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo
kfree(worker);
fail:
atomic_dec(&acct->nr_running);
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
acct->nr_workers--;
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wq);
return;
}
@@ -676,7 +711,7 @@ fail:
set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
tsk->flags |= PF_NO_SETAFFINITY;
- raw_spin_lock_irq(&wqe->lock);
+ raw_spin_lock(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
@@ -684,7 +719,7 @@ fail:
worker->flags |= IO_WORKER_F_BOUND;
if (first && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_unlock(&wqe->lock);
wake_up_new_task(tsk);
}
@@ -759,8 +794,7 @@ append:
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- int work_flags;
- unsigned long flags;
+ bool do_wake;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -772,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- work_flags = work->flags;
- raw_spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running);
+ raw_spin_unlock(&wqe->lock);
- if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running))
+ if (do_wake)
io_wqe_wake_worker(wqe, acct);
}
@@ -805,19 +839,18 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *match = data;
- unsigned long flags;
/*
* Hold the lock to avoid ->cur_work going out of scope, caller
* may dereference the passed in work.
*/
- spin_lock_irqsave(&worker->lock, flags);
+ spin_lock(&worker->lock);
if (worker->cur_work &&
match->fn(worker->cur_work, match->data)) {
set_notify_signal(worker->task);
match->nr_running++;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ spin_unlock(&worker->lock);
return match->nr_running && !match->cancel_all;
}
@@ -845,16 +878,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
- unsigned long flags;
retry:
- raw_spin_lock_irqsave(&wqe->lock, flags);
+ raw_spin_lock(&wqe->lock);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock(&wqe->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
if (!match->cancel_all)
@@ -863,7 +895,7 @@ retry:
/* not safe to continue after unlock */
goto retry;
}
- raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ raw_spin_unlock(&wqe->lock);
}
static void io_wqe_cancel_running_work(struct io_wqe *wqe,
@@ -1004,12 +1036,12 @@ err_wq:
static bool io_task_work_match(struct callback_head *cb, void *data)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;
if (cb->func != create_worker_cb)
return false;
- cwd = container_of(cb, struct create_worker_data, work);
- return cwd->wqe->wq == data;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker->wqe->wq == data;
}
void io_wq_exit_start(struct io_wq *wq)
@@ -1026,12 +1058,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
return;
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
- struct create_worker_data *cwd;
+ struct io_worker *worker;
- cwd = container_of(cb, struct create_worker_data, work);
- atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+ worker = container_of(cb, struct io_worker, create_work);
+ atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
io_worker_ref_put(wq);
- kfree(cwd);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}
rcu_read_lock();
@@ -1143,6 +1176,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
return 0;
}
+/*
+ * Set max number of unbounded workers, returns old value. If new_count is 0,
+ * then just return the old value.
+ */
+int io_wq_max_workers(struct io_wq *wq, int *new_count)
+{
+ int i, node, prev = 0;
+
+ for (i = 0; i < 2; i++) {
+ if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
+ new_count[i] = task_rlimit(current, RLIMIT_NPROC);
+ }
+
+ rcu_read_lock();
+ for_each_node(node) {
+ struct io_wqe_acct *acct;
+
+ for (i = 0; i < 2; i++) {
+ acct = &wq->wqes[node]->acct[i];
+ prev = max_t(int, acct->max_workers, prev);
+ if (new_count[i])
+ acct->max_workers = new_count[i];
+ new_count[i] = prev;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
static __init int io_wq_init(void)
{
int ret;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3999ee58ff26..bf5c4c533760 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -44,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
+ node->next = NULL;
if (!list->first) {
list->last = node;
WRITE_ONCE(list->first, node);
@@ -51,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
list->last->next = node;
list->last = node;
}
- node->next = NULL;
}
static inline void wq_list_cut(struct io_wq_work_list *list,
@@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
+int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a2e20a6fbfed..6f35b1285865 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -92,17 +92,12 @@
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
-/*
- * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
- */
-#define IORING_FILE_TABLE_SHIFT 9
-#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
-#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
-#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
+/* only define max */
+#define IORING_MAX_FIXED_FILES (1U << 15)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
-#define IO_RSRC_TAG_TABLE_SHIFT 9
+#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
@@ -235,8 +230,7 @@ struct io_rsrc_put {
};
struct io_file_table {
- /* two level table */
- struct io_fixed_file **files;
+ struct io_fixed_file *files;
};
struct io_rsrc_node {
@@ -301,18 +295,10 @@ struct io_sq_data {
struct completion exited;
};
-#define IO_IOPOLL_BATCH 8
#define IO_COMPL_BATCH 32
#define IO_REQ_CACHE_SIZE 32
#define IO_REQ_ALLOC_BATCH 8
-struct io_comp_state {
- struct io_kiocb *reqs[IO_COMPL_BATCH];
- unsigned int nr;
- /* inline/task_work completion list, under ->uring_lock */
- struct list_head free_list;
-};
-
struct io_submit_link {
struct io_kiocb *head;
struct io_kiocb *last;
@@ -333,14 +319,11 @@ struct io_submit_state {
/*
* Batch completion logic
*/
- struct io_comp_state comp;
+ struct io_kiocb *compl_reqs[IO_COMPL_BATCH];
+ unsigned int compl_nr;
+ /* inline/task_work completion list, under ->uring_lock */
+ struct list_head free_list;
- /*
- * File reference cache
- */
- struct file *file;
- unsigned int fd;
- unsigned int file_refs;
unsigned int ios_left;
};
@@ -392,6 +375,7 @@ struct io_ring_ctx {
struct io_submit_state submit_state;
struct list_head timeout_list;
+ struct list_head ltimeout_list;
struct list_head cq_overflow_list;
struct xarray io_buffers;
struct xarray personalities;
@@ -426,6 +410,8 @@ struct io_ring_ctx {
struct {
spinlock_t completion_lock;
+ spinlock_t timeout_lock;
+
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
@@ -487,8 +473,8 @@ struct io_uring_task {
spinlock_t task_lock;
struct io_wq_work_list task_list;
- unsigned long task_state;
struct callback_head task_work;
+ bool task_running;
};
/*
@@ -523,6 +509,7 @@ struct io_timeout_data {
struct hrtimer timer;
struct timespec64 ts;
enum hrtimer_mode mode;
+ u32 flags;
};
struct io_accept {
@@ -530,6 +517,7 @@ struct io_accept {
struct sockaddr __user *addr;
int __user *addr_len;
int flags;
+ u32 file_slot;
unsigned long nofile;
};
@@ -553,6 +541,8 @@ struct io_timeout {
struct list_head list;
/* head of the link, used by linked timeouts only */
struct io_kiocb *head;
+ /* for linked completions */
+ struct io_kiocb *prev;
};
struct io_timeout_rem {
@@ -562,6 +552,7 @@ struct io_timeout_rem {
/* timeout update */
struct timespec64 ts;
u32 flags;
+ bool ltimeout;
};
struct io_rw {
@@ -593,6 +584,7 @@ struct io_sr_msg {
struct io_open {
struct file *file;
int dfd;
+ u32 file_slot;
struct filename *filename;
struct open_how how;
unsigned long nofile;
@@ -675,9 +667,31 @@ struct io_unlink {
struct filename *filename;
};
+struct io_mkdir {
+ struct file *file;
+ int dfd;
+ umode_t mode;
+ struct filename *filename;
+};
+
+struct io_symlink {
+ struct file *file;
+ int new_dfd;
+ struct filename *oldpath;
+ struct filename *newpath;
+};
+
+struct io_hardlink {
+ struct file *file;
+ int old_dfd;
+ int new_dfd;
+ struct filename *oldpath;
+ struct filename *newpath;
+ int flags;
+};
+
struct io_completion {
struct file *file;
- struct list_head list;
u32 cflags;
};
@@ -719,14 +733,15 @@ enum {
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
- REQ_F_LTIMEOUT_ACTIVE_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_DONT_REISSUE_BIT,
REQ_F_CREDS_BIT,
+ REQ_F_REFCOUNT_BIT,
+ REQ_F_ARM_LTIMEOUT_BIT,
/* keep async read/write and isreg together and in order */
- REQ_F_ASYNC_READ_BIT,
- REQ_F_ASYNC_WRITE_BIT,
+ REQ_F_NOWAIT_READ_BIT,
+ REQ_F_NOWAIT_WRITE_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
@@ -763,8 +778,6 @@ enum {
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
- /* linked timeout is active, i.e. prepared by link's head */
- REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
/* completion is deferred through io_comp_state */
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
@@ -772,13 +785,17 @@ enum {
/* don't attempt request reissue, see io_rw_reissue() */
REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT),
/* supports async reads */
- REQ_F_ASYNC_READ = BIT(REQ_F_ASYNC_READ_BIT),
+ REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
/* supports async writes */
- REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT),
+ REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
+ /* skip refcounting if not set */
+ REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
+ /* there is a linked timeout that has to be armed */
+ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
};
struct async_poll {
@@ -786,7 +803,7 @@ struct async_poll {
struct io_poll_iocb *double_poll;
};
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req);
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
struct io_task_work {
union {
@@ -832,6 +849,9 @@ struct io_kiocb {
struct io_shutdown shutdown;
struct io_rename rename;
struct io_unlink unlink;
+ struct io_mkdir mkdir;
+ struct io_symlink symlink;
+ struct io_hardlink hardlink;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
@@ -1043,39 +1063,43 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_RENAMEAT] = {},
[IORING_OP_UNLINKAT] = {},
+ [IORING_OP_MKDIRAT] = {},
+ [IORING_OP_SYMLINKAT] = {},
+ [IORING_OP_LINKAT] = {},
};
+/* requests with any of those set should undergo io_disarm_next() */
+#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
+
static bool io_disarm_next(struct io_kiocb *req);
static void io_uring_del_tctx_node(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
long res, unsigned int cflags);
static void io_put_req(struct io_kiocb *req);
-static void io_put_req_deferred(struct io_kiocb *req, int nr);
+static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
-static void io_put_task(struct task_struct *task, int nr);
-static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
struct io_uring_rsrc_update2 *up,
unsigned nr_args);
static void io_clean_op(struct io_kiocb *req);
-static struct file *io_file_get(struct io_submit_state *state,
+static struct file *io_file_get(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
static void io_submit_flush_completions(struct io_ring_ctx *ctx);
-static bool io_poll_remove_waitqs(struct io_kiocb *req);
static int io_req_prep_async(struct io_kiocb *req);
-static void io_fallback_req_func(struct work_struct *unused);
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+ unsigned int issue_flags, u32 slot_index);
+static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
static struct kmem_cache *req_cachep;
@@ -1094,9 +1118,65 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
+{
+ if (!*locked) {
+ mutex_lock(&ctx->uring_lock);
+ *locked = true;
+ }
+}
+
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
+/*
+ * Shamelessly stolen from the mm implementation of page reference checking,
+ * see commit f958d7b528b1 for details.
+ */
+#define req_ref_zero_or_close_to_overflow(req) \
+ ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
+{
+ WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
+ return atomic_inc_not_zero(&req->refs);
+}
+
+static inline bool req_ref_put_and_test(struct io_kiocb *req)
+{
+ if (likely(!(req->flags & REQ_F_REFCOUNT)))
+ return true;
+
+ WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+ return atomic_dec_and_test(&req->refs);
+}
+
+static inline void req_ref_put(struct io_kiocb *req)
+{
+ WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
+ WARN_ON_ONCE(req_ref_put_and_test(req));
+}
+
+static inline void req_ref_get(struct io_kiocb *req)
+{
+ WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
+ WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+ atomic_inc(&req->refs);
+}
+
+static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
+{
+ if (!(req->flags & REQ_F_REFCOUNT)) {
+ req->flags |= REQ_F_REFCOUNT;
+ atomic_set(&req->refs, nr);
+ }
+}
+
+static inline void io_req_set_refcount(struct io_kiocb *req)
+{
+ __io_req_set_refcount(req, 1);
+}
+
static inline void io_req_set_rsrc_node(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1141,6 +1221,12 @@ static inline void req_set_fail(struct io_kiocb *req)
req->flags |= REQ_F_FAIL;
}
+static inline void req_fail_link_node(struct io_kiocb *req, int res)
+{
+ req_set_fail(req);
+ req->result = res;
+}
+
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1153,6 +1239,27 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
return !req->timeout.off;
}
+static void io_fallback_req_func(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
+ fallback_work.work);
+ struct llist_node *node = llist_del_all(&ctx->fallback_llist);
+ struct io_kiocb *req, *tmp;
+ bool locked = false;
+
+ percpu_ref_get(&ctx->refs);
+ llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
+ req->io_task_work.func(req, &locked);
+
+ if (locked) {
+ if (ctx->submit_state.compl_nr)
+ io_submit_flush_completions(ctx);
+ mutex_unlock(&ctx->uring_lock);
+ }
+ percpu_ref_put(&ctx->refs);
+
+}
+
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
@@ -1198,15 +1305,17 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
+ spin_lock_init(&ctx->timeout_lock);
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
+ INIT_LIST_HEAD(&ctx->ltimeout_list);
spin_lock_init(&ctx->rsrc_ref_lock);
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
- INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
+ INIT_LIST_HEAD(&ctx->submit_state.free_list);
INIT_LIST_HEAD(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
return ctx;
@@ -1236,6 +1345,20 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
+#define FFS_ASYNC_READ 0x1UL
+#define FFS_ASYNC_WRITE 0x2UL
+#ifdef CONFIG_64BIT
+#define FFS_ISREG 0x4UL
+#else
+#define FFS_ISREG 0x0UL
+#endif
+#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+
+static inline bool io_req_ffs_set(struct io_kiocb *req)
+{
+ return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+}
+
static void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
@@ -1244,6 +1367,32 @@ static void io_req_track_inflight(struct io_kiocb *req)
}
}
+static inline void io_unprep_linked_timeout(struct io_kiocb *req)
+{
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+}
+
+static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
+{
+ if (WARN_ON_ONCE(!req->link))
+ return NULL;
+
+ req->flags &= ~REQ_F_ARM_LTIMEOUT;
+ req->flags |= REQ_F_LINK_TIMEOUT;
+
+ /* linked timeouts should have two refs once prep'ed */
+ io_req_set_refcount(req);
+ __io_req_set_refcount(req->link, 2);
+ return req->link;
+}
+
+static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
+{
+ if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
+ return NULL;
+ return __io_prep_linked_timeout(req);
+}
+
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1283,22 +1432,25 @@ static void io_prep_async_link(struct io_kiocb *req)
if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
io_for_each_link(cur, req)
io_prep_async_work(cur);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
} else {
io_for_each_link(cur, req)
io_prep_async_work(cur);
}
}
-static void io_queue_async_work(struct io_kiocb *req)
+static void io_queue_async_work(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
+ /* must not take the lock, NULL it as a precaution */
+ locked = NULL;
+
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -1324,6 +1476,7 @@ static void io_queue_async_work(struct io_kiocb *req)
static void io_kill_timeout(struct io_kiocb *req, int status)
__must_hold(&req->ctx->completion_lock)
+ __must_hold(&req->ctx->timeout_lock)
{
struct io_timeout_data *io = req->async_data;
@@ -1332,7 +1485,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
io_cqring_fill_event(req->ctx, req->user_data, status, 0);
- io_put_req_deferred(req, 1);
+ io_put_req_deferred(req);
}
}
@@ -1351,9 +1504,11 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
}
static void io_flush_timeouts(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+ spin_lock_irq(&ctx->timeout_lock);
while (!list_empty(&ctx->timeout_list)) {
u32 events_needed, events_got;
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
@@ -1378,6 +1533,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
io_kill_timeout(req, 0);
}
ctx->cq_last_tm_flush = seq;
+ spin_unlock_irq(&ctx->timeout_lock);
}
static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -1434,13 +1590,22 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
return !ctx->eventfd_async || io_wq_current_is_worker();
}
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
- /* see waitqueue_active() comment */
- smp_mb();
-
- if (waitqueue_active(&ctx->cq_wait))
- wake_up(&ctx->cq_wait);
+ /*
+ * wake_up_all() may seem excessive, but io_wake_function() and
+ * io_should_wake() handle the termination of the loop and only
+ * wake as many waiters as we need to.
+ */
+ if (wq_has_sleeper(&ctx->cq_wait))
+ wake_up_all(&ctx->cq_wait);
if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
@@ -1453,12 +1618,9 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
- /* see waitqueue_active() comment */
- smp_mb();
-
if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->cq_wait))
- wake_up(&ctx->cq_wait);
+ if (wq_has_sleeper(&ctx->cq_wait))
+ wake_up_all(&ctx->cq_wait);
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
@@ -1471,14 +1633,13 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
/* Returns true if there are no backlogged entries after the flush */
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
- unsigned long flags;
bool all_flushed, posted;
if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
return false;
posted = false;
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock(&ctx->completion_lock);
while (!list_empty(&ctx->cq_overflow_list)) {
struct io_uring_cqe *cqe = io_get_cqe(ctx);
struct io_overflow_cqe *ocqe;
@@ -1506,13 +1667,13 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (posted)
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ spin_unlock(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
return all_flushed;
}
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
bool ret = true;
@@ -1520,7 +1681,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
- ret = __io_cqring_overflow_flush(ctx, force);
+ ret = __io_cqring_overflow_flush(ctx, false);
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&ctx->uring_lock);
}
@@ -1528,39 +1689,37 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
return ret;
}
-/*
- * Shamelessly stolen from the mm implementation of page reference checking,
- * see commit f958d7b528b1 for details.
- */
-#define req_ref_zero_or_close_to_overflow(req) \
- ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
-
-static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
{
- return atomic_inc_not_zero(&req->refs);
-}
+ struct io_uring_task *tctx = task->io_uring;
-static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
-{
- WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
- return atomic_sub_and_test(refs, &req->refs);
+ if (likely(task == current)) {
+ tctx->cached_refs += nr;
+ } else {
+ percpu_counter_sub(&tctx->inflight, nr);
+ if (unlikely(atomic_read(&tctx->in_idle)))
+ wake_up(&tctx->wait);
+ put_task_struct_many(task, nr);
+ }
}
-static inline bool req_ref_put_and_test(struct io_kiocb *req)
+static void io_task_refs_refill(struct io_uring_task *tctx)
{
- WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
- return atomic_dec_and_test(&req->refs);
-}
+ unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
-static inline void req_ref_put(struct io_kiocb *req)
-{
- WARN_ON_ONCE(req_ref_put_and_test(req));
+ percpu_counter_add(&tctx->inflight, refill);
+ refcount_add(refill, &current->usage);
+ tctx->cached_refs += refill;
}
-static inline void req_ref_get(struct io_kiocb *req)
+static inline void io_get_task_refs(int nr)
{
- WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
- atomic_inc(&req->refs);
+ struct io_uring_task *tctx = current->io_uring;
+
+ tctx->cached_refs -= nr;
+ if (unlikely(tctx->cached_refs < 0))
+ io_task_refs_refill(tctx);
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
@@ -1624,9 +1783,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
unsigned int cflags)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock(&ctx->completion_lock);
__io_cqring_fill_event(ctx, req->user_data, res, cflags);
/*
* If we're the last reference to this request, add to our locked
@@ -1634,7 +1792,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
*/
if (req_ref_put_and_test(req)) {
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL))
+ if (req->flags & IO_DISARM_MASK)
io_disarm_next(req);
if (req->link) {
io_req_task_queue(req->link);
@@ -1643,14 +1801,14 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
}
io_dismantle_req(req);
io_put_task(req->task, 1);
- list_add(&req->compl.list, &ctx->locked_free_list);
+ list_add(&req->inflight_entry, &ctx->locked_free_list);
ctx->locked_free_nr++;
} else {
if (!percpu_ref_tryget(&ctx->refs))
req = NULL;
}
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ spin_unlock(&ctx->completion_lock);
if (req) {
io_cqring_ev_posted(ctx);
@@ -1690,24 +1848,35 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
static void io_req_complete_failed(struct io_kiocb *req, long res)
{
req_set_fail(req);
- io_put_req(req);
io_req_complete_post(req, res, 0);
}
+/*
+ * Don't initialise the fields below on every allocation, but do that in
+ * advance and keep them valid across allocations.
+ */
+static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ req->ctx = ctx;
+ req->link = NULL;
+ req->async_data = NULL;
+ /* not necessary, but safer to zero */
+ req->result = 0;
+}
+
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
- struct io_comp_state *cs)
+ struct io_submit_state *state)
{
- spin_lock_irq(&ctx->completion_lock);
- list_splice_init(&ctx->locked_free_list, &cs->free_list);
+ spin_lock(&ctx->completion_lock);
+ list_splice_init(&ctx->locked_free_list, &state->free_list);
ctx->locked_free_nr = 0;
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
}
/* Returns true IFF there are requests in the cache */
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- struct io_comp_state *cs = &state->comp;
int nr;
/*
@@ -1716,14 +1885,14 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
* side cache.
*/
if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
- io_flush_cached_locked_reqs(ctx, cs);
+ io_flush_cached_locked_reqs(ctx, state);
nr = state->free_reqs;
- while (!list_empty(&cs->free_list)) {
- struct io_kiocb *req = list_first_entry(&cs->free_list,
- struct io_kiocb, compl.list);
+ while (!list_empty(&state->free_list)) {
+ struct io_kiocb *req = list_first_entry(&state->free_list,
+ struct io_kiocb, inflight_entry);
- list_del(&req->compl.list);
+ list_del(&req->inflight_entry);
state->reqs[nr++] = req;
if (nr == ARRAY_SIZE(state->reqs))
break;
@@ -1733,48 +1902,41 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
return nr != 0;
}
+/*
+ * A request might get retired back into the request caches even before opcode
+ * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
+ * Because of that, io_alloc_req() should be called only under ->uring_lock
+ * and with extra caution to not get a request that is still worked on.
+ */
static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ int ret, i;
BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
- if (!state->free_reqs) {
- gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
- int ret, i;
+ if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
+ goto got_req;
- if (io_flush_cached_reqs(ctx))
- goto got_req;
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
+ state->reqs);
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
- state->reqs);
-
- /*
- * Bulk alloc is all-or-nothing. If we fail to get a batch,
- * retry single alloc to be on the safe side.
- */
- if (unlikely(ret <= 0)) {
- state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!state->reqs[0])
- return NULL;
- ret = 1;
- }
-
- /*
- * Don't initialise the fields below on every allocation, but
- * do that in advance and keep valid on free.
- */
- for (i = 0; i < ret; i++) {
- struct io_kiocb *req = state->reqs[i];
-
- req->ctx = ctx;
- req->link = NULL;
- req->async_data = NULL;
- /* not necessary, but safer to zero */
- req->result = 0;
- }
- state->free_reqs = ret;
+ /*
+ * Bulk alloc is all-or-nothing. If we fail to get a batch,
+ * retry single alloc to be on the safe side.
+ */
+ if (unlikely(ret <= 0)) {
+ state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!state->reqs[0])
+ return NULL;
+ ret = 1;
}
+
+ for (i = 0; i < ret; i++)
+ io_preinit_req(state->reqs[i], ctx);
+ state->free_reqs = ret;
got_req:
state->free_reqs--;
return state->reqs[state->free_reqs];
@@ -1802,17 +1964,6 @@ static void io_dismantle_req(struct io_kiocb *req)
}
}
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
-{
- struct io_uring_task *tctx = task->io_uring;
-
- percpu_counter_sub(&tctx->inflight, nr);
- if (unlikely(atomic_read(&tctx->in_idle)))
- wake_up(&tctx->wait);
- put_task_struct_many(task, nr);
-}
-
static void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1820,7 +1971,11 @@ static void __io_free_req(struct io_kiocb *req)
io_dismantle_req(req);
io_put_task(req->task, 1);
- kmem_cache_free(req_cachep, req);
+ spin_lock(&ctx->completion_lock);
+ list_add(&req->inflight_entry, &ctx->locked_free_list);
+ ctx->locked_free_nr++;
+ spin_unlock(&ctx->completion_lock);
+
percpu_ref_put(&ctx->refs);
}
@@ -1834,22 +1989,20 @@ static inline void io_remove_next_linked(struct io_kiocb *req)
static bool io_kill_linked_timeout(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
+ __must_hold(&req->ctx->timeout_lock)
{
struct io_kiocb *link = req->link;
- /*
- * Can happen if a linked timeout fired and link had been like
- * req -> link t-out -> link t-out [-> ...]
- */
- if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
+ if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
struct io_timeout_data *io = link->async_data;
io_remove_next_linked(req);
link->timeout.head = NULL;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
+ list_del(&link->timeout.list);
io_cqring_fill_event(link->ctx, link->user_data,
-ECANCELED, 0);
- io_put_req_deferred(link, 1);
+ io_put_req_deferred(link);
return true;
}
}
@@ -1863,12 +2016,17 @@ static void io_fail_links(struct io_kiocb *req)
req->link = NULL;
while (link) {
+ long res = -ECANCELED;
+
+ if (link->flags & REQ_F_FAIL)
+ res = link->result;
+
nxt = link->link;
link->link = NULL;
trace_io_uring_fail_link(req, link);
- io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
- io_put_req_deferred(link, 2);
+ io_cqring_fill_event(link->ctx, link->user_data, res, 0);
+ io_put_req_deferred(link);
link = nxt;
}
}
@@ -1878,8 +2036,24 @@ static bool io_disarm_next(struct io_kiocb *req)
{
bool posted = false;
- if (likely(req->flags & REQ_F_LINK_TIMEOUT))
+ if (req->flags & REQ_F_ARM_LTIMEOUT) {
+ struct io_kiocb *link = req->link;
+
+ req->flags &= ~REQ_F_ARM_LTIMEOUT;
+ if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
+ io_remove_next_linked(req);
+ io_cqring_fill_event(link->ctx, link->user_data,
+ -ECANCELED, 0);
+ io_put_req_deferred(link);
+ posted = true;
+ }
+ } else if (req->flags & REQ_F_LINK_TIMEOUT) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->timeout_lock);
posted = io_kill_linked_timeout(req);
+ spin_unlock_irq(&ctx->timeout_lock);
+ }
if (unlikely((req->flags & REQ_F_FAIL) &&
!(req->flags & REQ_F_HARDLINK))) {
posted |= (req->link != NULL);
@@ -1898,16 +2072,15 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
+ if (req->flags & IO_DISARM_MASK) {
struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
bool posted;
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock(&ctx->completion_lock);
posted = io_disarm_next(req);
if (posted)
io_commit_cqring(req->ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ spin_unlock(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
}
@@ -1923,20 +2096,22 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
return __io_req_find_next(req);
}
-static void ctx_flush_and_put(struct io_ring_ctx *ctx)
+static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
- if (ctx->submit_state.comp.nr) {
- mutex_lock(&ctx->uring_lock);
- io_submit_flush_completions(ctx);
+ if (*locked) {
+ if (ctx->submit_state.compl_nr)
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
+ *locked = false;
}
percpu_ref_put(&ctx->refs);
}
static void tctx_task_work(struct callback_head *cb)
{
+ bool locked = false;
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
@@ -1947,37 +2122,32 @@ static void tctx_task_work(struct callback_head *cb)
spin_lock_irq(&tctx->task_lock);
node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
+ if (!node)
+ tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
+ if (!node)
+ break;
- while (node) {
+ do {
struct io_wq_work_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
if (req->ctx != ctx) {
- ctx_flush_and_put(ctx);
+ ctx_flush_and_put(ctx, &locked);
ctx = req->ctx;
+ /* if not contended, grab and improve batching */
+ locked = mutex_trylock(&ctx->uring_lock);
percpu_ref_get(&ctx->refs);
}
- req->io_task_work.func(req);
+ req->io_task_work.func(req, &locked);
node = next;
- }
- if (wq_list_empty(&tctx->task_list)) {
- spin_lock_irq(&tctx->task_lock);
- clear_bit(0, &tctx->task_state);
- if (wq_list_empty(&tctx->task_list)) {
- spin_unlock_irq(&tctx->task_lock);
- break;
- }
- spin_unlock_irq(&tctx->task_lock);
- /* another tctx_task_work() is enqueued, yield */
- if (test_and_set_bit(0, &tctx->task_state))
- break;
- }
+ } while (node);
+
cond_resched();
}
- ctx_flush_and_put(ctx);
+ ctx_flush_and_put(ctx, &locked);
}
static void io_req_task_work_add(struct io_kiocb *req)
@@ -1987,19 +2157,20 @@ static void io_req_task_work_add(struct io_kiocb *req)
enum task_work_notify_mode notify;
struct io_wq_work_node *node;
unsigned long flags;
+ bool running;
WARN_ON_ONCE(!tctx);
spin_lock_irqsave(&tctx->task_lock, flags);
wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+ running = tctx->task_running;
+ if (!running)
+ tctx->task_running = true;
spin_unlock_irqrestore(&tctx->task_lock, flags);
/* task_work already pending, we're done */
- if (test_bit(0, &tctx->task_state) ||
- test_and_set_bit(0, &tctx->task_state))
+ if (running)
return;
- if (unlikely(tsk->flags & PF_EXITING))
- goto fail;
/*
* SQPOLL kernel thread doesn't need notification, just a wakeup. For
@@ -2012,9 +2183,9 @@ static void io_req_task_work_add(struct io_kiocb *req)
wake_up_process(tsk);
return;
}
-fail:
- clear_bit(0, &tctx->task_state);
+
spin_lock_irqsave(&tctx->task_lock, flags);
+ tctx->task_running = false;
node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
@@ -2028,27 +2199,25 @@ fail:
}
}
-static void io_req_task_cancel(struct io_kiocb *req)
+static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
- /* ctx is guaranteed to stay alive while we hold uring_lock */
- mutex_lock(&ctx->uring_lock);
+ /* not needed for normal modes, but SQPOLL depends on it */
+ io_tw_lock(ctx, locked);
io_req_complete_failed(req, req->result);
- mutex_unlock(&ctx->uring_lock);
}
-static void io_req_task_submit(struct io_kiocb *req)
+static void io_req_task_submit(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
- /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
- mutex_lock(&ctx->uring_lock);
- if (!(req->task->flags & PF_EXITING) && !req->task->in_execve)
+ io_tw_lock(ctx, locked);
+ /* req->task == current here, checking PF_EXITING is safe */
+ if (likely(!(req->task->flags & PF_EXITING)))
__io_queue_sqe(req);
else
io_req_complete_failed(req, -EFAULT);
- mutex_unlock(&ctx->uring_lock);
}
static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -2084,6 +2253,11 @@ static void io_free_req(struct io_kiocb *req)
__io_free_req(req);
}
+static void io_free_req_work(struct io_kiocb *req, bool *locked)
+{
+ io_free_req(req);
+}
+
struct req_batch {
struct task_struct *task;
int task_refs;
@@ -2100,10 +2274,10 @@ static inline void io_init_req_batch(struct req_batch *rb)
static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
struct req_batch *rb)
{
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
if (rb->ctx_refs)
percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
+ if (rb->task)
+ io_put_task(rb->task, rb->task_refs);
}
static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
@@ -2124,37 +2298,37 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
if (state->free_reqs != ARRAY_SIZE(state->reqs))
state->reqs[state->free_reqs++] = req;
else
- list_add(&req->compl.list, &state->comp.free_list);
+ list_add(&req->inflight_entry, &state->free_list);
}
static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
{
- struct io_comp_state *cs = &ctx->submit_state.comp;
- int i, nr = cs->nr;
+ struct io_submit_state *state = &ctx->submit_state;
+ int i, nr = state->compl_nr;
struct req_batch rb;
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
for (i = 0; i < nr; i++) {
- struct io_kiocb *req = cs->reqs[i];
+ struct io_kiocb *req = state->compl_reqs[i];
__io_cqring_fill_event(ctx, req->user_data, req->result,
req->compl.cflags);
}
io_commit_cqring(ctx);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
io_init_req_batch(&rb);
for (i = 0; i < nr; i++) {
- struct io_kiocb *req = cs->reqs[i];
+ struct io_kiocb *req = state->compl_reqs[i];
- /* submission and completion refs */
- if (req_ref_sub_and_test(req, 2))
+ if (req_ref_put_and_test(req))
io_req_free_batch(&rb, req, &ctx->submit_state);
}
io_req_free_batch_finish(ctx, &rb);
- cs->nr = 0;
+ state->compl_nr = 0;
}
/*
@@ -2178,16 +2352,12 @@ static inline void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static void io_free_req_deferred(struct io_kiocb *req)
+static inline void io_put_req_deferred(struct io_kiocb *req)
{
- req->io_task_work.func = io_free_req;
- io_req_task_work_add(req);
-}
-
-static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
-{
- if (req_ref_sub_and_test(req, refs))
- io_free_req_deferred(req);
+ if (req_ref_put_and_test(req)) {
+ req->io_task_work.func = io_free_req_work;
+ io_req_task_work_add(req);
+ }
}
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
@@ -2220,6 +2390,8 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
struct io_buffer *kbuf;
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
return io_put_kbuf(req, kbuf);
}
@@ -2239,7 +2411,7 @@ static inline bool io_run_task_work(void)
* Find and free completed poll iocbs
*/
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done, bool resubmit)
+ struct list_head *done)
{
struct req_batch rb;
struct io_kiocb *req;
@@ -2249,23 +2421,18 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_init_req_batch(&rb);
while (!list_empty(done)) {
- int cflags = 0;
-
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
- if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
+ if (READ_ONCE(req->result) == -EAGAIN &&
!(req->flags & REQ_F_DONT_REISSUE)) {
req->iopoll_completed = 0;
- req_ref_get(req);
io_req_task_queue_reissue(req);
continue;
}
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_rw_kbuf(req);
-
- __io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
+ __io_cqring_fill_event(ctx, req->user_data, req->result,
+ io_put_rw_kbuf(req));
(*nr_events)++;
if (req_ref_put_and_test(req))
@@ -2278,12 +2445,11 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min, bool resubmit)
+ long min)
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
bool spin;
- int ret;
/*
* Only spin for completions if we don't have multiple devices hanging
@@ -2291,9 +2457,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
*/
spin = !ctx->poll_multi_queue && *nr_events < min;
- ret = 0;
list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
struct kiocb *kiocb = &req->rw.kiocb;
+ int ret;
/*
* Move completed and retryable entries to our local lists.
@@ -2308,22 +2474,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
break;
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
- if (ret < 0)
- break;
+ if (unlikely(ret < 0))
+ return ret;
+ else if (ret)
+ spin = false;
/* iopoll may have completed current req */
if (READ_ONCE(req->iopoll_completed))
list_move_tail(&req->inflight_entry, &done);
-
- if (ret && spin)
- spin = false;
- ret = 0;
}
if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done, resubmit);
+ io_iopoll_complete(ctx, nr_events, &done);
- return ret;
+ return 0;
}
/*
@@ -2339,7 +2503,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
while (!list_empty(&ctx->iopoll_list)) {
unsigned int nr_events = 0;
- io_do_iopoll(ctx, &nr_events, 0, false);
+ io_do_iopoll(ctx, &nr_events, 0);
/* let it sleep and repeat later if can't complete a request */
if (nr_events == 0)
@@ -2401,7 +2565,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min, true);
+ ret = io_do_iopoll(ctx, &nr_events, min);
} while (!ret && nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
@@ -2470,44 +2634,57 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
}
#endif
-static void io_fallback_req_func(struct work_struct *work)
-{
- struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
- fallback_work.work);
- struct llist_node *node = llist_del_all(&ctx->fallback_llist);
- struct io_kiocb *req, *tmp;
-
- percpu_ref_get(&ctx->refs);
- llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
- req->io_task_work.func(req);
- percpu_ref_put(&ctx->refs);
-}
-
-static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
- unsigned int issue_flags)
+static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
- int cflags = 0;
-
if (req->rw.kiocb.ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (res != req->result) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
- return;
+ return true;
}
req_set_fail(req);
+ req->result = res;
}
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_rw_kbuf(req);
- __io_req_complete(req, issue_flags, res, cflags);
+ return false;
+}
+
+static void io_req_task_complete(struct io_kiocb *req, bool *locked)
+{
+ unsigned int cflags = io_put_rw_kbuf(req);
+ long res = req->result;
+
+ if (*locked) {
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_submit_state *state = &ctx->submit_state;
+
+ io_req_complete_state(req, res, cflags);
+ state->compl_reqs[state->compl_nr++] = req;
+ if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
+ io_submit_flush_completions(ctx);
+ } else {
+ io_req_complete_post(req, res, cflags);
+ }
+}
+
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+ unsigned int issue_flags)
+{
+ if (__io_complete_rw_common(req, res))
+ return;
+ __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
- __io_complete_rw(req, res, res2, 0);
+ if (__io_complete_rw_common(req, res))
+ return;
+ req->result = res;
+ req->io_task_work.func = io_req_task_complete;
+ io_req_task_work_add(req);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -2593,40 +2770,6 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
}
}
-static inline void io_state_file_put(struct io_submit_state *state)
-{
- if (state->file_refs) {
- fput_many(state->file, state->file_refs);
- state->file_refs = 0;
- }
-}
-
-/*
- * Get as many references to a file as we have IOs left in this submission,
- * assuming most submissions are for one file, or at least that each file
- * has more than one submission.
- */
-static struct file *__io_file_get(struct io_submit_state *state, int fd)
-{
- if (!state)
- return fget(fd);
-
- if (state->file_refs) {
- if (state->fd == fd) {
- state->file_refs--;
- return state->file;
- }
- io_state_file_put(state);
- }
- state->file = fget_many(fd, state->ios_left);
- if (unlikely(!state->file))
- return NULL;
-
- state->fd = fd;
- state->file_refs = state->ios_left - 1;
- return state->file;
-}
-
static bool io_bdev_nowait(struct block_device *bdev)
{
return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
@@ -2637,7 +2780,7 @@ static bool io_bdev_nowait(struct block_device *bdev)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool __io_file_supports_async(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, int rw)
{
umode_t mode = file_inode(file)->i_mode;
@@ -2670,14 +2813,14 @@ static bool __io_file_supports_async(struct file *file, int rw)
return file->f_op->write_iter != NULL;
}
-static bool io_file_supports_async(struct io_kiocb *req, int rw)
+static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
{
- if (rw == READ && (req->flags & REQ_F_ASYNC_READ))
+ if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
return true;
- else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE))
+ else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
return true;
- return __io_file_supports_async(req->file, rw);
+ return __io_file_supports_nowait(req->file, rw);
}
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -2688,7 +2831,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
unsigned ioprio;
int ret;
- if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode))
+ if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
req->flags |= REQ_F_ISREG;
kiocb->ki_pos = READ_ONCE(sqe->off);
@@ -2721,7 +2864,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
!kiocb->ki_filp->f_op->iopoll)
return -EOPNOTSUPP;
- kiocb->ki_flags |= IOCB_HIPRI;
+ kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
} else {
@@ -2788,15 +2931,11 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (check_reissue && (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
if (io_resubmit_prep(req)) {
- req_ref_get(req);
io_req_task_queue_reissue(req);
} else {
- int cflags = 0;
-
req_set_fail(req);
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_rw_kbuf(req);
- __io_req_complete(req, issue_flags, ret, cflags);
+ __io_req_complete(req, issue_flags, ret,
+ io_put_rw_kbuf(req));
}
}
}
@@ -3214,9 +3353,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
list_del_init(&wait->entry);
-
- /* submit ref gets dropped, acquire a new one */
- req_ref_get(req);
io_req_task_queue(req);
return 1;
}
@@ -3301,7 +3437,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags |= IOCB_NOWAIT;
/* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_async(req, READ)) {
+ if (force_nonblock && !io_file_supports_nowait(req, READ)) {
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
return ret ?: -EAGAIN;
}
@@ -3406,7 +3542,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags |= IOCB_NOWAIT;
/* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_async(req, WRITE))
+ if (force_nonblock && !io_file_supports_nowait(req, WRITE))
goto copy_iov;
/* file path doesn't support NOWAIT for non-direct_IO */
@@ -3481,7 +3617,7 @@ static int io_renameat_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -3532,7 +3668,8 @@ static int io_unlinkat_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+ sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -3572,14 +3709,157 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_mkdirat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_mkdir *mkd = &req->mkdir;
+ const char __user *fname;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
+ sqe->splice_fd_in)
+ return -EINVAL;
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ mkd->dfd = READ_ONCE(sqe->fd);
+ mkd->mode = READ_ONCE(sqe->len);
+
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ mkd->filename = getname(fname);
+ if (IS_ERR(mkd->filename))
+ return PTR_ERR(mkd->filename);
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+{
+ struct io_mkdir *mkd = &req->mkdir;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_symlinkat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_symlink *sl = &req->symlink;
+ const char __user *oldpath, *newpath;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
+ sqe->splice_fd_in)
+ return -EINVAL;
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ sl->new_dfd = READ_ONCE(sqe->fd);
+ oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+
+ sl->oldpath = getname(oldpath);
+ if (IS_ERR(sl->oldpath))
+ return PTR_ERR(sl->oldpath);
+
+ sl->newpath = getname(newpath);
+ if (IS_ERR(sl->newpath)) {
+ putname(sl->oldpath);
+ return PTR_ERR(sl->newpath);
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+{
+ struct io_symlink *sl = &req->symlink;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_linkat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_hardlink *lnk = &req->hardlink;
+ const char __user *oldf, *newf;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+ return -EINVAL;
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ lnk->old_dfd = READ_ONCE(sqe->fd);
+ lnk->new_dfd = READ_ONCE(sqe->len);
+ oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ lnk->flags = READ_ONCE(sqe->hardlink_flags);
+
+ lnk->oldpath = getname(oldf);
+ if (IS_ERR(lnk->oldpath))
+ return PTR_ERR(lnk->oldpath);
+
+ lnk->newpath = getname(newf);
+ if (IS_ERR(lnk->newpath)) {
+ putname(lnk->oldpath);
+ return PTR_ERR(lnk->newpath);
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_linkat(struct io_kiocb *req, int issue_flags)
+{
+ struct io_hardlink *lnk = &req->hardlink;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
+ lnk->newpath, lnk->flags);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
static int io_shutdown_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
- sqe->buf_index)
+ if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->shutdown.how = READ_ONCE(sqe->len);
@@ -3628,7 +3908,7 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
- sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
+ sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
(sp->flags & SPLICE_F_FD_IN_FIXED));
if (!sp->file_in)
return -EBADF;
@@ -3727,7 +4007,8 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+ sqe->splice_fd_in))
return -EINVAL;
req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -3760,7 +4041,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
+ if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
+ sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -3791,6 +4073,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
if (unlikely(sqe->ioprio || sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
@@ -3808,6 +4092,11 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
req->open.filename = NULL;
return ret;
}
+
+ req->open.file_slot = READ_ONCE(sqe->file_index);
+ if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
+ return -EINVAL;
+
req->open.nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
@@ -3815,12 +4104,9 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- u64 flags, mode;
+ u64 mode = READ_ONCE(sqe->len);
+ u64 flags = READ_ONCE(sqe->open_flags);
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- mode = READ_ONCE(sqe->len);
- flags = READ_ONCE(sqe->open_flags);
req->open.how = build_open_how(flags, mode);
return __io_openat_prep(req, sqe);
}
@@ -3831,8 +4117,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
size_t len;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
if (len < OPEN_HOW_SIZE_VER0)
@@ -3850,8 +4134,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct open_flags op;
struct file *file;
- bool nonblock_set;
- bool resolve_nonblock;
+ bool resolve_nonblock, nonblock_set;
+ bool fixed = !!req->open.file_slot;
int ret;
ret = build_open_flags(&req->open.how, &op);
@@ -3870,9 +4154,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
op.open_flag |= O_NONBLOCK;
}
- ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
- if (ret < 0)
- goto err;
+ if (!fixed) {
+ ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
+ if (ret < 0)
+ goto err;
+ }
file = do_filp_open(req->open.dfd, req->open.filename, &op);
if (IS_ERR(file)) {
@@ -3881,7 +4167,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
* marginal gain for something that is now known to be a slower
* path. So just put it, and we'll get a new one when we retry.
*/
- put_unused_fd(ret);
+ if (!fixed)
+ put_unused_fd(ret);
ret = PTR_ERR(file);
/* only retry if RESOLVE_CACHED wasn't already set by application */
@@ -3894,7 +4181,12 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
file->f_flags &= ~O_NONBLOCK;
fsnotify_open(file);
- fd_install(ret, file);
+
+ if (!fixed)
+ fd_install(ret, file);
+ else
+ ret = io_install_fixed_file(req, file, issue_flags,
+ req->open.file_slot - 1);
err:
putname(req->open.filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -3915,7 +4207,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
+ if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+ sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
@@ -3986,7 +4279,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags)
+ if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
@@ -4073,7 +4366,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
- if (sqe->ioprio || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4119,7 +4412,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- if (sqe->ioprio || sqe->buf_index || sqe->off)
+ if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4154,7 +4447,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->addr)
+ if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4192,7 +4485,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -4228,7 +4521,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
- sqe->rw_flags || sqe->buf_index)
+ sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -4289,7 +4582,8 @@ static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+ sqe->splice_fd_in))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -4723,6 +5017,15 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
+
+ accept->file_slot = READ_ONCE(sqe->file_index);
+ if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
+ (accept->flags & SOCK_CLOEXEC)))
+ return -EINVAL;
+ if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
+ accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
return 0;
}
@@ -4731,20 +5034,35 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct io_accept *accept = &req->accept;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
- int ret;
+ bool fixed = !!accept->file_slot;
+ struct file *file;
+ int ret, fd;
if (req->file->f_flags & O_NONBLOCK)
req->flags |= REQ_F_NOWAIT;
- ret = __sys_accept4_file(req->file, file_flags, accept->addr,
- accept->addr_len, accept->flags,
- accept->nofile);
- if (ret == -EAGAIN && force_nonblock)
- return -EAGAIN;
- if (ret < 0) {
+ if (!fixed) {
+ fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+ if (unlikely(fd < 0))
+ return fd;
+ }
+ file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
+ accept->flags);
+ if (IS_ERR(file)) {
+ if (!fixed)
+ put_unused_fd(fd);
+ ret = PTR_ERR(file);
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
+ } else if (!fixed) {
+ fd_install(fd, file);
+ ret = fd;
+ } else {
+ ret = io_install_fixed_file(req, file, issue_flags,
+ accept->file_slot - 1);
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
@@ -4764,7 +5082,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
+ if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
+ sqe->splice_fd_in)
return -EINVAL;
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -4877,6 +5196,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
{
struct io_ring_ctx *ctx = req->ctx;
+ /* req->task == current here, checking PF_EXITING is safe */
if (unlikely(req->task->flags & PF_EXITING))
WRITE_ONCE(poll->canceled, true);
@@ -4886,7 +5206,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
req->result = vfs_poll(req->file, &pt) & poll->events;
}
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
if (!req->result && !READ_ONCE(poll->canceled)) {
add_wait_queue(poll->head, &poll->wait);
return true;
@@ -4920,12 +5240,12 @@ static void io_poll_remove_double(struct io_kiocb *req)
if (poll && poll->head) {
struct wait_queue_head *head = poll->head;
- spin_lock(&head->lock);
+ spin_lock_irq(&head->lock);
list_del_init(&poll->wait.entry);
if (poll->wait.private)
req_ref_put(req);
poll->head = NULL;
- spin_unlock(&head->lock);
+ spin_unlock_irq(&head->lock);
}
}
@@ -4955,13 +5275,13 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
return !(flags & IORING_CQE_F_MORE);
}
-static void io_poll_task_func(struct io_kiocb *req)
+static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt;
if (io_poll_rewait(req, &req->poll)) {
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
} else {
bool done;
@@ -4973,13 +5293,13 @@ static void io_poll_task_func(struct io_kiocb *req)
req->result = 0;
add_wait_queue(req->poll.head, &req->poll.wait);
}
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
if (done) {
nxt = io_put_req_find_next(req);
if (nxt)
- io_req_task_submit(nxt);
+ io_req_task_submit(nxt, locked);
}
}
}
@@ -4990,6 +5310,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
struct io_kiocb *req = wait->private;
struct io_poll_iocb *poll = io_poll_get_single(req);
__poll_t mask = key_to_poll(key);
+ unsigned long flags;
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
@@ -5002,13 +5323,13 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
if (poll->head) {
bool done;
- spin_lock(&poll->head->lock);
+ spin_lock_irqsave(&poll->head->lock, flags);
done = list_empty(&poll->wait.entry);
if (!done)
list_del_init(&poll->wait.entry);
/* make sure double remove sees this as being gone */
wait->private = NULL;
- spin_unlock(&poll->head->lock);
+ spin_unlock_irqrestore(&poll->head->lock, flags);
if (!done) {
/* use wait func handler, so it matches the rq type */
poll->wait.func(&poll->wait, mode, sync, key);
@@ -5045,8 +5366,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
if (unlikely(pt->nr_entries)) {
struct io_poll_iocb *poll_one = poll;
+ /* double add on the same waitqueue head, ignore */
+ if (poll_one->head == head)
+ return;
/* already have a 2nd entry, fail a third attempt */
if (*poll_ptr) {
+ if ((*poll_ptr)->head == head)
+ return;
pt->error = -EINVAL;
return;
}
@@ -5056,9 +5382,6 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
*/
if (!(poll_one->events & EPOLLONESHOT))
poll_one->events |= EPOLLONESHOT;
- /* double add on the same waitqueue head, ignore */
- if (poll_one->head == head)
- return;
poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
if (!poll) {
pt->error = -ENOMEM;
@@ -5088,7 +5411,7 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
-static void io_async_task_func(struct io_kiocb *req)
+static void io_async_task_func(struct io_kiocb *req, bool *locked)
{
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
@@ -5096,16 +5419,16 @@ static void io_async_task_func(struct io_kiocb *req)
trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
if (io_poll_rewait(req, &apoll->poll)) {
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
return;
}
hash_del(&req->hash_node);
io_poll_remove_double(req);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
if (!READ_ONCE(apoll->poll.canceled))
- io_req_task_submit(req);
+ io_req_task_submit(req, locked);
else
io_req_complete_failed(req, -ECANCELED);
}
@@ -5154,11 +5477,11 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
if (unlikely(!ipt->nr_entries) && !ipt->error)
ipt->error = -EINVAL;
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
io_poll_remove_double(req);
if (likely(poll->head)) {
- spin_lock(&poll->head->lock);
+ spin_lock_irq(&poll->head->lock);
if (unlikely(list_empty(&poll->wait.entry))) {
if (ipt->error)
cancel = true;
@@ -5171,7 +5494,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
WRITE_ONCE(poll->canceled, true);
else if (!poll->done) /* actually waiting for an event */
io_poll_req_insert(req);
- spin_unlock(&poll->head->lock);
+ spin_unlock_irq(&poll->head->lock);
}
return mask;
@@ -5213,7 +5536,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
}
/* if we can't nonblock try, then no point in arming a poll handler */
- if (!io_file_supports_async(req, rw))
+ if (!io_file_supports_nowait(req, rw))
return IO_APOLL_ABORTED;
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
@@ -5223,16 +5546,14 @@ static int io_arm_poll_handler(struct io_kiocb *req)
req->apoll = apoll;
req->flags |= REQ_F_POLLED;
ipt.pt._qproc = io_async_queue_proc;
+ io_req_set_refcount(req);
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
- if (ret || ipt.error) {
- spin_unlock_irq(&ctx->completion_lock);
- if (ret)
- return IO_APOLL_READY;
- return IO_APOLL_ABORTED;
- }
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
+ if (ret || ipt.error)
+ return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
+
trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
mask, apoll->poll.events);
return IO_APOLL_OK;
@@ -5246,19 +5567,19 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
if (!poll->head)
return false;
- spin_lock(&poll->head->lock);
+ spin_lock_irq(&poll->head->lock);
if (do_cancel)
WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
list_del_init(&poll->wait.entry);
do_complete = true;
}
- spin_unlock(&poll->head->lock);
+ spin_unlock_irq(&poll->head->lock);
hash_del(&req->hash_node);
return do_complete;
}
-static bool io_poll_remove_waitqs(struct io_kiocb *req)
+static bool io_poll_remove_one(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
bool do_complete;
@@ -5266,26 +5587,12 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req)
io_poll_remove_double(req);
do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
- if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
- /* non-poll requests have submit ref still */
- req_ref_put(req);
- }
- return do_complete;
-}
-
-static bool io_poll_remove_one(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
-{
- bool do_complete;
-
- do_complete = io_poll_remove_waitqs(req);
if (do_complete) {
io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
io_commit_cqring(req->ctx);
req_set_fail(req);
- io_put_req_deferred(req, 1);
+ io_put_req_deferred(req);
}
-
return do_complete;
}
@@ -5299,7 +5606,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
struct io_kiocb *req;
int posted = 0, i;
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list;
@@ -5309,7 +5616,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
posted += io_poll_remove_one(req);
}
}
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
@@ -5372,7 +5679,7 @@ static int io_poll_update_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@ -5427,6 +5734,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
if (flags & ~IORING_POLL_ADD_MULTI)
return -EINVAL;
+ io_req_set_refcount(req);
poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -5447,7 +5755,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
ipt.error = 0;
io_poll_complete(req, mask);
}
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
if (mask) {
io_cqring_ev_posted(ctx);
@@ -5464,7 +5772,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
bool completing;
int ret;
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
if (!preq) {
ret = -ENOENT;
@@ -5491,7 +5799,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
ret = 0;
err:
if (ret < 0) {
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
req_set_fail(req);
io_req_complete(req, ret);
return 0;
@@ -5504,7 +5812,7 @@ err:
}
if (req->poll_update.update_user_data)
preq->user_data = req->poll_update.new_user_data;
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
/* complete update request, we're done with it */
io_req_complete(req, ret);
@@ -5519,6 +5827,12 @@ err:
return 0;
}
+static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
+{
+ req_set_fail(req);
+ io_req_complete_post(req, -ETIME, 0);
+}
+
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
@@ -5527,24 +5841,20 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock_irqsave(&ctx->timeout_lock, flags);
list_del_init(&req->timeout.list);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
+ spin_unlock_irqrestore(&ctx->timeout_lock, flags);
- io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
- io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
-
- io_cqring_ev_posted(ctx);
- req_set_fail(req);
- io_put_req(req);
+ req->io_task_work.func = io_req_task_timeout;
+ io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
__u64 user_data)
- __must_hold(&ctx->completion_lock)
+ __must_hold(&ctx->timeout_lock)
{
struct io_timeout_data *io;
struct io_kiocb *req;
@@ -5567,6 +5877,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
__must_hold(&ctx->completion_lock)
+ __must_hold(&ctx->timeout_lock)
{
struct io_kiocb *req = io_timeout_extract(ctx, user_data);
@@ -5575,13 +5886,54 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
req_set_fail(req);
io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
- io_put_req_deferred(req, 1);
+ io_put_req_deferred(req);
+ return 0;
+}
+
+static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
+{
+ switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
+ case IORING_TIMEOUT_BOOTTIME:
+ return CLOCK_BOOTTIME;
+ case IORING_TIMEOUT_REALTIME:
+ return CLOCK_REALTIME;
+ default:
+ /* can't happen, vetted at prep time */
+ WARN_ON_ONCE(1);
+ fallthrough;
+ case 0:
+ return CLOCK_MONOTONIC;
+ }
+}
+
+static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+ struct timespec64 *ts, enum hrtimer_mode mode)
+ __must_hold(&ctx->timeout_lock)
+{
+ struct io_timeout_data *io;
+ struct io_kiocb *req;
+ bool found = false;
+
+ list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
+ found = user_data == req->user_data;
+ if (found)
+ break;
+ }
+ if (!found)
+ return -ENOENT;
+
+ io = req->async_data;
+ if (hrtimer_try_to_cancel(&io->timer) == -1)
+ return -EALREADY;
+ hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
+ io->timer.function = io_link_timeout_fn;
+ hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
return 0;
}
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
- __must_hold(&ctx->completion_lock)
+ __must_hold(&ctx->timeout_lock)
{
struct io_kiocb *req = io_timeout_extract(ctx, user_data);
struct io_timeout_data *data;
@@ -5592,7 +5944,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
req->timeout.off = 0; /* noseq */
data = req->async_data;
list_add_tail(&req->timeout.list, &ctx->timeout_list);
- hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+ hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
return 0;
@@ -5607,13 +5959,18 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len)
+ if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
return -EINVAL;
+ tr->ltimeout = false;
tr->addr = READ_ONCE(sqe->addr);
tr->flags = READ_ONCE(sqe->timeout_flags);
- if (tr->flags & IORING_TIMEOUT_UPDATE) {
- if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+ if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
+ if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
+ return -EINVAL;
+ if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
+ tr->ltimeout = true;
+ if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
return -EINVAL;
if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
return -EFAULT;
@@ -5640,20 +5997,26 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- spin_lock_irq(&ctx->completion_lock);
- if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
+ if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
+ spin_lock(&ctx->completion_lock);
+ spin_lock_irq(&ctx->timeout_lock);
ret = io_timeout_cancel(ctx, tr->addr);
- else
- ret = io_timeout_update(ctx, tr->addr, &tr->ts,
- io_translate_timeout_mode(tr->flags));
+ spin_unlock_irq(&ctx->timeout_lock);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
+
+ spin_lock_irq(&ctx->timeout_lock);
+ if (tr->ltimeout)
+ ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ else
+ ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ spin_unlock_irq(&ctx->timeout_lock);
+ }
- io_cqring_fill_event(ctx, req->user_data, ret, 0);
- io_commit_cqring(ctx);
- spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
if (ret < 0)
req_set_fail(req);
- io_put_req(req);
+ io_req_complete_post(req, ret, 0);
return 0;
}
@@ -5666,14 +6029,19 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
+ if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
+ sqe->splice_fd_in)
return -EINVAL;
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~IORING_TIMEOUT_ABS)
+ if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ return -EINVAL;
+ /* more than one clock specified is invalid, obviously */
+ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
return -EINVAL;
+ INIT_LIST_HEAD(&req->timeout.list);
req->timeout.off = off;
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
@@ -5683,14 +6051,24 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
data = req->async_data;
data->req = req;
+ data->flags = flags;
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
data->mode = io_translate_timeout_mode(flags);
- hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
- if (is_timeout_link)
- io_req_track_inflight(req);
+ hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
+
+ if (is_timeout_link) {
+ struct io_submit_link *link = &req->ctx->submit_state.link;
+
+ if (!link->head)
+ return -EINVAL;
+ if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
+ return -EINVAL;
+ req->timeout.head = link->last;
+ link->last->flags |= REQ_F_ARM_LTIMEOUT;
+ }
return 0;
}
@@ -5701,7 +6079,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
struct list_head *entry;
u32 tail, off = req->timeout.off;
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock_irq(&ctx->timeout_lock);
/*
* sqe->off holds how many events that need to occur for this
@@ -5740,7 +6118,7 @@ add:
list_add(&req->timeout.list, entry);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock_irq(&ctx->timeout_lock);
return 0;
}
@@ -5783,31 +6161,27 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
return ret;
}
-static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
- struct io_kiocb *req, __u64 sqe_addr,
- int success_ret)
+static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
{
- unsigned long flags;
+ struct io_ring_ctx *ctx = req->ctx;
int ret;
+ WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
+
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- spin_lock_irqsave(&ctx->completion_lock, flags);
if (ret != -ENOENT)
- goto done;
+ return ret;
+
+ spin_lock(&ctx->completion_lock);
+ spin_lock_irq(&ctx->timeout_lock);
ret = io_timeout_cancel(ctx, sqe_addr);
+ spin_unlock_irq(&ctx->timeout_lock);
if (ret != -ENOENT)
- goto done;
+ goto out;
ret = io_poll_cancel(ctx, sqe_addr, false);
-done:
- if (!ret)
- ret = success_ret;
- io_cqring_fill_event(ctx, req->user_data, ret, 0);
- io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
-
- if (ret < 0)
- req_set_fail(req);
+out:
+ spin_unlock(&ctx->completion_lock);
+ return ret;
}
static int io_async_cancel_prep(struct io_kiocb *req,
@@ -5817,7 +6191,8 @@ static int io_async_cancel_prep(struct io_kiocb *req,
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
+ sqe->splice_fd_in)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
@@ -5831,18 +6206,9 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
struct io_tctx_node *node;
int ret;
- /* tasks should wait for their io-wq threads, so safe w/o sync */
- ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- spin_lock_irq(&ctx->completion_lock);
- if (ret != -ENOENT)
- goto done;
- ret = io_timeout_cancel(ctx, sqe_addr);
- if (ret != -ENOENT)
- goto done;
- ret = io_poll_cancel(ctx, sqe_addr, false);
+ ret = io_try_cancel_userdata(req, sqe_addr);
if (ret != -ENOENT)
goto done;
- spin_unlock_irq(&ctx->completion_lock);
/* slow path, try all io-wq's */
io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
@@ -5855,17 +6221,10 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
break;
}
io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
-
- spin_lock_irq(&ctx->completion_lock);
done:
- io_cqring_fill_event(ctx, req->user_data, ret, 0);
- io_commit_cqring(ctx);
- spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-
if (ret < 0)
req_set_fail(req);
- io_put_req(req);
+ io_req_complete_post(req, ret, 0);
return 0;
}
@@ -5874,7 +6233,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
{
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags)
+ if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->rsrc_update.offset = READ_ONCE(sqe->off);
@@ -5982,6 +6341,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_renameat_prep(req, sqe);
case IORING_OP_UNLINKAT:
return io_unlinkat_prep(req, sqe);
+ case IORING_OP_MKDIRAT:
+ return io_mkdirat_prep(req, sqe);
+ case IORING_OP_SYMLINKAT:
+ return io_symlinkat_prep(req, sqe);
+ case IORING_OP_LINKAT:
+ return io_linkat_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6076,11 +6441,11 @@ fail:
return true;
}
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
kfree(de);
- io_queue_async_work(req);
+ io_queue_async_work(req, NULL);
return true;
}
@@ -6088,7 +6453,7 @@ fail:
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
return true;
}
@@ -6145,6 +6510,17 @@ static void io_clean_op(struct io_kiocb *req)
case IORING_OP_UNLINKAT:
putname(req->unlink.filename);
break;
+ case IORING_OP_MKDIRAT:
+ putname(req->mkdir.filename);
+ break;
+ case IORING_OP_SYMLINKAT:
+ putname(req->symlink.oldpath);
+ putname(req->symlink.newpath);
+ break;
+ case IORING_OP_LINKAT:
+ putname(req->hardlink.oldpath);
+ putname(req->hardlink.newpath);
+ break;
}
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6273,6 +6649,15 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_UNLINKAT:
ret = io_unlinkat(req, issue_flags);
break;
+ case IORING_OP_MKDIRAT:
+ ret = io_mkdirat(req, issue_flags);
+ break;
+ case IORING_OP_SYMLINKAT:
+ ret = io_symlinkat(req, issue_flags);
+ break;
+ case IORING_OP_LINKAT:
+ ret = io_linkat(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
@@ -6289,16 +6674,31 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ req = io_put_req_find_next(req);
+ return req ? &req->work : NULL;
+}
+
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_kiocb *timeout;
int ret = 0;
+ /* one will be dropped by ->io_free_work() after returning to io-wq */
+ if (!(req->flags & REQ_F_REFCOUNT))
+ __io_req_set_refcount(req, 2);
+ else
+ req_ref_get(req);
+
timeout = io_prep_linked_timeout(req);
if (timeout)
io_queue_linked_timeout(timeout);
+ /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL)
ret = -ECANCELED;
@@ -6317,29 +6717,14 @@ static void io_wq_submit_work(struct io_wq_work *work)
}
/* avoid locking problems by failing it from a clean context */
- if (ret) {
- /* io-wq is going to take one down */
- req_ref_get(req);
+ if (ret)
io_req_task_queue_fail(req, ret);
- }
}
-#define FFS_ASYNC_READ 0x1UL
-#define FFS_ASYNC_WRITE 0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG 0x4UL
-#else
-#define FFS_ISREG 0x0UL
-#endif
-#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
-
static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
- unsigned i)
+ unsigned i)
{
- struct io_fixed_file *table_l2;
-
- table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
- return &table_l2[i & IORING_FILE_TABLE_MASK];
+ return &table->files[i];
}
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -6354,45 +6739,69 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
{
unsigned long file_ptr = (unsigned long) file;
- if (__io_file_supports_async(file, READ))
+ if (__io_file_supports_nowait(file, READ))
file_ptr |= FFS_ASYNC_READ;
- if (__io_file_supports_async(file, WRITE))
+ if (__io_file_supports_nowait(file, WRITE))
file_ptr |= FFS_ASYNC_WRITE;
if (S_ISREG(file_inode(file)->i_mode))
file_ptr |= FFS_ISREG;
file_slot->file_ptr = file_ptr;
}
-static struct file *io_file_get(struct io_submit_state *state,
- struct io_kiocb *req, int fd, bool fixed)
+static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
+ struct io_kiocb *req, int fd)
{
- struct io_ring_ctx *ctx = req->ctx;
struct file *file;
+ unsigned long file_ptr;
- if (fixed) {
- unsigned long file_ptr;
+ if (unlikely((unsigned int)fd >= ctx->nr_user_files))
+ return NULL;
+ fd = array_index_nospec(fd, ctx->nr_user_files);
+ file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+ file = (struct file *) (file_ptr & FFS_MASK);
+ file_ptr &= ~FFS_MASK;
+ /* mask in overlapping REQ_F and FFS bits */
+ req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
+ io_req_set_rsrc_node(req);
+ return file;
+}
- if (unlikely((unsigned int)fd >= ctx->nr_user_files))
- return NULL;
- fd = array_index_nospec(fd, ctx->nr_user_files);
- file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
- file = (struct file *) (file_ptr & FFS_MASK);
- file_ptr &= ~FFS_MASK;
- /* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
- io_req_set_rsrc_node(req);
- } else {
- trace_io_uring_file_get(ctx, fd);
- file = __io_file_get(state, fd);
+static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
+ struct io_kiocb *req, int fd)
+{
+ struct file *file = fget(fd);
- /* we don't allow fixed io_uring files */
- if (file && unlikely(file->f_op == &io_uring_fops))
- io_req_track_inflight(req);
- }
+ trace_io_uring_file_get(ctx, fd);
+ /* we don't allow fixed io_uring files */
+ if (file && unlikely(file->f_op == &io_uring_fops))
+ io_req_track_inflight(req);
return file;
}
+static inline struct file *io_file_get(struct io_ring_ctx *ctx,
+ struct io_kiocb *req, int fd, bool fixed)
+{
+ if (fixed)
+ return io_file_get_fixed(ctx, req, fd);
+ else
+ return io_file_get_normal(ctx, req, fd);
+}
+
+static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
+{
+ struct io_kiocb *prev = req->timeout.prev;
+ int ret;
+
+ if (prev) {
+ ret = io_try_cancel_userdata(req, prev->user_data);
+ io_req_complete_post(req, ret ?: -ETIME, 0);
+ io_put_req(prev);
+ } else {
+ io_req_complete_post(req, -ETIME, 0);
+ }
+}
+
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
@@ -6401,7 +6810,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock_irqsave(&ctx->timeout_lock, flags);
prev = req->timeout.head;
req->timeout.head = NULL;
@@ -6414,15 +6823,12 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
if (!req_ref_inc_not_zero(prev))
prev = NULL;
}
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ list_del(&req->timeout.list);
+ req->timeout.prev = prev;
+ spin_unlock_irqrestore(&ctx->timeout_lock, flags);
- if (prev) {
- io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
- io_put_req_deferred(prev, 1);
- io_put_req_deferred(req, 1);
- } else {
- io_req_complete_post(req, -ETIME, 0);
- }
+ req->io_task_work.func = io_req_task_link_timeout;
+ io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
@@ -6430,7 +6836,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock_irq(&ctx->timeout_lock);
/*
* If the back reference is NULL, then our linked request finished
* before we got a chance to setup the timer
@@ -6441,29 +6847,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
data->timer.function = io_link_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
+ list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
}
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock_irq(&ctx->timeout_lock);
/* drop submission reference */
io_put_req(req);
}
-static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
-{
- struct io_kiocb *nxt = req->link;
-
- if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
- nxt->opcode != IORING_OP_LINK_TIMEOUT)
- return NULL;
-
- nxt->timeout.head = req;
- nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
- req->flags |= REQ_F_LINK_TIMEOUT;
- return nxt;
-}
-
static void __io_queue_sqe(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
{
- struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_kiocb *linked_timeout;
int ret;
issue_sqe:
@@ -6474,50 +6868,60 @@ issue_sqe:
* doesn't support non-blocking read/write attempts
*/
if (likely(!ret)) {
- /* drop submission reference */
if (req->flags & REQ_F_COMPLETE_INLINE) {
struct io_ring_ctx *ctx = req->ctx;
- struct io_comp_state *cs = &ctx->submit_state.comp;
+ struct io_submit_state *state = &ctx->submit_state;
- cs->reqs[cs->nr++] = req;
- if (cs->nr == ARRAY_SIZE(cs->reqs))
+ state->compl_reqs[state->compl_nr++] = req;
+ if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
io_submit_flush_completions(ctx);
- } else {
- io_put_req(req);
+ return;
}
+
+ linked_timeout = io_prep_linked_timeout(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+ linked_timeout = io_prep_linked_timeout(req);
+
switch (io_arm_poll_handler(req)) {
case IO_APOLL_READY:
+ if (linked_timeout)
+ io_unprep_linked_timeout(req);
goto issue_sqe;
case IO_APOLL_ABORTED:
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
- io_queue_async_work(req);
+ io_queue_async_work(req, NULL);
break;
}
+
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
} else {
io_req_complete_failed(req, ret);
}
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
}
static inline void io_queue_sqe(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
{
if (unlikely(req->ctx->drain_active) && io_drain_req(req))
return;
- if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) {
+ if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
__io_queue_sqe(req);
+ } else if (req->flags & REQ_F_FAIL) {
+ io_req_complete_failed(req, req->result);
} else {
int ret = io_req_prep_async(req);
if (unlikely(ret))
io_req_complete_failed(req, ret);
else
- io_queue_async_work(req);
+ io_queue_async_work(req, NULL);
}
}
@@ -6549,19 +6953,19 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
+ __must_hold(&ctx->uring_lock)
{
struct io_submit_state *state;
unsigned int sqe_flags;
int personality, ret = 0;
+ /* req is partially pre-initialised, see io_preinit_req() */
req->opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
req->file = NULL;
req->fixed_rsrc_refs = NULL;
- /* one is dropped after submission, the other at completion */
- atomic_set(&req->refs, 2);
req->task = current;
/* enforce forwards compatibility on users */
@@ -6599,9 +7003,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
if (io_op_defs[req->opcode].needs_file) {
- bool fixed = req->flags & REQ_F_FIXED_FILE;
-
- req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+ req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
+ (sqe_flags & IOSQE_FIXED_FILE));
if (unlikely(!req->file))
ret = -EBADF;
}
@@ -6612,6 +7015,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
+ __must_hold(&ctx->uring_lock)
{
struct io_submit_link *link = &ctx->submit_state.link;
int ret;
@@ -6619,20 +7023,34 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
fail_req:
+ /* fail even hard links since we don't submit */
if (link->head) {
- /* fail even hard links since we don't submit */
- req_set_fail(link->head);
- io_req_complete_failed(link->head, -ECANCELED);
- link->head = NULL;
+ /*
+ * we can judge a link req is failed or cancelled by if
+ * REQ_F_FAIL is set, but the head is an exception since
+ * it may be set REQ_F_FAIL because of other req's failure
+ * so let's leverage req->result to distinguish if a head
+ * is set REQ_F_FAIL because of its failure or other req's
+ * failure so that we can set the correct ret code for it.
+ * init result here to avoid affecting the normal path.
+ */
+ if (!(link->head->flags & REQ_F_FAIL))
+ req_fail_link_node(link->head, -ECANCELED);
+ } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
+ /*
+ * the current req is a normal req, we should return
+ * error and thus break the submittion loop.
+ */
+ io_req_complete_failed(req, ret);
+ return ret;
}
- io_req_complete_failed(req, ret);
- return ret;
+ req_fail_link_node(req, ret);
+ } else {
+ ret = io_req_prep(req, sqe);
+ if (unlikely(ret))
+ goto fail_req;
}
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
-
/* don't need @sqe from now on */
trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
req->flags, true,
@@ -6648,9 +7066,14 @@ fail_req:
if (link->head) {
struct io_kiocb *head = link->head;
- ret = io_req_prep_async(req);
- if (unlikely(ret))
- goto fail_req;
+ if (!(req->flags & REQ_F_FAIL)) {
+ ret = io_req_prep_async(req);
+ if (unlikely(ret)) {
+ req_fail_link_node(req, ret);
+ if (!(head->flags & REQ_F_FAIL))
+ req_fail_link_node(head, -ECANCELED);
+ }
+ }
trace_io_uring_link(ctx, req, head);
link->last->link = req;
link->last = req;
@@ -6680,11 +7103,10 @@ static void io_submit_state_end(struct io_submit_state *state,
{
if (state->link.head)
io_queue_sqe(state->link.head);
- if (state->comp.nr)
+ if (state->compl_nr)
io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
- io_state_file_put(state);
}
/*
@@ -6744,26 +7166,17 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
+ __must_hold(&ctx->uring_lock)
{
- struct io_uring_task *tctx;
int submitted = 0;
/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
+ io_get_task_refs(nr);
- tctx = current->io_uring;
- tctx->cached_refs -= nr;
- if (unlikely(tctx->cached_refs < 0)) {
- unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
-
- percpu_counter_add(&tctx->inflight, refill);
- refcount_add(refill, &current->usage);
- tctx->cached_refs += refill;
- }
io_submit_state_start(&ctx->submit_state, nr);
-
while (submitted < nr) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
@@ -6776,7 +7189,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
}
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- kmem_cache_free(req_cachep, req);
+ list_add(&req->inflight_entry, &ctx->submit_state.free_list);
break;
}
/* will complete beyond this point, count as submitted */
@@ -6808,18 +7221,18 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
/* Tell userspace we may need a wakeup call */
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
WRITE_ONCE(ctx->rings->sq_flags,
ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
}
static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
{
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
WRITE_ONCE(ctx->rings->sq_flags,
ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
}
static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
@@ -6841,7 +7254,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0, true);
+ io_do_iopoll(ctx, &nr_events, 0);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -6976,21 +7389,21 @@ static int io_sq_thread(void *data)
struct io_wait_queue {
struct wait_queue_entry wq;
struct io_ring_ctx *ctx;
- unsigned to_wait;
+ unsigned cq_tail;
unsigned nr_timeouts;
};
static inline bool io_should_wake(struct io_wait_queue *iowq)
{
struct io_ring_ctx *ctx = iowq->ctx;
+ int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
/*
* Wake up if we have enough events, or if a timeout occurred since we
* started waiting. For timeouts, we always want to return to userspace,
* regardless of event count.
*/
- return io_cqring_events(ctx) >= iowq->to_wait ||
- atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+ return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}
static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
@@ -7046,21 +7459,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz,
struct __kernel_timespec __user *uts)
{
- struct io_wait_queue iowq = {
- .wq = {
- .private = current,
- .func = io_wake_function,
- .entry = LIST_HEAD_INIT(iowq.wq.entry),
- },
- .ctx = ctx,
- .to_wait = min_events,
- };
+ struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
signed long timeout = MAX_SCHEDULE_TIMEOUT;
int ret;
do {
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx);
if (io_cqring_events(ctx) >= min_events)
return 0;
if (!io_run_task_work())
@@ -7088,11 +7493,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
timeout = timespec64_to_jiffies(&ts);
}
+ init_waitqueue_func_entry(&iowq.wq, io_wake_function);
+ iowq.wq.private = current;
+ INIT_LIST_HEAD(&iowq.wq.entry);
+ iowq.ctx = ctx;
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+ iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
+
trace_io_uring_cqring_wait(ctx, min_events);
do {
/* if we can't even flush overflow, don't wait for more */
- if (!io_cqring_overflow_flush(ctx, false)) {
+ if (!io_cqring_overflow_flush(ctx)) {
ret = -EBUSY;
break;
}
@@ -7123,14 +7534,14 @@ static void **io_alloc_page_table(size_t size)
size_t init_size = size;
void **table;
- table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
if (!table)
return NULL;
for (i = 0; i < nr_tables; i++) {
unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
- table[i] = kzalloc(this_size, GFP_KERNEL);
+ table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
if (!table[i]) {
io_free_page_table(table, init_size);
return NULL;
@@ -7146,6 +7557,50 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
kfree(ref_node);
}
+static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+{
+ struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+ struct io_ring_ctx *ctx = node->rsrc_data->ctx;
+ unsigned long flags;
+ bool first_add = false;
+
+ spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
+ node->done = true;
+
+ while (!list_empty(&ctx->rsrc_ref_list)) {
+ node = list_first_entry(&ctx->rsrc_ref_list,
+ struct io_rsrc_node, node);
+ /* recycle ref nodes in order */
+ if (!node->done)
+ break;
+ list_del(&node->node);
+ first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
+ }
+ spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
+
+ if (first_add)
+ mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
+}
+
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
+{
+ struct io_rsrc_node *ref_node;
+
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return NULL;
+
+ if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
+ 0, GFP_KERNEL)) {
+ kfree(ref_node);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&ref_node->node);
+ INIT_LIST_HEAD(&ref_node->rsrc_list);
+ ref_node->done = false;
+ return ref_node;
+}
+
static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
{
@@ -7277,17 +7732,14 @@ fail:
static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
- size_t size = nr_files * sizeof(struct io_fixed_file);
-
- table->files = (struct io_fixed_file **)io_alloc_page_table(size);
+ table->files = kvcalloc(nr_files, sizeof(table->files[0]),
+ GFP_KERNEL_ACCOUNT);
return !!table->files;
}
-static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+static void io_free_file_tables(struct io_file_table *table)
{
- size_t size = nr_files * sizeof(struct io_fixed_file);
-
- io_free_page_table((void **)table->files, size);
+ kvfree(table->files);
table->files = NULL;
}
@@ -7312,7 +7764,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
fput(file);
}
#endif
- io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+ io_free_file_tables(&ctx->file_table);
io_rsrc_data_free(ctx->file_data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
@@ -7628,11 +8080,11 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
io_ring_submit_lock(ctx, lock_ring);
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
ctx->cq_extra++;
io_commit_cqring(ctx);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
io_ring_submit_unlock(ctx, lock_ring);
}
@@ -7664,50 +8116,6 @@ static void io_rsrc_put_work(struct work_struct *work)
}
}
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
-{
- struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
- struct io_ring_ctx *ctx = node->rsrc_data->ctx;
- unsigned long flags;
- bool first_add = false;
-
- spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
- node->done = true;
-
- while (!list_empty(&ctx->rsrc_ref_list)) {
- node = list_first_entry(&ctx->rsrc_ref_list,
- struct io_rsrc_node, node);
- /* recycle ref nodes in order */
- if (!node->done)
- break;
- list_del(&node->node);
- first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
- }
- spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
-
- if (first_add)
- mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
-}
-
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
-{
- struct io_rsrc_node *ref_node;
-
- ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
- if (!ref_node)
- return NULL;
-
- if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
- 0, GFP_KERNEL)) {
- kfree(ref_node);
- return NULL;
- }
- INIT_LIST_HEAD(&ref_node->node);
- INIT_LIST_HEAD(&ref_node->rsrc_list);
- ref_node->done = false;
- return ref_node;
-}
-
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags)
{
@@ -7722,6 +8130,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
+ if (nr_args > rlimit(RLIMIT_NOFILE))
+ return -EMFILE;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
@@ -7780,7 +8190,7 @@ out_fput:
if (file)
fput(file);
}
- io_free_file_tables(&ctx->file_table, nr_args);
+ io_free_file_tables(&ctx->file_table);
ctx->nr_user_files = 0;
out_free:
io_rsrc_data_free(ctx->file_data);
@@ -7831,6 +8241,46 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+ unsigned int issue_flags, u32 slot_index)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ struct io_fixed_file *file_slot;
+ int ret = -EBADF;
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+ if (file->f_op == &io_uring_fops)
+ goto err;
+ ret = -ENXIO;
+ if (!ctx->file_data)
+ goto err;
+ ret = -EINVAL;
+ if (slot_index >= ctx->nr_user_files)
+ goto err;
+
+ slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
+ ret = -EBADF;
+ if (file_slot->file_ptr)
+ goto err;
+
+ *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+ io_fixed_file_set(file_slot, file);
+ ret = io_sqe_file_register(ctx, file, slot_index);
+ if (ret) {
+ file_slot->file_ptr = 0;
+ goto err;
+ }
+
+ ret = 0;
+err:
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ if (ret)
+ fput(file);
+ return ret;
+}
+
static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc)
{
@@ -7926,14 +8376,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
return done ? done : err;
}
-static struct io_wq_work *io_free_work(struct io_wq_work *work)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- req = io_put_req_find_next(req);
- return req ? &req->work : NULL;
-}
-
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
struct task_struct *task)
{
@@ -7957,7 +8399,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
data.hash = hash;
data.task = task;
- data.free_work = io_free_work;
+ data.free_work = io_wq_free_work;
data.do_work = io_wq_submit_work;
/* Do QD, or 4 * CPUS, whatever is smallest */
@@ -8624,33 +9066,29 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
__io_remove_buffers(ctx, buf, index, -1U);
}
-static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
+static void io_req_cache_free(struct list_head *list)
{
struct io_kiocb *req, *nxt;
- list_for_each_entry_safe(req, nxt, list, compl.list) {
- if (tsk && req->task != tsk)
- continue;
- list_del(&req->compl.list);
+ list_for_each_entry_safe(req, nxt, list, inflight_entry) {
+ list_del(&req->inflight_entry);
kmem_cache_free(req_cachep, req);
}
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
- struct io_submit_state *submit_state = &ctx->submit_state;
- struct io_comp_state *cs = &ctx->submit_state.comp;
+ struct io_submit_state *state = &ctx->submit_state;
mutex_lock(&ctx->uring_lock);
- if (submit_state->free_reqs) {
- kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
- submit_state->reqs);
- submit_state->free_reqs = 0;
+ if (state->free_reqs) {
+ kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
+ state->free_reqs = 0;
}
- io_flush_cached_locked_reqs(ctx, cs);
- io_req_cache_free(&cs->free_list, NULL);
+ io_flush_cached_locked_reqs(ctx, state);
+ io_req_cache_free(&state->free_list);
mutex_unlock(&ctx->uring_lock);
}
@@ -8702,6 +9140,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
sock_release(ctx->ring_sock);
}
#endif
+ WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
@@ -8801,6 +9240,7 @@ static void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
unsigned long timeout = jiffies + HZ * 60 * 5;
+ unsigned long interval = HZ / 20;
struct io_tctx_exit exit;
struct io_tctx_node *node;
int ret;
@@ -8825,8 +9265,11 @@ static void io_ring_exit_work(struct work_struct *work)
io_sq_thread_unpark(sqd);
}
- WARN_ON_ONCE(time_after(jiffies, timeout));
- } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
+ if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
+ /* there is little hope left, don't run it too often */
+ interval = HZ * 60;
+ }
+ } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
init_completion(&exit.completion);
init_task_work(&exit.task_work, io_tctx_exit_cb);
@@ -8855,8 +9298,8 @@ static void io_ring_exit_work(struct work_struct *work)
mutex_lock(&ctx->uring_lock);
}
mutex_unlock(&ctx->uring_lock);
- spin_lock_irq(&ctx->completion_lock);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
io_ring_ctx_free(ctx);
}
@@ -8868,16 +9311,18 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
struct io_kiocb *req, *tmp;
int canceled = 0;
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
+ spin_lock_irq(&ctx->timeout_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
if (io_match_task(req, tsk, cancel_all)) {
io_kill_timeout(req, -ECANCELED);
canceled++;
}
}
+ spin_unlock_irq(&ctx->timeout_lock);
if (canceled != 0)
io_commit_cqring(ctx);
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
if (canceled != 0)
io_cqring_ev_posted(ctx);
return canceled != 0;
@@ -8933,13 +9378,12 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
bool ret;
if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
- unsigned long flags;
struct io_ring_ctx *ctx = req->ctx;
/* protect against races with linked timeouts */
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock(&ctx->completion_lock);
ret = io_match_task(req, cancel->task, cancel->all);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ spin_unlock(&ctx->completion_lock);
} else {
ret = io_match_task(req, cancel->task, cancel->all);
}
@@ -8952,14 +9396,14 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct io_defer_entry *de;
LIST_HEAD(list);
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
if (io_match_task(de->req, task, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
}
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
if (list_empty(&list))
return false;
@@ -9124,8 +9568,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
* Must be after io_uring_del_task_file() (removes nodes under
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
*/
- tctx->io_wq = NULL;
io_wq_put_and_exit(wq);
+ tctx->io_wq = NULL;
}
}
@@ -9141,9 +9585,11 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
struct io_uring_task *tctx = task->io_uring;
unsigned int refs = tctx->cached_refs;
- tctx->cached_refs = 0;
- percpu_counter_sub(&tctx->inflight, refs);
- put_task_struct_many(task, refs);
+ if (refs) {
+ tctx->cached_refs = 0;
+ percpu_counter_sub(&tctx->inflight, refs);
+ put_task_struct_many(task, refs);
+ }
}
/*
@@ -9164,9 +9610,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
if (tctx->io_wq)
io_wq_exit_start(tctx->io_wq);
- io_uring_drop_tctx_refs(current);
atomic_inc(&tctx->in_idle);
do {
+ io_uring_drop_tctx_refs(current);
/* read completions before cancelations */
inflight = tctx_inflight(tctx, !cancel_all);
if (!inflight)
@@ -9190,6 +9636,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
}
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+ io_uring_drop_tctx_refs(current);
/*
* If we've seen completions, retry without waiting. This
* avoids a race where a completion comes in before we did
@@ -9208,9 +9655,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
}
}
-void __io_uring_cancel(struct files_struct *files)
+void __io_uring_cancel(bool cancel_all)
{
- io_uring_cancel_generic(!files, NULL);
+ io_uring_cancel_generic(cancel_all, NULL);
}
static void *io_uring_validate_mmap_request(struct file *file,
@@ -9370,7 +9817,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx);
if (unlikely(ctx->sq_data->thread == NULL)) {
ret = -EOWNERDEAD;
@@ -9506,7 +9953,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
io_uring_show_cred(m, index, cred);
}
seq_printf(m, "PollList:\n");
- spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list = &ctx->cancel_hash[i];
struct io_kiocb *req;
@@ -9515,7 +9962,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
req->task->task_works != NULL);
}
- spin_unlock_irq(&ctx->completion_lock);
+ spin_unlock(&ctx->completion_lock);
if (has_lock)
mutex_unlock(&ctx->uring_lock);
}
@@ -10048,6 +10495,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
+static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ __u32 new_count[2];
+ int i, ret;
+
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+ if (copy_from_user(new_count, arg, sizeof(new_count)))
+ return -EFAULT;
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i] > INT_MAX)
+ return -EINVAL;
+
+ ret = io_wq_max_workers(tctx->io_wq, new_count);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, new_count, sizeof(new_count)))
+ return -EFAULT;
+
+ return 0;
+}
+
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
@@ -10065,12 +10537,40 @@ static bool io_register_op_must_quiesce(int op)
case IORING_REGISTER_BUFFERS_UPDATE:
case IORING_REGISTER_IOWQ_AFF:
case IORING_UNREGISTER_IOWQ_AFF:
+ case IORING_REGISTER_IOWQ_MAX_WORKERS:
return false;
default:
return true;
}
}
+static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+{
+ long ret;
+
+ percpu_ref_kill(&ctx->refs);
+
+ /*
+ * Drop uring mutex before waiting for references to exit. If another
+ * thread is currently inside io_uring_enter() it might need to grab the
+ * uring_lock to make progress. If we hold it here across the drain
+ * wait, then we can deadlock. It's safe to drop the mutex here, since
+ * no new references will come in after we've killed the percpu ref.
+ */
+ mutex_unlock(&ctx->uring_lock);
+ do {
+ ret = wait_for_completion_interruptible(&ctx->ref_comp);
+ if (!ret)
+ break;
+ ret = io_run_task_work_sig();
+ } while (ret >= 0);
+ mutex_lock(&ctx->uring_lock);
+
+ if (ret)
+ io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+ return ret;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -10095,31 +10595,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
}
if (io_register_op_must_quiesce(opcode)) {
- percpu_ref_kill(&ctx->refs);
-
- /*
- * Drop uring mutex before waiting for references to exit. If
- * another thread is currently inside io_uring_enter() it might
- * need to grab the uring_lock to make progress. If we hold it
- * here across the drain wait, then we can deadlock. It's safe
- * to drop the mutex here, since no new references will come in
- * after we've killed the percpu ref.
- */
- mutex_unlock(&ctx->uring_lock);
- do {
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
- if (!ret)
- break;
- ret = io_run_task_work_sig();
- if (ret < 0)
- break;
- } while (1);
- mutex_lock(&ctx->uring_lock);
-
- if (ret) {
- io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+ ret = io_ctx_quiesce(ctx);
+ if (ret)
return ret;
- }
}
switch (opcode) {
@@ -10216,6 +10694,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_unregister_iowq_aff(ctx);
break;
+ case IORING_REGISTER_IOWQ_MAX_WORKERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 2)
+ break;
+ ret = io_register_iowq_max_workers(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
@@ -10297,11 +10781,16 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
+ BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
sizeof(struct io_uring_rsrc_update));
BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
sizeof(struct io_uring_rsrc_update2));
+
+ /* ->buf_index is u16 */
+ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1e2204fa9963..504e69578112 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -263,209 +263,6 @@ static long ioctl_file_clone_range(struct file *file,
args.src_length, args.dest_offset);
}
-#ifdef CONFIG_BLOCK
-
-static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
-{
- return (offset >> inode->i_blkbits);
-}
-
-static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
-{
- return (blk << inode->i_blkbits);
-}
-
-/**
- * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
- * @inode: the inode to map
- * @fieinfo: the fiemap info struct that will be passed back to userspace
- * @start: where to start mapping in the inode
- * @len: how much space to map
- * @get_block: the fs's get_block function
- *
- * This does FIEMAP for block based inodes. Basically it will just loop
- * through get_block until we hit the number of extents we want to map, or we
- * go past the end of the file and hit a hole.
- *
- * If it is possible to have data blocks beyond a hole past @inode->i_size, then
- * please do not use this function, it will stop at the first unmapped block
- * beyond i_size.
- *
- * If you use this function directly, you need to do your own locking. Use
- * generic_block_fiemap if you want the locking done for you.
- */
-static int __generic_block_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo, loff_t start,
- loff_t len, get_block_t *get_block)
-{
- struct buffer_head map_bh;
- sector_t start_blk, last_blk;
- loff_t isize = i_size_read(inode);
- u64 logical = 0, phys = 0, size = 0;
- u32 flags = FIEMAP_EXTENT_MERGED;
- bool past_eof = false, whole_file = false;
- int ret = 0;
-
- ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC);
- if (ret)
- return ret;
-
- /*
- * Either the i_mutex or other appropriate locking needs to be held
- * since we expect isize to not change at all through the duration of
- * this call.
- */
- if (len >= isize) {
- whole_file = true;
- len = isize;
- }
-
- /*
- * Some filesystems can't deal with being asked to map less than
- * blocksize, so make sure our len is at least block length.
- */
- if (logical_to_blk(inode, len) == 0)
- len = blk_to_logical(inode, 1);
-
- start_blk = logical_to_blk(inode, start);
- last_blk = logical_to_blk(inode, start + len - 1);
-
- do {
- /*
- * we set b_size to the total size we want so it will map as
- * many contiguous blocks as possible at once
- */
- memset(&map_bh, 0, sizeof(struct buffer_head));
- map_bh.b_size = len;
-
- ret = get_block(inode, start_blk, &map_bh, 0);
- if (ret)
- break;
-
- /* HOLE */
- if (!buffer_mapped(&map_bh)) {
- start_blk++;
-
- /*
- * We want to handle the case where there is an
- * allocated block at the front of the file, and then
- * nothing but holes up to the end of the file properly,
- * to make sure that extent at the front gets properly
- * marked with FIEMAP_EXTENT_LAST
- */
- if (!past_eof &&
- blk_to_logical(inode, start_blk) >= isize)
- past_eof = 1;
-
- /*
- * First hole after going past the EOF, this is our
- * last extent
- */
- if (past_eof && size) {
- flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size,
- flags);
- } else if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- size = 0;
- }
-
- /* if we have holes up to/past EOF then we're done */
- if (start_blk > last_blk || past_eof || ret)
- break;
- } else {
- /*
- * We have gone over the length of what we wanted to
- * map, and it wasn't the entire file, so add the extent
- * we got last time and exit.
- *
- * This is for the case where say we want to map all the
- * way up to the second to the last block in a file, but
- * the last block is a hole, making the second to last
- * block FIEMAP_EXTENT_LAST. In this case we want to
- * see if there is a hole after the second to last block
- * so we can mark it properly. If we found data after
- * we exceeded the length we were requesting, then we
- * are good to go, just add the extent to the fieinfo
- * and break
- */
- if (start_blk > last_blk && !whole_file) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size,
- flags);
- break;
- }
-
- /*
- * if size != 0 then we know we already have an extent
- * to add, so add it.
- */
- if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size,
- flags);
- if (ret)
- break;
- }
-
- logical = blk_to_logical(inode, start_blk);
- phys = blk_to_logical(inode, map_bh.b_blocknr);
- size = map_bh.b_size;
- flags = FIEMAP_EXTENT_MERGED;
-
- start_blk += logical_to_blk(inode, size);
-
- /*
- * If we are past the EOF, then we need to make sure as
- * soon as we find a hole that the last extent we found
- * is marked with FIEMAP_EXTENT_LAST
- */
- if (!past_eof && logical + size >= isize)
- past_eof = true;
- }
- cond_resched();
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- } while (1);
-
- /* If ret is 1 then we just hit the end of the extent array */
- if (ret == 1)
- ret = 0;
-
- return ret;
-}
-
-/**
- * generic_block_fiemap - FIEMAP for block based inodes
- * @inode: The inode to map
- * @fieinfo: The mapping information
- * @start: The initial block to map
- * @len: The length of the extect to attempt to map
- * @get_block: The block mapping function for the fs
- *
- * Calls __generic_block_fiemap to map the inode, after taking
- * the inode's mutex lock.
- */
-
-int generic_block_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo, u64 start,
- u64 len, get_block_t *get_block)
-{
- int ret;
- inode_lock(inode);
- ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
- inode_unlock(inode);
- return ret;
-}
-EXPORT_SYMBOL(generic_block_fiemap);
-
-#endif /* CONFIG_BLOCK */
-
/*
* This provides compatibility with legacy XFS pre-allocation ioctls
* which predate the fallocate syscall.
@@ -817,6 +614,14 @@ static int fileattr_set_prepare(struct inode *inode,
if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
FS_XFLAG_PROJINHERIT)
return -EINVAL;
+ } else {
+ /*
+ * Caller is allowed to change the project ID. If it is being
+ * changed, make sure that the new value is valid.
+ */
+ if (old_ma->fsx_projid != fa->fsx_projid &&
+ !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
+ return -EINVAL;
}
/* Check extent size hints. */
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index eef2722d93a1..4143a3ff89db 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o
iomap-y += trace.o \
- apply.o \
buffered-io.o \
direct-io.o \
fiemap.o \
+ iter.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
deleted file mode 100644
index 26ab6563181f..000000000000
--- a/fs/iomap/apply.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
- */
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
-#include <linux/iomap.h>
-#include "trace.h"
-
-/*
- * Execute a iomap write on a segment of the mapping that spans a
- * contiguous range of pages that have identical block mapping state.
- *
- * This avoids the need to map pages individually, do individual allocations
- * for each page and most importantly avoid the need for filesystem specific
- * locking per page. Instead, all the operations are amortised over the entire
- * range of pages. It is assumed that the filesystems will lock whatever
- * resources they require in the iomap_begin call, and release them in the
- * iomap_end call.
- */
-loff_t
-iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
- const struct iomap_ops *ops, void *data, iomap_actor_t actor)
-{
- struct iomap iomap = { .type = IOMAP_HOLE };
- struct iomap srcmap = { .type = IOMAP_HOLE };
- loff_t written = 0, ret;
- u64 end;
-
- trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_);
-
- /*
- * Need to map a range from start position for length bytes. This can
- * span multiple pages - it is only guaranteed to return a range of a
- * single type of pages (e.g. all into a hole, all mapped or all
- * unwritten). Failure at this point has nothing to undo.
- *
- * If allocation is required for this range, reserve the space now so
- * that the allocation is guaranteed to succeed later on. Once we copy
- * the data into the page cache pages, then we cannot fail otherwise we
- * expose transient stale data. If the reserve fails, we can safely
- * back out at this point as there is nothing to undo.
- */
- ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
- if (ret)
- return ret;
- if (WARN_ON(iomap.offset > pos)) {
- written = -EIO;
- goto out;
- }
- if (WARN_ON(iomap.length == 0)) {
- written = -EIO;
- goto out;
- }
-
- trace_iomap_apply_dstmap(inode, &iomap);
- if (srcmap.type != IOMAP_HOLE)
- trace_iomap_apply_srcmap(inode, &srcmap);
-
- /*
- * Cut down the length to the one actually provided by the filesystem,
- * as it might not be able to give us the whole size that we requested.
- */
- end = iomap.offset + iomap.length;
- if (srcmap.type != IOMAP_HOLE)
- end = min(end, srcmap.offset + srcmap.length);
- if (pos + length > end)
- length = end - pos;
-
- /*
- * Now that we have guaranteed that the space allocation will succeed,
- * we can do the copy-in page by page without having to worry about
- * failures exposing transient data.
- *
- * To support COW operations, we read in data for partially blocks from
- * the srcmap if the file system filled it in. In that case we the
- * length needs to be limited to the earlier of the ends of the iomaps.
- * If the file system did not provide a srcmap we pass in the normal
- * iomap into the actors so that they don't need to have special
- * handling for the two cases.
- */
- written = actor(inode, pos, length, data, &iomap,
- srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);
-
-out:
- /*
- * Now the data has been copied, commit the range we've copied. This
- * should not fail unless the filesystem has had a fatal error.
- */
- if (ops->iomap_end) {
- ret = ops->iomap_end(inode, pos, length,
- written > 0 ? written : 0,
- flags, &iomap);
- }
-
- return written ? written : ret;
-}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 87ccb3438bec..9cc5798423d1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -36,7 +36,7 @@ static inline struct iomap_page *to_iomap_page(struct page *page)
{
/*
* per-block data is stored in the head page. Callers should
- * not be dealing with tail pages (and if they are, they can
+ * not be dealing with tail pages, and if they are, they can
* call thp_head() first.
*/
VM_BUG_ON_PGFLAGS(PageTail(page), page);
@@ -98,7 +98,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
unsigned last = (poff + plen - 1) >> block_bits;
/*
- * If the block size is smaller than the page size we need to check the
+ * If the block size is smaller than the page size, we need to check the
* per-block uptodate status and adjust the offset and length if needed
* to avoid reading in already uptodate ranges.
*/
@@ -126,7 +126,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
}
/*
- * If the extent spans the block that contains the i_size we need to
+ * If the extent spans the block that contains the i_size, we need to
* handle both halves separately so that we properly zero data in the
* page cache for blocks that are entirely outside of i_size.
*/
@@ -205,60 +205,67 @@ struct iomap_readpage_ctx {
struct readahead_control *rac;
};
-static void
-iomap_read_inline_data(struct inode *inode, struct page *page,
- struct iomap *iomap)
+static loff_t iomap_read_inline_data(const struct iomap_iter *iter,
+ struct page *page)
{
- size_t size = i_size_read(inode);
+ const struct iomap *iomap = iomap_iter_srcmap(iter);
+ size_t size = i_size_read(iter->inode) - iomap->offset;
+ size_t poff = offset_in_page(iomap->offset);
void *addr;
if (PageUptodate(page))
- return;
-
- BUG_ON(page_has_private(page));
- BUG_ON(page->index);
- BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
-
- addr = kmap_atomic(page);
+ return PAGE_SIZE - poff;
+
+ if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
+ return -EIO;
+ if (WARN_ON_ONCE(size > PAGE_SIZE -
+ offset_in_page(iomap->inline_data)))
+ return -EIO;
+ if (WARN_ON_ONCE(size > iomap->length))
+ return -EIO;
+ if (poff > 0)
+ iomap_page_create(iter->inode, page);
+
+ addr = kmap_local_page(page) + poff;
memcpy(addr, iomap->inline_data, size);
- memset(addr + size, 0, PAGE_SIZE - size);
- kunmap_atomic(addr);
- SetPageUptodate(page);
+ memset(addr + size, 0, PAGE_SIZE - poff - size);
+ kunmap_local(addr);
+ iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff);
+ return PAGE_SIZE - poff;
}
-static inline bool iomap_block_needs_zeroing(struct inode *inode,
- struct iomap *iomap, loff_t pos)
+static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
+ loff_t pos)
{
- return iomap->type != IOMAP_MAPPED ||
- (iomap->flags & IOMAP_F_NEW) ||
- pos >= i_size_read(inode);
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+
+ return srcmap->type != IOMAP_MAPPED ||
+ (srcmap->flags & IOMAP_F_NEW) ||
+ pos >= i_size_read(iter->inode);
}
-static loff_t
-iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx, loff_t offset)
{
- struct iomap_readpage_ctx *ctx = data;
+ const struct iomap *iomap = &iter->iomap;
+ loff_t pos = iter->pos + offset;
+ loff_t length = iomap_length(iter) - offset;
struct page *page = ctx->cur_page;
struct iomap_page *iop;
- bool same_page = false, is_contig = false;
loff_t orig_pos = pos;
unsigned poff, plen;
sector_t sector;
- if (iomap->type == IOMAP_INLINE) {
- WARN_ON_ONCE(pos);
- iomap_read_inline_data(inode, page, iomap);
- return PAGE_SIZE;
- }
+ if (iomap->type == IOMAP_INLINE)
+ return min(iomap_read_inline_data(iter, page), length);
/* zero post-eof blocks as the page may be mapped */
- iop = iomap_page_create(inode, page);
- iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
+ iop = iomap_page_create(iter->inode, page);
+ iomap_adjust_read_range(iter->inode, iop, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
- if (iomap_block_needs_zeroing(inode, iomap, pos)) {
+ if (iomap_block_needs_zeroing(iter, pos)) {
zero_user(page, poff, plen);
iomap_set_range_uptodate(page, poff, plen);
goto done;
@@ -268,16 +275,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (iop)
atomic_add(plen, &iop->read_bytes_pending);
- /* Try to merge into a previous segment if we can */
sector = iomap_sector(iomap, pos);
- if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
- if (__bio_try_merge_page(ctx->bio, page, plen, poff,
- &same_page))
- goto done;
- is_contig = true;
- }
-
- if (!is_contig || bio_full(ctx->bio, plen)) {
+ if (!ctx->bio ||
+ bio_end_sector(ctx->bio) != sector ||
+ bio_add_page(ctx->bio, page, plen, poff) != plen) {
gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
gfp_t orig_gfp = gfp;
unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
@@ -301,13 +302,12 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ctx->bio->bi_iter.bi_sector = sector;
bio_set_dev(ctx->bio, iomap->bdev);
ctx->bio->bi_end_io = iomap_read_end_io;
+ __bio_add_page(ctx->bio, page, plen, poff);
}
-
- bio_add_page(ctx->bio, page, plen, poff);
done:
/*
* Move the caller beyond our range so that it keeps making progress.
- * For that we have to include any leading non-uptodate ranges, but
+ * For that, we have to include any leading non-uptodate ranges, but
* we can skip trailing ones as they will be handled in the next
* iteration.
*/
@@ -317,23 +317,23 @@ done:
int
iomap_readpage(struct page *page, const struct iomap_ops *ops)
{
- struct iomap_readpage_ctx ctx = { .cur_page = page };
- struct inode *inode = page->mapping->host;
- unsigned poff;
- loff_t ret;
+ struct iomap_iter iter = {
+ .inode = page->mapping->host,
+ .pos = page_offset(page),
+ .len = PAGE_SIZE,
+ };
+ struct iomap_readpage_ctx ctx = {
+ .cur_page = page,
+ };
+ int ret;
trace_iomap_readpage(page->mapping->host, 1);
- for (poff = 0; poff < PAGE_SIZE; poff += ret) {
- ret = iomap_apply(inode, page_offset(page) + poff,
- PAGE_SIZE - poff, 0, ops, &ctx,
- iomap_readpage_actor);
- if (ret <= 0) {
- WARN_ON_ONCE(ret == 0);
- SetPageError(page);
- break;
- }
- }
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
+
+ if (ret < 0)
+ SetPageError(page);
if (ctx.bio) {
submit_bio(ctx.bio);
@@ -344,23 +344,22 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
}
/*
- * Just like mpage_readahead and block_read_full_page we always
+ * Just like mpage_readahead and block_read_full_page, we always
* return 0 and just mark the page as PageError on errors. This
- * should be cleaned up all through the stack eventually.
+ * should be cleaned up throughout the stack eventually.
*/
return 0;
}
EXPORT_SYMBOL_GPL(iomap_readpage);
-static loff_t
-iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx)
{
- struct iomap_readpage_ctx *ctx = data;
+ loff_t length = iomap_length(iter);
loff_t done, ret;
for (done = 0; done < length; done += ret) {
- if (ctx->cur_page && offset_in_page(pos + done) == 0) {
+ if (ctx->cur_page && offset_in_page(iter->pos + done) == 0) {
if (!ctx->cur_page_in_bio)
unlock_page(ctx->cur_page);
put_page(ctx->cur_page);
@@ -370,8 +369,7 @@ iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length,
ctx->cur_page = readahead_page(ctx->rac);
ctx->cur_page_in_bio = false;
}
- ret = iomap_readpage_actor(inode, pos + done, length - done,
- ctx, iomap, srcmap);
+ ret = iomap_readpage_iter(iter, ctx, done);
}
return done;
@@ -394,25 +392,19 @@ iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length,
*/
void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
{
- struct inode *inode = rac->mapping->host;
- loff_t pos = readahead_pos(rac);
- size_t length = readahead_length(rac);
+ struct iomap_iter iter = {
+ .inode = rac->mapping->host,
+ .pos = readahead_pos(rac),
+ .len = readahead_length(rac),
+ };
struct iomap_readpage_ctx ctx = {
.rac = rac,
};
- trace_iomap_readahead(inode, readahead_count(rac));
+ trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
- while (length > 0) {
- ssize_t ret = iomap_apply(inode, pos, length, 0, ops,
- &ctx, iomap_readahead_actor);
- if (ret <= 0) {
- WARN_ON_ONCE(ret == 0);
- break;
- }
- pos += ret;
- length -= ret;
- }
+ while (iomap_iter(&iter, ops) > 0)
+ iter.processed = iomap_readahead_iter(&iter, &ctx);
if (ctx.bio)
submit_bio(ctx.bio);
@@ -467,7 +459,7 @@ iomap_releasepage(struct page *page, gfp_t gfp_mask)
/*
* mm accommodates an old ext3 case where clean pages might not have had
* the dirty bit cleared. Thus, it can send actual dirty pages to
- * ->releasepage() via shrink_active_list(), skip those here.
+ * ->releasepage() via shrink_active_list(); skip those here.
*/
if (PageDirty(page) || PageWriteback(page))
return 0;
@@ -482,7 +474,7 @@ iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
trace_iomap_invalidatepage(page->mapping->host, offset, len);
/*
- * If we are invalidating the entire page, clear the dirty state from it
+ * If we're invalidating the entire page, clear the dirty state from it
* and release it to avoid unnecessary buildup of the LRU.
*/
if (offset == 0 && len == PAGE_SIZE) {
@@ -516,10 +508,6 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
EXPORT_SYMBOL_GPL(iomap_migrate_page);
#endif /* CONFIG_MIGRATION */
-enum {
- IOMAP_WRITE_F_UNSHARE = (1 << 0),
-};
-
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -535,7 +523,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
static int
iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
- unsigned plen, struct iomap *iomap)
+ unsigned plen, const struct iomap *iomap)
{
struct bio_vec bvec;
struct bio bio;
@@ -548,12 +536,12 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
return submit_bio_wait(&bio);
}
-static int
-__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
- struct page *page, struct iomap *srcmap)
+static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+ unsigned len, struct page *page)
{
- struct iomap_page *iop = iomap_page_create(inode, page);
- loff_t block_size = i_blocksize(inode);
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ struct iomap_page *iop = iomap_page_create(iter->inode, page);
+ loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
unsigned from = offset_in_page(pos), to = from + len, poff, plen;
@@ -563,18 +551,18 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
ClearPageError(page);
do {
- iomap_adjust_read_range(inode, iop, &block_start,
+ iomap_adjust_read_range(iter->inode, iop, &block_start,
block_end - block_start, &poff, &plen);
if (plen == 0)
break;
- if (!(flags & IOMAP_WRITE_F_UNSHARE) &&
+ if (!(iter->flags & IOMAP_UNSHARE) &&
(from <= poff || from >= poff + plen) &&
(to <= poff || to >= poff + plen))
continue;
- if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
- if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
+ if (iomap_block_needs_zeroing(iter, block_start)) {
+ if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
return -EIO;
zero_user_segments(page, poff, from, to, poff + plen);
} else {
@@ -589,41 +577,54 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
return 0;
}
-static int
-iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
+static int iomap_write_begin_inline(const struct iomap_iter *iter,
+ struct page *page)
{
- const struct iomap_page_ops *page_ops = iomap->page_ops;
+ int ret;
+
+ /* needs more work for the tailpacking case; disable for now */
+ if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0))
+ return -EIO;
+ ret = iomap_read_inline_data(iter, page);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+ unsigned len, struct page **pagep)
+{
+ const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct page *page;
int status = 0;
- BUG_ON(pos + len > iomap->offset + iomap->length);
- if (srcmap != iomap)
+ BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
+ if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
if (fatal_signal_pending(current))
return -EINTR;
if (page_ops && page_ops->page_prepare) {
- status = page_ops->page_prepare(inode, pos, len, iomap);
+ status = page_ops->page_prepare(iter->inode, pos, len);
if (status)
return status;
}
- page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
- AOP_FLAG_NOFS);
+ page = grab_cache_page_write_begin(iter->inode->i_mapping,
+ pos >> PAGE_SHIFT, AOP_FLAG_NOFS);
if (!page) {
status = -ENOMEM;
goto out_no_page;
}
if (srcmap->type == IOMAP_INLINE)
- iomap_read_inline_data(inode, page, srcmap);
- else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
+ status = iomap_write_begin_inline(iter, page);
+ else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
status = __block_write_begin_int(page, pos, len, NULL, srcmap);
else
- status = __iomap_write_begin(inode, pos, len, flags, page,
- srcmap);
+ status = __iomap_write_begin(iter, pos, len, page);
if (unlikely(status))
goto out_unlock;
@@ -634,11 +635,11 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
out_unlock:
unlock_page(page);
put_page(page);
- iomap_write_failed(inode, pos, len);
+ iomap_write_failed(iter->inode, pos, len);
out_no_page:
if (page_ops && page_ops->page_done)
- page_ops->page_done(inode, pos, 0, NULL, iomap);
+ page_ops->page_done(iter->inode, pos, 0, NULL);
return status;
}
@@ -650,13 +651,13 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
/*
* The blocks that were entirely written will now be uptodate, so we
* don't have to worry about a readpage reading them and overwriting a
- * partial write. However if we have encountered a short write and only
+ * partial write. However, if we've encountered a short write and only
* partially written into a block, it will not be marked uptodate, so a
* readpage might come in and destroy our partial write.
*
- * Do the simplest thing, and just treat any short write to a non
- * uptodate page as a zero-length write, and force the caller to redo
- * the whole thing.
+ * Do the simplest thing and just treat any short write to a
+ * non-uptodate page as a zero-length write, and force the caller to
+ * redo the whole thing.
*/
if (unlikely(copied < len && !PageUptodate(page)))
return 0;
@@ -665,39 +666,40 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
return copied;
}
-static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
- struct iomap *iomap, loff_t pos, size_t copied)
+static size_t iomap_write_end_inline(const struct iomap_iter *iter,
+ struct page *page, loff_t pos, size_t copied)
{
+ const struct iomap *iomap = &iter->iomap;
void *addr;
WARN_ON_ONCE(!PageUptodate(page));
- BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ BUG_ON(!iomap_inline_data_valid(iomap));
flush_dcache_page(page);
- addr = kmap_atomic(page);
- memcpy(iomap->inline_data + pos, addr + pos, copied);
- kunmap_atomic(addr);
+ addr = kmap_local_page(page) + pos;
+ memcpy(iomap_inline_data(iomap, pos), addr, copied);
+ kunmap_local(addr);
- mark_inode_dirty(inode);
+ mark_inode_dirty(iter->inode);
return copied;
}
/* Returns the number of bytes copied. May be 0. Cannot be an errno. */
-static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len,
- size_t copied, struct page *page, struct iomap *iomap,
- struct iomap *srcmap)
+static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
+ size_t copied, struct page *page)
{
- const struct iomap_page_ops *page_ops = iomap->page_ops;
- loff_t old_size = inode->i_size;
+ const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t old_size = iter->inode->i_size;
size_t ret;
if (srcmap->type == IOMAP_INLINE) {
- ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
+ ret = iomap_write_end_inline(iter, page, pos, copied);
} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
- ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
- page, NULL);
+ ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
+ copied, page, NULL);
} else {
- ret = __iomap_write_end(inode, pos, len, copied, page);
+ ret = __iomap_write_end(iter->inode, pos, len, copied, page);
}
/*
@@ -706,29 +708,28 @@ static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len,
* preferably after I/O completion so that no stale data is exposed.
*/
if (pos + ret > old_size) {
- i_size_write(inode, pos + ret);
- iomap->flags |= IOMAP_F_SIZE_CHANGED;
+ i_size_write(iter->inode, pos + ret);
+ iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
unlock_page(page);
if (old_size < pos)
- pagecache_isize_extended(inode, old_size, pos);
+ pagecache_isize_extended(iter->inode, old_size, pos);
if (page_ops && page_ops->page_done)
- page_ops->page_done(inode, pos, ret, page, iomap);
+ page_ops->page_done(iter->inode, pos, ret, page);
put_page(page);
if (ret < len)
- iomap_write_failed(inode, pos, len);
+ iomap_write_failed(iter->inode, pos, len);
return ret;
}
-static loff_t
-iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
- struct iov_iter *i = data;
- long status = 0;
+ loff_t length = iomap_length(iter);
+ loff_t pos = iter->pos;
ssize_t written = 0;
+ long status = 0;
do {
struct page *page;
@@ -744,7 +745,7 @@ again:
bytes = length;
/*
- * Bring in the user page that we will copy from _first_.
+ * Bring in the user page that we'll copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
@@ -754,18 +755,16 @@ again:
break;
}
- status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
- srcmap);
+ status = iomap_write_begin(iter, pos, bytes, &page);
if (unlikely(status))
break;
- if (mapping_writably_mapped(inode->i_mapping))
+ if (mapping_writably_mapped(iter->inode->i_mapping))
flush_dcache_page(page);
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
- status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
- srcmap);
+ status = iomap_write_end(iter, pos, bytes, copied, page);
if (unlikely(copied != status))
iov_iter_revert(i, copied - status);
@@ -786,36 +785,38 @@ again:
written += status;
length -= status;
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (iov_iter_count(i) && length);
return written ? written : status;
}
ssize_t
-iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
const struct iomap_ops *ops)
{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
- loff_t pos = iocb->ki_pos, ret = 0, written = 0;
-
- while (iov_iter_count(iter)) {
- ret = iomap_apply(inode, pos, iov_iter_count(iter),
- IOMAP_WRITE, ops, iter, iomap_write_actor);
- if (ret <= 0)
- break;
- pos += ret;
- written += ret;
- }
+ struct iomap_iter iter = {
+ .inode = iocb->ki_filp->f_mapping->host,
+ .pos = iocb->ki_pos,
+ .len = iov_iter_count(i),
+ .flags = IOMAP_WRITE,
+ };
+ int ret;
- return written ? written : ret;
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_write_iter(&iter, i);
+ if (iter.pos == iocb->ki_pos)
+ return ret;
+ return iter.pos - iocb->ki_pos;
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
-static loff_t
-iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
+ struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
long status = 0;
loff_t written = 0;
@@ -831,13 +832,11 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
struct page *page;
- status = iomap_write_begin(inode, pos, bytes,
- IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap);
+ status = iomap_write_begin(iter, pos, bytes, &page);
if (unlikely(status))
return status;
- status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
- srcmap);
+ status = iomap_write_end(iter, pos, bytes, bytes, page);
if (WARN_ON_ONCE(status == 0))
return -EIO;
@@ -847,7 +846,7 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
written += status;
length -= status;
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (length);
return written;
@@ -857,44 +856,43 @@ int
iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops)
{
- loff_t ret;
-
- while (len) {
- ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
- iomap_unshare_actor);
- if (ret <= 0)
- return ret;
- pos += ret;
- len -= ret;
- }
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_WRITE | IOMAP_UNSHARE,
+ };
+ int ret;
- return 0;
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_unshare_iter(&iter);
+ return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
-static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length,
- struct iomap *iomap, struct iomap *srcmap)
+static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length)
{
struct page *page;
int status;
unsigned offset = offset_in_page(pos);
unsigned bytes = min_t(u64, PAGE_SIZE - offset, length);
- status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
+ status = iomap_write_begin(iter, pos, bytes, &page);
if (status)
return status;
zero_user(page, offset, bytes);
mark_page_accessed(page);
- return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
+ return iomap_write_end(iter, pos, bytes, bytes, page);
}
-static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
- loff_t length, void *data, struct iomap *iomap,
- struct iomap *srcmap)
+static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
- bool *did_zero = data;
+ struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
loff_t written = 0;
/* already zeroed? we're done. */
@@ -904,10 +902,10 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
do {
s64 bytes;
- if (IS_DAX(inode))
+ if (IS_DAX(iter->inode))
bytes = dax_iomap_zero(pos, length, iomap);
else
- bytes = iomap_zero(inode, pos, length, iomap, srcmap);
+ bytes = __iomap_zero_iter(iter, pos, length);
if (bytes < 0)
return bytes;
@@ -925,19 +923,17 @@ int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
const struct iomap_ops *ops)
{
- loff_t ret;
-
- while (len > 0) {
- ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
- ops, did_zero, iomap_zero_range_actor);
- if (ret <= 0)
- return ret;
-
- pos += ret;
- len -= ret;
- }
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_ZERO,
+ };
+ int ret;
- return 0;
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_zero_iter(&iter, did_zero);
+ return ret;
}
EXPORT_SYMBOL_GPL(iomap_zero_range);
@@ -955,15 +951,15 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
-static loff_t
-iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_page_mkwrite_iter(struct iomap_iter *iter,
+ struct page *page)
{
- struct page *page = data;
+ loff_t length = iomap_length(iter);
int ret;
- if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
- ret = __block_write_begin_int(page, pos, length, NULL, iomap);
+ if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
+ ret = __block_write_begin_int(page, iter->pos, length, NULL,
+ &iter->iomap);
if (ret)
return ret;
block_commit_write(page, 0, length);
@@ -977,29 +973,24 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
{
+ struct iomap_iter iter = {
+ .inode = file_inode(vmf->vma->vm_file),
+ .flags = IOMAP_WRITE | IOMAP_FAULT,
+ };
struct page *page = vmf->page;
- struct inode *inode = file_inode(vmf->vma->vm_file);
- unsigned long length;
- loff_t offset;
ssize_t ret;
lock_page(page);
- ret = page_mkwrite_check_truncate(page, inode);
+ ret = page_mkwrite_check_truncate(page, iter.inode);
if (ret < 0)
goto out_unlock;
- length = ret;
-
- offset = page_offset(page);
- while (length > 0) {
- ret = iomap_apply(inode, offset, length,
- IOMAP_WRITE | IOMAP_FAULT, ops, page,
- iomap_page_mkwrite_actor);
- if (unlikely(ret <= 0))
- goto out_unlock;
- offset += ret;
- length -= ret;
- }
+ iter.pos = page_offset(page);
+ iter.len = ret;
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_page_mkwrite_iter(&iter, page);
+ if (ret < 0)
+ goto out_unlock;
wait_for_stable_page(page);
return VM_FAULT_LOCKED;
out_unlock:
@@ -1016,7 +1007,7 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page,
if (error) {
SetPageError(page);
- mapping_set_error(inode->i_mapping, -EIO);
+ mapping_set_error(inode->i_mapping, error);
}
WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop);
@@ -1153,7 +1144,7 @@ static void iomap_writepage_end_bio(struct bio *bio)
* Submit the final bio for an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we have marked paged for writeback
+ * the submission process has failed after we've marked pages for writeback
* and unlocked them. In this situation, we need to fail the bio instead of
* submitting it. This typically only happens on a filesystem shutdown.
*/
@@ -1168,7 +1159,7 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
error = wpc->ops->prepare_ioend(ioend, error);
if (error) {
/*
- * If we are failing the IO now, just mark the ioend with an
+ * If we're failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
* as there is only one reference to the ioend at this point in
* time.
@@ -1210,7 +1201,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
/*
* Allocate a new bio, and chain the old bio to the new one.
*
- * Note that we have to do perform the chaining in this unintuitive order
+ * Note that we have to perform the chaining in this unintuitive order
* so that the bi_private linkage is set up in the right direction for the
* traversal in iomap_finish_ioend().
*/
@@ -1249,7 +1240,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
/*
* Test to see if we have an existing ioend structure that we could append to
- * first, otherwise finish off the current ioend and start another.
+ * first; otherwise finish off the current ioend and start another.
*/
static void
iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
@@ -1259,7 +1250,6 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
sector_t sector = iomap_sector(&wpc->iomap, offset);
unsigned len = i_blocksize(inode);
unsigned poff = offset & (PAGE_SIZE - 1);
- bool merged, same_page = false;
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
if (wpc->ioend)
@@ -1267,19 +1257,13 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
}
- merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
- &same_page);
- if (iop)
- atomic_add(len, &iop->write_bytes_pending);
-
- if (!merged) {
- if (bio_full(wpc->ioend->io_bio, len)) {
- wpc->ioend->io_bio =
- iomap_chain_bio(wpc->ioend->io_bio);
- }
- bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) {
+ wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
+ __bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
+ if (iop)
+ atomic_add(len, &iop->write_bytes_pending);
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, page, len);
}
@@ -1287,9 +1271,9 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
/*
* We implement an immediate ioend submission policy here to avoid needing to
* chain multiple ioends and hence nest mempool allocations which can violate
- * forward progress guarantees we need to provide. The current ioend we are
- * adding blocks to is cached on the writepage context, and if the new block
- * does not append to the cached ioend it will create a new ioend and cache that
+ * the forward progress guarantees we need to provide. The current ioend we're
+ * adding blocks to is cached in the writepage context, and if the new block
+ * doesn't append to the cached ioend, it will create a new ioend and cache that
* instead.
*
* If a new ioend is created and cached, the old ioend is returned and queued
@@ -1351,7 +1335,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
if (unlikely(error)) {
/*
* Let the filesystem know what portion of the current page
- * failed to map. If the page wasn't been added to ioend, it
+ * failed to map. If the page hasn't been added to ioend, it
* won't be affected by I/O completion and we must unlock it
* now.
*/
@@ -1368,7 +1352,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
unlock_page(page);
/*
- * Preserve the original error if there was one, otherwise catch
+ * Preserve the original error if there was one; catch
* submission errors here and propagate into subsequent ioend
* submissions.
*/
@@ -1395,8 +1379,8 @@ done:
/*
* Write out a dirty page.
*
- * For delalloc space on the page we need to allocate space and flush it.
- * For unwritten space on the page we need to start the conversion to
+ * For delalloc space on the page, we need to allocate space and flush it.
+ * For unwritten space on the page, we need to start the conversion to
* regular allocated space.
*/
static int
@@ -1411,7 +1395,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE);
/*
- * Refuse to write the page out if we are called from reclaim context.
+ * Refuse to write the page out if we're called from reclaim context.
*
* This avoids stack overflows when called from deeply used stacks in
* random callers for direct reclaim or memcg reclaim. We explicitly
@@ -1456,20 +1440,20 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
unsigned offset_into_page = offset & (PAGE_SIZE - 1);
/*
- * Skip the page if it is fully outside i_size, e.g. due to a
- * truncate operation that is in progress. We must redirty the
+ * Skip the page if it's fully outside i_size, e.g. due to a
+ * truncate operation that's in progress. We must redirty the
* page so that reclaim stops reclaiming it. Otherwise
* iomap_vm_releasepage() is called on it and gets confused.
*
- * Note that the end_index is unsigned long, it would overflow
- * if the given offset is greater than 16TB on 32-bit system
- * and if we do check the page is fully outside i_size or not
- * via "if (page->index >= end_index + 1)" as "end_index + 1"
- * will be evaluated to 0. Hence this page will be redirtied
- * and be written out repeatedly which would result in an
- * infinite loop, the user program that perform this operation
- * will hang. Instead, we can verify this situation by checking
- * if the page to write is totally beyond the i_size or if it's
+ * Note that the end_index is unsigned long. If the given
+ * offset is greater than 16TB on a 32-bit system then if we
+ * checked if the page is fully outside i_size with
+ * "if (page->index >= end_index + 1)", "end_index + 1" would
+ * overflow and evaluate to 0. Hence this page would be
+ * redirtied and written out repeatedly, which would result in
+ * an infinite loop; the user program performing this operation
+ * would hang. Instead, we can detect this situation by
+ * checking if the page is totally beyond i_size or if its
* offset is just equal to the EOF.
*/
if (page->index > end_index ||
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 9398b8c31323..4ecd255e0511 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2021 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -59,19 +59,17 @@ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
}
EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
- struct bio *bio, loff_t pos)
+static void iomap_dio_submit_bio(const struct iomap_iter *iter,
+ struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
if (dio->iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(bio, dio->iocb);
- dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+ dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev);
if (dio->dops && dio->dops->submit_io)
- dio->submit.cookie = dio->dops->submit_io(
- file_inode(dio->iocb->ki_filp),
- iomap, bio, pos);
+ dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
else
dio->submit.cookie = submit_bio(bio);
}
@@ -181,24 +179,23 @@ static void iomap_dio_bio_end_io(struct bio *bio)
}
}
-static void
-iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
- unsigned len)
+static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
+ loff_t pos, unsigned len)
{
struct page *page = ZERO_PAGE(0);
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bio = bio_alloc(GFP_KERNEL, 1);
- bio_set_dev(bio, iomap->bdev);
- bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+ bio_set_dev(bio, iter->iomap.bdev);
+ bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
__bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
- iomap_dio_submit_bio(dio, iomap, bio, pos);
+ iomap_dio_submit_bio(iter, dio, bio, pos);
}
/*
@@ -206,8 +203,8 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
* mapping, and whether or not we want FUA. Note that we can end up
* clearing the WRITE_FUA flag in the dio request.
*/
-static inline unsigned int
-iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua)
+static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio,
+ const struct iomap *iomap, bool use_fua)
{
unsigned int opflags = REQ_SYNC | REQ_IDLE;
@@ -229,13 +226,16 @@ iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua)
return opflags;
}
-static loff_t
-iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
- struct iomap_dio *dio, struct iomap *iomap)
+static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
+ struct iomap_dio *dio)
{
+ const struct iomap *iomap = &iter->iomap;
+ struct inode *inode = iter->inode;
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
unsigned int align = iov_iter_alignment(dio->submit.iter);
+ loff_t length = iomap_length(iter);
+ loff_t pos = iter->pos;
unsigned int bio_opf;
struct bio *bio;
bool need_zeroout = false;
@@ -286,7 +286,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
if (pad)
- iomap_dio_zero(dio, iomap, pos - pad, pad);
+ iomap_dio_zero(iter, dio, pos - pad, pad);
}
/*
@@ -339,7 +339,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
BIO_MAX_VECS);
- iomap_dio_submit_bio(dio, iomap, bio, pos);
+ iomap_dio_submit_bio(iter, dio, bio, pos);
pos += n;
} while (nr_pages);
@@ -355,7 +355,7 @@ zero_tail:
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
if (pad)
- iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+ iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
}
out:
/* Undo iter limitation to current extent */
@@ -365,65 +365,67 @@ out:
return ret;
}
-static loff_t
-iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
+static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
+ struct iomap_dio *dio)
{
- length = iov_iter_zero(length, dio->submit.iter);
+ loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
+
dio->size += length;
return length;
}
-static loff_t
-iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
- struct iomap_dio *dio, struct iomap *iomap)
+static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
+ struct iomap_dio *dio)
{
+ const struct iomap *iomap = &iomi->iomap;
struct iov_iter *iter = dio->submit.iter;
+ void *inline_data = iomap_inline_data(iomap, iomi->pos);
+ loff_t length = iomap_length(iomi);
+ loff_t pos = iomi->pos;
size_t copied;
- BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
+ return -EIO;
if (dio->flags & IOMAP_DIO_WRITE) {
- loff_t size = inode->i_size;
+ loff_t size = iomi->inode->i_size;
if (pos > size)
- memset(iomap->inline_data + size, 0, pos - size);
- copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+ memset(iomap_inline_data(iomap, size), 0, pos - size);
+ copied = copy_from_iter(inline_data, length, iter);
if (copied) {
if (pos + copied > size)
- i_size_write(inode, pos + copied);
- mark_inode_dirty(inode);
+ i_size_write(iomi->inode, pos + copied);
+ mark_inode_dirty(iomi->inode);
}
} else {
- copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+ copied = copy_to_iter(inline_data, length, iter);
}
dio->size += copied;
return copied;
}
-static loff_t
-iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_dio_iter(const struct iomap_iter *iter,
+ struct iomap_dio *dio)
{
- struct iomap_dio *dio = data;
-
- switch (iomap->type) {
+ switch (iter->iomap.type) {
case IOMAP_HOLE:
if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
return -EIO;
- return iomap_dio_hole_actor(length, dio);
+ return iomap_dio_hole_iter(iter, dio);
case IOMAP_UNWRITTEN:
if (!(dio->flags & IOMAP_DIO_WRITE))
- return iomap_dio_hole_actor(length, dio);
- return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+ return iomap_dio_hole_iter(iter, dio);
+ return iomap_dio_bio_iter(iter, dio);
case IOMAP_MAPPED:
- return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+ return iomap_dio_bio_iter(iter, dio);
case IOMAP_INLINE:
- return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
+ return iomap_dio_inline_iter(iter, dio);
case IOMAP_DELALLOC:
/*
* DIO is not serialised against mmap() access at all, and so
* if the page_mkwrite occurs between the writeback and the
- * iomap_apply() call in the DIO path, then it will see the
+ * iomap_iter() call in the DIO path, then it will see the
* DELALLOC block that the page-mkwrite allocated.
*/
pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
@@ -454,16 +456,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
- size_t count = iov_iter_count(iter);
- loff_t pos = iocb->ki_pos;
- loff_t end = iocb->ki_pos + count - 1, ret = 0;
+ struct iomap_iter iomi = {
+ .inode = inode,
+ .pos = iocb->ki_pos,
+ .len = iov_iter_count(iter),
+ .flags = IOMAP_DIRECT,
+ };
+ loff_t end = iomi.pos + iomi.len - 1, ret = 0;
bool wait_for_completion =
is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
- unsigned int iomap_flags = IOMAP_DIRECT;
struct blk_plug plug;
struct iomap_dio *dio;
- if (!count)
+ if (!iomi.len)
return NULL;
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
@@ -484,29 +489,30 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.last_queue = NULL;
if (iov_iter_rw(iter) == READ) {
- if (pos >= dio->i_size)
+ if (iomi.pos >= dio->i_size)
goto out_free_dio;
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_needs_writeback(mapping, pos, end)) {
+ if (filemap_range_needs_writeback(mapping, iomi.pos,
+ end)) {
ret = -EAGAIN;
goto out_free_dio;
}
- iomap_flags |= IOMAP_NOWAIT;
+ iomi.flags |= IOMAP_NOWAIT;
}
if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
- iomap_flags |= IOMAP_WRITE;
+ iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_has_page(mapping, pos, end)) {
+ if (filemap_range_has_page(mapping, iomi.pos, end)) {
ret = -EAGAIN;
goto out_free_dio;
}
- iomap_flags |= IOMAP_NOWAIT;
+ iomi.flags |= IOMAP_NOWAIT;
}
/* for data sync or sync, we need sync completion processing */
@@ -525,12 +531,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
- if (pos >= dio->i_size || pos + count > dio->i_size)
+ if (iomi.pos >= dio->i_size ||
+ iomi.pos + iomi.len > dio->i_size)
goto out_free_dio;
- iomap_flags |= IOMAP_OVERWRITE_ONLY;
+ iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
- ret = filemap_write_and_wait_range(mapping, pos, end);
+ ret = filemap_write_and_wait_range(mapping, iomi.pos, end);
if (ret)
goto out_free_dio;
@@ -540,9 +547,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
* If this invalidation fails, let the caller fall back to
* buffered I/O.
*/
- if (invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
- end >> PAGE_SHIFT)) {
- trace_iomap_dio_invalidate_fail(inode, pos, count);
+ if (invalidate_inode_pages2_range(mapping,
+ iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) {
+ trace_iomap_dio_invalidate_fail(inode, iomi.pos,
+ iomi.len);
ret = -ENOTBLK;
goto out_free_dio;
}
@@ -557,31 +565,23 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
inode_dio_begin(inode);
blk_start_plug(&plug);
- do {
- ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio,
- iomap_dio_actor);
- if (ret <= 0) {
- /* magic error code to fall back to buffered I/O */
- if (ret == -ENOTBLK) {
- wait_for_completion = true;
- ret = 0;
- }
- break;
- }
- pos += ret;
-
- if (iov_iter_rw(iter) == READ && pos >= dio->i_size) {
- /*
- * We only report that we've read data up to i_size.
- * Revert iter to a state corresponding to that as
- * some callers (such as splice code) rely on it.
- */
- iov_iter_revert(iter, pos - dio->i_size);
- break;
- }
- } while ((count = iov_iter_count(iter)) > 0);
+ while ((ret = iomap_iter(&iomi, ops)) > 0)
+ iomi.processed = iomap_dio_iter(&iomi, dio);
blk_finish_plug(&plug);
+ /*
+ * We only report that we've read data up to i_size.
+ * Revert iter to a state corresponding to that as some callers (such
+ * as the splice code) rely on it.
+ */
+ if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
+ iov_iter_revert(iter, iomi.pos - dio->i_size);
+
+ /* magic error code to fall back to buffered I/O */
+ if (ret == -ENOTBLK) {
+ wait_for_completion = true;
+ ret = 0;
+ }
if (ret < 0)
iomap_dio_set_error(dio, ret);
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index aab070df4a21..66cf267c68ae 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2021 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -8,13 +8,8 @@
#include <linux/iomap.h>
#include <linux/fiemap.h>
-struct fiemap_ctx {
- struct fiemap_extent_info *fi;
- struct iomap prev;
-};
-
static int iomap_to_fiemap(struct fiemap_extent_info *fi,
- struct iomap *iomap, u32 flags)
+ const struct iomap *iomap, u32 flags)
{
switch (iomap->type) {
case IOMAP_HOLE:
@@ -43,24 +38,22 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
iomap->length, flags);
}
-static loff_t
-iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_fiemap_iter(const struct iomap_iter *iter,
+ struct fiemap_extent_info *fi, struct iomap *prev)
{
- struct fiemap_ctx *ctx = data;
- loff_t ret = length;
+ int ret;
- if (iomap->type == IOMAP_HOLE)
- return length;
+ if (iter->iomap.type == IOMAP_HOLE)
+ return iomap_length(iter);
- ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
- ctx->prev = *iomap;
+ ret = iomap_to_fiemap(fi, prev, 0);
+ *prev = iter->iomap;
switch (ret) {
case 0: /* success */
- return length;
+ return iomap_length(iter);
case 1: /* extent array full */
return 0;
- default:
+ default: /* error */
return ret;
}
}
@@ -68,73 +61,63 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
u64 start, u64 len, const struct iomap_ops *ops)
{
- struct fiemap_ctx ctx;
- loff_t ret;
-
- memset(&ctx, 0, sizeof(ctx));
- ctx.fi = fi;
- ctx.prev.type = IOMAP_HOLE;
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = start,
+ .len = len,
+ .flags = IOMAP_REPORT,
+ };
+ struct iomap prev = {
+ .type = IOMAP_HOLE,
+ };
+ int ret;
- ret = fiemap_prep(inode, fi, start, &len, 0);
+ ret = fiemap_prep(inode, fi, start, &iter.len, 0);
if (ret)
return ret;
- while (len > 0) {
- ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
- iomap_fiemap_actor);
- /* inode with no (attribute) mapping will give ENOENT */
- if (ret == -ENOENT)
- break;
- if (ret < 0)
- return ret;
- if (ret == 0)
- break;
-
- start += ret;
- len -= ret;
- }
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_fiemap_iter(&iter, fi, &prev);
- if (ctx.prev.type != IOMAP_HOLE) {
- ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+ if (prev.type != IOMAP_HOLE) {
+ ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST);
if (ret < 0)
return ret;
}
+ /* inode with no (attribute) mapping will give ENOENT */
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
return 0;
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
-static loff_t
-iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap, struct iomap *srcmap)
-{
- sector_t *bno = data, addr;
-
- if (iomap->type == IOMAP_MAPPED) {
- addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
- *bno = addr;
- }
- return 0;
-}
-
/* legacy ->bmap interface. 0 is the error return (!) */
sector_t
iomap_bmap(struct address_space *mapping, sector_t bno,
const struct iomap_ops *ops)
{
- struct inode *inode = mapping->host;
- loff_t pos = bno << inode->i_blkbits;
- unsigned blocksize = i_blocksize(inode);
+ struct iomap_iter iter = {
+ .inode = mapping->host,
+ .pos = (loff_t)bno << mapping->host->i_blkbits,
+ .len = i_blocksize(mapping->host),
+ .flags = IOMAP_REPORT,
+ };
+ const unsigned int blkshift = mapping->host->i_blkbits - SECTOR_SHIFT;
int ret;
if (filemap_write_and_wait(mapping))
return 0;
bno = 0;
- ret = iomap_apply(inode, pos, blocksize, 0, ops, &bno,
- iomap_bmap_actor);
+ while ((ret = iomap_iter(&iter, ops)) > 0) {
+ if (iter.iomap.type == IOMAP_MAPPED)
+ bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift;
+ /* leave iter.processed unset to abort loop */
+ }
if (ret)
return 0;
+
return bno;
}
EXPORT_SYMBOL_GPL(iomap_bmap);
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
new file mode 100644
index 000000000000..a1c7592d2ade
--- /dev/null
+++ b/fs/iomap/iter.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016-2021 Christoph Hellwig.
+ */
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include "trace.h"
+
+static inline int iomap_iter_advance(struct iomap_iter *iter)
+{
+ /* handle the previous iteration (if any) */
+ if (iter->iomap.length) {
+ if (iter->processed <= 0)
+ return iter->processed;
+ if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
+ return -EIO;
+ iter->pos += iter->processed;
+ iter->len -= iter->processed;
+ if (!iter->len)
+ return 0;
+ }
+
+ /* clear the state for the next iteration */
+ iter->processed = 0;
+ memset(&iter->iomap, 0, sizeof(iter->iomap));
+ memset(&iter->srcmap, 0, sizeof(iter->srcmap));
+ return 1;
+}
+
+static inline void iomap_iter_done(struct iomap_iter *iter)
+{
+ WARN_ON_ONCE(iter->iomap.offset > iter->pos);
+ WARN_ON_ONCE(iter->iomap.length == 0);
+ WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
+
+ trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
+ if (iter->srcmap.type != IOMAP_HOLE)
+ trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
+}
+
+/**
+ * iomap_iter - iterate over a ranges in a file
+ * @iter: iteration structue
+ * @ops: iomap ops provided by the file system
+ *
+ * Iterate over filesystem-provided space mappings for the provided file range.
+ *
+ * This function handles cleanup of resources acquired for iteration when the
+ * filesystem indicates there are no more space mappings, which means that this
+ * function must be called in a loop that continues as long it returns a
+ * positive value. If 0 or a negative value is returned, the caller must not
+ * return to the loop body. Within a loop body, there are two ways to break out
+ * of the loop body: leave @iter.processed unchanged, or set it to a negative
+ * errno.
+ */
+int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
+{
+ int ret;
+
+ if (iter->iomap.length && ops->iomap_end) {
+ ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
+ iter->processed > 0 ? iter->processed : 0,
+ iter->flags, &iter->iomap);
+ if (ret < 0 && !iter->processed)
+ return ret;
+ }
+
+ trace_iomap_iter(iter, ops, _RET_IP_);
+ ret = iomap_iter_advance(iter);
+ if (ret <= 0)
+ return ret;
+
+ ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
+ &iter->iomap, &iter->srcmap);
+ if (ret < 0)
+ return ret;
+ iomap_iter_done(iter);
+ return 1;
+}
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index ce6fb810854f..a845c012b50c 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017 Red Hat, Inc.
- * Copyright (c) 2018 Christoph Hellwig.
+ * Copyright (c) 2018-2021 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -10,21 +10,20 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
-static loff_t
-iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length,
- void *data, struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
+ loff_t *hole_pos)
{
- loff_t offset = start;
+ loff_t length = iomap_length(iter);
- switch (iomap->type) {
+ switch (iter->iomap.type) {
case IOMAP_UNWRITTEN:
- offset = mapping_seek_hole_data(inode->i_mapping, start,
- start + length, SEEK_HOLE);
- if (offset == start + length)
+ *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
+ iter->pos, iter->pos + length, SEEK_HOLE);
+ if (*hole_pos == iter->pos + length)
return length;
- fallthrough;
+ return 0;
case IOMAP_HOLE:
- *(loff_t *)data = offset;
+ *hole_pos = iter->pos;
return 0;
default:
return length;
@@ -32,70 +31,73 @@ iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length,
}
loff_t
-iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
+iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
{
loff_t size = i_size_read(inode);
- loff_t ret;
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .flags = IOMAP_REPORT,
+ };
+ int ret;
/* Nothing to be found before or beyond the end of the file. */
- if (offset < 0 || offset >= size)
+ if (pos < 0 || pos >= size)
return -ENXIO;
- while (offset < size) {
- ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT,
- ops, &offset, iomap_seek_hole_actor);
- if (ret < 0)
- return ret;
- if (ret == 0)
- break;
- offset += ret;
- }
-
- return offset;
+ iter.len = size - pos;
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_seek_hole_iter(&iter, &pos);
+ if (ret < 0)
+ return ret;
+ if (iter.len) /* found hole before EOF */
+ return pos;
+ return size;
}
EXPORT_SYMBOL_GPL(iomap_seek_hole);
-static loff_t
-iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length,
- void *data, struct iomap *iomap, struct iomap *srcmap)
+static loff_t iomap_seek_data_iter(const struct iomap_iter *iter,
+ loff_t *hole_pos)
{
- loff_t offset = start;
+ loff_t length = iomap_length(iter);
- switch (iomap->type) {
+ switch (iter->iomap.type) {
case IOMAP_HOLE:
return length;
case IOMAP_UNWRITTEN:
- offset = mapping_seek_hole_data(inode->i_mapping, start,
- start + length, SEEK_DATA);
- if (offset < 0)
+ *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
+ iter->pos, iter->pos + length, SEEK_DATA);
+ if (*hole_pos < 0)
return length;
- fallthrough;
+ return 0;
default:
- *(loff_t *)data = offset;
+ *hole_pos = iter->pos;
return 0;
}
}
loff_t
-iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
+iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
{
loff_t size = i_size_read(inode);
- loff_t ret;
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .flags = IOMAP_REPORT,
+ };
+ int ret;
/* Nothing to be found before or beyond the end of the file. */
- if (offset < 0 || offset >= size)
+ if (pos < 0 || pos >= size)
return -ENXIO;
- while (offset < size) {
- ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT,
- ops, &offset, iomap_seek_data_actor);
- if (ret < 0)
- return ret;
- if (ret == 0)
- return offset;
- offset += ret;
- }
-
+ iter.len = size - pos;
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_seek_data_iter(&iter, &pos);
+ if (ret < 0)
+ return ret;
+ if (iter.len) /* found data before EOF */
+ return pos;
/* We've reached the end of the file without finding data */
return -ENXIO;
}
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index 6250ca6a1f85..5fc0ac36dee3 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -31,11 +31,16 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
{
struct iomap *iomap = &isi->iomap;
unsigned long nr_pages;
+ unsigned long max_pages;
uint64_t first_ppage;
uint64_t first_ppage_reported;
uint64_t next_ppage;
int error;
+ if (unlikely(isi->nr_pages >= isi->sis->max))
+ return 0;
+ max_pages = isi->sis->max - isi->nr_pages;
+
/*
* Round the start up and the end down so that the physical
* extent aligns to a page boundary.
@@ -48,6 +53,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
/*
* Calculate how much swap space we're adding; the first page contains
@@ -88,13 +94,9 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
* swap only cares about contiguous page-aligned physical extents and makes no
* distinction between written and unwritten extents.
*/
-static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
- loff_t count, void *data, struct iomap *iomap,
- struct iomap *srcmap)
+static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
+ struct iomap *iomap, struct iomap_swapfile_info *isi)
{
- struct iomap_swapfile_info *isi = data;
- int error;
-
switch (iomap->type) {
case IOMAP_MAPPED:
case IOMAP_UNWRITTEN:
@@ -125,12 +127,12 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
isi->iomap.length += iomap->length;
} else {
/* Otherwise, add the retained iomap and store this one. */
- error = iomap_swapfile_add_extent(isi);
+ int error = iomap_swapfile_add_extent(isi);
if (error)
return error;
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
}
- return count;
+ return iomap_length(iter);
}
/*
@@ -141,16 +143,19 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
struct file *swap_file, sector_t *pagespan,
const struct iomap_ops *ops)
{
+ struct inode *inode = swap_file->f_mapping->host;
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = 0,
+ .len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE),
+ .flags = IOMAP_REPORT,
+ };
struct iomap_swapfile_info isi = {
.sis = sis,
.lowest_ppage = (sector_t)-1ULL,
.file = swap_file,
};
- struct address_space *mapping = swap_file->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos = 0;
- loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
- loff_t ret;
+ int ret;
/*
* Persist all file mapping metadata so that we won't have any
@@ -160,15 +165,10 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
if (ret)
return ret;
- while (len > 0) {
- ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
- ops, &isi, iomap_swapfile_activate_actor);
- if (ret <= 0)
- return ret;
-
- pos += ret;
- len -= ret;
- }
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
+ if (ret < 0)
+ return ret;
if (isi.iomap.length) {
ret = iomap_swapfile_add_extent(&isi);
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index fdc7ae388476..65e39785c284 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -1,9 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Copyright (c) 2009-2019 Christoph Hellwig
+ * Copyright (c) 2009-2021 Christoph Hellwig
*
- * NOTE: none of these tracepoints shall be consider a stable kernel ABI
+ * NOTE: none of these tracepoints shall be considered a stable kernel ABI
* as they can change at any time.
+ *
+ * Current conventions for printing numbers measuring specific units:
+ *
+ * offset: byte offset into a subcomponent of a file operation
+ * pos: file offset, in bytes
+ * length: length of a file operation, in bytes
+ * ino: inode number
+ *
+ * Numbers describing space allocations should be formatted in hexadecimal.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM iomap
@@ -42,14 +51,14 @@ DEFINE_READPAGE_EVENT(iomap_readpage);
DEFINE_READPAGE_EVENT(iomap_readahead);
DECLARE_EVENT_CLASS(iomap_range_class,
- TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),
+ TP_PROTO(struct inode *inode, loff_t off, u64 len),
TP_ARGS(inode, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u64, ino)
__field(loff_t, size)
- __field(unsigned long, offset)
- __field(unsigned int, length)
+ __field(loff_t, offset)
+ __field(u64, length)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
@@ -58,8 +67,7 @@ DECLARE_EVENT_CLASS(iomap_range_class,
__entry->offset = off;
__entry->length = len;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx "
- "length %x",
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -69,7 +77,7 @@ DECLARE_EVENT_CLASS(iomap_range_class,
#define DEFINE_RANGE_EVENT(name) \
DEFINE_EVENT(iomap_range_class, name, \
- TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\
+ TP_PROTO(struct inode *inode, loff_t off, u64 len),\
TP_ARGS(inode, off, len))
DEFINE_RANGE_EVENT(iomap_writepage);
DEFINE_RANGE_EVENT(iomap_releasepage);
@@ -122,8 +130,8 @@ DECLARE_EVENT_CLASS(iomap_class,
__entry->flags = iomap->flags;
__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
),
- TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld "
- "length %llu type %s flags %s",
+ TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx "
+ "length 0x%llx type %s flags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
MAJOR(__entry->bdev), MINOR(__entry->bdev),
@@ -138,36 +146,32 @@ DECLARE_EVENT_CLASS(iomap_class,
DEFINE_EVENT(iomap_class, name, \
TP_PROTO(struct inode *inode, struct iomap *iomap), \
TP_ARGS(inode, iomap))
-DEFINE_IOMAP_EVENT(iomap_apply_dstmap);
-DEFINE_IOMAP_EVENT(iomap_apply_srcmap);
+DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
+DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-TRACE_EVENT(iomap_apply,
- TP_PROTO(struct inode *inode, loff_t pos, loff_t length,
- unsigned int flags, const void *ops, void *actor,
- unsigned long caller),
- TP_ARGS(inode, pos, length, flags, ops, actor, caller),
+TRACE_EVENT(iomap_iter,
+ TP_PROTO(struct iomap_iter *iter, const void *ops,
+ unsigned long caller),
+ TP_ARGS(iter, ops, caller),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u64, ino)
__field(loff_t, pos)
- __field(loff_t, length)
+ __field(u64, length)
__field(unsigned int, flags)
__field(const void *, ops)
- __field(void *, actor)
__field(unsigned long, caller)
),
TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = inode->i_ino;
- __entry->pos = pos;
- __entry->length = length;
- __entry->flags = flags;
+ __entry->dev = iter->inode->i_sb->s_dev;
+ __entry->ino = iter->inode->i_ino;
+ __entry->pos = iter->pos;
+ __entry->length = iomap_length(iter);
+ __entry->flags = iter->flags;
__entry->ops = ops;
- __entry->actor = actor;
__entry->caller = caller;
),
- TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) "
- "ops %ps caller %pS actor %ps",
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
@@ -175,8 +179,7 @@ TRACE_EVENT(iomap_apply,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,
- (void *)__entry->caller,
- __entry->actor)
+ (void *)__entry->caller)
);
#endif /* _IOMAP_TRACE_H */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 21edc423b79f..678e2c51b855 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -155,7 +155,6 @@ struct iso9660_options{
unsigned int overriderockperm:1;
unsigned int uid_set:1;
unsigned int gid_set:1;
- unsigned int utf8:1;
unsigned char map;
unsigned char check;
unsigned int blocksize;
@@ -356,7 +355,6 @@ static int parse_options(char *options, struct iso9660_options *popt)
popt->gid = GLOBAL_ROOT_GID;
popt->uid = GLOBAL_ROOT_UID;
popt->iocharset = NULL;
- popt->utf8 = 0;
popt->overriderockperm = 0;
popt->session=-1;
popt->sbsector=-1;
@@ -389,10 +387,13 @@ static int parse_options(char *options, struct iso9660_options *popt)
case Opt_cruft:
popt->cruft = 1;
break;
+#ifdef CONFIG_JOLIET
case Opt_utf8:
- popt->utf8 = 1;
+ kfree(popt->iocharset);
+ popt->iocharset = kstrdup("utf8", GFP_KERNEL);
+ if (!popt->iocharset)
+ return 0;
break;
-#ifdef CONFIG_JOLIET
case Opt_iocharset:
kfree(popt->iocharset);
popt->iocharset = match_strdup(&args[0]);
@@ -495,7 +496,6 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root)
if (sbi->s_nocompress) seq_puts(m, ",nocompress");
if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm");
if (sbi->s_showassoc) seq_puts(m, ",showassoc");
- if (sbi->s_utf8) seq_puts(m, ",utf8");
if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check);
if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping);
@@ -518,9 +518,10 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",fmode=%o", sbi->s_fmode);
#ifdef CONFIG_JOLIET
- if (sbi->s_nls_iocharset &&
- strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0)
+ if (sbi->s_nls_iocharset)
seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset);
+ else
+ seq_puts(m, ",iocharset=utf8");
#endif
return 0;
}
@@ -863,14 +864,13 @@ root_found:
sbi->s_nls_iocharset = NULL;
#ifdef CONFIG_JOLIET
- if (joliet_level && opt.utf8 == 0) {
+ if (joliet_level) {
char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
- sbi->s_nls_iocharset = load_nls(p);
- if (! sbi->s_nls_iocharset) {
- /* Fail only if explicit charset specified */
- if (opt.iocharset)
+ if (strcmp(p, "utf8") != 0) {
+ sbi->s_nls_iocharset = opt.iocharset ?
+ load_nls(opt.iocharset) : load_nls_default();
+ if (!sbi->s_nls_iocharset)
goto out_freesbi;
- sbi->s_nls_iocharset = load_nls_default();
}
}
#endif
@@ -886,7 +886,6 @@ root_found:
sbi->s_gid = opt.gid;
sbi->s_uid_set = opt.uid_set;
sbi->s_gid_set = opt.gid_set;
- sbi->s_utf8 = opt.utf8;
sbi->s_nocompress = opt.nocompress;
sbi->s_overriderockperm = opt.overriderockperm;
/*
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 055ec6c586f7..dcdc191ed183 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -44,7 +44,6 @@ struct isofs_sb_info {
unsigned char s_session;
unsigned int s_high_sierra:1;
unsigned int s_rock:2;
- unsigned int s_utf8:1;
unsigned int s_cruft:1; /* Broken disks with high byte of length
* containing junk */
unsigned int s_nocompress:1;
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index be8b6a9d0b92..c0f04a1e7f69 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -41,14 +41,12 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
int
get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
{
- unsigned char utf8;
struct nls_table *nls;
unsigned char len = 0;
- utf8 = ISOFS_SB(inode->i_sb)->s_utf8;
nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
- if (utf8) {
+ if (!nls) {
len = utf16s_to_utf8s((const wchar_t *) de->name,
de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
outname, PAGE_SIZE);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d47a0d96bf30..8ca3527189f8 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -179,8 +179,8 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
- tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
- sizeof(struct jbd2_journal_block_tail));
+ tail = (struct jbd2_journal_block_tail *)((char *)buf +
+ j->j_blocksize - sizeof(struct jbd2_journal_block_tail));
provided = tail->t_checksum;
tail->t_checksum = 0;
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
@@ -196,7 +196,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
static int count_tags(journal_t *journal, struct buffer_head *bh)
{
char * tagp;
- journal_block_tag_t * tag;
+ journal_block_tag_t tag;
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
@@ -206,14 +206,14 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes) <= size) {
- tag = (journal_block_tag_t *) tagp;
+ memcpy(&tag, tagp, sizeof(tag));
nr++;
tagp += tag_bytes;
- if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
+ if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
tagp += 16;
- if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
+ if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
break;
}
@@ -433,9 +433,9 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
}
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
+ journal_block_tag3_t *tag3,
void *buf, __u32 sequence)
{
- journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
__u32 csum32;
__be32 seq;
@@ -496,7 +496,7 @@ static int do_one_pass(journal_t *journal,
while (1) {
int flags;
char * tagp;
- journal_block_tag_t * tag;
+ journal_block_tag_t tag;
struct buffer_head * obh;
struct buffer_head * nbh;
@@ -613,8 +613,8 @@ static int do_one_pass(journal_t *journal,
<= journal->j_blocksize - descr_csum_size) {
unsigned long io_block;
- tag = (journal_block_tag_t *) tagp;
- flags = be16_to_cpu(tag->t_flags);
+ memcpy(&tag, tagp, sizeof(tag));
+ flags = be16_to_cpu(tag.t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
@@ -632,7 +632,7 @@ static int do_one_pass(journal_t *journal,
J_ASSERT(obh != NULL);
blocknr = read_tag_block(journal,
- tag);
+ &tag);
/* If the block has been
* revoked, then we're all done
@@ -647,8 +647,8 @@ static int do_one_pass(journal_t *journal,
/* Look for block corruption */
if (!jbd2_block_tag_csum_verify(
- journal, tag, obh->b_data,
- be32_to_cpu(tmp->h_sequence))) {
+ journal, &tag, (journal_block_tag3_t *)tagp,
+ obh->b_data, be32_to_cpu(tmp->h_sequence))) {
brelse(obh);
success = -EFSBADCRC;
printk(KERN_ERR "JBD2: Invalid "
@@ -760,7 +760,6 @@ static int do_one_pass(journal_t *journal,
*/
jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
next_commit_ID);
- err = 0;
brelse(bh);
goto done;
}
@@ -897,7 +896,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
{
jbd2_journal_revoke_header_t *header;
int offset, max;
- int csum_size = 0;
+ unsigned csum_size = 0;
__u32 rcount;
int record_len = 4;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 8804e126805f..6a3caedd2285 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -223,9 +223,15 @@ static void sub_reserved_credits(journal_t *journal, int blocks)
* with j_state_lock held for reading. Returns 0 if handle joined the running
* transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
* caller must retry.
+ *
+ * Note: because j_state_lock may be dropped depending on the return
+ * value, we need to fake out sparse so ti doesn't complain about a
+ * locking imbalance. Callers of add_transaction_credits will need to
+ * make a similar accomodation.
*/
static int add_transaction_credits(journal_t *journal, int blocks,
int rsv_blocks)
+__must_hold(&journal->j_state_lock)
{
transaction_t *t = journal->j_running_transaction;
int needed;
@@ -238,6 +244,7 @@ static int add_transaction_credits(journal_t *journal, int blocks,
if (t->t_state != T_RUNNING) {
WARN_ON_ONCE(t->t_state >= T_FLUSH);
wait_transaction_locked(journal);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -266,10 +273,12 @@ static int add_transaction_credits(journal_t *journal, int blocks,
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + total <=
journal->j_max_transaction_buffers);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
wait_transaction_locked(journal);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -293,6 +302,7 @@ static int add_transaction_credits(journal_t *journal, int blocks,
journal->j_max_transaction_buffers)
__jbd2_log_wait_for_space(journal);
write_unlock(&journal->j_state_lock);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -310,6 +320,7 @@ static int add_transaction_credits(journal_t *journal, int blocks,
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + rsv_blocks
<= journal->j_max_transaction_buffers / 2);
+ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
return 0;
@@ -413,8 +424,14 @@ repeat:
if (!handle->h_reserved) {
/* We may have dropped j_state_lock - restart in that case */
- if (add_transaction_credits(journal, blocks, rsv_blocks))
+ if (add_transaction_credits(journal, blocks, rsv_blocks)) {
+ /*
+ * add_transaction_credits releases
+ * j_state_lock on a non-zero return
+ */
+ __release(&journal->j_state_lock);
goto repeat;
+ }
} else {
/*
* We have handle reserved so we are allowed to join T_LOCKED
@@ -1404,7 +1421,7 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
{
struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
- if (WARN_ON(!jh))
+ if (WARN_ON_ONCE(!jh))
return;
jh->b_triggers = type;
jbd2_journal_put_journal_head(jh);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 55a79df70d24..e945e3484788 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -173,12 +173,15 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
return ERR_PTR(-EINVAL);
}
-struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
+struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu)
{
struct posix_acl *acl;
char *value = NULL;
int rc, xprefix;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
xprefix = JFFS2_XPREFIX_ACL_ACCESS;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 62c50da9d493..9d9fb7cf093e 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -27,7 +27,7 @@ struct jffs2_acl_header {
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
-struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
+struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu);
int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 43c285c3d2a7..a653f34c6e26 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -14,13 +14,16 @@
#include "jfs_xattr.h"
#include "jfs_acl.h"
-struct posix_acl *jfs_get_acl(struct inode *inode, int type)
+struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu)
{
struct posix_acl *acl;
char *ea_name;
int size;
char *value = NULL;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch(type) {
case ACL_TYPE_ACCESS:
ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 7ae389a7a366..3de40286d31f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -7,7 +7,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
-struct posix_acl *jfs_get_acl(struct inode *inode, int type);
+struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu);
int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 33166ec90a11..ba581429bf7b 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -17,7 +17,7 @@
#include "kernfs-internal.h"
-DEFINE_MUTEX(kernfs_mutex);
+DECLARE_RWSEM(kernfs_rwsem);
static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
@@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
static bool kernfs_active(struct kernfs_node *kn)
{
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held(&kernfs_rwsem);
return atomic_read(&kn->active) >= 0;
}
@@ -340,7 +340,7 @@ static int kernfs_sd_compare(const struct kernfs_node *left,
* @kn->parent->dir.children.
*
* Locking:
- * mutex_lock(kernfs_mutex)
+ * kernfs_rwsem held exclusive
*
* RETURNS:
* 0 on susccess -EEXIST on failure.
@@ -372,6 +372,7 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
/* successfully added, account subdir number */
if (kernfs_type(kn) == KERNFS_DIR)
kn->parent->dir.subdirs++;
+ kernfs_inc_rev(kn->parent);
return 0;
}
@@ -385,7 +386,7 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
* removed, %false if @kn wasn't on the rbtree.
*
* Locking:
- * mutex_lock(kernfs_mutex)
+ * kernfs_rwsem held exclusive
*/
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
{
@@ -394,6 +395,7 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_DIR)
kn->parent->dir.subdirs--;
+ kernfs_inc_rev(kn->parent);
rb_erase(&kn->rb, &kn->parent->dir.children);
RB_CLEAR_NODE(&kn->rb);
@@ -455,14 +457,14 @@ void kernfs_put_active(struct kernfs_node *kn)
* return after draining is complete.
*/
static void kernfs_drain(struct kernfs_node *kn)
- __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
+ __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem)
{
struct kernfs_root *root = kernfs_root(kn);
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held_write(&kernfs_rwsem);
WARN_ON_ONCE(kernfs_active(kn));
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
if (kernfs_lockdep(kn)) {
rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
@@ -481,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn)
kernfs_drain_open_files(kn);
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
}
/**
@@ -720,7 +722,7 @@ int kernfs_add_one(struct kernfs_node *kn)
bool has_ns;
int ret;
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
@@ -751,7 +753,7 @@ int kernfs_add_one(struct kernfs_node *kn)
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
/*
* Activate the new node unless CREATE_DEACTIVATED is requested.
@@ -765,7 +767,7 @@ int kernfs_add_one(struct kernfs_node *kn)
return 0;
out_unlock:
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
return ret;
}
@@ -786,7 +788,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
bool has_ns = kernfs_ns_enabled(parent);
unsigned int hash;
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held(&kernfs_rwsem);
if (has_ns != (bool)ns) {
WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
@@ -818,7 +820,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
size_t len;
char *p, *name;
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held_read(&kernfs_rwsem);
/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
spin_lock_irq(&kernfs_rename_lock);
@@ -858,10 +860,10 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
{
struct kernfs_node *kn;
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
kernfs_get(kn);
- mutex_unlock(&kernfs_mutex);
+ up_read(&kernfs_rwsem);
return kn;
}
@@ -882,10 +884,10 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
{
struct kernfs_node *kn;
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
kn = kernfs_walk_ns(parent, path, ns);
kernfs_get(kn);
- mutex_unlock(&kernfs_mutex);
+ up_read(&kernfs_rwsem);
return kn;
}
@@ -1037,12 +1039,34 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- /* Always perform fresh lookup for negatives */
- if (d_really_is_negative(dentry))
- goto out_bad_unlocked;
+ /* Negative hashed dentry? */
+ if (d_really_is_negative(dentry)) {
+ struct kernfs_node *parent;
+
+ /* If the kernfs parent node has changed discard and
+ * proceed to ->lookup.
+ */
+ down_read(&kernfs_rwsem);
+ spin_lock(&dentry->d_lock);
+ parent = kernfs_dentry_node(dentry->d_parent);
+ if (parent) {
+ if (kernfs_dir_changed(parent, dentry)) {
+ spin_unlock(&dentry->d_lock);
+ up_read(&kernfs_rwsem);
+ return 0;
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+ up_read(&kernfs_rwsem);
+
+ /* The kernfs parent node hasn't changed, leave the
+ * dentry negative and return success.
+ */
+ return 1;
+ }
kn = kernfs_dentry_node(dentry);
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
/* The kernfs node has been deactivated */
if (!kernfs_active(kn))
@@ -1061,11 +1085,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
kernfs_info(dentry->d_sb)->ns != kn->ns)
goto out_bad;
- mutex_unlock(&kernfs_mutex);
+ up_read(&kernfs_rwsem);
return 1;
out_bad:
- mutex_unlock(&kernfs_mutex);
-out_bad_unlocked:
+ up_read(&kernfs_rwsem);
return 0;
}
@@ -1077,37 +1100,29 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
- struct dentry *ret;
struct kernfs_node *parent = dir->i_private;
struct kernfs_node *kn;
- struct inode *inode;
+ struct inode *inode = NULL;
const void *ns = NULL;
- mutex_lock(&kernfs_mutex);
-
+ down_read(&kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dir->i_sb)->ns;
kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
-
- /* no such entry */
- if (!kn || !kernfs_active(kn)) {
- ret = NULL;
- goto out_unlock;
- }
-
/* attach dentry and inode */
- inode = kernfs_get_inode(dir->i_sb, kn);
- if (!inode) {
- ret = ERR_PTR(-ENOMEM);
- goto out_unlock;
+ if (kn && kernfs_active(kn)) {
+ inode = kernfs_get_inode(dir->i_sb, kn);
+ if (!inode)
+ inode = ERR_PTR(-ENOMEM);
}
+ /* Needed only for negative dentry validation */
+ if (!inode)
+ kernfs_set_rev(parent, dentry);
+ up_read(&kernfs_rwsem);
- /* instantiate and hash dentry */
- ret = d_splice_alias(inode, dentry);
- out_unlock:
- mutex_unlock(&kernfs_mutex);
- return ret;
+ /* instantiate and hash (possibly negative) dentry */
+ return d_splice_alias(inode, dentry);
}
static int kernfs_iop_mkdir(struct user_namespace *mnt_userns,
@@ -1227,7 +1242,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
{
struct rb_node *rbn;
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held_write(&kernfs_rwsem);
/* if first iteration, visit leftmost descendant which may be root */
if (!pos)
@@ -1263,7 +1278,7 @@ void kernfs_activate(struct kernfs_node *kn)
{
struct kernfs_node *pos;
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
pos = NULL;
while ((pos = kernfs_next_descendant_post(pos, kn))) {
@@ -1277,14 +1292,14 @@ void kernfs_activate(struct kernfs_node *kn)
pos->flags |= KERNFS_ACTIVATED;
}
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
}
static void __kernfs_remove(struct kernfs_node *kn)
{
struct kernfs_node *pos;
- lockdep_assert_held(&kernfs_mutex);
+ lockdep_assert_held_write(&kernfs_rwsem);
/*
* Short-circuit if non-root @kn has already finished removal.
@@ -1307,7 +1322,7 @@ static void __kernfs_remove(struct kernfs_node *kn)
pos = kernfs_leftmost_descendant(kn);
/*
- * kernfs_drain() drops kernfs_mutex temporarily and @pos's
+ * kernfs_drain() drops kernfs_rwsem temporarily and @pos's
* base ref could have been put by someone else by the time
* the function returns. Make sure it doesn't go away
* underneath us.
@@ -1354,9 +1369,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
*/
void kernfs_remove(struct kernfs_node *kn)
{
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
__kernfs_remove(kn);
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
}
/**
@@ -1443,17 +1458,17 @@ bool kernfs_remove_self(struct kernfs_node *kn)
{
bool ret;
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
kernfs_break_active_protection(kn);
/*
* SUICIDAL is used to arbitrate among competing invocations. Only
* the first one will actually perform removal. When the removal
* is complete, SUICIDED is set and the active ref is restored
- * while holding kernfs_mutex. The ones which lost arbitration
- * waits for SUICDED && drained which can happen only after the
- * enclosing kernfs operation which executed the winning instance
- * of kernfs_remove_self() finished.
+ * while kernfs_rwsem for held exclusive. The ones which lost
+ * arbitration waits for SUICIDED && drained which can happen only
+ * after the enclosing kernfs operation which executed the winning
+ * instance of kernfs_remove_self() finished.
*/
if (!(kn->flags & KERNFS_SUICIDAL)) {
kn->flags |= KERNFS_SUICIDAL;
@@ -1471,9 +1486,9 @@ bool kernfs_remove_self(struct kernfs_node *kn)
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
break;
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
schedule();
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
}
finish_wait(waitq, &wait);
WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
@@ -1481,12 +1496,12 @@ bool kernfs_remove_self(struct kernfs_node *kn)
}
/*
- * This must be done while holding kernfs_mutex; otherwise, waiting
- * for SUICIDED && deactivated could finish prematurely.
+ * This must be done while kernfs_rwsem held exclusive; otherwise,
+ * waiting for SUICIDED && deactivated could finish prematurely.
*/
kernfs_unbreak_active_protection(kn);
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
return ret;
}
@@ -1510,13 +1525,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
return -ENOENT;
}
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
if (kn)
__kernfs_remove(kn);
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
if (kn)
return 0;
@@ -1542,7 +1557,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
if (!kn->parent)
return -EINVAL;
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
error = -ENOENT;
if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
@@ -1596,7 +1611,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
error = 0;
out:
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
return error;
}
@@ -1671,7 +1686,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dentry->d_sb)->ns;
@@ -1688,12 +1703,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
file->private_data = pos;
kernfs_get(pos);
- mutex_unlock(&kernfs_mutex);
+ up_read(&kernfs_rwsem);
if (!dir_emit(ctx, name, len, ino, type))
return 0;
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
}
- mutex_unlock(&kernfs_mutex);
+ up_read(&kernfs_rwsem);
file->private_data = NULL;
ctx->pos = INT_MAX;
return 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index c75719312147..60e2a86c535e 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -860,7 +860,7 @@ repeat:
spin_unlock_irq(&kernfs_notify_lock);
/* kick fsnotify */
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
struct kernfs_node *parent;
@@ -898,7 +898,7 @@ repeat:
iput(inode);
}
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
kernfs_put(kn);
goto repeat;
}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 26f2aa3586f9..c0eae1725435 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -100,9 +100,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
int ret;
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
ret = __kernfs_setattr(kn, iattr);
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
return ret;
}
@@ -116,7 +116,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (!kn)
return -EINVAL;
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
error = setattr_prepare(&init_user_ns, dentry, iattr);
if (error)
goto out;
@@ -129,7 +129,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(&init_user_ns, inode, iattr);
out:
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
return error;
}
@@ -185,11 +185,13 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
struct inode *inode = d_inode(path->dentry);
struct kernfs_node *kn = inode->i_private;
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
+ spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
- mutex_unlock(&kernfs_mutex);
-
generic_fillattr(&init_user_ns, inode, stat);
+ spin_unlock(&inode->i_lock);
+ up_read(&kernfs_rwsem);
+
return 0;
}
@@ -272,17 +274,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
struct kernfs_node *kn;
+ int ret;
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
kn = inode->i_private;
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
+ spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
- mutex_unlock(&kernfs_mutex);
+ ret = generic_permission(&init_user_ns, inode, mask);
+ spin_unlock(&inode->i_lock);
+ up_read(&kernfs_rwsem);
- return generic_permission(&init_user_ns, inode, mask);
+ return ret;
}
int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index ccc3b44f6306..f9cc912c31e1 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -13,6 +13,7 @@
#include <linux/lockdep.h>
#include <linux/fs.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/xattr.h>
#include <linux/kernfs.h>
@@ -69,7 +70,7 @@ struct kernfs_super_info {
*/
const void *ns;
- /* anchored at kernfs_root->supers, protected by kernfs_mutex */
+ /* anchored at kernfs_root->supers, protected by kernfs_rwsem */
struct list_head node;
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
@@ -81,6 +82,25 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
return d_inode(dentry)->i_private;
}
+static inline void kernfs_set_rev(struct kernfs_node *parent,
+ struct dentry *dentry)
+{
+ dentry->d_time = parent->dir.rev;
+}
+
+static inline void kernfs_inc_rev(struct kernfs_node *parent)
+{
+ parent->dir.rev++;
+}
+
+static inline bool kernfs_dir_changed(struct kernfs_node *parent,
+ struct dentry *dentry)
+{
+ if (parent->dir.rev != dentry->d_time)
+ return true;
+ return false;
+}
+
extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
@@ -102,7 +122,7 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
/*
* dir.c
*/
-extern struct mutex kernfs_mutex;
+extern struct rw_semaphore kernfs_rwsem;
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 9dc7e7a64e10..f2f909d09f52 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -255,9 +255,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
sb->s_shrink.seeks = 0;
/* get root inode, initialize and unlock it */
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
inode = kernfs_get_inode(sb, info->root->kn);
- mutex_unlock(&kernfs_mutex);
+ up_read(&kernfs_rwsem);
if (!inode) {
pr_debug("kernfs: could not get root inode\n");
return -ENOMEM;
@@ -344,9 +344,9 @@ int kernfs_get_tree(struct fs_context *fc)
}
sb->s_flags |= SB_ACTIVE;
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
list_add(&info->node, &info->root->supers);
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
}
fc->root = dget(sb->s_root);
@@ -372,9 +372,9 @@ void kernfs_kill_sb(struct super_block *sb)
{
struct kernfs_super_info *info = kernfs_info(sb);
- mutex_lock(&kernfs_mutex);
+ down_write(&kernfs_rwsem);
list_del(&info->node);
- mutex_unlock(&kernfs_mutex);
+ up_write(&kernfs_rwsem);
/*
* Remove the superblock from fs_supers/s_instances
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 5432883d819f..c8f8e41b8411 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -116,9 +116,9 @@ static int kernfs_getlink(struct inode *inode, char *path)
struct kernfs_node *target = kn->symlink.target_kn;
int error;
- mutex_lock(&kernfs_mutex);
+ down_read(&kernfs_rwsem);
error = kernfs_get_target_path(parent, target, path);
- mutex_unlock(&kernfs_mutex);
+ up_read(&kernfs_rwsem);
return error;
}
diff --git a/fs/ksmbd/Kconfig b/fs/ksmbd/Kconfig
new file mode 100644
index 000000000000..b83cbd756ae5
--- /dev/null
+++ b/fs/ksmbd/Kconfig
@@ -0,0 +1,68 @@
+config SMB_SERVER
+ tristate "SMB3 server support (EXPERIMENTAL)"
+ depends on INET
+ depends on MULTIUSER
+ depends on FILE_LOCKING
+ select NLS
+ select NLS_UTF8
+ select CRYPTO
+ select CRYPTO_MD4
+ select CRYPTO_MD5
+ select CRYPTO_HMAC
+ select CRYPTO_ECB
+ select CRYPTO_LIB_DES
+ select CRYPTO_SHA256
+ select CRYPTO_CMAC
+ select CRYPTO_SHA512
+ select CRYPTO_AEAD2
+ select CRYPTO_CCM
+ select CRYPTO_GCM
+ select ASN1
+ select OID_REGISTRY
+ default n
+ help
+ Choose Y here if you want to allow SMB3 compliant clients
+ to access files residing on this system using SMB3 protocol.
+ To compile the SMB3 server support as a module,
+ choose M here: the module will be called ksmbd.
+
+ You may choose to use a samba server instead, in which
+ case you can choose N here.
+
+ You also need to install user space programs which can be found
+ in ksmbd-tools, available from
+ https://github.com/cifsd-team/ksmbd-tools.
+ More detail about how to run the ksmbd kernel server is
+ available via README file
+ (https://github.com/cifsd-team/ksmbd-tools/blob/master/README).
+
+ ksmbd kernel server includes support for auto-negotiation,
+ Secure negotiate, Pre-authentication integrity, oplock/lease,
+ compound requests, multi-credit, packet signing, RDMA(smbdirect),
+ smb3 encryption, copy-offload, secure per-user session
+ establishment via NTLM or NTLMv2.
+
+config SMB_SERVER_SMBDIRECT
+ bool "Support for SMB Direct protocol"
+ depends on SMB_SERVER=m && INFINIBAND && INFINIBAND_ADDR_TRANS || SMB_SERVER=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
+ select SG_POOL
+ default n
+
+ help
+ Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
+
+ SMB Direct allows transferring SMB packets over RDMA. If unsure,
+ say N.
+
+config SMB_SERVER_CHECK_CAP_NET_ADMIN
+ bool "Enable check network administration capability"
+ depends on SMB_SERVER
+ default y
+
+ help
+ Prevent unprivileged processes to start the ksmbd kernel server.
+
+config SMB_SERVER_KERBEROS5
+ bool "Support for Kerberos 5"
+ depends on SMB_SERVER
+ default n
diff --git a/fs/ksmbd/Makefile b/fs/ksmbd/Makefile
new file mode 100644
index 000000000000..7d6337a7dee4
--- /dev/null
+++ b/fs/ksmbd/Makefile
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Makefile for Linux SMB3 kernel server
+#
+obj-$(CONFIG_SMB_SERVER) += ksmbd.o
+
+ksmbd-y := unicode.o auth.o vfs.o vfs_cache.o server.o ndr.o \
+ misc.o oplock.o connection.o ksmbd_work.o crypto_ctx.o \
+ mgmt/ksmbd_ida.o mgmt/user_config.o mgmt/share_config.o \
+ mgmt/tree_connect.o mgmt/user_session.o smb_common.o \
+ transport_tcp.o transport_ipc.o smbacl.o smb2pdu.o \
+ smb2ops.o smb2misc.o ksmbd_spnego_negtokeninit.asn1.o \
+ ksmbd_spnego_negtokentarg.asn1.o asn1.o
+
+$(obj)/asn1.o: $(obj)/ksmbd_spnego_negtokeninit.asn1.h $(obj)/ksmbd_spnego_negtokentarg.asn1.h
+
+$(obj)/ksmbd_spnego_negtokeninit.asn1.o: $(obj)/ksmbd_spnego_negtokeninit.asn1.c $(obj)/ksmbd_spnego_negtokeninit.asn1.h
+$(obj)/ksmbd_spnego_negtokentarg.asn1.o: $(obj)/ksmbd_spnego_negtokentarg.asn1.c $(obj)/ksmbd_spnego_negtokentarg.asn1.h
+
+ksmbd-$(CONFIG_SMB_SERVER_SMBDIRECT) += transport_rdma.o
diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c
new file mode 100644
index 000000000000..b014f4638610
--- /dev/null
+++ b/fs/ksmbd/asn1.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in
+ * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/oid_registry.h>
+
+#include "glob.h"
+
+#include "asn1.h"
+#include "connection.h"
+#include "auth.h"
+#include "ksmbd_spnego_negtokeninit.asn1.h"
+#include "ksmbd_spnego_negtokentarg.asn1.h"
+
+#define SPNEGO_OID_LEN 7
+#define NTLMSSP_OID_LEN 10
+#define KRB5_OID_LEN 7
+#define KRB5U2U_OID_LEN 8
+#define MSKRB5_OID_LEN 7
+static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
+static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
+static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
+static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
+static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
+
+static char NTLMSSP_OID_STR[NTLMSSP_OID_LEN] = { 0x2b, 0x06, 0x01, 0x04, 0x01,
+ 0x82, 0x37, 0x02, 0x02, 0x0a };
+
+static bool
+asn1_subid_decode(const unsigned char **begin, const unsigned char *end,
+ unsigned long *subid)
+{
+ const unsigned char *ptr = *begin;
+ unsigned char ch;
+
+ *subid = 0;
+
+ do {
+ if (ptr >= end)
+ return false;
+
+ ch = *ptr++;
+ *subid <<= 7;
+ *subid |= ch & 0x7F;
+ } while ((ch & 0x80) == 0x80);
+
+ *begin = ptr;
+ return true;
+}
+
+static bool asn1_oid_decode(const unsigned char *value, size_t vlen,
+ unsigned long **oid, size_t *oidlen)
+{
+ const unsigned char *iptr = value, *end = value + vlen;
+ unsigned long *optr;
+ unsigned long subid;
+
+ vlen += 1;
+ if (vlen < 2 || vlen > UINT_MAX / sizeof(unsigned long))
+ goto fail_nullify;
+
+ *oid = kmalloc(vlen * sizeof(unsigned long), GFP_KERNEL);
+ if (!*oid)
+ return false;
+
+ optr = *oid;
+
+ if (!asn1_subid_decode(&iptr, end, &subid))
+ goto fail;
+
+ if (subid < 40) {
+ optr[0] = 0;
+ optr[1] = subid;
+ } else if (subid < 80) {
+ optr[0] = 1;
+ optr[1] = subid - 40;
+ } else {
+ optr[0] = 2;
+ optr[1] = subid - 80;
+ }
+
+ *oidlen = 2;
+ optr += 2;
+
+ while (iptr < end) {
+ if (++(*oidlen) > vlen)
+ goto fail;
+
+ if (!asn1_subid_decode(&iptr, end, optr++))
+ goto fail;
+ }
+ return true;
+
+fail:
+ kfree(*oid);
+fail_nullify:
+ *oid = NULL;
+ return false;
+}
+
+static bool oid_eq(unsigned long *oid1, unsigned int oid1len,
+ unsigned long *oid2, unsigned int oid2len)
+{
+ if (oid1len != oid2len)
+ return false;
+
+ return memcmp(oid1, oid2, oid1len) == 0;
+}
+
+int
+ksmbd_decode_negTokenInit(unsigned char *security_blob, int length,
+ struct ksmbd_conn *conn)
+{
+ return asn1_ber_decoder(&ksmbd_spnego_negtokeninit_decoder, conn,
+ security_blob, length);
+}
+
+int
+ksmbd_decode_negTokenTarg(unsigned char *security_blob, int length,
+ struct ksmbd_conn *conn)
+{
+ return asn1_ber_decoder(&ksmbd_spnego_negtokentarg_decoder, conn,
+ security_blob, length);
+}
+
+static int compute_asn_hdr_len_bytes(int len)
+{
+ if (len > 0xFFFFFF)
+ return 4;
+ else if (len > 0xFFFF)
+ return 3;
+ else if (len > 0xFF)
+ return 2;
+ else if (len > 0x7F)
+ return 1;
+ else
+ return 0;
+}
+
+static void encode_asn_tag(char *buf, unsigned int *ofs, char tag, char seq,
+ int length)
+{
+ int i;
+ int index = *ofs;
+ char hdr_len = compute_asn_hdr_len_bytes(length);
+ int len = length + 2 + hdr_len;
+
+ /* insert tag */
+ buf[index++] = tag;
+
+ if (!hdr_len) {
+ buf[index++] = len;
+ } else {
+ buf[index++] = 0x80 | hdr_len;
+ for (i = hdr_len - 1; i >= 0; i--)
+ buf[index++] = (len >> (i * 8)) & 0xFF;
+ }
+
+ /* insert seq */
+ len = len - (index - *ofs);
+ buf[index++] = seq;
+
+ if (!hdr_len) {
+ buf[index++] = len;
+ } else {
+ buf[index++] = 0x80 | hdr_len;
+ for (i = hdr_len - 1; i >= 0; i--)
+ buf[index++] = (len >> (i * 8)) & 0xFF;
+ }
+
+ *ofs += (index - *ofs);
+}
+
+int build_spnego_ntlmssp_neg_blob(unsigned char **pbuffer, u16 *buflen,
+ char *ntlm_blob, int ntlm_blob_len)
+{
+ char *buf;
+ unsigned int ofs = 0;
+ int neg_result_len = 4 + compute_asn_hdr_len_bytes(1) * 2 + 1;
+ int oid_len = 4 + compute_asn_hdr_len_bytes(NTLMSSP_OID_LEN) * 2 +
+ NTLMSSP_OID_LEN;
+ int ntlmssp_len = 4 + compute_asn_hdr_len_bytes(ntlm_blob_len) * 2 +
+ ntlm_blob_len;
+ int total_len = 4 + compute_asn_hdr_len_bytes(neg_result_len +
+ oid_len + ntlmssp_len) * 2 +
+ neg_result_len + oid_len + ntlmssp_len;
+
+ buf = kmalloc(total_len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* insert main gss header */
+ encode_asn_tag(buf, &ofs, 0xa1, 0x30, neg_result_len + oid_len +
+ ntlmssp_len);
+
+ /* insert neg result */
+ encode_asn_tag(buf, &ofs, 0xa0, 0x0a, 1);
+ buf[ofs++] = 1;
+
+ /* insert oid */
+ encode_asn_tag(buf, &ofs, 0xa1, 0x06, NTLMSSP_OID_LEN);
+ memcpy(buf + ofs, NTLMSSP_OID_STR, NTLMSSP_OID_LEN);
+ ofs += NTLMSSP_OID_LEN;
+
+ /* insert response token - ntlmssp blob */
+ encode_asn_tag(buf, &ofs, 0xa2, 0x04, ntlm_blob_len);
+ memcpy(buf + ofs, ntlm_blob, ntlm_blob_len);
+ ofs += ntlm_blob_len;
+
+ *pbuffer = buf;
+ *buflen = total_len;
+ return 0;
+}
+
+int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen,
+ int neg_result)
+{
+ char *buf;
+ unsigned int ofs = 0;
+ int neg_result_len = 4 + compute_asn_hdr_len_bytes(1) * 2 + 1;
+ int total_len = 4 + compute_asn_hdr_len_bytes(neg_result_len) * 2 +
+ neg_result_len;
+
+ buf = kmalloc(total_len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* insert main gss header */
+ encode_asn_tag(buf, &ofs, 0xa1, 0x30, neg_result_len);
+
+ /* insert neg result */
+ encode_asn_tag(buf, &ofs, 0xa0, 0x0a, 1);
+ if (neg_result)
+ buf[ofs++] = 2;
+ else
+ buf[ofs++] = 0;
+
+ *pbuffer = buf;
+ *buflen = total_len;
+ return 0;
+}
+
+int ksmbd_gssapi_this_mech(void *context, size_t hdrlen, unsigned char tag,
+ const void *value, size_t vlen)
+{
+ unsigned long *oid;
+ size_t oidlen;
+ int err = 0;
+
+ if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) {
+ err = -EBADMSG;
+ goto out;
+ }
+
+ if (!oid_eq(oid, oidlen, SPNEGO_OID, SPNEGO_OID_LEN))
+ err = -EBADMSG;
+ kfree(oid);
+out:
+ if (err) {
+ char buf[50];
+
+ sprint_oid(value, vlen, buf, sizeof(buf));
+ ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf);
+ }
+ return err;
+}
+
+int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen,
+ unsigned char tag, const void *value,
+ size_t vlen)
+{
+ struct ksmbd_conn *conn = context;
+ unsigned long *oid;
+ size_t oidlen;
+ int mech_type;
+ char buf[50];
+
+ if (!asn1_oid_decode(value, vlen, &oid, &oidlen))
+ goto fail;
+
+ if (oid_eq(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN))
+ mech_type = KSMBD_AUTH_NTLMSSP;
+ else if (oid_eq(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN))
+ mech_type = KSMBD_AUTH_MSKRB5;
+ else if (oid_eq(oid, oidlen, KRB5_OID, KRB5_OID_LEN))
+ mech_type = KSMBD_AUTH_KRB5;
+ else if (oid_eq(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN))
+ mech_type = KSMBD_AUTH_KRB5U2U;
+ else
+ goto fail;
+
+ conn->auth_mechs |= mech_type;
+ if (conn->preferred_auth_mech == 0)
+ conn->preferred_auth_mech = mech_type;
+
+ kfree(oid);
+ return 0;
+
+fail:
+ kfree(oid);
+ sprint_oid(value, vlen, buf, sizeof(buf));
+ ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf);
+ return -EBADMSG;
+}
+
+int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen,
+ unsigned char tag, const void *value,
+ size_t vlen)
+{
+ struct ksmbd_conn *conn = context;
+
+ conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL);
+ if (!conn->mechToken)
+ return -ENOMEM;
+
+ memcpy(conn->mechToken, value, vlen);
+ conn->mechToken[vlen] = '\0';
+ return 0;
+}
+
+int ksmbd_neg_token_targ_resp_token(void *context, size_t hdrlen,
+ unsigned char tag, const void *value,
+ size_t vlen)
+{
+ struct ksmbd_conn *conn = context;
+
+ conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL);
+ if (!conn->mechToken)
+ return -ENOMEM;
+
+ memcpy(conn->mechToken, value, vlen);
+ conn->mechToken[vlen] = '\0';
+ return 0;
+}
diff --git a/fs/ksmbd/asn1.h b/fs/ksmbd/asn1.h
new file mode 100644
index 000000000000..ce105f4ce305
--- /dev/null
+++ b/fs/ksmbd/asn1.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in
+ * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __ASN1_H__
+#define __ASN1_H__
+
+int ksmbd_decode_negTokenInit(unsigned char *security_blob, int length,
+ struct ksmbd_conn *conn);
+int ksmbd_decode_negTokenTarg(unsigned char *security_blob, int length,
+ struct ksmbd_conn *conn);
+int build_spnego_ntlmssp_neg_blob(unsigned char **pbuffer, u16 *buflen,
+ char *ntlm_blob, int ntlm_blob_len);
+int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen,
+ int neg_result);
+#endif /* __ASN1_H__ */
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
new file mode 100644
index 000000000000..de36f12070bf
--- /dev/null
+++ b/fs/ksmbd/auth.c
@@ -0,0 +1,1364 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/backing-dev.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include <linux/xattr.h>
+#include <crypto/hash.h>
+#include <crypto/aead.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+
+#include "auth.h"
+#include "glob.h"
+
+#include <linux/fips.h>
+#include <crypto/des.h>
+
+#include "server.h"
+#include "smb_common.h"
+#include "connection.h"
+#include "mgmt/user_session.h"
+#include "mgmt/user_config.h"
+#include "crypto_ctx.h"
+#include "transport_ipc.h"
+
+/*
+ * Fixed format data defining GSS header and fixed string
+ * "not_defined_in_RFC4178@please_ignore".
+ * So sec blob data in neg phase could be generated statically.
+ */
+static char NEGOTIATE_GSS_HEADER[AUTH_GSS_LENGTH] = {
+#ifdef CONFIG_SMB_SERVER_KERBEROS5
+ 0x60, 0x5e, 0x06, 0x06, 0x2b, 0x06, 0x01, 0x05,
+ 0x05, 0x02, 0xa0, 0x54, 0x30, 0x52, 0xa0, 0x24,
+ 0x30, 0x22, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86,
+ 0xf7, 0x12, 0x01, 0x02, 0x02, 0x06, 0x09, 0x2a,
+ 0x86, 0x48, 0x82, 0xf7, 0x12, 0x01, 0x02, 0x02,
+ 0x06, 0x0a, 0x2b, 0x06, 0x01, 0x04, 0x01, 0x82,
+ 0x37, 0x02, 0x02, 0x0a, 0xa3, 0x2a, 0x30, 0x28,
+ 0xa0, 0x26, 0x1b, 0x24, 0x6e, 0x6f, 0x74, 0x5f,
+ 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x5f,
+ 0x69, 0x6e, 0x5f, 0x52, 0x46, 0x43, 0x34, 0x31,
+ 0x37, 0x38, 0x40, 0x70, 0x6c, 0x65, 0x61, 0x73,
+ 0x65, 0x5f, 0x69, 0x67, 0x6e, 0x6f, 0x72, 0x65
+#else
+ 0x60, 0x48, 0x06, 0x06, 0x2b, 0x06, 0x01, 0x05,
+ 0x05, 0x02, 0xa0, 0x3e, 0x30, 0x3c, 0xa0, 0x0e,
+ 0x30, 0x0c, 0x06, 0x0a, 0x2b, 0x06, 0x01, 0x04,
+ 0x01, 0x82, 0x37, 0x02, 0x02, 0x0a, 0xa3, 0x2a,
+ 0x30, 0x28, 0xa0, 0x26, 0x1b, 0x24, 0x6e, 0x6f,
+ 0x74, 0x5f, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+ 0x64, 0x5f, 0x69, 0x6e, 0x5f, 0x52, 0x46, 0x43,
+ 0x34, 0x31, 0x37, 0x38, 0x40, 0x70, 0x6c, 0x65,
+ 0x61, 0x73, 0x65, 0x5f, 0x69, 0x67, 0x6e, 0x6f,
+ 0x72, 0x65
+#endif
+};
+
+void ksmbd_copy_gss_neg_header(void *buf)
+{
+ memcpy(buf, NEGOTIATE_GSS_HEADER, AUTH_GSS_LENGTH);
+}
+
+static void
+str_to_key(unsigned char *str, unsigned char *key)
+{
+ int i;
+
+ key[0] = str[0] >> 1;
+ key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
+ key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
+ key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
+ key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
+ key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
+ key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
+ key[7] = str[6] & 0x7F;
+ for (i = 0; i < 8; i++)
+ key[i] = (key[i] << 1);
+}
+
+static int
+smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
+{
+ unsigned char key2[8];
+ struct des_ctx ctx;
+
+ if (fips_enabled) {
+ ksmbd_debug(AUTH, "FIPS compliance enabled: DES not permitted\n");
+ return -ENOENT;
+ }
+
+ str_to_key(key, key2);
+ des_expand_key(&ctx, key2, DES_KEY_SIZE);
+ des_encrypt(&ctx, out, in);
+ memzero_explicit(&ctx, sizeof(ctx));
+ return 0;
+}
+
+static int ksmbd_enc_p24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
+{
+ int rc;
+
+ rc = smbhash(p24, c8, p21);
+ if (rc)
+ return rc;
+ rc = smbhash(p24 + 8, c8, p21 + 7);
+ if (rc)
+ return rc;
+ return smbhash(p24 + 16, c8, p21 + 14);
+}
+
+/* produce a md4 message digest from data of length n bytes */
+static int ksmbd_enc_md4(unsigned char *md4_hash, unsigned char *link_str,
+ int link_len)
+{
+ int rc;
+ struct ksmbd_crypto_ctx *ctx;
+
+ ctx = ksmbd_crypto_ctx_find_md4();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "Crypto md4 allocation error\n");
+ return -ENOMEM;
+ }
+
+ rc = crypto_shash_init(CRYPTO_MD4(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not init md4 shash\n");
+ goto out;
+ }
+
+ rc = crypto_shash_update(CRYPTO_MD4(ctx), link_str, link_len);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not update with link_str\n");
+ goto out;
+ }
+
+ rc = crypto_shash_final(CRYPTO_MD4(ctx), md4_hash);
+ if (rc)
+ ksmbd_debug(AUTH, "Could not generate md4 hash\n");
+out:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
+
+static int ksmbd_enc_update_sess_key(unsigned char *md5_hash, char *nonce,
+ char *server_challenge, int len)
+{
+ int rc;
+ struct ksmbd_crypto_ctx *ctx;
+
+ ctx = ksmbd_crypto_ctx_find_md5();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "Crypto md5 allocation error\n");
+ return -ENOMEM;
+ }
+
+ rc = crypto_shash_init(CRYPTO_MD5(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not init md5 shash\n");
+ goto out;
+ }
+
+ rc = crypto_shash_update(CRYPTO_MD5(ctx), server_challenge, len);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not update with challenge\n");
+ goto out;
+ }
+
+ rc = crypto_shash_update(CRYPTO_MD5(ctx), nonce, len);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not update with nonce\n");
+ goto out;
+ }
+
+ rc = crypto_shash_final(CRYPTO_MD5(ctx), md5_hash);
+ if (rc)
+ ksmbd_debug(AUTH, "Could not generate md5 hash\n");
+out:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
+
+/**
+ * ksmbd_gen_sess_key() - function to generate session key
+ * @sess: session of connection
+ * @hash: source hash value to be used for find session key
+ * @hmac: source hmac value to be used for finding session key
+ *
+ */
+static int ksmbd_gen_sess_key(struct ksmbd_session *sess, char *hash,
+ char *hmac)
+{
+ struct ksmbd_crypto_ctx *ctx;
+ int rc;
+
+ ctx = ksmbd_crypto_ctx_find_hmacmd5();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n");
+ return -ENOMEM;
+ }
+
+ rc = crypto_shash_setkey(CRYPTO_HMACMD5_TFM(ctx),
+ hash,
+ CIFS_HMAC_MD5_HASH_SIZE);
+ if (rc) {
+ ksmbd_debug(AUTH, "hmacmd5 set key fail error %d\n", rc);
+ goto out;
+ }
+
+ rc = crypto_shash_init(CRYPTO_HMACMD5(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "could not init hmacmd5 error %d\n", rc);
+ goto out;
+ }
+
+ rc = crypto_shash_update(CRYPTO_HMACMD5(ctx),
+ hmac,
+ SMB2_NTLMV2_SESSKEY_SIZE);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not update with response error %d\n", rc);
+ goto out;
+ }
+
+ rc = crypto_shash_final(CRYPTO_HMACMD5(ctx), sess->sess_key);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not generate hmacmd5 hash error %d\n", rc);
+ goto out;
+ }
+
+out:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
+
+static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash,
+ char *dname)
+{
+ int ret, len, conv_len;
+ wchar_t *domain = NULL;
+ __le16 *uniname = NULL;
+ struct ksmbd_crypto_ctx *ctx;
+
+ ctx = ksmbd_crypto_ctx_find_hmacmd5();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "can't generate ntlmv2 hash\n");
+ return -ENOMEM;
+ }
+
+ ret = crypto_shash_setkey(CRYPTO_HMACMD5_TFM(ctx),
+ user_passkey(sess->user),
+ CIFS_ENCPWD_SIZE);
+ if (ret) {
+ ksmbd_debug(AUTH, "Could not set NT Hash as a key\n");
+ goto out;
+ }
+
+ ret = crypto_shash_init(CRYPTO_HMACMD5(ctx));
+ if (ret) {
+ ksmbd_debug(AUTH, "could not init hmacmd5\n");
+ goto out;
+ }
+
+ /* convert user_name to unicode */
+ len = strlen(user_name(sess->user));
+ uniname = kzalloc(2 + UNICODE_LEN(len), GFP_KERNEL);
+ if (!uniname) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ conv_len = smb_strtoUTF16(uniname, user_name(sess->user), len,
+ sess->conn->local_nls);
+ if (conv_len < 0 || conv_len > len) {
+ ret = -EINVAL;
+ goto out;
+ }
+ UniStrupr(uniname);
+
+ ret = crypto_shash_update(CRYPTO_HMACMD5(ctx),
+ (char *)uniname,
+ UNICODE_LEN(conv_len));
+ if (ret) {
+ ksmbd_debug(AUTH, "Could not update with user\n");
+ goto out;
+ }
+
+ /* Convert domain name or conn name to unicode and uppercase */
+ len = strlen(dname);
+ domain = kzalloc(2 + UNICODE_LEN(len), GFP_KERNEL);
+ if (!domain) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ conv_len = smb_strtoUTF16((__le16 *)domain, dname, len,
+ sess->conn->local_nls);
+ if (conv_len < 0 || conv_len > len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = crypto_shash_update(CRYPTO_HMACMD5(ctx),
+ (char *)domain,
+ UNICODE_LEN(conv_len));
+ if (ret) {
+ ksmbd_debug(AUTH, "Could not update with domain\n");
+ goto out;
+ }
+
+ ret = crypto_shash_final(CRYPTO_HMACMD5(ctx), ntlmv2_hash);
+ if (ret)
+ ksmbd_debug(AUTH, "Could not generate md5 hash\n");
+out:
+ kfree(uniname);
+ kfree(domain);
+ ksmbd_release_crypto_ctx(ctx);
+ return ret;
+}
+
+/**
+ * ksmbd_auth_ntlm() - NTLM authentication handler
+ * @sess: session of connection
+ * @pw_buf: NTLM challenge response
+ * @passkey: user password
+ *
+ * Return: 0 on success, error number on error
+ */
+int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf)
+{
+ int rc;
+ unsigned char p21[21];
+ char key[CIFS_AUTH_RESP_SIZE];
+
+ memset(p21, '\0', 21);
+ memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE);
+ rc = ksmbd_enc_p24(p21, sess->ntlmssp.cryptkey, key);
+ if (rc) {
+ pr_err("password processing failed\n");
+ return rc;
+ }
+
+ ksmbd_enc_md4(sess->sess_key, user_passkey(sess->user),
+ CIFS_SMB1_SESSKEY_SIZE);
+ memcpy(sess->sess_key + CIFS_SMB1_SESSKEY_SIZE, key,
+ CIFS_AUTH_RESP_SIZE);
+ sess->sequence_number = 1;
+
+ if (strncmp(pw_buf, key, CIFS_AUTH_RESP_SIZE) != 0) {
+ ksmbd_debug(AUTH, "ntlmv1 authentication failed\n");
+ return -EINVAL;
+ }
+
+ ksmbd_debug(AUTH, "ntlmv1 authentication pass\n");
+ return 0;
+}
+
+/**
+ * ksmbd_auth_ntlmv2() - NTLMv2 authentication handler
+ * @sess: session of connection
+ * @ntlmv2: NTLMv2 challenge response
+ * @blen: NTLMv2 blob length
+ * @domain_name: domain name
+ *
+ * Return: 0 on success, error number on error
+ */
+int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
+ int blen, char *domain_name)
+{
+ char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+ char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE];
+ struct ksmbd_crypto_ctx *ctx;
+ char *construct = NULL;
+ int rc, len;
+
+ ctx = ksmbd_crypto_ctx_find_hmacmd5();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n");
+ return -ENOMEM;
+ }
+
+ rc = calc_ntlmv2_hash(sess, ntlmv2_hash, domain_name);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not get v2 hash rc %d\n", rc);
+ goto out;
+ }
+
+ rc = crypto_shash_setkey(CRYPTO_HMACMD5_TFM(ctx),
+ ntlmv2_hash,
+ CIFS_HMAC_MD5_HASH_SIZE);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not set NTLMV2 Hash as a key\n");
+ goto out;
+ }
+
+ rc = crypto_shash_init(CRYPTO_HMACMD5(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not init hmacmd5\n");
+ goto out;
+ }
+
+ len = CIFS_CRYPTO_KEY_SIZE + blen;
+ construct = kzalloc(len, GFP_KERNEL);
+ if (!construct) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(construct, sess->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE);
+ memcpy(construct + CIFS_CRYPTO_KEY_SIZE, &ntlmv2->blob_signature, blen);
+
+ rc = crypto_shash_update(CRYPTO_HMACMD5(ctx), construct, len);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not update with response\n");
+ goto out;
+ }
+
+ rc = crypto_shash_final(CRYPTO_HMACMD5(ctx), ntlmv2_rsp);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not generate md5 hash\n");
+ goto out;
+ }
+
+ rc = ksmbd_gen_sess_key(sess, ntlmv2_hash, ntlmv2_rsp);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not generate sess key\n");
+ goto out;
+ }
+
+ if (memcmp(ntlmv2->ntlmv2_hash, ntlmv2_rsp, CIFS_HMAC_MD5_HASH_SIZE) != 0)
+ rc = -EINVAL;
+out:
+ ksmbd_release_crypto_ctx(ctx);
+ kfree(construct);
+ return rc;
+}
+
+/**
+ * __ksmbd_auth_ntlmv2() - NTLM2(extended security) authentication handler
+ * @sess: session of connection
+ * @client_nonce: client nonce from LM response.
+ * @ntlm_resp: ntlm response data from client.
+ *
+ * Return: 0 on success, error number on error
+ */
+static int __ksmbd_auth_ntlmv2(struct ksmbd_session *sess, char *client_nonce,
+ char *ntlm_resp)
+{
+ char sess_key[CIFS_SMB1_SESSKEY_SIZE] = {0};
+ int rc;
+ unsigned char p21[21];
+ char key[CIFS_AUTH_RESP_SIZE];
+
+ rc = ksmbd_enc_update_sess_key(sess_key,
+ client_nonce,
+ (char *)sess->ntlmssp.cryptkey, 8);
+ if (rc) {
+ pr_err("password processing failed\n");
+ goto out;
+ }
+
+ memset(p21, '\0', 21);
+ memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE);
+ rc = ksmbd_enc_p24(p21, sess_key, key);
+ if (rc) {
+ pr_err("password processing failed\n");
+ goto out;
+ }
+
+ if (memcmp(ntlm_resp, key, CIFS_AUTH_RESP_SIZE) != 0)
+ rc = -EINVAL;
+out:
+ return rc;
+}
+
+/**
+ * ksmbd_decode_ntlmssp_auth_blob() - helper function to construct
+ * authenticate blob
+ * @authblob: authenticate blob source pointer
+ * @usr: user details
+ * @sess: session of connection
+ *
+ * Return: 0 on success, error number on error
+ */
+int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
+ int blob_len, struct ksmbd_session *sess)
+{
+ char *domain_name;
+ unsigned int lm_off, nt_off;
+ unsigned short nt_len;
+ int ret;
+
+ if (blob_len < sizeof(struct authenticate_message)) {
+ ksmbd_debug(AUTH, "negotiate blob len %d too small\n",
+ blob_len);
+ return -EINVAL;
+ }
+
+ if (memcmp(authblob->Signature, "NTLMSSP", 8)) {
+ ksmbd_debug(AUTH, "blob signature incorrect %s\n",
+ authblob->Signature);
+ return -EINVAL;
+ }
+
+ lm_off = le32_to_cpu(authblob->LmChallengeResponse.BufferOffset);
+ nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset);
+ nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length);
+
+ /* process NTLM authentication */
+ if (nt_len == CIFS_AUTH_RESP_SIZE) {
+ if (le32_to_cpu(authblob->NegotiateFlags) &
+ NTLMSSP_NEGOTIATE_EXTENDED_SEC)
+ return __ksmbd_auth_ntlmv2(sess, (char *)authblob +
+ lm_off, (char *)authblob + nt_off);
+ else
+ return ksmbd_auth_ntlm(sess, (char *)authblob +
+ nt_off);
+ }
+
+ /* TODO : use domain name that imported from configuration file */
+ domain_name = smb_strndup_from_utf16((const char *)authblob +
+ le32_to_cpu(authblob->DomainName.BufferOffset),
+ le16_to_cpu(authblob->DomainName.Length), true,
+ sess->conn->local_nls);
+ if (IS_ERR(domain_name))
+ return PTR_ERR(domain_name);
+
+ /* process NTLMv2 authentication */
+ ksmbd_debug(AUTH, "decode_ntlmssp_authenticate_blob dname%s\n",
+ domain_name);
+ ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off),
+ nt_len - CIFS_ENCPWD_SIZE,
+ domain_name);
+ kfree(domain_name);
+ return ret;
+}
+
+/**
+ * ksmbd_decode_ntlmssp_neg_blob() - helper function to construct
+ * negotiate blob
+ * @negblob: negotiate blob source pointer
+ * @rsp: response header pointer to be updated
+ * @sess: session of connection
+ *
+ */
+int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
+ int blob_len, struct ksmbd_session *sess)
+{
+ if (blob_len < sizeof(struct negotiate_message)) {
+ ksmbd_debug(AUTH, "negotiate blob len %d too small\n",
+ blob_len);
+ return -EINVAL;
+ }
+
+ if (memcmp(negblob->Signature, "NTLMSSP", 8)) {
+ ksmbd_debug(AUTH, "blob signature incorrect %s\n",
+ negblob->Signature);
+ return -EINVAL;
+ }
+
+ sess->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags);
+ return 0;
+}
+
+/**
+ * ksmbd_build_ntlmssp_challenge_blob() - helper function to construct
+ * challenge blob
+ * @chgblob: challenge blob source pointer to initialize
+ * @rsp: response header pointer to be updated
+ * @sess: session of connection
+ *
+ */
+unsigned int
+ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
+ struct ksmbd_session *sess)
+{
+ struct target_info *tinfo;
+ wchar_t *name;
+ __u8 *target_name;
+ unsigned int flags, blob_off, blob_len, type, target_info_len = 0;
+ int len, uni_len, conv_len;
+ int cflags = sess->ntlmssp.client_flags;
+
+ memcpy(chgblob->Signature, NTLMSSP_SIGNATURE, 8);
+ chgblob->MessageType = NtLmChallenge;
+
+ flags = NTLMSSP_NEGOTIATE_UNICODE |
+ NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_TARGET_TYPE_SERVER |
+ NTLMSSP_NEGOTIATE_TARGET_INFO;
+
+ if (cflags & NTLMSSP_NEGOTIATE_SIGN) {
+ flags |= NTLMSSP_NEGOTIATE_SIGN;
+ flags |= cflags & (NTLMSSP_NEGOTIATE_128 |
+ NTLMSSP_NEGOTIATE_56);
+ }
+
+ if (cflags & NTLMSSP_NEGOTIATE_ALWAYS_SIGN)
+ flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+
+ if (cflags & NTLMSSP_REQUEST_TARGET)
+ flags |= NTLMSSP_REQUEST_TARGET;
+
+ if (sess->conn->use_spnego &&
+ (cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
+ flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+
+ chgblob->NegotiateFlags = cpu_to_le32(flags);
+ len = strlen(ksmbd_netbios_name());
+ name = kmalloc(2 + UNICODE_LEN(len), GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ conv_len = smb_strtoUTF16((__le16 *)name, ksmbd_netbios_name(), len,
+ sess->conn->local_nls);
+ if (conv_len < 0 || conv_len > len) {
+ kfree(name);
+ return -EINVAL;
+ }
+
+ uni_len = UNICODE_LEN(conv_len);
+
+ blob_off = sizeof(struct challenge_message);
+ blob_len = blob_off + uni_len;
+
+ chgblob->TargetName.Length = cpu_to_le16(uni_len);
+ chgblob->TargetName.MaximumLength = cpu_to_le16(uni_len);
+ chgblob->TargetName.BufferOffset = cpu_to_le32(blob_off);
+
+ /* Initialize random conn challenge */
+ get_random_bytes(sess->ntlmssp.cryptkey, sizeof(__u64));
+ memcpy(chgblob->Challenge, sess->ntlmssp.cryptkey,
+ CIFS_CRYPTO_KEY_SIZE);
+
+ /* Add Target Information to security buffer */
+ chgblob->TargetInfoArray.BufferOffset = cpu_to_le32(blob_len);
+
+ target_name = (__u8 *)chgblob + blob_off;
+ memcpy(target_name, name, uni_len);
+ tinfo = (struct target_info *)(target_name + uni_len);
+
+ chgblob->TargetInfoArray.Length = 0;
+ /* Add target info list for NetBIOS/DNS settings */
+ for (type = NTLMSSP_AV_NB_COMPUTER_NAME;
+ type <= NTLMSSP_AV_DNS_DOMAIN_NAME; type++) {
+ tinfo->Type = cpu_to_le16(type);
+ tinfo->Length = cpu_to_le16(uni_len);
+ memcpy(tinfo->Content, name, uni_len);
+ tinfo = (struct target_info *)((char *)tinfo + 4 + uni_len);
+ target_info_len += 4 + uni_len;
+ }
+
+ /* Add terminator subblock */
+ tinfo->Type = 0;
+ tinfo->Length = 0;
+ target_info_len += 4;
+
+ chgblob->TargetInfoArray.Length = cpu_to_le16(target_info_len);
+ chgblob->TargetInfoArray.MaximumLength = cpu_to_le16(target_info_len);
+ blob_len += target_info_len;
+ kfree(name);
+ ksmbd_debug(AUTH, "NTLMSSP SecurityBufferLength %d\n", blob_len);
+ return blob_len;
+}
+
+#ifdef CONFIG_SMB_SERVER_KERBEROS5
+int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,
+ int in_len, char *out_blob, int *out_len)
+{
+ struct ksmbd_spnego_authen_response *resp;
+ struct ksmbd_user *user = NULL;
+ int retval;
+
+ resp = ksmbd_ipc_spnego_authen_request(in_blob, in_len);
+ if (!resp) {
+ ksmbd_debug(AUTH, "SPNEGO_AUTHEN_REQUEST failure\n");
+ return -EINVAL;
+ }
+
+ if (!(resp->login_response.status & KSMBD_USER_FLAG_OK)) {
+ ksmbd_debug(AUTH, "krb5 authentication failure\n");
+ retval = -EPERM;
+ goto out;
+ }
+
+ if (*out_len <= resp->spnego_blob_len) {
+ ksmbd_debug(AUTH, "buf len %d, but blob len %d\n",
+ *out_len, resp->spnego_blob_len);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (resp->session_key_len > sizeof(sess->sess_key)) {
+ ksmbd_debug(AUTH, "session key is too long\n");
+ retval = -EINVAL;
+ goto out;
+ }
+
+ user = ksmbd_alloc_user(&resp->login_response);
+ if (!user) {
+ ksmbd_debug(AUTH, "login failure\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+ sess->user = user;
+
+ memcpy(sess->sess_key, resp->payload, resp->session_key_len);
+ memcpy(out_blob, resp->payload + resp->session_key_len,
+ resp->spnego_blob_len);
+ *out_len = resp->spnego_blob_len;
+ retval = 0;
+out:
+ kvfree(resp);
+ return retval;
+}
+#else
+int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,
+ int in_len, char *out_blob, int *out_len)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+/**
+ * ksmbd_sign_smb2_pdu() - function to generate packet signing
+ * @conn: connection
+ * @key: signing key
+ * @iov: buffer iov array
+ * @n_vec: number of iovecs
+ * @sig: signature value generated for client request packet
+ *
+ */
+int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov,
+ int n_vec, char *sig)
+{
+ struct ksmbd_crypto_ctx *ctx;
+ int rc, i;
+
+ ctx = ksmbd_crypto_ctx_find_hmacsha256();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n");
+ return -ENOMEM;
+ }
+
+ rc = crypto_shash_setkey(CRYPTO_HMACSHA256_TFM(ctx),
+ key,
+ SMB2_NTLMV2_SESSKEY_SIZE);
+ if (rc)
+ goto out;
+
+ rc = crypto_shash_init(CRYPTO_HMACSHA256(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "hmacsha256 init error %d\n", rc);
+ goto out;
+ }
+
+ for (i = 0; i < n_vec; i++) {
+ rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx),
+ iov[i].iov_base,
+ iov[i].iov_len);
+ if (rc) {
+ ksmbd_debug(AUTH, "hmacsha256 update error %d\n", rc);
+ goto out;
+ }
+ }
+
+ rc = crypto_shash_final(CRYPTO_HMACSHA256(ctx), sig);
+ if (rc)
+ ksmbd_debug(AUTH, "hmacsha256 generation error %d\n", rc);
+out:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
+
+/**
+ * ksmbd_sign_smb3_pdu() - function to generate packet signing
+ * @conn: connection
+ * @key: signing key
+ * @iov: buffer iov array
+ * @n_vec: number of iovecs
+ * @sig: signature value generated for client request packet
+ *
+ */
+int ksmbd_sign_smb3_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov,
+ int n_vec, char *sig)
+{
+ struct ksmbd_crypto_ctx *ctx;
+ int rc, i;
+
+ ctx = ksmbd_crypto_ctx_find_cmacaes();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "could not crypto alloc cmac\n");
+ return -ENOMEM;
+ }
+
+ rc = crypto_shash_setkey(CRYPTO_CMACAES_TFM(ctx),
+ key,
+ SMB2_CMACAES_SIZE);
+ if (rc)
+ goto out;
+
+ rc = crypto_shash_init(CRYPTO_CMACAES(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "cmaces init error %d\n", rc);
+ goto out;
+ }
+
+ for (i = 0; i < n_vec; i++) {
+ rc = crypto_shash_update(CRYPTO_CMACAES(ctx),
+ iov[i].iov_base,
+ iov[i].iov_len);
+ if (rc) {
+ ksmbd_debug(AUTH, "cmaces update error %d\n", rc);
+ goto out;
+ }
+ }
+
+ rc = crypto_shash_final(CRYPTO_CMACAES(ctx), sig);
+ if (rc)
+ ksmbd_debug(AUTH, "cmaces generation error %d\n", rc);
+out:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
+
+struct derivation {
+ struct kvec label;
+ struct kvec context;
+ bool binding;
+};
+
+static int generate_key(struct ksmbd_session *sess, struct kvec label,
+ struct kvec context, __u8 *key, unsigned int key_size)
+{
+ unsigned char zero = 0x0;
+ __u8 i[4] = {0, 0, 0, 1};
+ __u8 L128[4] = {0, 0, 0, 128};
+ __u8 L256[4] = {0, 0, 1, 0};
+ int rc;
+ unsigned char prfhash[SMB2_HMACSHA256_SIZE];
+ unsigned char *hashptr = prfhash;
+ struct ksmbd_crypto_ctx *ctx;
+
+ memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
+ memset(key, 0x0, key_size);
+
+ ctx = ksmbd_crypto_ctx_find_hmacsha256();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n");
+ return -ENOMEM;
+ }
+
+ rc = crypto_shash_setkey(CRYPTO_HMACSHA256_TFM(ctx),
+ sess->sess_key,
+ SMB2_NTLMV2_SESSKEY_SIZE);
+ if (rc)
+ goto smb3signkey_ret;
+
+ rc = crypto_shash_init(CRYPTO_HMACSHA256(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "hmacsha256 init error %d\n", rc);
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), i, 4);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not update with n\n");
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx),
+ label.iov_base,
+ label.iov_len);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not update with label\n");
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), &zero, 1);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not update with zero\n");
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx),
+ context.iov_base,
+ context.iov_len);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not update with context\n");
+ goto smb3signkey_ret;
+ }
+
+ if (sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
+ sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), L256, 4);
+ else
+ rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), L128, 4);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not update with L\n");
+ goto smb3signkey_ret;
+ }
+
+ rc = crypto_shash_final(CRYPTO_HMACSHA256(ctx), hashptr);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not generate hmacmd5 hash error %d\n",
+ rc);
+ goto smb3signkey_ret;
+ }
+
+ memcpy(key, hashptr, key_size);
+
+smb3signkey_ret:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
+
+static int generate_smb3signingkey(struct ksmbd_session *sess,
+ struct ksmbd_conn *conn,
+ const struct derivation *signing)
+{
+ int rc;
+ struct channel *chann;
+ char *key;
+
+ chann = lookup_chann_list(sess, conn);
+ if (!chann)
+ return 0;
+
+ if (sess->conn->dialect >= SMB30_PROT_ID && signing->binding)
+ key = chann->smb3signingkey;
+ else
+ key = sess->smb3signingkey;
+
+ rc = generate_key(sess, signing->label, signing->context, key,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
+
+ if (!(sess->conn->dialect >= SMB30_PROT_ID && signing->binding))
+ memcpy(chann->smb3signingkey, key, SMB3_SIGN_KEY_SIZE);
+
+ ksmbd_debug(AUTH, "dumping generated AES signing keys\n");
+ ksmbd_debug(AUTH, "Session Id %llu\n", sess->id);
+ ksmbd_debug(AUTH, "Session Key %*ph\n",
+ SMB2_NTLMV2_SESSKEY_SIZE, sess->sess_key);
+ ksmbd_debug(AUTH, "Signing Key %*ph\n",
+ SMB3_SIGN_KEY_SIZE, key);
+ return 0;
+}
+
+int ksmbd_gen_smb30_signingkey(struct ksmbd_session *sess,
+ struct ksmbd_conn *conn)
+{
+ struct derivation d;
+
+ d.label.iov_base = "SMB2AESCMAC";
+ d.label.iov_len = 12;
+ d.context.iov_base = "SmbSign";
+ d.context.iov_len = 8;
+ d.binding = conn->binding;
+
+ return generate_smb3signingkey(sess, conn, &d);
+}
+
+int ksmbd_gen_smb311_signingkey(struct ksmbd_session *sess,
+ struct ksmbd_conn *conn)
+{
+ struct derivation d;
+
+ d.label.iov_base = "SMBSigningKey";
+ d.label.iov_len = 14;
+ if (conn->binding) {
+ struct preauth_session *preauth_sess;
+
+ preauth_sess = ksmbd_preauth_session_lookup(conn, sess->id);
+ if (!preauth_sess)
+ return -ENOENT;
+ d.context.iov_base = preauth_sess->Preauth_HashValue;
+ } else {
+ d.context.iov_base = sess->Preauth_HashValue;
+ }
+ d.context.iov_len = 64;
+ d.binding = conn->binding;
+
+ return generate_smb3signingkey(sess, conn, &d);
+}
+
+struct derivation_twin {
+ struct derivation encryption;
+ struct derivation decryption;
+};
+
+static int generate_smb3encryptionkey(struct ksmbd_session *sess,
+ const struct derivation_twin *ptwin)
+{
+ int rc;
+
+ rc = generate_key(sess, ptwin->encryption.label,
+ ptwin->encryption.context, sess->smb3encryptionkey,
+ SMB3_ENC_DEC_KEY_SIZE);
+ if (rc)
+ return rc;
+
+ rc = generate_key(sess, ptwin->decryption.label,
+ ptwin->decryption.context,
+ sess->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE);
+ if (rc)
+ return rc;
+
+ ksmbd_debug(AUTH, "dumping generated AES encryption keys\n");
+ ksmbd_debug(AUTH, "Cipher type %d\n", sess->conn->cipher_type);
+ ksmbd_debug(AUTH, "Session Id %llu\n", sess->id);
+ ksmbd_debug(AUTH, "Session Key %*ph\n",
+ SMB2_NTLMV2_SESSKEY_SIZE, sess->sess_key);
+ if (sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
+ sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) {
+ ksmbd_debug(AUTH, "ServerIn Key %*ph\n",
+ SMB3_GCM256_CRYPTKEY_SIZE, sess->smb3encryptionkey);
+ ksmbd_debug(AUTH, "ServerOut Key %*ph\n",
+ SMB3_GCM256_CRYPTKEY_SIZE, sess->smb3decryptionkey);
+ } else {
+ ksmbd_debug(AUTH, "ServerIn Key %*ph\n",
+ SMB3_GCM128_CRYPTKEY_SIZE, sess->smb3encryptionkey);
+ ksmbd_debug(AUTH, "ServerOut Key %*ph\n",
+ SMB3_GCM128_CRYPTKEY_SIZE, sess->smb3decryptionkey);
+ }
+ return 0;
+}
+
+int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess)
+{
+ struct derivation_twin twin;
+ struct derivation *d;
+
+ d = &twin.encryption;
+ d->label.iov_base = "SMB2AESCCM";
+ d->label.iov_len = 11;
+ d->context.iov_base = "ServerOut";
+ d->context.iov_len = 10;
+
+ d = &twin.decryption;
+ d->label.iov_base = "SMB2AESCCM";
+ d->label.iov_len = 11;
+ d->context.iov_base = "ServerIn ";
+ d->context.iov_len = 10;
+
+ return generate_smb3encryptionkey(sess, &twin);
+}
+
+int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess)
+{
+ struct derivation_twin twin;
+ struct derivation *d;
+
+ d = &twin.encryption;
+ d->label.iov_base = "SMBS2CCipherKey";
+ d->label.iov_len = 16;
+ d->context.iov_base = sess->Preauth_HashValue;
+ d->context.iov_len = 64;
+
+ d = &twin.decryption;
+ d->label.iov_base = "SMBC2SCipherKey";
+ d->label.iov_len = 16;
+ d->context.iov_base = sess->Preauth_HashValue;
+ d->context.iov_len = 64;
+
+ return generate_smb3encryptionkey(sess, &twin);
+}
+
+int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
+ __u8 *pi_hash)
+{
+ int rc;
+ struct smb2_hdr *rcv_hdr = (struct smb2_hdr *)buf;
+ char *all_bytes_msg = (char *)&rcv_hdr->ProtocolId;
+ int msg_size = be32_to_cpu(rcv_hdr->smb2_buf_length);
+ struct ksmbd_crypto_ctx *ctx = NULL;
+
+ if (conn->preauth_info->Preauth_HashId !=
+ SMB2_PREAUTH_INTEGRITY_SHA512)
+ return -EINVAL;
+
+ ctx = ksmbd_crypto_ctx_find_sha512();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "could not alloc sha512\n");
+ return -ENOMEM;
+ }
+
+ rc = crypto_shash_init(CRYPTO_SHA512(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "could not init shashn");
+ goto out;
+ }
+
+ rc = crypto_shash_update(CRYPTO_SHA512(ctx), pi_hash, 64);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not update with n\n");
+ goto out;
+ }
+
+ rc = crypto_shash_update(CRYPTO_SHA512(ctx), all_bytes_msg, msg_size);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not update with n\n");
+ goto out;
+ }
+
+ rc = crypto_shash_final(CRYPTO_SHA512(ctx), pi_hash);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not generate hash err : %d\n", rc);
+ goto out;
+ }
+out:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
+
+int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len,
+ __u8 *pi_hash)
+{
+ int rc;
+ struct ksmbd_crypto_ctx *ctx = NULL;
+
+ ctx = ksmbd_crypto_ctx_find_sha256();
+ if (!ctx) {
+ ksmbd_debug(AUTH, "could not alloc sha256\n");
+ return -ENOMEM;
+ }
+
+ rc = crypto_shash_init(CRYPTO_SHA256(ctx));
+ if (rc) {
+ ksmbd_debug(AUTH, "could not init shashn");
+ goto out;
+ }
+
+ rc = crypto_shash_update(CRYPTO_SHA256(ctx), sd_buf, len);
+ if (rc) {
+ ksmbd_debug(AUTH, "could not update with n\n");
+ goto out;
+ }
+
+ rc = crypto_shash_final(CRYPTO_SHA256(ctx), pi_hash);
+ if (rc) {
+ ksmbd_debug(AUTH, "Could not generate hash err : %d\n", rc);
+ goto out;
+ }
+out:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
+
+static int ksmbd_get_encryption_key(struct ksmbd_conn *conn, __u64 ses_id,
+ int enc, u8 *key)
+{
+ struct ksmbd_session *sess;
+ u8 *ses_enc_key;
+
+ sess = ksmbd_session_lookup_all(conn, ses_id);
+ if (!sess)
+ return -EINVAL;
+
+ ses_enc_key = enc ? sess->smb3encryptionkey :
+ sess->smb3decryptionkey;
+ memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+
+ return 0;
+}
+
+static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
+ unsigned int buflen)
+{
+ void *addr;
+
+ if (is_vmalloc_addr(buf))
+ addr = vmalloc_to_page(buf);
+ else
+ addr = virt_to_page(buf);
+ sg_set_page(sg, addr, buflen, offset_in_page(buf));
+}
+
+static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
+ u8 *sign)
+{
+ struct scatterlist *sg;
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+ int i, nr_entries[3] = {0}, total_entries = 0, sg_idx = 0;
+
+ if (!nvec)
+ return NULL;
+
+ for (i = 0; i < nvec - 1; i++) {
+ unsigned long kaddr = (unsigned long)iov[i + 1].iov_base;
+
+ if (is_vmalloc_addr(iov[i + 1].iov_base)) {
+ nr_entries[i] = ((kaddr + iov[i + 1].iov_len +
+ PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ (kaddr >> PAGE_SHIFT);
+ } else {
+ nr_entries[i]++;
+ }
+ total_entries += nr_entries[i];
+ }
+
+ /* Add two entries for transform header and signature */
+ total_entries += 2;
+
+ sg = kmalloc_array(total_entries, sizeof(struct scatterlist), GFP_KERNEL);
+ if (!sg)
+ return NULL;
+
+ sg_init_table(sg, total_entries);
+ smb2_sg_set_buf(&sg[sg_idx++], iov[0].iov_base + 24, assoc_data_len);
+ for (i = 0; i < nvec - 1; i++) {
+ void *data = iov[i + 1].iov_base;
+ int len = iov[i + 1].iov_len;
+
+ if (is_vmalloc_addr(data)) {
+ int j, offset = offset_in_page(data);
+
+ for (j = 0; j < nr_entries[i]; j++) {
+ unsigned int bytes = PAGE_SIZE - offset;
+
+ if (!len)
+ break;
+
+ if (bytes > len)
+ bytes = len;
+
+ sg_set_page(&sg[sg_idx++],
+ vmalloc_to_page(data), bytes,
+ offset_in_page(data));
+
+ data += bytes;
+ len -= bytes;
+ offset = 0;
+ }
+ } else {
+ sg_set_page(&sg[sg_idx++], virt_to_page(data), len,
+ offset_in_page(data));
+ }
+ }
+ smb2_sg_set_buf(&sg[sg_idx], sign, SMB2_SIGNATURE_SIZE);
+ return sg;
+}
+
+int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
+ unsigned int nvec, int enc)
+{
+ struct smb2_transform_hdr *tr_hdr =
+ (struct smb2_transform_hdr *)iov[0].iov_base;
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+ int rc;
+ struct scatterlist *sg;
+ u8 sign[SMB2_SIGNATURE_SIZE] = {};
+ u8 key[SMB3_ENC_DEC_KEY_SIZE];
+ struct aead_request *req;
+ char *iv;
+ unsigned int iv_len;
+ struct crypto_aead *tfm;
+ unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+ struct ksmbd_crypto_ctx *ctx;
+
+ rc = ksmbd_get_encryption_key(conn,
+ le64_to_cpu(tr_hdr->SessionId),
+ enc,
+ key);
+ if (rc) {
+ pr_err("Could not get %scryption key\n", enc ? "en" : "de");
+ return rc;
+ }
+
+ if (conn->cipher_type == SMB2_ENCRYPTION_AES128_GCM ||
+ conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ ctx = ksmbd_crypto_ctx_find_gcm();
+ else
+ ctx = ksmbd_crypto_ctx_find_ccm();
+ if (!ctx) {
+ pr_err("crypto alloc failed\n");
+ return -ENOMEM;
+ }
+
+ if (conn->cipher_type == SMB2_ENCRYPTION_AES128_GCM ||
+ conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ tfm = CRYPTO_GCM(ctx);
+ else
+ tfm = CRYPTO_CCM(ctx);
+
+ if (conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
+ conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE);
+ else
+ rc = crypto_aead_setkey(tfm, key, SMB3_GCM128_CRYPTKEY_SIZE);
+ if (rc) {
+ pr_err("Failed to set aead key %d\n", rc);
+ goto free_ctx;
+ }
+
+ rc = crypto_aead_setauthsize(tfm, SMB2_SIGNATURE_SIZE);
+ if (rc) {
+ pr_err("Failed to set authsize %d\n", rc);
+ goto free_ctx;
+ }
+
+ req = aead_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ rc = -ENOMEM;
+ goto free_ctx;
+ }
+
+ if (!enc) {
+ memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
+ crypt_len += SMB2_SIGNATURE_SIZE;
+ }
+
+ sg = ksmbd_init_sg(iov, nvec, sign);
+ if (!sg) {
+ pr_err("Failed to init sg\n");
+ rc = -ENOMEM;
+ goto free_req;
+ }
+
+ iv_len = crypto_aead_ivsize(tfm);
+ iv = kzalloc(iv_len, GFP_KERNEL);
+ if (!iv) {
+ rc = -ENOMEM;
+ goto free_sg;
+ }
+
+ if (conn->cipher_type == SMB2_ENCRYPTION_AES128_GCM ||
+ conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) {
+ memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
+ } else {
+ iv[0] = 3;
+ memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
+ }
+
+ aead_request_set_crypt(req, sg, sg, crypt_len, iv);
+ aead_request_set_ad(req, assoc_data_len);
+ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
+ if (enc)
+ rc = crypto_aead_encrypt(req);
+ else
+ rc = crypto_aead_decrypt(req);
+ if (rc)
+ goto free_iv;
+
+ if (enc)
+ memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
+
+free_iv:
+ kfree(iv);
+free_sg:
+ kfree(sg);
+free_req:
+ kfree(req);
+free_ctx:
+ ksmbd_release_crypto_ctx(ctx);
+ return rc;
+}
diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h
new file mode 100644
index 000000000000..9c2d4badd05d
--- /dev/null
+++ b/fs/ksmbd/auth.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __AUTH_H__
+#define __AUTH_H__
+
+#include "ntlmssp.h"
+
+#ifdef CONFIG_SMB_SERVER_KERBEROS5
+#define AUTH_GSS_LENGTH 96
+#define AUTH_GSS_PADDING 0
+#else
+#define AUTH_GSS_LENGTH 74
+#define AUTH_GSS_PADDING 6
+#endif
+
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
+
+/*
+ * Size of the ntlm client response
+ */
+#define CIFS_AUTH_RESP_SIZE 24
+#define CIFS_SMB1_SIGNATURE_SIZE 8
+#define CIFS_SMB1_SESSKEY_SIZE 16
+
+#define KSMBD_AUTH_NTLMSSP 0x0001
+#define KSMBD_AUTH_KRB5 0x0002
+#define KSMBD_AUTH_MSKRB5 0x0004
+#define KSMBD_AUTH_KRB5U2U 0x0008
+
+struct ksmbd_session;
+struct ksmbd_conn;
+struct kvec;
+
+int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
+ unsigned int nvec, int enc);
+void ksmbd_copy_gss_neg_header(void *buf);
+int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf);
+int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
+ int blen, char *domain_name);
+int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
+ int blob_len, struct ksmbd_session *sess);
+int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
+ int blob_len, struct ksmbd_session *sess);
+unsigned int
+ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
+ struct ksmbd_session *sess);
+int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,
+ int in_len, char *out_blob, int *out_len);
+int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov,
+ int n_vec, char *sig);
+int ksmbd_sign_smb3_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov,
+ int n_vec, char *sig);
+int ksmbd_gen_smb30_signingkey(struct ksmbd_session *sess,
+ struct ksmbd_conn *conn);
+int ksmbd_gen_smb311_signingkey(struct ksmbd_session *sess,
+ struct ksmbd_conn *conn);
+int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess);
+int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess);
+int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
+ __u8 *pi_hash);
+int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len,
+ __u8 *pi_hash);
+#endif
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
new file mode 100644
index 000000000000..af086d35398a
--- /dev/null
+++ b/fs/ksmbd/connection.c
@@ -0,0 +1,413 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <namjae.jeon@protocolfreedom.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <linux/module.h>
+
+#include "server.h"
+#include "smb_common.h"
+#include "mgmt/ksmbd_ida.h"
+#include "connection.h"
+#include "transport_tcp.h"
+#include "transport_rdma.h"
+
+static DEFINE_MUTEX(init_lock);
+
+static struct ksmbd_conn_ops default_conn_ops;
+
+LIST_HEAD(conn_list);
+DEFINE_RWLOCK(conn_list_lock);
+
+/**
+ * ksmbd_conn_free() - free resources of the connection instance
+ *
+ * @conn: connection instance to be cleand up
+ *
+ * During the thread termination, the corresponding conn instance
+ * resources(sock/memory) are released and finally the conn object is freed.
+ */
+void ksmbd_conn_free(struct ksmbd_conn *conn)
+{
+ write_lock(&conn_list_lock);
+ list_del(&conn->conns_list);
+ write_unlock(&conn_list_lock);
+
+ kvfree(conn->request_buf);
+ kfree(conn->preauth_info);
+ kfree(conn);
+}
+
+/**
+ * ksmbd_conn_alloc() - initialize a new connection instance
+ *
+ * Return: ksmbd_conn struct on success, otherwise NULL
+ */
+struct ksmbd_conn *ksmbd_conn_alloc(void)
+{
+ struct ksmbd_conn *conn;
+
+ conn = kzalloc(sizeof(struct ksmbd_conn), GFP_KERNEL);
+ if (!conn)
+ return NULL;
+
+ conn->need_neg = true;
+ conn->status = KSMBD_SESS_NEW;
+ conn->local_nls = load_nls("utf8");
+ if (!conn->local_nls)
+ conn->local_nls = load_nls_default();
+ atomic_set(&conn->req_running, 0);
+ atomic_set(&conn->r_count, 0);
+ init_waitqueue_head(&conn->req_running_q);
+ INIT_LIST_HEAD(&conn->conns_list);
+ INIT_LIST_HEAD(&conn->sessions);
+ INIT_LIST_HEAD(&conn->requests);
+ INIT_LIST_HEAD(&conn->async_requests);
+ spin_lock_init(&conn->request_lock);
+ spin_lock_init(&conn->credits_lock);
+ ida_init(&conn->async_ida);
+
+ spin_lock_init(&conn->llist_lock);
+ INIT_LIST_HEAD(&conn->lock_list);
+
+ write_lock(&conn_list_lock);
+ list_add(&conn->conns_list, &conn_list);
+ write_unlock(&conn_list_lock);
+ return conn;
+}
+
+bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c)
+{
+ struct ksmbd_conn *t;
+ bool ret = false;
+
+ read_lock(&conn_list_lock);
+ list_for_each_entry(t, &conn_list, conns_list) {
+ if (memcmp(t->ClientGUID, c->ClientGUID, SMB2_CLIENT_GUID_SIZE))
+ continue;
+
+ ret = true;
+ break;
+ }
+ read_unlock(&conn_list_lock);
+ return ret;
+}
+
+void ksmbd_conn_enqueue_request(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct list_head *requests_queue = NULL;
+
+ if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) {
+ requests_queue = &conn->requests;
+ work->syncronous = true;
+ }
+
+ if (requests_queue) {
+ atomic_inc(&conn->req_running);
+ spin_lock(&conn->request_lock);
+ list_add_tail(&work->request_entry, requests_queue);
+ spin_unlock(&conn->request_lock);
+ }
+}
+
+int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ int ret = 1;
+
+ if (list_empty(&work->request_entry) &&
+ list_empty(&work->async_request_entry))
+ return 0;
+
+ if (!work->multiRsp)
+ atomic_dec(&conn->req_running);
+ spin_lock(&conn->request_lock);
+ if (!work->multiRsp) {
+ list_del_init(&work->request_entry);
+ if (work->syncronous == false)
+ list_del_init(&work->async_request_entry);
+ ret = 0;
+ }
+ spin_unlock(&conn->request_lock);
+
+ wake_up_all(&conn->req_running_q);
+ return ret;
+}
+
+static void ksmbd_conn_lock(struct ksmbd_conn *conn)
+{
+ mutex_lock(&conn->srv_mutex);
+}
+
+static void ksmbd_conn_unlock(struct ksmbd_conn *conn)
+{
+ mutex_unlock(&conn->srv_mutex);
+}
+
+void ksmbd_conn_wait_idle(struct ksmbd_conn *conn)
+{
+ wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2);
+}
+
+int ksmbd_conn_write(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb_hdr *rsp_hdr = work->response_buf;
+ size_t len = 0;
+ int sent;
+ struct kvec iov[3];
+ int iov_idx = 0;
+
+ ksmbd_conn_try_dequeue_request(work);
+ if (!rsp_hdr) {
+ pr_err("NULL response header\n");
+ return -EINVAL;
+ }
+
+ if (work->tr_buf) {
+ iov[iov_idx] = (struct kvec) { work->tr_buf,
+ sizeof(struct smb2_transform_hdr) };
+ len += iov[iov_idx++].iov_len;
+ }
+
+ if (work->aux_payload_sz) {
+ iov[iov_idx] = (struct kvec) { rsp_hdr, work->resp_hdr_sz };
+ len += iov[iov_idx++].iov_len;
+ iov[iov_idx] = (struct kvec) { work->aux_payload_buf, work->aux_payload_sz };
+ len += iov[iov_idx++].iov_len;
+ } else {
+ if (work->tr_buf)
+ iov[iov_idx].iov_len = work->resp_hdr_sz;
+ else
+ iov[iov_idx].iov_len = get_rfc1002_len(rsp_hdr) + 4;
+ iov[iov_idx].iov_base = rsp_hdr;
+ len += iov[iov_idx++].iov_len;
+ }
+
+ ksmbd_conn_lock(conn);
+ sent = conn->transport->ops->writev(conn->transport, &iov[0],
+ iov_idx, len,
+ work->need_invalidate_rkey,
+ work->remote_key);
+ ksmbd_conn_unlock(conn);
+
+ if (sent < 0) {
+ pr_err("Failed to send message: %d\n", sent);
+ return sent;
+ }
+
+ return 0;
+}
+
+int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf,
+ unsigned int buflen, u32 remote_key, u64 remote_offset,
+ u32 remote_len)
+{
+ int ret = -EINVAL;
+
+ if (conn->transport->ops->rdma_read)
+ ret = conn->transport->ops->rdma_read(conn->transport,
+ buf, buflen,
+ remote_key, remote_offset,
+ remote_len);
+ return ret;
+}
+
+int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf,
+ unsigned int buflen, u32 remote_key,
+ u64 remote_offset, u32 remote_len)
+{
+ int ret = -EINVAL;
+
+ if (conn->transport->ops->rdma_write)
+ ret = conn->transport->ops->rdma_write(conn->transport,
+ buf, buflen,
+ remote_key, remote_offset,
+ remote_len);
+ return ret;
+}
+
+bool ksmbd_conn_alive(struct ksmbd_conn *conn)
+{
+ if (!ksmbd_server_running())
+ return false;
+
+ if (conn->status == KSMBD_SESS_EXITING)
+ return false;
+
+ if (kthread_should_stop())
+ return false;
+
+ if (atomic_read(&conn->stats.open_files_count) > 0)
+ return true;
+
+ /*
+ * Stop current session if the time that get last request from client
+ * is bigger than deadtime user configured and opening file count is
+ * zero.
+ */
+ if (server_conf.deadtime > 0 &&
+ time_after(jiffies, conn->last_active + server_conf.deadtime)) {
+ ksmbd_debug(CONN, "No response from client in %lu minutes\n",
+ server_conf.deadtime / SMB_ECHO_INTERVAL);
+ return false;
+ }
+ return true;
+}
+
+/**
+ * ksmbd_conn_handler_loop() - session thread to listen on new smb requests
+ * @p: connection instance
+ *
+ * One thread each per connection
+ *
+ * Return: 0 on success
+ */
+int ksmbd_conn_handler_loop(void *p)
+{
+ struct ksmbd_conn *conn = (struct ksmbd_conn *)p;
+ struct ksmbd_transport *t = conn->transport;
+ unsigned int pdu_size;
+ char hdr_buf[4] = {0,};
+ int size;
+
+ mutex_init(&conn->srv_mutex);
+ __module_get(THIS_MODULE);
+
+ if (t->ops->prepare && t->ops->prepare(t))
+ goto out;
+
+ conn->last_active = jiffies;
+ while (ksmbd_conn_alive(conn)) {
+ if (try_to_freeze())
+ continue;
+
+ kvfree(conn->request_buf);
+ conn->request_buf = NULL;
+
+ size = t->ops->read(t, hdr_buf, sizeof(hdr_buf));
+ if (size != sizeof(hdr_buf))
+ break;
+
+ pdu_size = get_rfc1002_len(hdr_buf);
+ ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size);
+
+ /* make sure we have enough to get to SMB header end */
+ if (!ksmbd_pdu_size_has_room(pdu_size)) {
+ ksmbd_debug(CONN, "SMB request too short (%u bytes)\n",
+ pdu_size);
+ continue;
+ }
+
+ /* 4 for rfc1002 length field */
+ size = pdu_size + 4;
+ conn->request_buf = kvmalloc(size, GFP_KERNEL);
+ if (!conn->request_buf)
+ continue;
+
+ memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf));
+ if (!ksmbd_smb_request(conn))
+ break;
+
+ /*
+ * We already read 4 bytes to find out PDU size, now
+ * read in PDU
+ */
+ size = t->ops->read(t, conn->request_buf + 4, pdu_size);
+ if (size < 0) {
+ pr_err("sock_read failed: %d\n", size);
+ break;
+ }
+
+ if (size != pdu_size) {
+ pr_err("PDU error. Read: %d, Expected: %d\n",
+ size, pdu_size);
+ continue;
+ }
+
+ if (!default_conn_ops.process_fn) {
+ pr_err("No connection request callback\n");
+ break;
+ }
+
+ if (default_conn_ops.process_fn(conn)) {
+ pr_err("Cannot handle request\n");
+ break;
+ }
+ }
+
+out:
+ /* Wait till all reference dropped to the Server object*/
+ while (atomic_read(&conn->r_count) > 0)
+ schedule_timeout(HZ);
+
+ unload_nls(conn->local_nls);
+ if (default_conn_ops.terminate_fn)
+ default_conn_ops.terminate_fn(conn);
+ t->ops->disconnect(t);
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops)
+{
+ default_conn_ops.process_fn = ops->process_fn;
+ default_conn_ops.terminate_fn = ops->terminate_fn;
+}
+
+int ksmbd_conn_transport_init(void)
+{
+ int ret;
+
+ mutex_lock(&init_lock);
+ ret = ksmbd_tcp_init();
+ if (ret) {
+ pr_err("Failed to init TCP subsystem: %d\n", ret);
+ goto out;
+ }
+
+ ret = ksmbd_rdma_init();
+ if (ret) {
+ pr_err("Failed to init RDMA subsystem: %d\n", ret);
+ goto out;
+ }
+out:
+ mutex_unlock(&init_lock);
+ return ret;
+}
+
+static void stop_sessions(void)
+{
+ struct ksmbd_conn *conn;
+
+again:
+ read_lock(&conn_list_lock);
+ list_for_each_entry(conn, &conn_list, conns_list) {
+ struct task_struct *task;
+
+ task = conn->transport->handler;
+ if (task)
+ ksmbd_debug(CONN, "Stop session handler %s/%d\n",
+ task->comm, task_pid_nr(task));
+ conn->status = KSMBD_SESS_EXITING;
+ }
+ read_unlock(&conn_list_lock);
+
+ if (!list_empty(&conn_list)) {
+ schedule_timeout_interruptible(HZ / 10); /* 100ms */
+ goto again;
+ }
+}
+
+void ksmbd_conn_transport_destroy(void)
+{
+ mutex_lock(&init_lock);
+ ksmbd_tcp_destroy();
+ ksmbd_rdma_destroy();
+ stop_sessions();
+ mutex_unlock(&init_lock);
+}
diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h
new file mode 100644
index 000000000000..e5403c587a58
--- /dev/null
+++ b/fs/ksmbd/connection.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_CONNECTION_H__
+#define __KSMBD_CONNECTION_H__
+
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/inet_connection_sock.h>
+#include <net/request_sock.h>
+#include <linux/kthread.h>
+#include <linux/nls.h>
+
+#include "smb_common.h"
+#include "ksmbd_work.h"
+
+#define KSMBD_SOCKET_BACKLOG 16
+
+/*
+ * WARNING
+ *
+ * This is nothing but a HACK. Session status should move to channel
+ * or to session. As of now we have 1 tcp_conn : 1 ksmbd_session, but
+ * we need to change it to 1 tcp_conn : N ksmbd_sessions.
+ */
+enum {
+ KSMBD_SESS_NEW = 0,
+ KSMBD_SESS_GOOD,
+ KSMBD_SESS_EXITING,
+ KSMBD_SESS_NEED_RECONNECT,
+ KSMBD_SESS_NEED_NEGOTIATE
+};
+
+struct ksmbd_stats {
+ atomic_t open_files_count;
+ atomic64_t request_served;
+};
+
+struct ksmbd_transport;
+
+struct ksmbd_conn {
+ struct smb_version_values *vals;
+ struct smb_version_ops *ops;
+ struct smb_version_cmds *cmds;
+ unsigned int max_cmds;
+ struct mutex srv_mutex;
+ int status;
+ unsigned int cli_cap;
+ char *request_buf;
+ struct ksmbd_transport *transport;
+ struct nls_table *local_nls;
+ struct list_head conns_list;
+ /* smb session 1 per user */
+ struct list_head sessions;
+ unsigned long last_active;
+ /* How many request are running currently */
+ atomic_t req_running;
+ /* References which are made for this Server object*/
+ atomic_t r_count;
+ unsigned short total_credits;
+ unsigned short max_credits;
+ spinlock_t credits_lock;
+ wait_queue_head_t req_running_q;
+ /* Lock to protect requests list*/
+ spinlock_t request_lock;
+ struct list_head requests;
+ struct list_head async_requests;
+ int connection_type;
+ struct ksmbd_stats stats;
+ char ClientGUID[SMB2_CLIENT_GUID_SIZE];
+ union {
+ /* pending trans request table */
+ struct trans_state *recent_trans;
+ /* Used by ntlmssp */
+ char *ntlmssp_cryptkey;
+ };
+
+ spinlock_t llist_lock;
+ struct list_head lock_list;
+
+ struct preauth_integrity_info *preauth_info;
+
+ bool need_neg;
+ unsigned int auth_mechs;
+ unsigned int preferred_auth_mech;
+ bool sign;
+ bool use_spnego:1;
+ __u16 cli_sec_mode;
+ __u16 srv_sec_mode;
+ /* dialect index that server chose */
+ __u16 dialect;
+
+ char *mechToken;
+
+ struct ksmbd_conn_ops *conn_ops;
+
+ /* Preauth Session Table */
+ struct list_head preauth_sess_table;
+
+ struct sockaddr_storage peer_addr;
+
+ /* Identifier for async message */
+ struct ida async_ida;
+
+ __le16 cipher_type;
+ __le16 compress_algorithm;
+ bool posix_ext_supported;
+ bool signing_negotiated;
+ __le16 signing_algorithm;
+ bool binding;
+};
+
+struct ksmbd_conn_ops {
+ int (*process_fn)(struct ksmbd_conn *conn);
+ int (*terminate_fn)(struct ksmbd_conn *conn);
+};
+
+struct ksmbd_transport_ops {
+ int (*prepare)(struct ksmbd_transport *t);
+ void (*disconnect)(struct ksmbd_transport *t);
+ int (*read)(struct ksmbd_transport *t, char *buf, unsigned int size);
+ int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov,
+ int size, bool need_invalidate_rkey,
+ unsigned int remote_key);
+ int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len,
+ u32 remote_key, u64 remote_offset, u32 remote_len);
+ int (*rdma_write)(struct ksmbd_transport *t, void *buf,
+ unsigned int len, u32 remote_key, u64 remote_offset,
+ u32 remote_len);
+};
+
+struct ksmbd_transport {
+ struct ksmbd_conn *conn;
+ struct ksmbd_transport_ops *ops;
+ struct task_struct *handler;
+};
+
+#define KSMBD_TCP_RECV_TIMEOUT (7 * HZ)
+#define KSMBD_TCP_SEND_TIMEOUT (5 * HZ)
+#define KSMBD_TCP_PEER_SOCKADDR(c) ((struct sockaddr *)&((c)->peer_addr))
+
+extern struct list_head conn_list;
+extern rwlock_t conn_list_lock;
+
+bool ksmbd_conn_alive(struct ksmbd_conn *conn);
+void ksmbd_conn_wait_idle(struct ksmbd_conn *conn);
+struct ksmbd_conn *ksmbd_conn_alloc(void);
+void ksmbd_conn_free(struct ksmbd_conn *conn);
+bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
+int ksmbd_conn_write(struct ksmbd_work *work);
+int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf,
+ unsigned int buflen, u32 remote_key, u64 remote_offset,
+ u32 remote_len);
+int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf,
+ unsigned int buflen, u32 remote_key, u64 remote_offset,
+ u32 remote_len);
+void ksmbd_conn_enqueue_request(struct ksmbd_work *work);
+int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work);
+void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops);
+int ksmbd_conn_handler_loop(void *p);
+int ksmbd_conn_transport_init(void);
+void ksmbd_conn_transport_destroy(void);
+
+/*
+ * WARNING
+ *
+ * This is a hack. We will move status to a proper place once we land
+ * a multi-sessions support.
+ */
+static inline bool ksmbd_conn_good(struct ksmbd_work *work)
+{
+ return work->conn->status == KSMBD_SESS_GOOD;
+}
+
+static inline bool ksmbd_conn_need_negotiate(struct ksmbd_work *work)
+{
+ return work->conn->status == KSMBD_SESS_NEED_NEGOTIATE;
+}
+
+static inline bool ksmbd_conn_need_reconnect(struct ksmbd_work *work)
+{
+ return work->conn->status == KSMBD_SESS_NEED_RECONNECT;
+}
+
+static inline bool ksmbd_conn_exiting(struct ksmbd_work *work)
+{
+ return work->conn->status == KSMBD_SESS_EXITING;
+}
+
+static inline void ksmbd_conn_set_good(struct ksmbd_work *work)
+{
+ work->conn->status = KSMBD_SESS_GOOD;
+}
+
+static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_work *work)
+{
+ work->conn->status = KSMBD_SESS_NEED_NEGOTIATE;
+}
+
+static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_work *work)
+{
+ work->conn->status = KSMBD_SESS_NEED_RECONNECT;
+}
+
+static inline void ksmbd_conn_set_exiting(struct ksmbd_work *work)
+{
+ work->conn->status = KSMBD_SESS_EXITING;
+}
+#endif /* __CONNECTION_H__ */
diff --git a/fs/ksmbd/crypto_ctx.c b/fs/ksmbd/crypto_ctx.c
new file mode 100644
index 000000000000..5f4b1008d17e
--- /dev/null
+++ b/fs/ksmbd/crypto_ctx.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "glob.h"
+#include "crypto_ctx.h"
+
+struct crypto_ctx_list {
+ spinlock_t ctx_lock;
+ int avail_ctx;
+ struct list_head idle_ctx;
+ wait_queue_head_t ctx_wait;
+};
+
+static struct crypto_ctx_list ctx_list;
+
+static inline void free_aead(struct crypto_aead *aead)
+{
+ if (aead)
+ crypto_free_aead(aead);
+}
+
+static void free_shash(struct shash_desc *shash)
+{
+ if (shash) {
+ crypto_free_shash(shash->tfm);
+ kfree(shash);
+ }
+}
+
+static struct crypto_aead *alloc_aead(int id)
+{
+ struct crypto_aead *tfm = NULL;
+
+ switch (id) {
+ case CRYPTO_AEAD_AES_GCM:
+ tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ break;
+ case CRYPTO_AEAD_AES_CCM:
+ tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+ break;
+ default:
+ pr_err("Does not support encrypt ahead(id : %d)\n", id);
+ return NULL;
+ }
+
+ if (IS_ERR(tfm)) {
+ pr_err("Failed to alloc encrypt aead : %ld\n", PTR_ERR(tfm));
+ return NULL;
+ }
+
+ return tfm;
+}
+
+static struct shash_desc *alloc_shash_desc(int id)
+{
+ struct crypto_shash *tfm = NULL;
+ struct shash_desc *shash;
+
+ switch (id) {
+ case CRYPTO_SHASH_HMACMD5:
+ tfm = crypto_alloc_shash("hmac(md5)", 0, 0);
+ break;
+ case CRYPTO_SHASH_HMACSHA256:
+ tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+ break;
+ case CRYPTO_SHASH_CMACAES:
+ tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
+ break;
+ case CRYPTO_SHASH_SHA256:
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ break;
+ case CRYPTO_SHASH_SHA512:
+ tfm = crypto_alloc_shash("sha512", 0, 0);
+ break;
+ case CRYPTO_SHASH_MD4:
+ tfm = crypto_alloc_shash("md4", 0, 0);
+ break;
+ case CRYPTO_SHASH_MD5:
+ tfm = crypto_alloc_shash("md5", 0, 0);
+ break;
+ default:
+ return NULL;
+ }
+
+ if (IS_ERR(tfm))
+ return NULL;
+
+ shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(tfm),
+ GFP_KERNEL);
+ if (!shash)
+ crypto_free_shash(tfm);
+ else
+ shash->tfm = tfm;
+ return shash;
+}
+
+static void ctx_free(struct ksmbd_crypto_ctx *ctx)
+{
+ int i;
+
+ for (i = 0; i < CRYPTO_SHASH_MAX; i++)
+ free_shash(ctx->desc[i]);
+ for (i = 0; i < CRYPTO_AEAD_MAX; i++)
+ free_aead(ctx->ccmaes[i]);
+ kfree(ctx);
+}
+
+static struct ksmbd_crypto_ctx *ksmbd_find_crypto_ctx(void)
+{
+ struct ksmbd_crypto_ctx *ctx;
+
+ while (1) {
+ spin_lock(&ctx_list.ctx_lock);
+ if (!list_empty(&ctx_list.idle_ctx)) {
+ ctx = list_entry(ctx_list.idle_ctx.next,
+ struct ksmbd_crypto_ctx,
+ list);
+ list_del(&ctx->list);
+ spin_unlock(&ctx_list.ctx_lock);
+ return ctx;
+ }
+
+ if (ctx_list.avail_ctx > num_online_cpus()) {
+ spin_unlock(&ctx_list.ctx_lock);
+ wait_event(ctx_list.ctx_wait,
+ !list_empty(&ctx_list.idle_ctx));
+ continue;
+ }
+
+ ctx_list.avail_ctx++;
+ spin_unlock(&ctx_list.ctx_lock);
+
+ ctx = kzalloc(sizeof(struct ksmbd_crypto_ctx), GFP_KERNEL);
+ if (!ctx) {
+ spin_lock(&ctx_list.ctx_lock);
+ ctx_list.avail_ctx--;
+ spin_unlock(&ctx_list.ctx_lock);
+ wait_event(ctx_list.ctx_wait,
+ !list_empty(&ctx_list.idle_ctx));
+ continue;
+ }
+ break;
+ }
+ return ctx;
+}
+
+void ksmbd_release_crypto_ctx(struct ksmbd_crypto_ctx *ctx)
+{
+ if (!ctx)
+ return;
+
+ spin_lock(&ctx_list.ctx_lock);
+ if (ctx_list.avail_ctx <= num_online_cpus()) {
+ list_add(&ctx->list, &ctx_list.idle_ctx);
+ spin_unlock(&ctx_list.ctx_lock);
+ wake_up(&ctx_list.ctx_wait);
+ return;
+ }
+
+ ctx_list.avail_ctx--;
+ spin_unlock(&ctx_list.ctx_lock);
+ ctx_free(ctx);
+}
+
+static struct ksmbd_crypto_ctx *____crypto_shash_ctx_find(int id)
+{
+ struct ksmbd_crypto_ctx *ctx;
+
+ if (id >= CRYPTO_SHASH_MAX)
+ return NULL;
+
+ ctx = ksmbd_find_crypto_ctx();
+ if (ctx->desc[id])
+ return ctx;
+
+ ctx->desc[id] = alloc_shash_desc(id);
+ if (ctx->desc[id])
+ return ctx;
+ ksmbd_release_crypto_ctx(ctx);
+ return NULL;
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacmd5(void)
+{
+ return ____crypto_shash_ctx_find(CRYPTO_SHASH_HMACMD5);
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void)
+{
+ return ____crypto_shash_ctx_find(CRYPTO_SHASH_HMACSHA256);
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void)
+{
+ return ____crypto_shash_ctx_find(CRYPTO_SHASH_CMACAES);
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void)
+{
+ return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA256);
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void)
+{
+ return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512);
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void)
+{
+ return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD4);
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void)
+{
+ return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD5);
+}
+
+static struct ksmbd_crypto_ctx *____crypto_aead_ctx_find(int id)
+{
+ struct ksmbd_crypto_ctx *ctx;
+
+ if (id >= CRYPTO_AEAD_MAX)
+ return NULL;
+
+ ctx = ksmbd_find_crypto_ctx();
+ if (ctx->ccmaes[id])
+ return ctx;
+
+ ctx->ccmaes[id] = alloc_aead(id);
+ if (ctx->ccmaes[id])
+ return ctx;
+ ksmbd_release_crypto_ctx(ctx);
+ return NULL;
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void)
+{
+ return ____crypto_aead_ctx_find(CRYPTO_AEAD_AES_GCM);
+}
+
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void)
+{
+ return ____crypto_aead_ctx_find(CRYPTO_AEAD_AES_CCM);
+}
+
+void ksmbd_crypto_destroy(void)
+{
+ struct ksmbd_crypto_ctx *ctx;
+
+ while (!list_empty(&ctx_list.idle_ctx)) {
+ ctx = list_entry(ctx_list.idle_ctx.next,
+ struct ksmbd_crypto_ctx,
+ list);
+ list_del(&ctx->list);
+ ctx_free(ctx);
+ }
+}
+
+int ksmbd_crypto_create(void)
+{
+ struct ksmbd_crypto_ctx *ctx;
+
+ spin_lock_init(&ctx_list.ctx_lock);
+ INIT_LIST_HEAD(&ctx_list.idle_ctx);
+ init_waitqueue_head(&ctx_list.ctx_wait);
+ ctx_list.avail_ctx = 1;
+
+ ctx = kzalloc(sizeof(struct ksmbd_crypto_ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ list_add(&ctx->list, &ctx_list.idle_ctx);
+ return 0;
+}
diff --git a/fs/ksmbd/crypto_ctx.h b/fs/ksmbd/crypto_ctx.h
new file mode 100644
index 000000000000..ef11154b43df
--- /dev/null
+++ b/fs/ksmbd/crypto_ctx.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __CRYPTO_CTX_H__
+#define __CRYPTO_CTX_H__
+
+#include <crypto/hash.h>
+#include <crypto/aead.h>
+
+enum {
+ CRYPTO_SHASH_HMACMD5 = 0,
+ CRYPTO_SHASH_HMACSHA256,
+ CRYPTO_SHASH_CMACAES,
+ CRYPTO_SHASH_SHA256,
+ CRYPTO_SHASH_SHA512,
+ CRYPTO_SHASH_MD4,
+ CRYPTO_SHASH_MD5,
+ CRYPTO_SHASH_MAX,
+};
+
+enum {
+ CRYPTO_AEAD_AES_GCM = 16,
+ CRYPTO_AEAD_AES_CCM,
+ CRYPTO_AEAD_MAX,
+};
+
+enum {
+ CRYPTO_BLK_ECBDES = 32,
+ CRYPTO_BLK_MAX,
+};
+
+struct ksmbd_crypto_ctx {
+ struct list_head list;
+
+ struct shash_desc *desc[CRYPTO_SHASH_MAX];
+ struct crypto_aead *ccmaes[CRYPTO_AEAD_MAX];
+};
+
+#define CRYPTO_HMACMD5(c) ((c)->desc[CRYPTO_SHASH_HMACMD5])
+#define CRYPTO_HMACSHA256(c) ((c)->desc[CRYPTO_SHASH_HMACSHA256])
+#define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES])
+#define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256])
+#define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512])
+#define CRYPTO_MD4(c) ((c)->desc[CRYPTO_SHASH_MD4])
+#define CRYPTO_MD5(c) ((c)->desc[CRYPTO_SHASH_MD5])
+
+#define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm)
+#define CRYPTO_HMACSHA256_TFM(c)\
+ ((c)->desc[CRYPTO_SHASH_HMACSHA256]->tfm)
+#define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm)
+#define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm)
+#define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm)
+#define CRYPTO_MD4_TFM(c) ((c)->desc[CRYPTO_SHASH_MD4]->tfm)
+#define CRYPTO_MD5_TFM(c) ((c)->desc[CRYPTO_SHASH_MD5]->tfm)
+
+#define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM])
+#define CRYPTO_CCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_CCM])
+
+void ksmbd_release_crypto_ctx(struct ksmbd_crypto_ctx *ctx);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacmd5(void);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void);
+struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void);
+void ksmbd_crypto_destroy(void);
+int ksmbd_crypto_create(void);
+
+#endif /* __CRYPTO_CTX_H__ */
diff --git a/fs/ksmbd/glob.h b/fs/ksmbd/glob.h
new file mode 100644
index 000000000000..49a5a3afa118
--- /dev/null
+++ b/fs/ksmbd/glob.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_GLOB_H
+#define __KSMBD_GLOB_H
+
+#include <linux/ctype.h>
+
+#include "unicode.h"
+#include "vfs_cache.h"
+
+#define KSMBD_VERSION "3.1.9"
+
+extern int ksmbd_debug_types;
+
+#define KSMBD_DEBUG_SMB BIT(0)
+#define KSMBD_DEBUG_AUTH BIT(1)
+#define KSMBD_DEBUG_VFS BIT(2)
+#define KSMBD_DEBUG_OPLOCK BIT(3)
+#define KSMBD_DEBUG_IPC BIT(4)
+#define KSMBD_DEBUG_CONN BIT(5)
+#define KSMBD_DEBUG_RDMA BIT(6)
+#define KSMBD_DEBUG_ALL (KSMBD_DEBUG_SMB | KSMBD_DEBUG_AUTH | \
+ KSMBD_DEBUG_VFS | KSMBD_DEBUG_OPLOCK | \
+ KSMBD_DEBUG_IPC | KSMBD_DEBUG_CONN | \
+ KSMBD_DEBUG_RDMA)
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#ifdef SUBMOD_NAME
+#define pr_fmt(fmt) "ksmbd: " SUBMOD_NAME ": " fmt
+#else
+#define pr_fmt(fmt) "ksmbd: " fmt
+#endif
+
+#define ksmbd_debug(type, fmt, ...) \
+ do { \
+ if (ksmbd_debug_types & KSMBD_DEBUG_##type) \
+ pr_info(fmt, ##__VA_ARGS__); \
+ } while (0)
+
+#define UNICODE_LEN(x) ((x) * 2)
+
+#endif /* __KSMBD_GLOB_H */
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
new file mode 100644
index 000000000000..2fbe2bc1e093
--- /dev/null
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -0,0 +1,395 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ *
+ * linux-ksmbd-devel@lists.sourceforge.net
+ */
+
+#ifndef _LINUX_KSMBD_SERVER_H
+#define _LINUX_KSMBD_SERVER_H
+
+#include <linux/types.h>
+
+/*
+ * This is a userspace ABI to communicate data between ksmbd and user IPC
+ * daemon using netlink. This is added to track and cache user account DB
+ * and share configuration info from userspace.
+ *
+ * - KSMBD_EVENT_HEARTBEAT_REQUEST(ksmbd_heartbeat)
+ * This event is to check whether user IPC daemon is alive. If user IPC
+ * daemon is dead, ksmbd keep existing connection till disconnecting and
+ * new connection will be denied.
+ *
+ * - KSMBD_EVENT_STARTING_UP(ksmbd_startup_request)
+ * This event is to receive the information that initializes the ksmbd
+ * server from the user IPC daemon and to start the server. The global
+ * section parameters are given from smb.conf as initialization
+ * information.
+ *
+ * - KSMBD_EVENT_SHUTTING_DOWN(ksmbd_shutdown_request)
+ * This event is to shutdown ksmbd server.
+ *
+ * - KSMBD_EVENT_LOGIN_REQUEST/RESPONSE(ksmbd_login_request/response)
+ * This event is to get user account info to user IPC daemon.
+ *
+ * - KSMBD_EVENT_SHARE_CONFIG_REQUEST/RESPONSE(ksmbd_share_config_request/response)
+ * This event is to get net share configuration info.
+ *
+ * - KSMBD_EVENT_TREE_CONNECT_REQUEST/RESPONSE(ksmbd_tree_connect_request/response)
+ * This event is to get session and tree connect info.
+ *
+ * - KSMBD_EVENT_TREE_DISCONNECT_REQUEST(ksmbd_tree_disconnect_request)
+ * This event is to send tree disconnect info to user IPC daemon.
+ *
+ * - KSMBD_EVENT_LOGOUT_REQUEST(ksmbd_logout_request)
+ * This event is to send logout request to user IPC daemon.
+ *
+ * - KSMBD_EVENT_RPC_REQUEST/RESPONSE(ksmbd_rpc_command)
+ * This event is to make DCE/RPC request like srvsvc, wkssvc, lsarpc,
+ * samr to be processed in userspace.
+ *
+ * - KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST/RESPONSE(ksmbd_spnego_authen_request/response)
+ * This event is to make kerberos authentication to be processed in
+ * userspace.
+ */
+
+#define KSMBD_GENL_NAME "SMBD_GENL"
+#define KSMBD_GENL_VERSION 0x01
+
+#define KSMBD_REQ_MAX_ACCOUNT_NAME_SZ 48
+#define KSMBD_REQ_MAX_HASH_SZ 18
+#define KSMBD_REQ_MAX_SHARE_NAME 64
+
+/*
+ * IPC heartbeat frame to check whether user IPC daemon is alive.
+ */
+struct ksmbd_heartbeat {
+ __u32 handle;
+};
+
+/*
+ * Global config flags.
+ */
+#define KSMBD_GLOBAL_FLAG_INVALID (0)
+#define KSMBD_GLOBAL_FLAG_SMB2_LEASES BIT(0)
+#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1)
+#define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2)
+
+/*
+ * IPC request for ksmbd server startup
+ */
+struct ksmbd_startup_request {
+ __u32 flags; /* Flags for global config */
+ __s32 signing; /* Signing enabled */
+ __s8 min_prot[16]; /* The minimum SMB protocol version */
+ __s8 max_prot[16]; /* The maximum SMB protocol version */
+ __s8 netbios_name[16];
+ __s8 work_group[64]; /* Workgroup */
+ __s8 server_string[64]; /* Server string */
+ __u16 tcp_port; /* tcp port */
+ __u16 ipc_timeout; /*
+ * specifies the number of seconds
+ * server will wait for the userspace to
+ * reply to heartbeat frames.
+ */
+ __u32 deadtime; /* Number of minutes of inactivity */
+ __u32 file_max; /* Limits the maximum number of open files */
+ __u32 smb2_max_write; /* MAX write size */
+ __u32 smb2_max_read; /* MAX read size */
+ __u32 smb2_max_trans; /* MAX trans size */
+ __u32 share_fake_fscaps; /*
+ * Support some special application that
+ * makes QFSINFO calls to check whether
+ * we set the SPARSE_FILES bit (0x40).
+ */
+ __u32 sub_auth[3]; /* Subauth value for Security ID */
+ __u32 ifc_list_sz; /* interfaces list size */
+ __s8 ____payload[];
+};
+
+#define KSMBD_STARTUP_CONFIG_INTERFACES(s) ((s)->____payload)
+
+/*
+ * IPC request to shutdown ksmbd server.
+ */
+struct ksmbd_shutdown_request {
+ __s32 reserved;
+};
+
+/*
+ * IPC user login request.
+ */
+struct ksmbd_login_request {
+ __u32 handle;
+ __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */
+};
+
+/*
+ * IPC user login response.
+ */
+struct ksmbd_login_response {
+ __u32 handle;
+ __u32 gid; /* group id */
+ __u32 uid; /* user id */
+ __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */
+ __u16 status;
+ __u16 hash_sz; /* hash size */
+ __s8 hash[KSMBD_REQ_MAX_HASH_SZ]; /* password hash */
+};
+
+/*
+ * IPC request to fetch net share config.
+ */
+struct ksmbd_share_config_request {
+ __u32 handle;
+ __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; /* share name */
+};
+
+/*
+ * IPC response to the net share config request.
+ */
+struct ksmbd_share_config_response {
+ __u32 handle;
+ __u32 flags;
+ __u16 create_mask;
+ __u16 directory_mask;
+ __u16 force_create_mode;
+ __u16 force_directory_mode;
+ __u16 force_uid;
+ __u16 force_gid;
+ __u32 veto_list_sz;
+ __s8 ____payload[];
+};
+
+#define KSMBD_SHARE_CONFIG_VETO_LIST(s) ((s)->____payload)
+
+static inline char *
+ksmbd_share_config_path(struct ksmbd_share_config_response *sc)
+{
+ char *p = sc->____payload;
+
+ if (sc->veto_list_sz)
+ p += sc->veto_list_sz + 1;
+
+ return p;
+}
+
+/*
+ * IPC request for tree connection. This request include session and tree
+ * connect info from client.
+ */
+struct ksmbd_tree_connect_request {
+ __u32 handle;
+ __u16 account_flags;
+ __u16 flags;
+ __u64 session_id;
+ __u64 connect_id;
+ __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ];
+ __s8 share[KSMBD_REQ_MAX_SHARE_NAME];
+ __s8 peer_addr[64];
+};
+
+/*
+ * IPC Response structure for tree connection.
+ */
+struct ksmbd_tree_connect_response {
+ __u32 handle;
+ __u16 status;
+ __u16 connection_flags;
+};
+
+/*
+ * IPC Request struture to disconnect tree connection.
+ */
+struct ksmbd_tree_disconnect_request {
+ __u64 session_id; /* session id */
+ __u64 connect_id; /* tree connection id */
+};
+
+/*
+ * IPC Response structure to logout user account.
+ */
+struct ksmbd_logout_request {
+ __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */
+};
+
+/*
+ * RPC command structure to send rpc request like srvsvc or wkssvc to
+ * IPC user daemon.
+ */
+struct ksmbd_rpc_command {
+ __u32 handle;
+ __u32 flags;
+ __u32 payload_sz;
+ __u8 payload[];
+};
+
+/*
+ * IPC Request Kerberos authentication
+ */
+struct ksmbd_spnego_authen_request {
+ __u32 handle;
+ __u16 spnego_blob_len; /* the length of spnego_blob */
+ __u8 spnego_blob[0]; /*
+ * the GSS token from SecurityBuffer of
+ * SMB2 SESSION SETUP request
+ */
+};
+
+/*
+ * Response data which includes the GSS token and the session key generated by
+ * user daemon.
+ */
+struct ksmbd_spnego_authen_response {
+ __u32 handle;
+ struct ksmbd_login_response login_response; /*
+ * the login response with
+ * a user identified by the
+ * GSS token from a client
+ */
+ __u16 session_key_len; /* the length of the session key */
+ __u16 spnego_blob_len; /*
+ * the length of the GSS token which will be
+ * stored in SecurityBuffer of SMB2 SESSION
+ * SETUP response
+ */
+ __u8 payload[]; /* session key + AP_REP */
+};
+
+/*
+ * This also used as NETLINK attribute type value.
+ *
+ * NOTE:
+ * Response message type value should be equal to
+ * request message type value + 1.
+ */
+enum ksmbd_event {
+ KSMBD_EVENT_UNSPEC = 0,
+ KSMBD_EVENT_HEARTBEAT_REQUEST,
+
+ KSMBD_EVENT_STARTING_UP,
+ KSMBD_EVENT_SHUTTING_DOWN,
+
+ KSMBD_EVENT_LOGIN_REQUEST,
+ KSMBD_EVENT_LOGIN_RESPONSE = 5,
+
+ KSMBD_EVENT_SHARE_CONFIG_REQUEST,
+ KSMBD_EVENT_SHARE_CONFIG_RESPONSE,
+
+ KSMBD_EVENT_TREE_CONNECT_REQUEST,
+ KSMBD_EVENT_TREE_CONNECT_RESPONSE,
+
+ KSMBD_EVENT_TREE_DISCONNECT_REQUEST = 10,
+
+ KSMBD_EVENT_LOGOUT_REQUEST,
+
+ KSMBD_EVENT_RPC_REQUEST,
+ KSMBD_EVENT_RPC_RESPONSE,
+
+ KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST,
+ KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15,
+
+ KSMBD_EVENT_MAX
+};
+
+/*
+ * Enumeration for IPC tree connect status.
+ */
+enum KSMBD_TREE_CONN_STATUS {
+ KSMBD_TREE_CONN_STATUS_OK = 0,
+ KSMBD_TREE_CONN_STATUS_NOMEM,
+ KSMBD_TREE_CONN_STATUS_NO_SHARE,
+ KSMBD_TREE_CONN_STATUS_NO_USER,
+ KSMBD_TREE_CONN_STATUS_INVALID_USER,
+ KSMBD_TREE_CONN_STATUS_HOST_DENIED = 5,
+ KSMBD_TREE_CONN_STATUS_CONN_EXIST,
+ KSMBD_TREE_CONN_STATUS_TOO_MANY_CONNS,
+ KSMBD_TREE_CONN_STATUS_TOO_MANY_SESSIONS,
+ KSMBD_TREE_CONN_STATUS_ERROR,
+};
+
+/*
+ * User config flags.
+ */
+#define KSMBD_USER_FLAG_INVALID (0)
+#define KSMBD_USER_FLAG_OK BIT(0)
+#define KSMBD_USER_FLAG_BAD_PASSWORD BIT(1)
+#define KSMBD_USER_FLAG_BAD_UID BIT(2)
+#define KSMBD_USER_FLAG_BAD_USER BIT(3)
+#define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4)
+
+/*
+ * Share config flags.
+ */
+#define KSMBD_SHARE_FLAG_INVALID (0)
+#define KSMBD_SHARE_FLAG_AVAILABLE BIT(0)
+#define KSMBD_SHARE_FLAG_BROWSEABLE BIT(1)
+#define KSMBD_SHARE_FLAG_WRITEABLE BIT(2)
+#define KSMBD_SHARE_FLAG_READONLY BIT(3)
+#define KSMBD_SHARE_FLAG_GUEST_OK BIT(4)
+#define KSMBD_SHARE_FLAG_GUEST_ONLY BIT(5)
+#define KSMBD_SHARE_FLAG_STORE_DOS_ATTRS BIT(6)
+#define KSMBD_SHARE_FLAG_OPLOCKS BIT(7)
+#define KSMBD_SHARE_FLAG_PIPE BIT(8)
+#define KSMBD_SHARE_FLAG_HIDE_DOT_FILES BIT(9)
+#define KSMBD_SHARE_FLAG_INHERIT_OWNER BIT(10)
+#define KSMBD_SHARE_FLAG_STREAMS BIT(11)
+#define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12)
+#define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13)
+
+/*
+ * Tree connect request flags.
+ */
+#define KSMBD_TREE_CONN_FLAG_REQUEST_SMB1 (0)
+#define KSMBD_TREE_CONN_FLAG_REQUEST_IPV6 BIT(0)
+#define KSMBD_TREE_CONN_FLAG_REQUEST_SMB2 BIT(1)
+
+/*
+ * Tree connect flags.
+ */
+#define KSMBD_TREE_CONN_FLAG_GUEST_ACCOUNT BIT(0)
+#define KSMBD_TREE_CONN_FLAG_READ_ONLY BIT(1)
+#define KSMBD_TREE_CONN_FLAG_WRITABLE BIT(2)
+#define KSMBD_TREE_CONN_FLAG_ADMIN_ACCOUNT BIT(3)
+
+/*
+ * RPC over IPC.
+ */
+#define KSMBD_RPC_METHOD_RETURN BIT(0)
+#define KSMBD_RPC_SRVSVC_METHOD_INVOKE BIT(1)
+#define KSMBD_RPC_SRVSVC_METHOD_RETURN (KSMBD_RPC_SRVSVC_METHOD_INVOKE | KSMBD_RPC_METHOD_RETURN)
+#define KSMBD_RPC_WKSSVC_METHOD_INVOKE BIT(2)
+#define KSMBD_RPC_WKSSVC_METHOD_RETURN (KSMBD_RPC_WKSSVC_METHOD_INVOKE | KSMBD_RPC_METHOD_RETURN)
+#define KSMBD_RPC_IOCTL_METHOD (BIT(3) | KSMBD_RPC_METHOD_RETURN)
+#define KSMBD_RPC_OPEN_METHOD BIT(4)
+#define KSMBD_RPC_WRITE_METHOD BIT(5)
+#define KSMBD_RPC_READ_METHOD (BIT(6) | KSMBD_RPC_METHOD_RETURN)
+#define KSMBD_RPC_CLOSE_METHOD BIT(7)
+#define KSMBD_RPC_RAP_METHOD (BIT(8) | KSMBD_RPC_METHOD_RETURN)
+#define KSMBD_RPC_RESTRICTED_CONTEXT BIT(9)
+#define KSMBD_RPC_SAMR_METHOD_INVOKE BIT(10)
+#define KSMBD_RPC_SAMR_METHOD_RETURN (KSMBD_RPC_SAMR_METHOD_INVOKE | KSMBD_RPC_METHOD_RETURN)
+#define KSMBD_RPC_LSARPC_METHOD_INVOKE BIT(11)
+#define KSMBD_RPC_LSARPC_METHOD_RETURN (KSMBD_RPC_LSARPC_METHOD_INVOKE | KSMBD_RPC_METHOD_RETURN)
+
+/*
+ * RPC status definitions.
+ */
+#define KSMBD_RPC_OK 0
+#define KSMBD_RPC_EBAD_FUNC 0x00000001
+#define KSMBD_RPC_EACCESS_DENIED 0x00000005
+#define KSMBD_RPC_EBAD_FID 0x00000006
+#define KSMBD_RPC_ENOMEM 0x00000008
+#define KSMBD_RPC_EBAD_DATA 0x0000000D
+#define KSMBD_RPC_ENOTIMPLEMENTED 0x00000040
+#define KSMBD_RPC_EINVALID_PARAMETER 0x00000057
+#define KSMBD_RPC_EMORE_DATA 0x000000EA
+#define KSMBD_RPC_EINVALID_LEVEL 0x0000007C
+#define KSMBD_RPC_SOME_NOT_MAPPED 0x00000107
+
+#define KSMBD_CONFIG_OPT_DISABLED 0
+#define KSMBD_CONFIG_OPT_ENABLED 1
+#define KSMBD_CONFIG_OPT_AUTO 2
+#define KSMBD_CONFIG_OPT_MANDATORY 3
+
+#endif /* _LINUX_KSMBD_SERVER_H */
diff --git a/fs/ksmbd/ksmbd_spnego_negtokeninit.asn1 b/fs/ksmbd/ksmbd_spnego_negtokeninit.asn1
new file mode 100644
index 000000000000..0065f191b54b
--- /dev/null
+++ b/fs/ksmbd/ksmbd_spnego_negtokeninit.asn1
@@ -0,0 +1,31 @@
+GSSAPI ::=
+ [APPLICATION 0] IMPLICIT SEQUENCE {
+ thisMech
+ OBJECT IDENTIFIER ({ksmbd_gssapi_this_mech}),
+ negotiationToken
+ NegotiationToken
+ }
+
+MechType ::= OBJECT IDENTIFIER ({ksmbd_neg_token_init_mech_type})
+
+MechTypeList ::= SEQUENCE OF MechType
+
+NegTokenInit ::=
+ SEQUENCE {
+ mechTypes
+ [0] MechTypeList,
+ reqFlags
+ [1] BIT STRING OPTIONAL,
+ mechToken
+ [2] OCTET STRING OPTIONAL ({ksmbd_neg_token_init_mech_token}),
+ mechListMIC
+ [3] OCTET STRING OPTIONAL
+ }
+
+NegotiationToken ::=
+ CHOICE {
+ negTokenInit
+ [0] NegTokenInit,
+ negTokenTarg
+ [1] ANY
+ }
diff --git a/fs/ksmbd/ksmbd_spnego_negtokentarg.asn1 b/fs/ksmbd/ksmbd_spnego_negtokentarg.asn1
new file mode 100644
index 000000000000..1151933e7b9c
--- /dev/null
+++ b/fs/ksmbd/ksmbd_spnego_negtokentarg.asn1
@@ -0,0 +1,19 @@
+GSSAPI ::=
+ CHOICE {
+ negTokenInit
+ [0] ANY,
+ negTokenTarg
+ [1] NegTokenTarg
+ }
+
+NegTokenTarg ::=
+ SEQUENCE {
+ negResult
+ [0] ENUMERATED OPTIONAL,
+ supportedMech
+ [1] OBJECT IDENTIFIER OPTIONAL,
+ responseToken
+ [2] OCTET STRING OPTIONAL ({ksmbd_neg_token_targ_resp_token}),
+ mechListMIC
+ [3] OCTET STRING OPTIONAL
+ }
diff --git a/fs/ksmbd/ksmbd_work.c b/fs/ksmbd/ksmbd_work.c
new file mode 100644
index 000000000000..fd58eb4809f6
--- /dev/null
+++ b/fs/ksmbd/ksmbd_work.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "server.h"
+#include "connection.h"
+#include "ksmbd_work.h"
+#include "mgmt/ksmbd_ida.h"
+
+static struct kmem_cache *work_cache;
+static struct workqueue_struct *ksmbd_wq;
+
+struct ksmbd_work *ksmbd_alloc_work_struct(void)
+{
+ struct ksmbd_work *work = kmem_cache_zalloc(work_cache, GFP_KERNEL);
+
+ if (work) {
+ work->compound_fid = KSMBD_NO_FID;
+ work->compound_pfid = KSMBD_NO_FID;
+ INIT_LIST_HEAD(&work->request_entry);
+ INIT_LIST_HEAD(&work->async_request_entry);
+ INIT_LIST_HEAD(&work->fp_entry);
+ INIT_LIST_HEAD(&work->interim_entry);
+ }
+ return work;
+}
+
+void ksmbd_free_work_struct(struct ksmbd_work *work)
+{
+ WARN_ON(work->saved_cred != NULL);
+
+ kvfree(work->response_buf);
+ kvfree(work->aux_payload_buf);
+ kfree(work->tr_buf);
+ kvfree(work->request_buf);
+ if (work->async_id)
+ ksmbd_release_id(&work->conn->async_ida, work->async_id);
+ kmem_cache_free(work_cache, work);
+}
+
+void ksmbd_work_pool_destroy(void)
+{
+ kmem_cache_destroy(work_cache);
+}
+
+int ksmbd_work_pool_init(void)
+{
+ work_cache = kmem_cache_create("ksmbd_work_cache",
+ sizeof(struct ksmbd_work), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!work_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+int ksmbd_workqueue_init(void)
+{
+ ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0);
+ if (!ksmbd_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+void ksmbd_workqueue_destroy(void)
+{
+ flush_workqueue(ksmbd_wq);
+ destroy_workqueue(ksmbd_wq);
+ ksmbd_wq = NULL;
+}
+
+bool ksmbd_queue_work(struct ksmbd_work *work)
+{
+ return queue_work(ksmbd_wq, &work->work);
+}
diff --git a/fs/ksmbd/ksmbd_work.h b/fs/ksmbd/ksmbd_work.h
new file mode 100644
index 000000000000..f7156bc50049
--- /dev/null
+++ b/fs/ksmbd/ksmbd_work.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_WORK_H__
+#define __KSMBD_WORK_H__
+
+#include <linux/ctype.h>
+#include <linux/workqueue.h>
+
+struct ksmbd_conn;
+struct ksmbd_session;
+struct ksmbd_tree_connect;
+
+enum {
+ KSMBD_WORK_ACTIVE = 0,
+ KSMBD_WORK_CANCELLED,
+ KSMBD_WORK_CLOSED,
+};
+
+/* one of these for every pending CIFS request at the connection */
+struct ksmbd_work {
+ /* Server corresponding to this mid */
+ struct ksmbd_conn *conn;
+ struct ksmbd_session *sess;
+ struct ksmbd_tree_connect *tcon;
+
+ /* Pointer to received SMB header */
+ void *request_buf;
+ /* Response buffer */
+ void *response_buf;
+
+ /* Read data buffer */
+ void *aux_payload_buf;
+
+ /* Next cmd hdr in compound req buf*/
+ int next_smb2_rcv_hdr_off;
+ /* Next cmd hdr in compound rsp buf*/
+ int next_smb2_rsp_hdr_off;
+
+ /*
+ * Current Local FID assigned compound response if SMB2 CREATE
+ * command is present in compound request
+ */
+ u64 compound_fid;
+ u64 compound_pfid;
+ u64 compound_sid;
+
+ const struct cred *saved_cred;
+
+ /* Number of granted credits */
+ unsigned int credits_granted;
+
+ /* response smb header size */
+ unsigned int resp_hdr_sz;
+ unsigned int response_sz;
+ /* Read data count */
+ unsigned int aux_payload_sz;
+
+ void *tr_buf;
+
+ unsigned char state;
+ /* Multiple responses for one request e.g. SMB ECHO */
+ bool multiRsp:1;
+ /* No response for cancelled request */
+ bool send_no_response:1;
+ /* Request is encrypted */
+ bool encrypted:1;
+ /* Is this SYNC or ASYNC ksmbd_work */
+ bool syncronous:1;
+ bool need_invalidate_rkey:1;
+
+ unsigned int remote_key;
+ /* cancel works */
+ int async_id;
+ void **cancel_argv;
+ void (*cancel_fn)(void **argv);
+
+ struct work_struct work;
+ /* List head at conn->requests */
+ struct list_head request_entry;
+ /* List head at conn->async_requests */
+ struct list_head async_request_entry;
+ struct list_head fp_entry;
+ struct list_head interim_entry;
+};
+
+/**
+ * ksmbd_resp_buf_next - Get next buffer on compound response.
+ * @work: smb work containing response buffer
+ */
+static inline void *ksmbd_resp_buf_next(struct ksmbd_work *work)
+{
+ return work->response_buf + work->next_smb2_rsp_hdr_off;
+}
+
+/**
+ * ksmbd_req_buf_next - Get next buffer on compound request.
+ * @work: smb work containing response buffer
+ */
+static inline void *ksmbd_req_buf_next(struct ksmbd_work *work)
+{
+ return work->request_buf + work->next_smb2_rcv_hdr_off;
+}
+
+struct ksmbd_work *ksmbd_alloc_work_struct(void);
+void ksmbd_free_work_struct(struct ksmbd_work *work);
+
+void ksmbd_work_pool_destroy(void);
+int ksmbd_work_pool_init(void);
+
+int ksmbd_workqueue_init(void);
+void ksmbd_workqueue_destroy(void);
+bool ksmbd_queue_work(struct ksmbd_work *work);
+
+#endif /* __KSMBD_WORK_H__ */
diff --git a/fs/ksmbd/mgmt/ksmbd_ida.c b/fs/ksmbd/mgmt/ksmbd_ida.c
new file mode 100644
index 000000000000..54194d959a5e
--- /dev/null
+++ b/fs/ksmbd/mgmt/ksmbd_ida.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include "ksmbd_ida.h"
+
+static inline int __acquire_id(struct ida *ida, int from, int to)
+{
+ return ida_simple_get(ida, from, to, GFP_KERNEL);
+}
+
+int ksmbd_acquire_smb2_tid(struct ida *ida)
+{
+ int id;
+
+ id = __acquire_id(ida, 1, 0xFFFFFFFF);
+
+ return id;
+}
+
+int ksmbd_acquire_smb2_uid(struct ida *ida)
+{
+ int id;
+
+ id = __acquire_id(ida, 1, 0);
+ if (id == 0xFFFE)
+ id = __acquire_id(ida, 1, 0);
+
+ return id;
+}
+
+int ksmbd_acquire_async_msg_id(struct ida *ida)
+{
+ return __acquire_id(ida, 1, 0);
+}
+
+int ksmbd_acquire_id(struct ida *ida)
+{
+ return __acquire_id(ida, 0, 0);
+}
+
+void ksmbd_release_id(struct ida *ida, int id)
+{
+ ida_simple_remove(ida, id);
+}
diff --git a/fs/ksmbd/mgmt/ksmbd_ida.h b/fs/ksmbd/mgmt/ksmbd_ida.h
new file mode 100644
index 000000000000..2bc07b16cfde
--- /dev/null
+++ b/fs/ksmbd/mgmt/ksmbd_ida.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_IDA_MANAGEMENT_H__
+#define __KSMBD_IDA_MANAGEMENT_H__
+
+#include <linux/slab.h>
+#include <linux/idr.h>
+
+/*
+ * 2.2.1.6.7 TID Generation
+ * The value 0xFFFF MUST NOT be used as a valid TID. All other
+ * possible values for TID, including zero (0x0000), are valid.
+ * The value 0xFFFF is used to specify all TIDs or no TID,
+ * depending upon the context in which it is used.
+ */
+int ksmbd_acquire_smb2_tid(struct ida *ida);
+
+/*
+ * 2.2.1.6.8 UID Generation
+ * The value 0xFFFE was declared reserved in the LAN Manager 1.0
+ * documentation, so a value of 0xFFFE SHOULD NOT be used as a
+ * valid UID.<21> All other possible values for a UID, excluding
+ * zero (0x0000), are valid.
+ */
+int ksmbd_acquire_smb2_uid(struct ida *ida);
+int ksmbd_acquire_async_msg_id(struct ida *ida);
+
+int ksmbd_acquire_id(struct ida *ida);
+
+void ksmbd_release_id(struct ida *ida, int id);
+#endif /* __KSMBD_IDA_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c
new file mode 100644
index 000000000000..cb72d30f5b71
--- /dev/null
+++ b/fs/ksmbd/mgmt/share_config.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/parser.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+#include "share_config.h"
+#include "user_config.h"
+#include "user_session.h"
+#include "../transport_ipc.h"
+
+#define SHARE_HASH_BITS 3
+static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS);
+static DECLARE_RWSEM(shares_table_lock);
+
+struct ksmbd_veto_pattern {
+ char *pattern;
+ struct list_head list;
+};
+
+static unsigned int share_name_hash(char *name)
+{
+ return jhash(name, strlen(name), 0);
+}
+
+static void kill_share(struct ksmbd_share_config *share)
+{
+ while (!list_empty(&share->veto_list)) {
+ struct ksmbd_veto_pattern *p;
+
+ p = list_entry(share->veto_list.next,
+ struct ksmbd_veto_pattern,
+ list);
+ list_del(&p->list);
+ kfree(p->pattern);
+ kfree(p);
+ }
+
+ if (share->path)
+ path_put(&share->vfs_path);
+ kfree(share->name);
+ kfree(share->path);
+ kfree(share);
+}
+
+void __ksmbd_share_config_put(struct ksmbd_share_config *share)
+{
+ down_write(&shares_table_lock);
+ hash_del(&share->hlist);
+ up_write(&shares_table_lock);
+
+ kill_share(share);
+}
+
+static struct ksmbd_share_config *
+__get_share_config(struct ksmbd_share_config *share)
+{
+ if (!atomic_inc_not_zero(&share->refcount))
+ return NULL;
+ return share;
+}
+
+static struct ksmbd_share_config *__share_lookup(char *name)
+{
+ struct ksmbd_share_config *share;
+ unsigned int key = share_name_hash(name);
+
+ hash_for_each_possible(shares_table, share, hlist, key) {
+ if (!strcmp(name, share->name))
+ return share;
+ }
+ return NULL;
+}
+
+static int parse_veto_list(struct ksmbd_share_config *share,
+ char *veto_list,
+ int veto_list_sz)
+{
+ int sz = 0;
+
+ if (!veto_list_sz)
+ return 0;
+
+ while (veto_list_sz > 0) {
+ struct ksmbd_veto_pattern *p;
+
+ sz = strlen(veto_list);
+ if (!sz)
+ break;
+
+ p = kzalloc(sizeof(struct ksmbd_veto_pattern), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->pattern = kstrdup(veto_list, GFP_KERNEL);
+ if (!p->pattern) {
+ kfree(p);
+ return -ENOMEM;
+ }
+
+ list_add(&p->list, &share->veto_list);
+
+ veto_list += sz + 1;
+ veto_list_sz -= (sz + 1);
+ }
+
+ return 0;
+}
+
+static struct ksmbd_share_config *share_config_request(char *name)
+{
+ struct ksmbd_share_config_response *resp;
+ struct ksmbd_share_config *share = NULL;
+ struct ksmbd_share_config *lookup;
+ int ret;
+
+ resp = ksmbd_ipc_share_config_request(name);
+ if (!resp)
+ return NULL;
+
+ if (resp->flags == KSMBD_SHARE_FLAG_INVALID)
+ goto out;
+
+ share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL);
+ if (!share)
+ goto out;
+
+ share->flags = resp->flags;
+ atomic_set(&share->refcount, 1);
+ INIT_LIST_HEAD(&share->veto_list);
+ share->name = kstrdup(name, GFP_KERNEL);
+
+ if (!test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) {
+ share->path = kstrdup(ksmbd_share_config_path(resp),
+ GFP_KERNEL);
+ if (share->path)
+ share->path_sz = strlen(share->path);
+ share->create_mask = resp->create_mask;
+ share->directory_mask = resp->directory_mask;
+ share->force_create_mode = resp->force_create_mode;
+ share->force_directory_mode = resp->force_directory_mode;
+ share->force_uid = resp->force_uid;
+ share->force_gid = resp->force_gid;
+ ret = parse_veto_list(share,
+ KSMBD_SHARE_CONFIG_VETO_LIST(resp),
+ resp->veto_list_sz);
+ if (!ret && share->path) {
+ ret = kern_path(share->path, 0, &share->vfs_path);
+ if (ret) {
+ ksmbd_debug(SMB, "failed to access '%s'\n",
+ share->path);
+ /* Avoid put_path() */
+ kfree(share->path);
+ share->path = NULL;
+ }
+ }
+ if (ret || !share->name) {
+ kill_share(share);
+ share = NULL;
+ goto out;
+ }
+ }
+
+ down_write(&shares_table_lock);
+ lookup = __share_lookup(name);
+ if (lookup)
+ lookup = __get_share_config(lookup);
+ if (!lookup) {
+ hash_add(shares_table, &share->hlist, share_name_hash(name));
+ } else {
+ kill_share(share);
+ share = lookup;
+ }
+ up_write(&shares_table_lock);
+
+out:
+ kvfree(resp);
+ return share;
+}
+
+static void strtolower(char *share_name)
+{
+ while (*share_name) {
+ *share_name = tolower(*share_name);
+ share_name++;
+ }
+}
+
+struct ksmbd_share_config *ksmbd_share_config_get(char *name)
+{
+ struct ksmbd_share_config *share;
+
+ strtolower(name);
+
+ down_read(&shares_table_lock);
+ share = __share_lookup(name);
+ if (share)
+ share = __get_share_config(share);
+ up_read(&shares_table_lock);
+
+ if (share)
+ return share;
+ return share_config_request(name);
+}
+
+bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
+ const char *filename)
+{
+ struct ksmbd_veto_pattern *p;
+
+ list_for_each_entry(p, &share->veto_list, list) {
+ if (match_wildcard(p->pattern, filename))
+ return true;
+ }
+ return false;
+}
+
+void ksmbd_share_configs_cleanup(void)
+{
+ struct ksmbd_share_config *share;
+ struct hlist_node *tmp;
+ int i;
+
+ down_write(&shares_table_lock);
+ hash_for_each_safe(shares_table, i, tmp, share, hlist) {
+ hash_del(&share->hlist);
+ kill_share(share);
+ }
+ up_write(&shares_table_lock);
+}
diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h
new file mode 100644
index 000000000000..953befc94e84
--- /dev/null
+++ b/fs/ksmbd/mgmt/share_config.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __SHARE_CONFIG_MANAGEMENT_H__
+#define __SHARE_CONFIG_MANAGEMENT_H__
+
+#include <linux/workqueue.h>
+#include <linux/hashtable.h>
+#include <linux/path.h>
+
+struct ksmbd_share_config {
+ char *name;
+ char *path;
+
+ unsigned int path_sz;
+ unsigned int flags;
+ struct list_head veto_list;
+
+ struct path vfs_path;
+
+ atomic_t refcount;
+ struct hlist_node hlist;
+ unsigned short create_mask;
+ unsigned short directory_mask;
+ unsigned short force_create_mode;
+ unsigned short force_directory_mode;
+ unsigned short force_uid;
+ unsigned short force_gid;
+};
+
+#define KSMBD_SHARE_INVALID_UID ((__u16)-1)
+#define KSMBD_SHARE_INVALID_GID ((__u16)-1)
+
+static inline int share_config_create_mode(struct ksmbd_share_config *share,
+ umode_t posix_mode)
+{
+ if (!share->force_create_mode) {
+ if (!posix_mode)
+ return share->create_mask;
+ else
+ return posix_mode & share->create_mask;
+ }
+ return share->force_create_mode & share->create_mask;
+}
+
+static inline int share_config_directory_mode(struct ksmbd_share_config *share,
+ umode_t posix_mode)
+{
+ if (!share->force_directory_mode) {
+ if (!posix_mode)
+ return share->directory_mask;
+ else
+ return posix_mode & share->directory_mask;
+ }
+
+ return share->force_directory_mode & share->directory_mask;
+}
+
+static inline int test_share_config_flag(struct ksmbd_share_config *share,
+ int flag)
+{
+ return share->flags & flag;
+}
+
+void __ksmbd_share_config_put(struct ksmbd_share_config *share);
+
+static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
+{
+ if (!atomic_dec_and_test(&share->refcount))
+ return;
+ __ksmbd_share_config_put(share);
+}
+
+struct ksmbd_share_config *ksmbd_share_config_get(char *name);
+bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
+ const char *filename);
+void ksmbd_share_configs_cleanup(void);
+
+#endif /* __SHARE_CONFIG_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c
new file mode 100644
index 000000000000..0d28e723a28c
--- /dev/null
+++ b/fs/ksmbd/mgmt/tree_connect.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+
+#include "../transport_ipc.h"
+#include "../connection.h"
+
+#include "tree_connect.h"
+#include "user_config.h"
+#include "share_config.h"
+#include "user_session.h"
+
+struct ksmbd_tree_conn_status
+ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name)
+{
+ struct ksmbd_tree_conn_status status = {-EINVAL, NULL};
+ struct ksmbd_tree_connect_response *resp = NULL;
+ struct ksmbd_share_config *sc;
+ struct ksmbd_tree_connect *tree_conn = NULL;
+ struct sockaddr *peer_addr;
+ int ret;
+
+ sc = ksmbd_share_config_get(share_name);
+ if (!sc)
+ return status;
+
+ tree_conn = kzalloc(sizeof(struct ksmbd_tree_connect), GFP_KERNEL);
+ if (!tree_conn) {
+ status.ret = -ENOMEM;
+ goto out_error;
+ }
+
+ tree_conn->id = ksmbd_acquire_tree_conn_id(sess);
+ if (tree_conn->id < 0) {
+ status.ret = -EINVAL;
+ goto out_error;
+ }
+
+ peer_addr = KSMBD_TCP_PEER_SOCKADDR(sess->conn);
+ resp = ksmbd_ipc_tree_connect_request(sess,
+ sc,
+ tree_conn,
+ peer_addr);
+ if (!resp) {
+ status.ret = -EINVAL;
+ goto out_error;
+ }
+
+ status.ret = resp->status;
+ if (status.ret != KSMBD_TREE_CONN_STATUS_OK)
+ goto out_error;
+
+ tree_conn->flags = resp->connection_flags;
+ tree_conn->user = sess->user;
+ tree_conn->share_conf = sc;
+ status.tree_conn = tree_conn;
+
+ ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn,
+ GFP_KERNEL));
+ if (ret) {
+ status.ret = -ENOMEM;
+ goto out_error;
+ }
+ kvfree(resp);
+ return status;
+
+out_error:
+ if (tree_conn)
+ ksmbd_release_tree_conn_id(sess, tree_conn->id);
+ ksmbd_share_config_put(sc);
+ kfree(tree_conn);
+ kvfree(resp);
+ return status;
+}
+
+int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
+ struct ksmbd_tree_connect *tree_conn)
+{
+ int ret;
+
+ ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id);
+ ksmbd_release_tree_conn_id(sess, tree_conn->id);
+ xa_erase(&sess->tree_conns, tree_conn->id);
+ ksmbd_share_config_put(tree_conn->share_conf);
+ kfree(tree_conn);
+ return ret;
+}
+
+struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess,
+ unsigned int id)
+{
+ return xa_load(&sess->tree_conns, id);
+}
+
+struct ksmbd_share_config *ksmbd_tree_conn_share(struct ksmbd_session *sess,
+ unsigned int id)
+{
+ struct ksmbd_tree_connect *tc;
+
+ tc = ksmbd_tree_conn_lookup(sess, id);
+ if (tc)
+ return tc->share_conf;
+ return NULL;
+}
+
+int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess)
+{
+ int ret = 0;
+ struct ksmbd_tree_connect *tc;
+ unsigned long id;
+
+ xa_for_each(&sess->tree_conns, id, tc)
+ ret |= ksmbd_tree_conn_disconnect(sess, tc);
+ xa_destroy(&sess->tree_conns);
+ return ret;
+}
diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h
new file mode 100644
index 000000000000..18e2a996e0aa
--- /dev/null
+++ b/fs/ksmbd/mgmt/tree_connect.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __TREE_CONNECT_MANAGEMENT_H__
+#define __TREE_CONNECT_MANAGEMENT_H__
+
+#include <linux/hashtable.h>
+
+#include "../ksmbd_netlink.h"
+
+struct ksmbd_share_config;
+struct ksmbd_user;
+
+struct ksmbd_tree_connect {
+ int id;
+
+ unsigned int flags;
+ struct ksmbd_share_config *share_conf;
+ struct ksmbd_user *user;
+
+ struct list_head list;
+
+ int maximal_access;
+ bool posix_extensions;
+};
+
+struct ksmbd_tree_conn_status {
+ unsigned int ret;
+ struct ksmbd_tree_connect *tree_conn;
+};
+
+static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn,
+ int flag)
+{
+ return tree_conn->flags & flag;
+}
+
+struct ksmbd_session;
+
+struct ksmbd_tree_conn_status
+ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name);
+
+int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
+ struct ksmbd_tree_connect *tree_conn);
+
+struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess,
+ unsigned int id);
+
+struct ksmbd_share_config *ksmbd_tree_conn_share(struct ksmbd_session *sess,
+ unsigned int id);
+
+int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess);
+
+#endif /* __TREE_CONNECT_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c
new file mode 100644
index 000000000000..d21629ae5c89
--- /dev/null
+++ b/fs/ksmbd/mgmt/user_config.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include "user_config.h"
+#include "../transport_ipc.h"
+
+struct ksmbd_user *ksmbd_login_user(const char *account)
+{
+ struct ksmbd_login_response *resp;
+ struct ksmbd_user *user = NULL;
+
+ resp = ksmbd_ipc_login_request(account);
+ if (!resp)
+ return NULL;
+
+ if (!(resp->status & KSMBD_USER_FLAG_OK))
+ goto out;
+
+ user = ksmbd_alloc_user(resp);
+out:
+ kvfree(resp);
+ return user;
+}
+
+struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp)
+{
+ struct ksmbd_user *user = NULL;
+
+ user = kmalloc(sizeof(struct ksmbd_user), GFP_KERNEL);
+ if (!user)
+ return NULL;
+
+ user->name = kstrdup(resp->account, GFP_KERNEL);
+ user->flags = resp->status;
+ user->gid = resp->gid;
+ user->uid = resp->uid;
+ user->passkey_sz = resp->hash_sz;
+ user->passkey = kmalloc(resp->hash_sz, GFP_KERNEL);
+ if (user->passkey)
+ memcpy(user->passkey, resp->hash, resp->hash_sz);
+
+ if (!user->name || !user->passkey) {
+ kfree(user->name);
+ kfree(user->passkey);
+ kfree(user);
+ user = NULL;
+ }
+ return user;
+}
+
+void ksmbd_free_user(struct ksmbd_user *user)
+{
+ ksmbd_ipc_logout_request(user->name);
+ kfree(user->name);
+ kfree(user->passkey);
+ kfree(user);
+}
+
+int ksmbd_anonymous_user(struct ksmbd_user *user)
+{
+ if (user->name[0] == '\0')
+ return 1;
+ return 0;
+}
diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h
new file mode 100644
index 000000000000..b2bb074a0150
--- /dev/null
+++ b/fs/ksmbd/mgmt/user_config.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __USER_CONFIG_MANAGEMENT_H__
+#define __USER_CONFIG_MANAGEMENT_H__
+
+#include "../glob.h"
+
+struct ksmbd_user {
+ unsigned short flags;
+
+ unsigned int uid;
+ unsigned int gid;
+
+ char *name;
+
+ size_t passkey_sz;
+ char *passkey;
+};
+
+static inline bool user_guest(struct ksmbd_user *user)
+{
+ return user->flags & KSMBD_USER_FLAG_GUEST_ACCOUNT;
+}
+
+static inline void set_user_flag(struct ksmbd_user *user, int flag)
+{
+ user->flags |= flag;
+}
+
+static inline int test_user_flag(struct ksmbd_user *user, int flag)
+{
+ return user->flags & flag;
+}
+
+static inline void set_user_guest(struct ksmbd_user *user)
+{
+}
+
+static inline char *user_passkey(struct ksmbd_user *user)
+{
+ return user->passkey;
+}
+
+static inline char *user_name(struct ksmbd_user *user)
+{
+ return user->name;
+}
+
+static inline unsigned int user_uid(struct ksmbd_user *user)
+{
+ return user->uid;
+}
+
+static inline unsigned int user_gid(struct ksmbd_user *user)
+{
+ return user->gid;
+}
+
+struct ksmbd_user *ksmbd_login_user(const char *account);
+struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp);
+void ksmbd_free_user(struct ksmbd_user *user);
+int ksmbd_anonymous_user(struct ksmbd_user *user);
+#endif /* __USER_CONFIG_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c
new file mode 100644
index 000000000000..8d8ffd8c6f19
--- /dev/null
+++ b/fs/ksmbd/mgmt/user_session.c
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xarray.h>
+
+#include "ksmbd_ida.h"
+#include "user_session.h"
+#include "user_config.h"
+#include "tree_connect.h"
+#include "../transport_ipc.h"
+#include "../connection.h"
+#include "../vfs_cache.h"
+
+static DEFINE_IDA(session_ida);
+
+#define SESSION_HASH_BITS 3
+static DEFINE_HASHTABLE(sessions_table, SESSION_HASH_BITS);
+static DECLARE_RWSEM(sessions_table_lock);
+
+struct ksmbd_session_rpc {
+ int id;
+ unsigned int method;
+ struct list_head list;
+};
+
+static void free_channel_list(struct ksmbd_session *sess)
+{
+ struct channel *chann, *tmp;
+
+ list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list,
+ chann_list) {
+ list_del(&chann->chann_list);
+ kfree(chann);
+ }
+}
+
+static void __session_rpc_close(struct ksmbd_session *sess,
+ struct ksmbd_session_rpc *entry)
+{
+ struct ksmbd_rpc_command *resp;
+
+ resp = ksmbd_rpc_close(sess, entry->id);
+ if (!resp)
+ pr_err("Unable to close RPC pipe %d\n", entry->id);
+
+ kvfree(resp);
+ ksmbd_rpc_id_free(entry->id);
+ kfree(entry);
+}
+
+static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess)
+{
+ struct ksmbd_session_rpc *entry;
+
+ while (!list_empty(&sess->rpc_handle_list)) {
+ entry = list_entry(sess->rpc_handle_list.next,
+ struct ksmbd_session_rpc,
+ list);
+
+ list_del(&entry->list);
+ __session_rpc_close(sess, entry);
+ }
+}
+
+static int __rpc_method(char *rpc_name)
+{
+ if (!strcmp(rpc_name, "\\srvsvc") || !strcmp(rpc_name, "srvsvc"))
+ return KSMBD_RPC_SRVSVC_METHOD_INVOKE;
+
+ if (!strcmp(rpc_name, "\\wkssvc") || !strcmp(rpc_name, "wkssvc"))
+ return KSMBD_RPC_WKSSVC_METHOD_INVOKE;
+
+ if (!strcmp(rpc_name, "LANMAN") || !strcmp(rpc_name, "lanman"))
+ return KSMBD_RPC_RAP_METHOD;
+
+ if (!strcmp(rpc_name, "\\samr") || !strcmp(rpc_name, "samr"))
+ return KSMBD_RPC_SAMR_METHOD_INVOKE;
+
+ if (!strcmp(rpc_name, "\\lsarpc") || !strcmp(rpc_name, "lsarpc"))
+ return KSMBD_RPC_LSARPC_METHOD_INVOKE;
+
+ pr_err("Unsupported RPC: %s\n", rpc_name);
+ return 0;
+}
+
+int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
+{
+ struct ksmbd_session_rpc *entry;
+ struct ksmbd_rpc_command *resp;
+ int method;
+
+ method = __rpc_method(rpc_name);
+ if (!method)
+ return -EINVAL;
+
+ entry = kzalloc(sizeof(struct ksmbd_session_rpc), GFP_KERNEL);
+ if (!entry)
+ return -EINVAL;
+
+ list_add(&entry->list, &sess->rpc_handle_list);
+ entry->method = method;
+ entry->id = ksmbd_ipc_id_alloc();
+ if (entry->id < 0)
+ goto error;
+
+ resp = ksmbd_rpc_open(sess, entry->id);
+ if (!resp)
+ goto error;
+
+ kvfree(resp);
+ return entry->id;
+error:
+ list_del(&entry->list);
+ kfree(entry);
+ return -EINVAL;
+}
+
+void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id)
+{
+ struct ksmbd_session_rpc *entry;
+
+ list_for_each_entry(entry, &sess->rpc_handle_list, list) {
+ if (entry->id == id) {
+ list_del(&entry->list);
+ __session_rpc_close(sess, entry);
+ break;
+ }
+ }
+}
+
+int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id)
+{
+ struct ksmbd_session_rpc *entry;
+
+ list_for_each_entry(entry, &sess->rpc_handle_list, list) {
+ if (entry->id == id)
+ return entry->method;
+ }
+ return 0;
+}
+
+void ksmbd_session_destroy(struct ksmbd_session *sess)
+{
+ if (!sess)
+ return;
+
+ if (!atomic_dec_and_test(&sess->refcnt))
+ return;
+
+ list_del(&sess->sessions_entry);
+
+ down_write(&sessions_table_lock);
+ hash_del(&sess->hlist);
+ up_write(&sessions_table_lock);
+
+ if (sess->user)
+ ksmbd_free_user(sess->user);
+
+ ksmbd_tree_conn_session_logoff(sess);
+ ksmbd_destroy_file_table(&sess->file_table);
+ ksmbd_session_rpc_clear_list(sess);
+ free_channel_list(sess);
+ kfree(sess->Preauth_HashValue);
+ ksmbd_release_id(&session_ida, sess->id);
+ kfree(sess);
+}
+
+static struct ksmbd_session *__session_lookup(unsigned long long id)
+{
+ struct ksmbd_session *sess;
+
+ hash_for_each_possible(sessions_table, sess, hlist, id) {
+ if (id == sess->id)
+ return sess;
+ }
+ return NULL;
+}
+
+void ksmbd_session_register(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess)
+{
+ sess->conn = conn;
+ list_add(&sess->sessions_entry, &conn->sessions);
+}
+
+void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
+{
+ struct ksmbd_session *sess;
+
+ while (!list_empty(&conn->sessions)) {
+ sess = list_entry(conn->sessions.next,
+ struct ksmbd_session,
+ sessions_entry);
+
+ ksmbd_session_destroy(sess);
+ }
+}
+
+static bool ksmbd_session_id_match(struct ksmbd_session *sess,
+ unsigned long long id)
+{
+ return sess->id == id;
+}
+
+struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
+ unsigned long long id)
+{
+ struct ksmbd_session *sess = NULL;
+
+ list_for_each_entry(sess, &conn->sessions, sessions_entry) {
+ if (ksmbd_session_id_match(sess, id))
+ return sess;
+ }
+ return NULL;
+}
+
+int get_session(struct ksmbd_session *sess)
+{
+ return atomic_inc_not_zero(&sess->refcnt);
+}
+
+void put_session(struct ksmbd_session *sess)
+{
+ if (atomic_dec_and_test(&sess->refcnt))
+ pr_err("get/%s seems to be mismatched.", __func__);
+}
+
+struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id)
+{
+ struct ksmbd_session *sess;
+
+ down_read(&sessions_table_lock);
+ sess = __session_lookup(id);
+ if (sess) {
+ if (!get_session(sess))
+ sess = NULL;
+ }
+ up_read(&sessions_table_lock);
+
+ return sess;
+}
+
+struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
+ unsigned long long id)
+{
+ struct ksmbd_session *sess;
+
+ sess = ksmbd_session_lookup(conn, id);
+ if (!sess && conn->binding)
+ sess = ksmbd_session_lookup_slowpath(id);
+ return sess;
+}
+
+struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
+ u64 sess_id)
+{
+ struct preauth_session *sess;
+
+ sess = kmalloc(sizeof(struct preauth_session), GFP_KERNEL);
+ if (!sess)
+ return NULL;
+
+ sess->id = sess_id;
+ memcpy(sess->Preauth_HashValue, conn->preauth_info->Preauth_HashValue,
+ PREAUTH_HASHVALUE_SIZE);
+ list_add(&sess->preauth_entry, &conn->preauth_sess_table);
+
+ return sess;
+}
+
+static bool ksmbd_preauth_session_id_match(struct preauth_session *sess,
+ unsigned long long id)
+{
+ return sess->id == id;
+}
+
+struct preauth_session *ksmbd_preauth_session_lookup(struct ksmbd_conn *conn,
+ unsigned long long id)
+{
+ struct preauth_session *sess = NULL;
+
+ list_for_each_entry(sess, &conn->preauth_sess_table, preauth_entry) {
+ if (ksmbd_preauth_session_id_match(sess, id))
+ return sess;
+ }
+ return NULL;
+}
+
+static int __init_smb2_session(struct ksmbd_session *sess)
+{
+ int id = ksmbd_acquire_smb2_uid(&session_ida);
+
+ if (id < 0)
+ return -EINVAL;
+ sess->id = id;
+ return 0;
+}
+
+static struct ksmbd_session *__session_create(int protocol)
+{
+ struct ksmbd_session *sess;
+ int ret;
+
+ sess = kzalloc(sizeof(struct ksmbd_session), GFP_KERNEL);
+ if (!sess)
+ return NULL;
+
+ if (ksmbd_init_file_table(&sess->file_table))
+ goto error;
+
+ set_session_flag(sess, protocol);
+ INIT_LIST_HEAD(&sess->sessions_entry);
+ xa_init(&sess->tree_conns);
+ INIT_LIST_HEAD(&sess->ksmbd_chann_list);
+ INIT_LIST_HEAD(&sess->rpc_handle_list);
+ sess->sequence_number = 1;
+ atomic_set(&sess->refcnt, 1);
+
+ switch (protocol) {
+ case CIFDS_SESSION_FLAG_SMB2:
+ ret = __init_smb2_session(sess);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ goto error;
+
+ ida_init(&sess->tree_conn_ida);
+
+ if (protocol == CIFDS_SESSION_FLAG_SMB2) {
+ down_write(&sessions_table_lock);
+ hash_add(sessions_table, &sess->hlist, sess->id);
+ up_write(&sessions_table_lock);
+ }
+ return sess;
+
+error:
+ ksmbd_session_destroy(sess);
+ return NULL;
+}
+
+struct ksmbd_session *ksmbd_smb2_session_create(void)
+{
+ return __session_create(CIFDS_SESSION_FLAG_SMB2);
+}
+
+int ksmbd_acquire_tree_conn_id(struct ksmbd_session *sess)
+{
+ int id = -EINVAL;
+
+ if (test_session_flag(sess, CIFDS_SESSION_FLAG_SMB2))
+ id = ksmbd_acquire_smb2_tid(&sess->tree_conn_ida);
+
+ return id;
+}
+
+void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id)
+{
+ if (id >= 0)
+ ksmbd_release_id(&sess->tree_conn_ida, id);
+}
diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h
new file mode 100644
index 000000000000..82289c3cbd2b
--- /dev/null
+++ b/fs/ksmbd/mgmt/user_session.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __USER_SESSION_MANAGEMENT_H__
+#define __USER_SESSION_MANAGEMENT_H__
+
+#include <linux/hashtable.h>
+#include <linux/xarray.h>
+
+#include "../smb_common.h"
+#include "../ntlmssp.h"
+
+#define CIFDS_SESSION_FLAG_SMB2 BIT(1)
+
+#define PREAUTH_HASHVALUE_SIZE 64
+
+struct ksmbd_file_table;
+
+struct channel {
+ __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
+ struct ksmbd_conn *conn;
+ struct list_head chann_list;
+};
+
+struct preauth_session {
+ __u8 Preauth_HashValue[PREAUTH_HASHVALUE_SIZE];
+ u64 id;
+ struct list_head preauth_entry;
+};
+
+struct ksmbd_session {
+ u64 id;
+
+ struct ksmbd_user *user;
+ struct ksmbd_conn *conn;
+ unsigned int sequence_number;
+ unsigned int flags;
+
+ bool sign;
+ bool enc;
+ bool is_anonymous;
+
+ int state;
+ __u8 *Preauth_HashValue;
+
+ struct ntlmssp_auth ntlmssp;
+ char sess_key[CIFS_KEY_SIZE];
+
+ struct hlist_node hlist;
+ struct list_head ksmbd_chann_list;
+ struct xarray tree_conns;
+ struct ida tree_conn_ida;
+ struct list_head rpc_handle_list;
+
+ __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
+ __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE];
+ __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
+
+ struct list_head sessions_entry;
+ struct ksmbd_file_table file_table;
+ atomic_t refcnt;
+};
+
+static inline int test_session_flag(struct ksmbd_session *sess, int bit)
+{
+ return sess->flags & bit;
+}
+
+static inline void set_session_flag(struct ksmbd_session *sess, int bit)
+{
+ sess->flags |= bit;
+}
+
+static inline void clear_session_flag(struct ksmbd_session *sess, int bit)
+{
+ sess->flags &= ~bit;
+}
+
+struct ksmbd_session *ksmbd_smb2_session_create(void);
+
+void ksmbd_session_destroy(struct ksmbd_session *sess);
+
+struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id);
+struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
+ unsigned long long id);
+void ksmbd_session_register(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess);
+void ksmbd_sessions_deregister(struct ksmbd_conn *conn);
+struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
+ unsigned long long id);
+struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
+ u64 sess_id);
+struct preauth_session *ksmbd_preauth_session_lookup(struct ksmbd_conn *conn,
+ unsigned long long id);
+
+int ksmbd_acquire_tree_conn_id(struct ksmbd_session *sess);
+void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id);
+
+int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name);
+void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id);
+int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id);
+int get_session(struct ksmbd_session *sess);
+void put_session(struct ksmbd_session *sess);
+#endif /* __USER_SESSION_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
new file mode 100644
index 000000000000..0b307ca28a19
--- /dev/null
+++ b/fs/ksmbd/misc.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/xattr.h>
+#include <linux/fs.h>
+
+#include "misc.h"
+#include "smb_common.h"
+#include "connection.h"
+#include "vfs.h"
+
+#include "mgmt/share_config.h"
+
+/**
+ * match_pattern() - compare a string with a pattern which might include
+ * wildcard '*' and '?'
+ * TODO : implement consideration about DOS_DOT, DOS_QM and DOS_STAR
+ *
+ * @string: string to compare with a pattern
+ * @len: string length
+ * @pattern: pattern string which might include wildcard '*' and '?'
+ *
+ * Return: 0 if pattern matched with the string, otherwise non zero value
+ */
+int match_pattern(const char *str, size_t len, const char *pattern)
+{
+ const char *s = str;
+ const char *p = pattern;
+ bool star = false;
+
+ while (*s && len) {
+ switch (*p) {
+ case '?':
+ s++;
+ len--;
+ p++;
+ break;
+ case '*':
+ star = true;
+ str = s;
+ if (!*++p)
+ return true;
+ pattern = p;
+ break;
+ default:
+ if (tolower(*s) == tolower(*p)) {
+ s++;
+ len--;
+ p++;
+ } else {
+ if (!star)
+ return false;
+ str++;
+ s = str;
+ p = pattern;
+ }
+ break;
+ }
+ }
+
+ if (*p == '*')
+ ++p;
+ return !*p;
+}
+
+/*
+ * is_char_allowed() - check for valid character
+ * @ch: input character to be checked
+ *
+ * Return: 1 if char is allowed, otherwise 0
+ */
+static inline int is_char_allowed(char ch)
+{
+ /* check for control chars, wildcards etc. */
+ if (!(ch & 0x80) &&
+ (ch <= 0x1f ||
+ ch == '?' || ch == '"' || ch == '<' ||
+ ch == '>' || ch == '|' || ch == '*'))
+ return 0;
+
+ return 1;
+}
+
+int ksmbd_validate_filename(char *filename)
+{
+ while (*filename) {
+ char c = *filename;
+
+ filename++;
+ if (!is_char_allowed(c)) {
+ ksmbd_debug(VFS, "File name validation failed: 0x%x\n", c);
+ return -ENOENT;
+ }
+ }
+
+ return 0;
+}
+
+static int ksmbd_validate_stream_name(char *stream_name)
+{
+ while (*stream_name) {
+ char c = *stream_name;
+
+ stream_name++;
+ if (c == '/' || c == ':' || c == '\\') {
+ pr_err("Stream name validation failed: %c\n", c);
+ return -ENOENT;
+ }
+ }
+
+ return 0;
+}
+
+int parse_stream_name(char *filename, char **stream_name, int *s_type)
+{
+ char *stream_type;
+ char *s_name;
+ int rc = 0;
+
+ s_name = filename;
+ filename = strsep(&s_name, ":");
+ ksmbd_debug(SMB, "filename : %s, streams : %s\n", filename, s_name);
+ if (strchr(s_name, ':')) {
+ stream_type = s_name;
+ s_name = strsep(&stream_type, ":");
+
+ rc = ksmbd_validate_stream_name(s_name);
+ if (rc < 0) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ ksmbd_debug(SMB, "stream name : %s, stream type : %s\n", s_name,
+ stream_type);
+ if (!strncasecmp("$data", stream_type, 5))
+ *s_type = DATA_STREAM;
+ else if (!strncasecmp("$index_allocation", stream_type, 17))
+ *s_type = DIR_STREAM;
+ else
+ rc = -ENOENT;
+ }
+
+ *stream_name = s_name;
+out:
+ return rc;
+}
+
+/**
+ * convert_to_nt_pathname() - extract and return windows path string
+ * whose share directory prefix was removed from file path
+ * @filename : unix filename
+ * @sharepath: share path string
+ *
+ * Return : windows path string or error
+ */
+
+char *convert_to_nt_pathname(char *filename, char *sharepath)
+{
+ char *ab_pathname;
+ int len, name_len;
+
+ name_len = strlen(filename);
+ ab_pathname = kmalloc(name_len, GFP_KERNEL);
+ if (!ab_pathname)
+ return NULL;
+
+ ab_pathname[0] = '\\';
+ ab_pathname[1] = '\0';
+
+ len = strlen(sharepath);
+ if (!strncmp(filename, sharepath, len) && name_len != len) {
+ strscpy(ab_pathname, &filename[len], name_len);
+ ksmbd_conv_path_to_windows(ab_pathname);
+ }
+
+ return ab_pathname;
+}
+
+int get_nlink(struct kstat *st)
+{
+ int nlink;
+
+ nlink = st->nlink;
+ if (S_ISDIR(st->mode))
+ nlink--;
+
+ return nlink;
+}
+
+void ksmbd_conv_path_to_unix(char *path)
+{
+ strreplace(path, '\\', '/');
+}
+
+void ksmbd_strip_last_slash(char *path)
+{
+ int len = strlen(path);
+
+ while (len && path[len - 1] == '/') {
+ path[len - 1] = '\0';
+ len--;
+ }
+}
+
+void ksmbd_conv_path_to_windows(char *path)
+{
+ strreplace(path, '/', '\\');
+}
+
+/**
+ * ksmbd_extract_sharename() - get share name from tree connect request
+ * @treename: buffer containing tree name and share name
+ *
+ * Return: share name on success, otherwise error
+ */
+char *ksmbd_extract_sharename(char *treename)
+{
+ char *name = treename;
+ char *dst;
+ char *pos = strrchr(name, '\\');
+
+ if (pos)
+ name = (pos + 1);
+
+ /* caller has to free the memory */
+ dst = kstrdup(name, GFP_KERNEL);
+ if (!dst)
+ return ERR_PTR(-ENOMEM);
+ return dst;
+}
+
+/**
+ * convert_to_unix_name() - convert windows name to unix format
+ * @path: name to be converted
+ * @tid: tree id of mathing share
+ *
+ * Return: converted name on success, otherwise NULL
+ */
+char *convert_to_unix_name(struct ksmbd_share_config *share, char *name)
+{
+ int no_slash = 0, name_len, path_len;
+ char *new_name;
+
+ if (name[0] == '/')
+ name++;
+
+ path_len = share->path_sz;
+ name_len = strlen(name);
+ new_name = kmalloc(path_len + name_len + 2, GFP_KERNEL);
+ if (!new_name)
+ return new_name;
+
+ memcpy(new_name, share->path, path_len);
+ if (new_name[path_len - 1] != '/') {
+ new_name[path_len] = '/';
+ no_slash = 1;
+ }
+
+ memcpy(new_name + path_len + no_slash, name, name_len);
+ path_len += name_len + no_slash;
+ new_name[path_len] = 0x00;
+ return new_name;
+}
+
+char *ksmbd_convert_dir_info_name(struct ksmbd_dir_info *d_info,
+ const struct nls_table *local_nls,
+ int *conv_len)
+{
+ char *conv;
+ int sz = min(4 * d_info->name_len, PATH_MAX);
+
+ if (!sz)
+ return NULL;
+
+ conv = kmalloc(sz, GFP_KERNEL);
+ if (!conv)
+ return NULL;
+
+ /* XXX */
+ *conv_len = smbConvertToUTF16((__le16 *)conv, d_info->name,
+ d_info->name_len, local_nls, 0);
+ *conv_len *= 2;
+
+ /* We allocate buffer twice bigger than needed. */
+ conv[*conv_len] = 0x00;
+ conv[*conv_len + 1] = 0x00;
+ return conv;
+}
+
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
+struct timespec64 ksmbd_NTtimeToUnix(__le64 ntutc)
+{
+ struct timespec64 ts;
+
+ /* Subtract the NTFS time offset, then convert to 1s intervals. */
+ s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+ u64 abs_t;
+
+ /*
+ * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+ * the alternative, do_div, does not work with negative numbers so have
+ * to special case them
+ */
+ if (t < 0) {
+ abs_t = -t;
+ ts.tv_nsec = do_div(abs_t, 10000000) * 100;
+ ts.tv_nsec = -ts.tv_nsec;
+ ts.tv_sec = -abs_t;
+ } else {
+ abs_t = t;
+ ts.tv_nsec = do_div(abs_t, 10000000) * 100;
+ ts.tv_sec = abs_t;
+ }
+
+ return ts;
+}
+
+/* Convert the Unix UTC into NT UTC. */
+inline u64 ksmbd_UnixTimeToNT(struct timespec64 t)
+{
+ /* Convert to 100ns intervals and then add the NTFS time offset. */
+ return (u64)t.tv_sec * 10000000 + t.tv_nsec / 100 + NTFS_TIME_OFFSET;
+}
+
+inline long long ksmbd_systime(void)
+{
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+ return ksmbd_UnixTimeToNT(ts);
+}
diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h
new file mode 100644
index 000000000000..af8717d4d85b
--- /dev/null
+++ b/fs/ksmbd/misc.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_MISC_H__
+#define __KSMBD_MISC_H__
+
+struct ksmbd_share_config;
+struct nls_table;
+struct kstat;
+struct ksmbd_file;
+
+int match_pattern(const char *str, size_t len, const char *pattern);
+int ksmbd_validate_filename(char *filename);
+int parse_stream_name(char *filename, char **stream_name, int *s_type);
+char *convert_to_nt_pathname(char *filename, char *sharepath);
+int get_nlink(struct kstat *st);
+void ksmbd_conv_path_to_unix(char *path);
+void ksmbd_strip_last_slash(char *path);
+void ksmbd_conv_path_to_windows(char *path);
+char *ksmbd_extract_sharename(char *treename);
+char *convert_to_unix_name(struct ksmbd_share_config *share, char *name);
+
+#define KSMBD_DIR_INFO_ALIGNMENT 8
+struct ksmbd_dir_info;
+char *ksmbd_convert_dir_info_name(struct ksmbd_dir_info *d_info,
+ const struct nls_table *local_nls,
+ int *conv_len);
+
+#define NTFS_TIME_OFFSET ((u64)(369 * 365 + 89) * 24 * 3600 * 10000000)
+struct timespec64 ksmbd_NTtimeToUnix(__le64 ntutc);
+u64 ksmbd_UnixTimeToNT(struct timespec64 t);
+long long ksmbd_systime(void);
+#endif /* __KSMBD_MISC_H__ */
diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c
new file mode 100644
index 000000000000..2243a2c64b37
--- /dev/null
+++ b/fs/ksmbd/ndr.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021 Samsung Electronics Co., Ltd.
+ * Author(s): Namjae Jeon <linkinjeon@kernel.org>
+ */
+
+#include <linux/fs.h>
+
+#include "glob.h"
+#include "ndr.h"
+
+static inline char *ndr_get_field(struct ndr *n)
+{
+ return n->data + n->offset;
+}
+
+static int try_to_realloc_ndr_blob(struct ndr *n, size_t sz)
+{
+ char *data;
+
+ data = krealloc(n->data, n->offset + sz + 1024, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ n->data = data;
+ n->length += 1024;
+ memset(n->data + n->offset, 0, 1024);
+ return 0;
+}
+
+static void ndr_write_int16(struct ndr *n, __u16 value)
+{
+ if (n->length <= n->offset + sizeof(value))
+ try_to_realloc_ndr_blob(n, sizeof(value));
+
+ *(__le16 *)ndr_get_field(n) = cpu_to_le16(value);
+ n->offset += sizeof(value);
+}
+
+static void ndr_write_int32(struct ndr *n, __u32 value)
+{
+ if (n->length <= n->offset + sizeof(value))
+ try_to_realloc_ndr_blob(n, sizeof(value));
+
+ *(__le32 *)ndr_get_field(n) = cpu_to_le32(value);
+ n->offset += sizeof(value);
+}
+
+static void ndr_write_int64(struct ndr *n, __u64 value)
+{
+ if (n->length <= n->offset + sizeof(value))
+ try_to_realloc_ndr_blob(n, sizeof(value));
+
+ *(__le64 *)ndr_get_field(n) = cpu_to_le64(value);
+ n->offset += sizeof(value);
+}
+
+static int ndr_write_bytes(struct ndr *n, void *value, size_t sz)
+{
+ if (n->length <= n->offset + sz)
+ try_to_realloc_ndr_blob(n, sz);
+
+ memcpy(ndr_get_field(n), value, sz);
+ n->offset += sz;
+ return 0;
+}
+
+static int ndr_write_string(struct ndr *n, char *value)
+{
+ size_t sz;
+
+ sz = strlen(value) + 1;
+ if (n->length <= n->offset + sz)
+ try_to_realloc_ndr_blob(n, sz);
+
+ memcpy(ndr_get_field(n), value, sz);
+ n->offset += sz;
+ n->offset = ALIGN(n->offset, 2);
+ return 0;
+}
+
+static int ndr_read_string(struct ndr *n, void *value, size_t sz)
+{
+ int len = strnlen(ndr_get_field(n), sz);
+
+ memcpy(value, ndr_get_field(n), len);
+ len++;
+ n->offset += len;
+ n->offset = ALIGN(n->offset, 2);
+ return 0;
+}
+
+static int ndr_read_bytes(struct ndr *n, void *value, size_t sz)
+{
+ memcpy(value, ndr_get_field(n), sz);
+ n->offset += sz;
+ return 0;
+}
+
+static __u16 ndr_read_int16(struct ndr *n)
+{
+ __u16 ret;
+
+ ret = le16_to_cpu(*(__le16 *)ndr_get_field(n));
+ n->offset += sizeof(__u16);
+ return ret;
+}
+
+static __u32 ndr_read_int32(struct ndr *n)
+{
+ __u32 ret;
+
+ ret = le32_to_cpu(*(__le32 *)ndr_get_field(n));
+ n->offset += sizeof(__u32);
+ return ret;
+}
+
+static __u64 ndr_read_int64(struct ndr *n)
+{
+ __u64 ret;
+
+ ret = le64_to_cpu(*(__le64 *)ndr_get_field(n));
+ n->offset += sizeof(__u64);
+ return ret;
+}
+
+int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da)
+{
+ char hex_attr[12] = {0};
+
+ n->offset = 0;
+ n->length = 1024;
+ n->data = kzalloc(n->length, GFP_KERNEL);
+ if (!n->data)
+ return -ENOMEM;
+
+ if (da->version == 3) {
+ snprintf(hex_attr, 10, "0x%x", da->attr);
+ ndr_write_string(n, hex_attr);
+ } else {
+ ndr_write_string(n, "");
+ }
+ ndr_write_int16(n, da->version);
+ ndr_write_int32(n, da->version);
+
+ ndr_write_int32(n, da->flags);
+ ndr_write_int32(n, da->attr);
+ if (da->version == 3) {
+ ndr_write_int32(n, da->ea_size);
+ ndr_write_int64(n, da->size);
+ ndr_write_int64(n, da->alloc_size);
+ } else {
+ ndr_write_int64(n, da->itime);
+ }
+ ndr_write_int64(n, da->create_time);
+ if (da->version == 3)
+ ndr_write_int64(n, da->change_time);
+ return 0;
+}
+
+int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da)
+{
+ char *hex_attr;
+ int version2;
+
+ hex_attr = kzalloc(n->length, GFP_KERNEL);
+ if (!hex_attr)
+ return -ENOMEM;
+
+ n->offset = 0;
+ ndr_read_string(n, hex_attr, n->length);
+ kfree(hex_attr);
+ da->version = ndr_read_int16(n);
+
+ if (da->version != 3 && da->version != 4) {
+ pr_err("v%d version is not supported\n", da->version);
+ return -EINVAL;
+ }
+
+ version2 = ndr_read_int32(n);
+ if (da->version != version2) {
+ pr_err("ndr version mismatched(version: %d, version2: %d)\n",
+ da->version, version2);
+ return -EINVAL;
+ }
+
+ ndr_read_int32(n);
+ da->attr = ndr_read_int32(n);
+ if (da->version == 4) {
+ da->itime = ndr_read_int64(n);
+ da->create_time = ndr_read_int64(n);
+ } else {
+ ndr_read_int32(n);
+ ndr_read_int64(n);
+ ndr_read_int64(n);
+ da->create_time = ndr_read_int64(n);
+ ndr_read_int64(n);
+ }
+
+ return 0;
+}
+
+static int ndr_encode_posix_acl_entry(struct ndr *n, struct xattr_smb_acl *acl)
+{
+ int i;
+
+ ndr_write_int32(n, acl->count);
+ n->offset = ALIGN(n->offset, 8);
+ ndr_write_int32(n, acl->count);
+ ndr_write_int32(n, 0);
+
+ for (i = 0; i < acl->count; i++) {
+ n->offset = ALIGN(n->offset, 8);
+ ndr_write_int16(n, acl->entries[i].type);
+ ndr_write_int16(n, acl->entries[i].type);
+
+ if (acl->entries[i].type == SMB_ACL_USER) {
+ n->offset = ALIGN(n->offset, 8);
+ ndr_write_int64(n, acl->entries[i].uid);
+ } else if (acl->entries[i].type == SMB_ACL_GROUP) {
+ n->offset = ALIGN(n->offset, 8);
+ ndr_write_int64(n, acl->entries[i].gid);
+ }
+
+ /* push permission */
+ ndr_write_int32(n, acl->entries[i].perm);
+ }
+
+ return 0;
+}
+
+int ndr_encode_posix_acl(struct ndr *n,
+ struct user_namespace *user_ns,
+ struct inode *inode,
+ struct xattr_smb_acl *acl,
+ struct xattr_smb_acl *def_acl)
+{
+ int ref_id = 0x00020000;
+
+ n->offset = 0;
+ n->length = 1024;
+ n->data = kzalloc(n->length, GFP_KERNEL);
+ if (!n->data)
+ return -ENOMEM;
+
+ if (acl) {
+ /* ACL ACCESS */
+ ndr_write_int32(n, ref_id);
+ ref_id += 4;
+ } else {
+ ndr_write_int32(n, 0);
+ }
+
+ if (def_acl) {
+ /* DEFAULT ACL ACCESS */
+ ndr_write_int32(n, ref_id);
+ ref_id += 4;
+ } else {
+ ndr_write_int32(n, 0);
+ }
+
+ ndr_write_int64(n, from_kuid(user_ns, inode->i_uid));
+ ndr_write_int64(n, from_kgid(user_ns, inode->i_gid));
+ ndr_write_int32(n, inode->i_mode);
+
+ if (acl) {
+ ndr_encode_posix_acl_entry(n, acl);
+ if (def_acl)
+ ndr_encode_posix_acl_entry(n, def_acl);
+ }
+ return 0;
+}
+
+int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl)
+{
+ int ref_id = 0x00020004;
+
+ n->offset = 0;
+ n->length = 2048;
+ n->data = kzalloc(n->length, GFP_KERNEL);
+ if (!n->data)
+ return -ENOMEM;
+
+ ndr_write_int16(n, acl->version);
+ ndr_write_int32(n, acl->version);
+ ndr_write_int16(n, 2);
+ ndr_write_int32(n, ref_id);
+
+ /* push hash type and hash 64bytes */
+ ndr_write_int16(n, acl->hash_type);
+ ndr_write_bytes(n, acl->hash, XATTR_SD_HASH_SIZE);
+ ndr_write_bytes(n, acl->desc, acl->desc_len);
+ ndr_write_int64(n, acl->current_time);
+ ndr_write_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE);
+
+ /* push ndr for security descriptor */
+ ndr_write_bytes(n, acl->sd_buf, acl->sd_size);
+
+ return 0;
+}
+
+int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl)
+{
+ int version2;
+
+ n->offset = 0;
+ acl->version = ndr_read_int16(n);
+ if (acl->version != 4) {
+ pr_err("v%d version is not supported\n", acl->version);
+ return -EINVAL;
+ }
+
+ version2 = ndr_read_int32(n);
+ if (acl->version != version2) {
+ pr_err("ndr version mismatched(version: %d, version2: %d)\n",
+ acl->version, version2);
+ return -EINVAL;
+ }
+
+ /* Read Level */
+ ndr_read_int16(n);
+ /* Read Ref Id */
+ ndr_read_int32(n);
+ acl->hash_type = ndr_read_int16(n);
+ ndr_read_bytes(n, acl->hash, XATTR_SD_HASH_SIZE);
+
+ ndr_read_bytes(n, acl->desc, 10);
+ if (strncmp(acl->desc, "posix_acl", 9)) {
+ pr_err("Invalid acl description : %s\n", acl->desc);
+ return -EINVAL;
+ }
+
+ /* Read Time */
+ ndr_read_int64(n);
+ /* Read Posix ACL hash */
+ ndr_read_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE);
+ acl->sd_size = n->length - n->offset;
+ acl->sd_buf = kzalloc(acl->sd_size, GFP_KERNEL);
+ if (!acl->sd_buf)
+ return -ENOMEM;
+
+ ndr_read_bytes(n, acl->sd_buf, acl->sd_size);
+
+ return 0;
+}
diff --git a/fs/ksmbd/ndr.h b/fs/ksmbd/ndr.h
new file mode 100644
index 000000000000..60ca265d1bb0
--- /dev/null
+++ b/fs/ksmbd/ndr.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Samsung Electronics Co., Ltd.
+ * Author(s): Namjae Jeon <linkinjeon@kernel.org>
+ */
+
+struct ndr {
+ char *data;
+ int offset;
+ int length;
+};
+
+#define NDR_NTSD_OFFSETOF 0xA0
+
+int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da);
+int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da);
+int ndr_encode_posix_acl(struct ndr *n, struct user_namespace *user_ns,
+ struct inode *inode, struct xattr_smb_acl *acl,
+ struct xattr_smb_acl *def_acl);
+int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl);
+int ndr_encode_v3_ntacl(struct ndr *n, struct xattr_ntacl *acl);
+int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl);
diff --git a/fs/ksmbd/nterr.h b/fs/ksmbd/nterr.h
new file mode 100644
index 000000000000..2f358f88a018
--- /dev/null
+++ b/fs/ksmbd/nterr.h
@@ -0,0 +1,543 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Unix SMB/Netbios implementation.
+ * Version 1.9.
+ * NT error code constants
+ * Copyright (C) Andrew Tridgell 1992-2000
+ * Copyright (C) John H Terpstra 1996-2000
+ * Copyright (C) Luke Kenneth Casson Leighton 1996-2000
+ * Copyright (C) Paul Ashton 1998-2000
+ */
+
+#ifndef _NTERR_H
+#define _NTERR_H
+
+/* Win32 Status codes. */
+#define NT_STATUS_MORE_ENTRIES 0x0105
+#define NT_ERROR_INVALID_PARAMETER 0x0057
+#define NT_ERROR_INSUFFICIENT_BUFFER 0x007a
+#define NT_STATUS_1804 0x070c
+#define NT_STATUS_NOTIFY_ENUM_DIR 0x010c
+#define NT_STATUS_INVALID_LOCK_RANGE (0xC0000000 | 0x01a1)
+/*
+ * Win32 Error codes extracted using a loop in smbclient then printing a netmon
+ * sniff to a file.
+ */
+
+#define NT_STATUS_OK 0x0000
+#define NT_STATUS_SOME_UNMAPPED 0x0107
+#define NT_STATUS_BUFFER_OVERFLOW 0x80000005
+#define NT_STATUS_NO_MORE_ENTRIES 0x8000001a
+#define NT_STATUS_MEDIA_CHANGED 0x8000001c
+#define NT_STATUS_END_OF_MEDIA 0x8000001e
+#define NT_STATUS_MEDIA_CHECK 0x80000020
+#define NT_STATUS_NO_DATA_DETECTED 0x8000001c
+#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d
+#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288
+#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288
+#define NT_STATUS_UNSUCCESSFUL (0xC0000000 | 0x0001)
+#define NT_STATUS_NOT_IMPLEMENTED (0xC0000000 | 0x0002)
+#define NT_STATUS_INVALID_INFO_CLASS (0xC0000000 | 0x0003)
+#define NT_STATUS_INFO_LENGTH_MISMATCH (0xC0000000 | 0x0004)
+#define NT_STATUS_ACCESS_VIOLATION (0xC0000000 | 0x0005)
+#define NT_STATUS_IN_PAGE_ERROR (0xC0000000 | 0x0006)
+#define NT_STATUS_PAGEFILE_QUOTA (0xC0000000 | 0x0007)
+#define NT_STATUS_INVALID_HANDLE (0xC0000000 | 0x0008)
+#define NT_STATUS_BAD_INITIAL_STACK (0xC0000000 | 0x0009)
+#define NT_STATUS_BAD_INITIAL_PC (0xC0000000 | 0x000a)
+#define NT_STATUS_INVALID_CID (0xC0000000 | 0x000b)
+#define NT_STATUS_TIMER_NOT_CANCELED (0xC0000000 | 0x000c)
+#define NT_STATUS_INVALID_PARAMETER (0xC0000000 | 0x000d)
+#define NT_STATUS_NO_SUCH_DEVICE (0xC0000000 | 0x000e)
+#define NT_STATUS_NO_SUCH_FILE (0xC0000000 | 0x000f)
+#define NT_STATUS_INVALID_DEVICE_REQUEST (0xC0000000 | 0x0010)
+#define NT_STATUS_END_OF_FILE (0xC0000000 | 0x0011)
+#define NT_STATUS_WRONG_VOLUME (0xC0000000 | 0x0012)
+#define NT_STATUS_NO_MEDIA_IN_DEVICE (0xC0000000 | 0x0013)
+#define NT_STATUS_UNRECOGNIZED_MEDIA (0xC0000000 | 0x0014)
+#define NT_STATUS_NONEXISTENT_SECTOR (0xC0000000 | 0x0015)
+#define NT_STATUS_MORE_PROCESSING_REQUIRED (0xC0000000 | 0x0016)
+#define NT_STATUS_NO_MEMORY (0xC0000000 | 0x0017)
+#define NT_STATUS_CONFLICTING_ADDRESSES (0xC0000000 | 0x0018)
+#define NT_STATUS_NOT_MAPPED_VIEW (0xC0000000 | 0x0019)
+#define NT_STATUS_UNABLE_TO_FREE_VM (0x80000000 | 0x001a)
+#define NT_STATUS_UNABLE_TO_DELETE_SECTION (0xC0000000 | 0x001b)
+#define NT_STATUS_INVALID_SYSTEM_SERVICE (0xC0000000 | 0x001c)
+#define NT_STATUS_ILLEGAL_INSTRUCTION (0xC0000000 | 0x001d)
+#define NT_STATUS_INVALID_LOCK_SEQUENCE (0xC0000000 | 0x001e)
+#define NT_STATUS_INVALID_VIEW_SIZE (0xC0000000 | 0x001f)
+#define NT_STATUS_INVALID_FILE_FOR_SECTION (0xC0000000 | 0x0020)
+#define NT_STATUS_ALREADY_COMMITTED (0xC0000000 | 0x0021)
+#define NT_STATUS_ACCESS_DENIED (0xC0000000 | 0x0022)
+#define NT_STATUS_BUFFER_TOO_SMALL (0xC0000000 | 0x0023)
+#define NT_STATUS_OBJECT_TYPE_MISMATCH (0xC0000000 | 0x0024)
+#define NT_STATUS_NONCONTINUABLE_EXCEPTION (0xC0000000 | 0x0025)
+#define NT_STATUS_INVALID_DISPOSITION (0xC0000000 | 0x0026)
+#define NT_STATUS_UNWIND (0xC0000000 | 0x0027)
+#define NT_STATUS_BAD_STACK (0xC0000000 | 0x0028)
+#define NT_STATUS_INVALID_UNWIND_TARGET (0xC0000000 | 0x0029)
+#define NT_STATUS_NOT_LOCKED (0xC0000000 | 0x002a)
+#define NT_STATUS_PARITY_ERROR (0xC0000000 | 0x002b)
+#define NT_STATUS_UNABLE_TO_DECOMMIT_VM (0xC0000000 | 0x002c)
+#define NT_STATUS_NOT_COMMITTED (0xC0000000 | 0x002d)
+#define NT_STATUS_INVALID_PORT_ATTRIBUTES (0xC0000000 | 0x002e)
+#define NT_STATUS_PORT_MESSAGE_TOO_LONG (0xC0000000 | 0x002f)
+#define NT_STATUS_INVALID_PARAMETER_MIX (0xC0000000 | 0x0030)
+#define NT_STATUS_INVALID_QUOTA_LOWER (0xC0000000 | 0x0031)
+#define NT_STATUS_DISK_CORRUPT_ERROR (0xC0000000 | 0x0032)
+#define NT_STATUS_OBJECT_NAME_INVALID (0xC0000000 | 0x0033)
+#define NT_STATUS_OBJECT_NAME_NOT_FOUND (0xC0000000 | 0x0034)
+#define NT_STATUS_OBJECT_NAME_COLLISION (0xC0000000 | 0x0035)
+#define NT_STATUS_HANDLE_NOT_WAITABLE (0xC0000000 | 0x0036)
+#define NT_STATUS_PORT_DISCONNECTED (0xC0000000 | 0x0037)
+#define NT_STATUS_DEVICE_ALREADY_ATTACHED (0xC0000000 | 0x0038)
+#define NT_STATUS_OBJECT_PATH_INVALID (0xC0000000 | 0x0039)
+#define NT_STATUS_OBJECT_PATH_NOT_FOUND (0xC0000000 | 0x003a)
+#define NT_STATUS_OBJECT_PATH_SYNTAX_BAD (0xC0000000 | 0x003b)
+#define NT_STATUS_DATA_OVERRUN (0xC0000000 | 0x003c)
+#define NT_STATUS_DATA_LATE_ERROR (0xC0000000 | 0x003d)
+#define NT_STATUS_DATA_ERROR (0xC0000000 | 0x003e)
+#define NT_STATUS_CRC_ERROR (0xC0000000 | 0x003f)
+#define NT_STATUS_SECTION_TOO_BIG (0xC0000000 | 0x0040)
+#define NT_STATUS_PORT_CONNECTION_REFUSED (0xC0000000 | 0x0041)
+#define NT_STATUS_INVALID_PORT_HANDLE (0xC0000000 | 0x0042)
+#define NT_STATUS_SHARING_VIOLATION (0xC0000000 | 0x0043)
+#define NT_STATUS_QUOTA_EXCEEDED (0xC0000000 | 0x0044)
+#define NT_STATUS_INVALID_PAGE_PROTECTION (0xC0000000 | 0x0045)
+#define NT_STATUS_MUTANT_NOT_OWNED (0xC0000000 | 0x0046)
+#define NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED (0xC0000000 | 0x0047)
+#define NT_STATUS_PORT_ALREADY_SET (0xC0000000 | 0x0048)
+#define NT_STATUS_SECTION_NOT_IMAGE (0xC0000000 | 0x0049)
+#define NT_STATUS_SUSPEND_COUNT_EXCEEDED (0xC0000000 | 0x004a)
+#define NT_STATUS_THREAD_IS_TERMINATING (0xC0000000 | 0x004b)
+#define NT_STATUS_BAD_WORKING_SET_LIMIT (0xC0000000 | 0x004c)
+#define NT_STATUS_INCOMPATIBLE_FILE_MAP (0xC0000000 | 0x004d)
+#define NT_STATUS_SECTION_PROTECTION (0xC0000000 | 0x004e)
+#define NT_STATUS_EAS_NOT_SUPPORTED (0xC0000000 | 0x004f)
+#define NT_STATUS_EA_TOO_LARGE (0xC0000000 | 0x0050)
+#define NT_STATUS_NONEXISTENT_EA_ENTRY (0xC0000000 | 0x0051)
+#define NT_STATUS_NO_EAS_ON_FILE (0xC0000000 | 0x0052)
+#define NT_STATUS_EA_CORRUPT_ERROR (0xC0000000 | 0x0053)
+#define NT_STATUS_FILE_LOCK_CONFLICT (0xC0000000 | 0x0054)
+#define NT_STATUS_LOCK_NOT_GRANTED (0xC0000000 | 0x0055)
+#define NT_STATUS_DELETE_PENDING (0xC0000000 | 0x0056)
+#define NT_STATUS_CTL_FILE_NOT_SUPPORTED (0xC0000000 | 0x0057)
+#define NT_STATUS_UNKNOWN_REVISION (0xC0000000 | 0x0058)
+#define NT_STATUS_REVISION_MISMATCH (0xC0000000 | 0x0059)
+#define NT_STATUS_INVALID_OWNER (0xC0000000 | 0x005a)
+#define NT_STATUS_INVALID_PRIMARY_GROUP (0xC0000000 | 0x005b)
+#define NT_STATUS_NO_IMPERSONATION_TOKEN (0xC0000000 | 0x005c)
+#define NT_STATUS_CANT_DISABLE_MANDATORY (0xC0000000 | 0x005d)
+#define NT_STATUS_NO_LOGON_SERVERS (0xC0000000 | 0x005e)
+#define NT_STATUS_NO_SUCH_LOGON_SESSION (0xC0000000 | 0x005f)
+#define NT_STATUS_NO_SUCH_PRIVILEGE (0xC0000000 | 0x0060)
+#define NT_STATUS_PRIVILEGE_NOT_HELD (0xC0000000 | 0x0061)
+#define NT_STATUS_INVALID_ACCOUNT_NAME (0xC0000000 | 0x0062)
+#define NT_STATUS_USER_EXISTS (0xC0000000 | 0x0063)
+#define NT_STATUS_NO_SUCH_USER (0xC0000000 | 0x0064)
+#define NT_STATUS_GROUP_EXISTS (0xC0000000 | 0x0065)
+#define NT_STATUS_NO_SUCH_GROUP (0xC0000000 | 0x0066)
+#define NT_STATUS_MEMBER_IN_GROUP (0xC0000000 | 0x0067)
+#define NT_STATUS_MEMBER_NOT_IN_GROUP (0xC0000000 | 0x0068)
+#define NT_STATUS_LAST_ADMIN (0xC0000000 | 0x0069)
+#define NT_STATUS_WRONG_PASSWORD (0xC0000000 | 0x006a)
+#define NT_STATUS_ILL_FORMED_PASSWORD (0xC0000000 | 0x006b)
+#define NT_STATUS_PASSWORD_RESTRICTION (0xC0000000 | 0x006c)
+#define NT_STATUS_LOGON_FAILURE (0xC0000000 | 0x006d)
+#define NT_STATUS_ACCOUNT_RESTRICTION (0xC0000000 | 0x006e)
+#define NT_STATUS_INVALID_LOGON_HOURS (0xC0000000 | 0x006f)
+#define NT_STATUS_INVALID_WORKSTATION (0xC0000000 | 0x0070)
+#define NT_STATUS_PASSWORD_EXPIRED (0xC0000000 | 0x0071)
+#define NT_STATUS_ACCOUNT_DISABLED (0xC0000000 | 0x0072)
+#define NT_STATUS_NONE_MAPPED (0xC0000000 | 0x0073)
+#define NT_STATUS_TOO_MANY_LUIDS_REQUESTED (0xC0000000 | 0x0074)
+#define NT_STATUS_LUIDS_EXHAUSTED (0xC0000000 | 0x0075)
+#define NT_STATUS_INVALID_SUB_AUTHORITY (0xC0000000 | 0x0076)
+#define NT_STATUS_INVALID_ACL (0xC0000000 | 0x0077)
+#define NT_STATUS_INVALID_SID (0xC0000000 | 0x0078)
+#define NT_STATUS_INVALID_SECURITY_DESCR (0xC0000000 | 0x0079)
+#define NT_STATUS_PROCEDURE_NOT_FOUND (0xC0000000 | 0x007a)
+#define NT_STATUS_INVALID_IMAGE_FORMAT (0xC0000000 | 0x007b)
+#define NT_STATUS_NO_TOKEN (0xC0000000 | 0x007c)
+#define NT_STATUS_BAD_INHERITANCE_ACL (0xC0000000 | 0x007d)
+#define NT_STATUS_RANGE_NOT_LOCKED (0xC0000000 | 0x007e)
+#define NT_STATUS_DISK_FULL (0xC0000000 | 0x007f)
+#define NT_STATUS_SERVER_DISABLED (0xC0000000 | 0x0080)
+#define NT_STATUS_SERVER_NOT_DISABLED (0xC0000000 | 0x0081)
+#define NT_STATUS_TOO_MANY_GUIDS_REQUESTED (0xC0000000 | 0x0082)
+#define NT_STATUS_GUIDS_EXHAUSTED (0xC0000000 | 0x0083)
+#define NT_STATUS_INVALID_ID_AUTHORITY (0xC0000000 | 0x0084)
+#define NT_STATUS_AGENTS_EXHAUSTED (0xC0000000 | 0x0085)
+#define NT_STATUS_INVALID_VOLUME_LABEL (0xC0000000 | 0x0086)
+#define NT_STATUS_SECTION_NOT_EXTENDED (0xC0000000 | 0x0087)
+#define NT_STATUS_NOT_MAPPED_DATA (0xC0000000 | 0x0088)
+#define NT_STATUS_RESOURCE_DATA_NOT_FOUND (0xC0000000 | 0x0089)
+#define NT_STATUS_RESOURCE_TYPE_NOT_FOUND (0xC0000000 | 0x008a)
+#define NT_STATUS_RESOURCE_NAME_NOT_FOUND (0xC0000000 | 0x008b)
+#define NT_STATUS_ARRAY_BOUNDS_EXCEEDED (0xC0000000 | 0x008c)
+#define NT_STATUS_FLOAT_DENORMAL_OPERAND (0xC0000000 | 0x008d)
+#define NT_STATUS_FLOAT_DIVIDE_BY_ZERO (0xC0000000 | 0x008e)
+#define NT_STATUS_FLOAT_INEXACT_RESULT (0xC0000000 | 0x008f)
+#define NT_STATUS_FLOAT_INVALID_OPERATION (0xC0000000 | 0x0090)
+#define NT_STATUS_FLOAT_OVERFLOW (0xC0000000 | 0x0091)
+#define NT_STATUS_FLOAT_STACK_CHECK (0xC0000000 | 0x0092)
+#define NT_STATUS_FLOAT_UNDERFLOW (0xC0000000 | 0x0093)
+#define NT_STATUS_INTEGER_DIVIDE_BY_ZERO (0xC0000000 | 0x0094)
+#define NT_STATUS_INTEGER_OVERFLOW (0xC0000000 | 0x0095)
+#define NT_STATUS_PRIVILEGED_INSTRUCTION (0xC0000000 | 0x0096)
+#define NT_STATUS_TOO_MANY_PAGING_FILES (0xC0000000 | 0x0097)
+#define NT_STATUS_FILE_INVALID (0xC0000000 | 0x0098)
+#define NT_STATUS_ALLOTTED_SPACE_EXCEEDED (0xC0000000 | 0x0099)
+#define NT_STATUS_INSUFFICIENT_RESOURCES (0xC0000000 | 0x009a)
+#define NT_STATUS_DFS_EXIT_PATH_FOUND (0xC0000000 | 0x009b)
+#define NT_STATUS_DEVICE_DATA_ERROR (0xC0000000 | 0x009c)
+#define NT_STATUS_DEVICE_NOT_CONNECTED (0xC0000000 | 0x009d)
+#define NT_STATUS_DEVICE_POWER_FAILURE (0xC0000000 | 0x009e)
+#define NT_STATUS_FREE_VM_NOT_AT_BASE (0xC0000000 | 0x009f)
+#define NT_STATUS_MEMORY_NOT_ALLOCATED (0xC0000000 | 0x00a0)
+#define NT_STATUS_WORKING_SET_QUOTA (0xC0000000 | 0x00a1)
+#define NT_STATUS_MEDIA_WRITE_PROTECTED (0xC0000000 | 0x00a2)
+#define NT_STATUS_DEVICE_NOT_READY (0xC0000000 | 0x00a3)
+#define NT_STATUS_INVALID_GROUP_ATTRIBUTES (0xC0000000 | 0x00a4)
+#define NT_STATUS_BAD_IMPERSONATION_LEVEL (0xC0000000 | 0x00a5)
+#define NT_STATUS_CANT_OPEN_ANONYMOUS (0xC0000000 | 0x00a6)
+#define NT_STATUS_BAD_VALIDATION_CLASS (0xC0000000 | 0x00a7)
+#define NT_STATUS_BAD_TOKEN_TYPE (0xC0000000 | 0x00a8)
+#define NT_STATUS_BAD_MASTER_BOOT_RECORD (0xC0000000 | 0x00a9)
+#define NT_STATUS_INSTRUCTION_MISALIGNMENT (0xC0000000 | 0x00aa)
+#define NT_STATUS_INSTANCE_NOT_AVAILABLE (0xC0000000 | 0x00ab)
+#define NT_STATUS_PIPE_NOT_AVAILABLE (0xC0000000 | 0x00ac)
+#define NT_STATUS_INVALID_PIPE_STATE (0xC0000000 | 0x00ad)
+#define NT_STATUS_PIPE_BUSY (0xC0000000 | 0x00ae)
+#define NT_STATUS_ILLEGAL_FUNCTION (0xC0000000 | 0x00af)
+#define NT_STATUS_PIPE_DISCONNECTED (0xC0000000 | 0x00b0)
+#define NT_STATUS_PIPE_CLOSING (0xC0000000 | 0x00b1)
+#define NT_STATUS_PIPE_CONNECTED (0xC0000000 | 0x00b2)
+#define NT_STATUS_PIPE_LISTENING (0xC0000000 | 0x00b3)
+#define NT_STATUS_INVALID_READ_MODE (0xC0000000 | 0x00b4)
+#define NT_STATUS_IO_TIMEOUT (0xC0000000 | 0x00b5)
+#define NT_STATUS_FILE_FORCED_CLOSED (0xC0000000 | 0x00b6)
+#define NT_STATUS_PROFILING_NOT_STARTED (0xC0000000 | 0x00b7)
+#define NT_STATUS_PROFILING_NOT_STOPPED (0xC0000000 | 0x00b8)
+#define NT_STATUS_COULD_NOT_INTERPRET (0xC0000000 | 0x00b9)
+#define NT_STATUS_FILE_IS_A_DIRECTORY (0xC0000000 | 0x00ba)
+#define NT_STATUS_NOT_SUPPORTED (0xC0000000 | 0x00bb)
+#define NT_STATUS_REMOTE_NOT_LISTENING (0xC0000000 | 0x00bc)
+#define NT_STATUS_DUPLICATE_NAME (0xC0000000 | 0x00bd)
+#define NT_STATUS_BAD_NETWORK_PATH (0xC0000000 | 0x00be)
+#define NT_STATUS_NETWORK_BUSY (0xC0000000 | 0x00bf)
+#define NT_STATUS_DEVICE_DOES_NOT_EXIST (0xC0000000 | 0x00c0)
+#define NT_STATUS_TOO_MANY_COMMANDS (0xC0000000 | 0x00c1)
+#define NT_STATUS_ADAPTER_HARDWARE_ERROR (0xC0000000 | 0x00c2)
+#define NT_STATUS_INVALID_NETWORK_RESPONSE (0xC0000000 | 0x00c3)
+#define NT_STATUS_UNEXPECTED_NETWORK_ERROR (0xC0000000 | 0x00c4)
+#define NT_STATUS_BAD_REMOTE_ADAPTER (0xC0000000 | 0x00c5)
+#define NT_STATUS_PRINT_QUEUE_FULL (0xC0000000 | 0x00c6)
+#define NT_STATUS_NO_SPOOL_SPACE (0xC0000000 | 0x00c7)
+#define NT_STATUS_PRINT_CANCELLED (0xC0000000 | 0x00c8)
+#define NT_STATUS_NETWORK_NAME_DELETED (0xC0000000 | 0x00c9)
+#define NT_STATUS_NETWORK_ACCESS_DENIED (0xC0000000 | 0x00ca)
+#define NT_STATUS_BAD_DEVICE_TYPE (0xC0000000 | 0x00cb)
+#define NT_STATUS_BAD_NETWORK_NAME (0xC0000000 | 0x00cc)
+#define NT_STATUS_TOO_MANY_NAMES (0xC0000000 | 0x00cd)
+#define NT_STATUS_TOO_MANY_SESSIONS (0xC0000000 | 0x00ce)
+#define NT_STATUS_SHARING_PAUSED (0xC0000000 | 0x00cf)
+#define NT_STATUS_REQUEST_NOT_ACCEPTED (0xC0000000 | 0x00d0)
+#define NT_STATUS_REDIRECTOR_PAUSED (0xC0000000 | 0x00d1)
+#define NT_STATUS_NET_WRITE_FAULT (0xC0000000 | 0x00d2)
+#define NT_STATUS_PROFILING_AT_LIMIT (0xC0000000 | 0x00d3)
+#define NT_STATUS_NOT_SAME_DEVICE (0xC0000000 | 0x00d4)
+#define NT_STATUS_FILE_RENAMED (0xC0000000 | 0x00d5)
+#define NT_STATUS_VIRTUAL_CIRCUIT_CLOSED (0xC0000000 | 0x00d6)
+#define NT_STATUS_NO_SECURITY_ON_OBJECT (0xC0000000 | 0x00d7)
+#define NT_STATUS_CANT_WAIT (0xC0000000 | 0x00d8)
+#define NT_STATUS_PIPE_EMPTY (0xC0000000 | 0x00d9)
+#define NT_STATUS_CANT_ACCESS_DOMAIN_INFO (0xC0000000 | 0x00da)
+#define NT_STATUS_CANT_TERMINATE_SELF (0xC0000000 | 0x00db)
+#define NT_STATUS_INVALID_SERVER_STATE (0xC0000000 | 0x00dc)
+#define NT_STATUS_INVALID_DOMAIN_STATE (0xC0000000 | 0x00dd)
+#define NT_STATUS_INVALID_DOMAIN_ROLE (0xC0000000 | 0x00de)
+#define NT_STATUS_NO_SUCH_DOMAIN (0xC0000000 | 0x00df)
+#define NT_STATUS_DOMAIN_EXISTS (0xC0000000 | 0x00e0)
+#define NT_STATUS_DOMAIN_LIMIT_EXCEEDED (0xC0000000 | 0x00e1)
+#define NT_STATUS_OPLOCK_NOT_GRANTED (0xC0000000 | 0x00e2)
+#define NT_STATUS_INVALID_OPLOCK_PROTOCOL (0xC0000000 | 0x00e3)
+#define NT_STATUS_INTERNAL_DB_CORRUPTION (0xC0000000 | 0x00e4)
+#define NT_STATUS_INTERNAL_ERROR (0xC0000000 | 0x00e5)
+#define NT_STATUS_GENERIC_NOT_MAPPED (0xC0000000 | 0x00e6)
+#define NT_STATUS_BAD_DESCRIPTOR_FORMAT (0xC0000000 | 0x00e7)
+#define NT_STATUS_INVALID_USER_BUFFER (0xC0000000 | 0x00e8)
+#define NT_STATUS_UNEXPECTED_IO_ERROR (0xC0000000 | 0x00e9)
+#define NT_STATUS_UNEXPECTED_MM_CREATE_ERR (0xC0000000 | 0x00ea)
+#define NT_STATUS_UNEXPECTED_MM_MAP_ERROR (0xC0000000 | 0x00eb)
+#define NT_STATUS_UNEXPECTED_MM_EXTEND_ERR (0xC0000000 | 0x00ec)
+#define NT_STATUS_NOT_LOGON_PROCESS (0xC0000000 | 0x00ed)
+#define NT_STATUS_LOGON_SESSION_EXISTS (0xC0000000 | 0x00ee)
+#define NT_STATUS_INVALID_PARAMETER_1 (0xC0000000 | 0x00ef)
+#define NT_STATUS_INVALID_PARAMETER_2 (0xC0000000 | 0x00f0)
+#define NT_STATUS_INVALID_PARAMETER_3 (0xC0000000 | 0x00f1)
+#define NT_STATUS_INVALID_PARAMETER_4 (0xC0000000 | 0x00f2)
+#define NT_STATUS_INVALID_PARAMETER_5 (0xC0000000 | 0x00f3)
+#define NT_STATUS_INVALID_PARAMETER_6 (0xC0000000 | 0x00f4)
+#define NT_STATUS_INVALID_PARAMETER_7 (0xC0000000 | 0x00f5)
+#define NT_STATUS_INVALID_PARAMETER_8 (0xC0000000 | 0x00f6)
+#define NT_STATUS_INVALID_PARAMETER_9 (0xC0000000 | 0x00f7)
+#define NT_STATUS_INVALID_PARAMETER_10 (0xC0000000 | 0x00f8)
+#define NT_STATUS_INVALID_PARAMETER_11 (0xC0000000 | 0x00f9)
+#define NT_STATUS_INVALID_PARAMETER_12 (0xC0000000 | 0x00fa)
+#define NT_STATUS_REDIRECTOR_NOT_STARTED (0xC0000000 | 0x00fb)
+#define NT_STATUS_REDIRECTOR_STARTED (0xC0000000 | 0x00fc)
+#define NT_STATUS_STACK_OVERFLOW (0xC0000000 | 0x00fd)
+#define NT_STATUS_NO_SUCH_PACKAGE (0xC0000000 | 0x00fe)
+#define NT_STATUS_BAD_FUNCTION_TABLE (0xC0000000 | 0x00ff)
+#define NT_STATUS_DIRECTORY_NOT_EMPTY (0xC0000000 | 0x0101)
+#define NT_STATUS_FILE_CORRUPT_ERROR (0xC0000000 | 0x0102)
+#define NT_STATUS_NOT_A_DIRECTORY (0xC0000000 | 0x0103)
+#define NT_STATUS_BAD_LOGON_SESSION_STATE (0xC0000000 | 0x0104)
+#define NT_STATUS_LOGON_SESSION_COLLISION (0xC0000000 | 0x0105)
+#define NT_STATUS_NAME_TOO_LONG (0xC0000000 | 0x0106)
+#define NT_STATUS_FILES_OPEN (0xC0000000 | 0x0107)
+#define NT_STATUS_CONNECTION_IN_USE (0xC0000000 | 0x0108)
+#define NT_STATUS_MESSAGE_NOT_FOUND (0xC0000000 | 0x0109)
+#define NT_STATUS_PROCESS_IS_TERMINATING (0xC0000000 | 0x010a)
+#define NT_STATUS_INVALID_LOGON_TYPE (0xC0000000 | 0x010b)
+#define NT_STATUS_NO_GUID_TRANSLATION (0xC0000000 | 0x010c)
+#define NT_STATUS_CANNOT_IMPERSONATE (0xC0000000 | 0x010d)
+#define NT_STATUS_IMAGE_ALREADY_LOADED (0xC0000000 | 0x010e)
+#define NT_STATUS_ABIOS_NOT_PRESENT (0xC0000000 | 0x010f)
+#define NT_STATUS_ABIOS_LID_NOT_EXIST (0xC0000000 | 0x0110)
+#define NT_STATUS_ABIOS_LID_ALREADY_OWNED (0xC0000000 | 0x0111)
+#define NT_STATUS_ABIOS_NOT_LID_OWNER (0xC0000000 | 0x0112)
+#define NT_STATUS_ABIOS_INVALID_COMMAND (0xC0000000 | 0x0113)
+#define NT_STATUS_ABIOS_INVALID_LID (0xC0000000 | 0x0114)
+#define NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE (0xC0000000 | 0x0115)
+#define NT_STATUS_ABIOS_INVALID_SELECTOR (0xC0000000 | 0x0116)
+#define NT_STATUS_NO_LDT (0xC0000000 | 0x0117)
+#define NT_STATUS_INVALID_LDT_SIZE (0xC0000000 | 0x0118)
+#define NT_STATUS_INVALID_LDT_OFFSET (0xC0000000 | 0x0119)
+#define NT_STATUS_INVALID_LDT_DESCRIPTOR (0xC0000000 | 0x011a)
+#define NT_STATUS_INVALID_IMAGE_NE_FORMAT (0xC0000000 | 0x011b)
+#define NT_STATUS_RXACT_INVALID_STATE (0xC0000000 | 0x011c)
+#define NT_STATUS_RXACT_COMMIT_FAILURE (0xC0000000 | 0x011d)
+#define NT_STATUS_MAPPED_FILE_SIZE_ZERO (0xC0000000 | 0x011e)
+#define NT_STATUS_TOO_MANY_OPENED_FILES (0xC0000000 | 0x011f)
+#define NT_STATUS_CANCELLED (0xC0000000 | 0x0120)
+#define NT_STATUS_CANNOT_DELETE (0xC0000000 | 0x0121)
+#define NT_STATUS_INVALID_COMPUTER_NAME (0xC0000000 | 0x0122)
+#define NT_STATUS_FILE_DELETED (0xC0000000 | 0x0123)
+#define NT_STATUS_SPECIAL_ACCOUNT (0xC0000000 | 0x0124)
+#define NT_STATUS_SPECIAL_GROUP (0xC0000000 | 0x0125)
+#define NT_STATUS_SPECIAL_USER (0xC0000000 | 0x0126)
+#define NT_STATUS_MEMBERS_PRIMARY_GROUP (0xC0000000 | 0x0127)
+#define NT_STATUS_FILE_CLOSED (0xC0000000 | 0x0128)
+#define NT_STATUS_TOO_MANY_THREADS (0xC0000000 | 0x0129)
+#define NT_STATUS_THREAD_NOT_IN_PROCESS (0xC0000000 | 0x012a)
+#define NT_STATUS_TOKEN_ALREADY_IN_USE (0xC0000000 | 0x012b)
+#define NT_STATUS_PAGEFILE_QUOTA_EXCEEDED (0xC0000000 | 0x012c)
+#define NT_STATUS_COMMITMENT_LIMIT (0xC0000000 | 0x012d)
+#define NT_STATUS_INVALID_IMAGE_LE_FORMAT (0xC0000000 | 0x012e)
+#define NT_STATUS_INVALID_IMAGE_NOT_MZ (0xC0000000 | 0x012f)
+#define NT_STATUS_INVALID_IMAGE_PROTECT (0xC0000000 | 0x0130)
+#define NT_STATUS_INVALID_IMAGE_WIN_16 (0xC0000000 | 0x0131)
+#define NT_STATUS_LOGON_SERVER_CONFLICT (0xC0000000 | 0x0132)
+#define NT_STATUS_TIME_DIFFERENCE_AT_DC (0xC0000000 | 0x0133)
+#define NT_STATUS_SYNCHRONIZATION_REQUIRED (0xC0000000 | 0x0134)
+#define NT_STATUS_DLL_NOT_FOUND (0xC0000000 | 0x0135)
+#define NT_STATUS_OPEN_FAILED (0xC0000000 | 0x0136)
+#define NT_STATUS_IO_PRIVILEGE_FAILED (0xC0000000 | 0x0137)
+#define NT_STATUS_ORDINAL_NOT_FOUND (0xC0000000 | 0x0138)
+#define NT_STATUS_ENTRYPOINT_NOT_FOUND (0xC0000000 | 0x0139)
+#define NT_STATUS_CONTROL_C_EXIT (0xC0000000 | 0x013a)
+#define NT_STATUS_LOCAL_DISCONNECT (0xC0000000 | 0x013b)
+#define NT_STATUS_REMOTE_DISCONNECT (0xC0000000 | 0x013c)
+#define NT_STATUS_REMOTE_RESOURCES (0xC0000000 | 0x013d)
+#define NT_STATUS_LINK_FAILED (0xC0000000 | 0x013e)
+#define NT_STATUS_LINK_TIMEOUT (0xC0000000 | 0x013f)
+#define NT_STATUS_INVALID_CONNECTION (0xC0000000 | 0x0140)
+#define NT_STATUS_INVALID_ADDRESS (0xC0000000 | 0x0141)
+#define NT_STATUS_DLL_INIT_FAILED (0xC0000000 | 0x0142)
+#define NT_STATUS_MISSING_SYSTEMFILE (0xC0000000 | 0x0143)
+#define NT_STATUS_UNHANDLED_EXCEPTION (0xC0000000 | 0x0144)
+#define NT_STATUS_APP_INIT_FAILURE (0xC0000000 | 0x0145)
+#define NT_STATUS_PAGEFILE_CREATE_FAILED (0xC0000000 | 0x0146)
+#define NT_STATUS_NO_PAGEFILE (0xC0000000 | 0x0147)
+#define NT_STATUS_INVALID_LEVEL (0xC0000000 | 0x0148)
+#define NT_STATUS_WRONG_PASSWORD_CORE (0xC0000000 | 0x0149)
+#define NT_STATUS_ILLEGAL_FLOAT_CONTEXT (0xC0000000 | 0x014a)
+#define NT_STATUS_PIPE_BROKEN (0xC0000000 | 0x014b)
+#define NT_STATUS_REGISTRY_CORRUPT (0xC0000000 | 0x014c)
+#define NT_STATUS_REGISTRY_IO_FAILED (0xC0000000 | 0x014d)
+#define NT_STATUS_NO_EVENT_PAIR (0xC0000000 | 0x014e)
+#define NT_STATUS_UNRECOGNIZED_VOLUME (0xC0000000 | 0x014f)
+#define NT_STATUS_SERIAL_NO_DEVICE_INITED (0xC0000000 | 0x0150)
+#define NT_STATUS_NO_SUCH_ALIAS (0xC0000000 | 0x0151)
+#define NT_STATUS_MEMBER_NOT_IN_ALIAS (0xC0000000 | 0x0152)
+#define NT_STATUS_MEMBER_IN_ALIAS (0xC0000000 | 0x0153)
+#define NT_STATUS_ALIAS_EXISTS (0xC0000000 | 0x0154)
+#define NT_STATUS_LOGON_NOT_GRANTED (0xC0000000 | 0x0155)
+#define NT_STATUS_TOO_MANY_SECRETS (0xC0000000 | 0x0156)
+#define NT_STATUS_SECRET_TOO_LONG (0xC0000000 | 0x0157)
+#define NT_STATUS_INTERNAL_DB_ERROR (0xC0000000 | 0x0158)
+#define NT_STATUS_FULLSCREEN_MODE (0xC0000000 | 0x0159)
+#define NT_STATUS_TOO_MANY_CONTEXT_IDS (0xC0000000 | 0x015a)
+#define NT_STATUS_LOGON_TYPE_NOT_GRANTED (0xC0000000 | 0x015b)
+#define NT_STATUS_NOT_REGISTRY_FILE (0xC0000000 | 0x015c)
+#define NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED (0xC0000000 | 0x015d)
+#define NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR (0xC0000000 | 0x015e)
+#define NT_STATUS_FT_MISSING_MEMBER (0xC0000000 | 0x015f)
+#define NT_STATUS_ILL_FORMED_SERVICE_ENTRY (0xC0000000 | 0x0160)
+#define NT_STATUS_ILLEGAL_CHARACTER (0xC0000000 | 0x0161)
+#define NT_STATUS_UNMAPPABLE_CHARACTER (0xC0000000 | 0x0162)
+#define NT_STATUS_UNDEFINED_CHARACTER (0xC0000000 | 0x0163)
+#define NT_STATUS_FLOPPY_VOLUME (0xC0000000 | 0x0164)
+#define NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND (0xC0000000 | 0x0165)
+#define NT_STATUS_FLOPPY_WRONG_CYLINDER (0xC0000000 | 0x0166)
+#define NT_STATUS_FLOPPY_UNKNOWN_ERROR (0xC0000000 | 0x0167)
+#define NT_STATUS_FLOPPY_BAD_REGISTERS (0xC0000000 | 0x0168)
+#define NT_STATUS_DISK_RECALIBRATE_FAILED (0xC0000000 | 0x0169)
+#define NT_STATUS_DISK_OPERATION_FAILED (0xC0000000 | 0x016a)
+#define NT_STATUS_DISK_RESET_FAILED (0xC0000000 | 0x016b)
+#define NT_STATUS_SHARED_IRQ_BUSY (0xC0000000 | 0x016c)
+#define NT_STATUS_FT_ORPHANING (0xC0000000 | 0x016d)
+#define NT_STATUS_PARTITION_FAILURE (0xC0000000 | 0x0172)
+#define NT_STATUS_INVALID_BLOCK_LENGTH (0xC0000000 | 0x0173)
+#define NT_STATUS_DEVICE_NOT_PARTITIONED (0xC0000000 | 0x0174)
+#define NT_STATUS_UNABLE_TO_LOCK_MEDIA (0xC0000000 | 0x0175)
+#define NT_STATUS_UNABLE_TO_UNLOAD_MEDIA (0xC0000000 | 0x0176)
+#define NT_STATUS_EOM_OVERFLOW (0xC0000000 | 0x0177)
+#define NT_STATUS_NO_MEDIA (0xC0000000 | 0x0178)
+#define NT_STATUS_NO_SUCH_MEMBER (0xC0000000 | 0x017a)
+#define NT_STATUS_INVALID_MEMBER (0xC0000000 | 0x017b)
+#define NT_STATUS_KEY_DELETED (0xC0000000 | 0x017c)
+#define NT_STATUS_NO_LOG_SPACE (0xC0000000 | 0x017d)
+#define NT_STATUS_TOO_MANY_SIDS (0xC0000000 | 0x017e)
+#define NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED (0xC0000000 | 0x017f)
+#define NT_STATUS_KEY_HAS_CHILDREN (0xC0000000 | 0x0180)
+#define NT_STATUS_CHILD_MUST_BE_VOLATILE (0xC0000000 | 0x0181)
+#define NT_STATUS_DEVICE_CONFIGURATION_ERROR (0xC0000000 | 0x0182)
+#define NT_STATUS_DRIVER_INTERNAL_ERROR (0xC0000000 | 0x0183)
+#define NT_STATUS_INVALID_DEVICE_STATE (0xC0000000 | 0x0184)
+#define NT_STATUS_IO_DEVICE_ERROR (0xC0000000 | 0x0185)
+#define NT_STATUS_DEVICE_PROTOCOL_ERROR (0xC0000000 | 0x0186)
+#define NT_STATUS_BACKUP_CONTROLLER (0xC0000000 | 0x0187)
+#define NT_STATUS_LOG_FILE_FULL (0xC0000000 | 0x0188)
+#define NT_STATUS_TOO_LATE (0xC0000000 | 0x0189)
+#define NT_STATUS_NO_TRUST_LSA_SECRET (0xC0000000 | 0x018a)
+#define NT_STATUS_NO_TRUST_SAM_ACCOUNT (0xC0000000 | 0x018b)
+#define NT_STATUS_TRUSTED_DOMAIN_FAILURE (0xC0000000 | 0x018c)
+#define NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE (0xC0000000 | 0x018d)
+#define NT_STATUS_EVENTLOG_FILE_CORRUPT (0xC0000000 | 0x018e)
+#define NT_STATUS_EVENTLOG_CANT_START (0xC0000000 | 0x018f)
+#define NT_STATUS_TRUST_FAILURE (0xC0000000 | 0x0190)
+#define NT_STATUS_MUTANT_LIMIT_EXCEEDED (0xC0000000 | 0x0191)
+#define NT_STATUS_NETLOGON_NOT_STARTED (0xC0000000 | 0x0192)
+#define NT_STATUS_ACCOUNT_EXPIRED (0xC0000000 | 0x0193)
+#define NT_STATUS_POSSIBLE_DEADLOCK (0xC0000000 | 0x0194)
+#define NT_STATUS_NETWORK_CREDENTIAL_CONFLICT (0xC0000000 | 0x0195)
+#define NT_STATUS_REMOTE_SESSION_LIMIT (0xC0000000 | 0x0196)
+#define NT_STATUS_EVENTLOG_FILE_CHANGED (0xC0000000 | 0x0197)
+#define NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT (0xC0000000 | 0x0198)
+#define NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT (0xC0000000 | 0x0199)
+#define NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT (0xC0000000 | 0x019a)
+#define NT_STATUS_DOMAIN_TRUST_INCONSISTENT (0xC0000000 | 0x019b)
+#define NT_STATUS_FS_DRIVER_REQUIRED (0xC0000000 | 0x019c)
+#define NT_STATUS_NO_USER_SESSION_KEY (0xC0000000 | 0x0202)
+#define NT_STATUS_USER_SESSION_DELETED (0xC0000000 | 0x0203)
+#define NT_STATUS_RESOURCE_LANG_NOT_FOUND (0xC0000000 | 0x0204)
+#define NT_STATUS_INSUFF_SERVER_RESOURCES (0xC0000000 | 0x0205)
+#define NT_STATUS_INVALID_BUFFER_SIZE (0xC0000000 | 0x0206)
+#define NT_STATUS_INVALID_ADDRESS_COMPONENT (0xC0000000 | 0x0207)
+#define NT_STATUS_INVALID_ADDRESS_WILDCARD (0xC0000000 | 0x0208)
+#define NT_STATUS_TOO_MANY_ADDRESSES (0xC0000000 | 0x0209)
+#define NT_STATUS_ADDRESS_ALREADY_EXISTS (0xC0000000 | 0x020a)
+#define NT_STATUS_ADDRESS_CLOSED (0xC0000000 | 0x020b)
+#define NT_STATUS_CONNECTION_DISCONNECTED (0xC0000000 | 0x020c)
+#define NT_STATUS_CONNECTION_RESET (0xC0000000 | 0x020d)
+#define NT_STATUS_TOO_MANY_NODES (0xC0000000 | 0x020e)
+#define NT_STATUS_TRANSACTION_ABORTED (0xC0000000 | 0x020f)
+#define NT_STATUS_TRANSACTION_TIMED_OUT (0xC0000000 | 0x0210)
+#define NT_STATUS_TRANSACTION_NO_RELEASE (0xC0000000 | 0x0211)
+#define NT_STATUS_TRANSACTION_NO_MATCH (0xC0000000 | 0x0212)
+#define NT_STATUS_TRANSACTION_RESPONDED (0xC0000000 | 0x0213)
+#define NT_STATUS_TRANSACTION_INVALID_ID (0xC0000000 | 0x0214)
+#define NT_STATUS_TRANSACTION_INVALID_TYPE (0xC0000000 | 0x0215)
+#define NT_STATUS_NOT_SERVER_SESSION (0xC0000000 | 0x0216)
+#define NT_STATUS_NOT_CLIENT_SESSION (0xC0000000 | 0x0217)
+#define NT_STATUS_CANNOT_LOAD_REGISTRY_FILE (0xC0000000 | 0x0218)
+#define NT_STATUS_DEBUG_ATTACH_FAILED (0xC0000000 | 0x0219)
+#define NT_STATUS_SYSTEM_PROCESS_TERMINATED (0xC0000000 | 0x021a)
+#define NT_STATUS_DATA_NOT_ACCEPTED (0xC0000000 | 0x021b)
+#define NT_STATUS_NO_BROWSER_SERVERS_FOUND (0xC0000000 | 0x021c)
+#define NT_STATUS_VDM_HARD_ERROR (0xC0000000 | 0x021d)
+#define NT_STATUS_DRIVER_CANCEL_TIMEOUT (0xC0000000 | 0x021e)
+#define NT_STATUS_REPLY_MESSAGE_MISMATCH (0xC0000000 | 0x021f)
+#define NT_STATUS_MAPPED_ALIGNMENT (0xC0000000 | 0x0220)
+#define NT_STATUS_IMAGE_CHECKSUM_MISMATCH (0xC0000000 | 0x0221)
+#define NT_STATUS_LOST_WRITEBEHIND_DATA (0xC0000000 | 0x0222)
+#define NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID (0xC0000000 | 0x0223)
+#define NT_STATUS_PASSWORD_MUST_CHANGE (0xC0000000 | 0x0224)
+#define NT_STATUS_NOT_FOUND (0xC0000000 | 0x0225)
+#define NT_STATUS_NOT_TINY_STREAM (0xC0000000 | 0x0226)
+#define NT_STATUS_RECOVERY_FAILURE (0xC0000000 | 0x0227)
+#define NT_STATUS_STACK_OVERFLOW_READ (0xC0000000 | 0x0228)
+#define NT_STATUS_FAIL_CHECK (0xC0000000 | 0x0229)
+#define NT_STATUS_DUPLICATE_OBJECTID (0xC0000000 | 0x022a)
+#define NT_STATUS_OBJECTID_EXISTS (0xC0000000 | 0x022b)
+#define NT_STATUS_CONVERT_TO_LARGE (0xC0000000 | 0x022c)
+#define NT_STATUS_RETRY (0xC0000000 | 0x022d)
+#define NT_STATUS_FOUND_OUT_OF_SCOPE (0xC0000000 | 0x022e)
+#define NT_STATUS_ALLOCATE_BUCKET (0xC0000000 | 0x022f)
+#define NT_STATUS_PROPSET_NOT_FOUND (0xC0000000 | 0x0230)
+#define NT_STATUS_MARSHALL_OVERFLOW (0xC0000000 | 0x0231)
+#define NT_STATUS_INVALID_VARIANT (0xC0000000 | 0x0232)
+#define NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND (0xC0000000 | 0x0233)
+#define NT_STATUS_ACCOUNT_LOCKED_OUT (0xC0000000 | 0x0234)
+#define NT_STATUS_HANDLE_NOT_CLOSABLE (0xC0000000 | 0x0235)
+#define NT_STATUS_CONNECTION_REFUSED (0xC0000000 | 0x0236)
+#define NT_STATUS_GRACEFUL_DISCONNECT (0xC0000000 | 0x0237)
+#define NT_STATUS_ADDRESS_ALREADY_ASSOCIATED (0xC0000000 | 0x0238)
+#define NT_STATUS_ADDRESS_NOT_ASSOCIATED (0xC0000000 | 0x0239)
+#define NT_STATUS_CONNECTION_INVALID (0xC0000000 | 0x023a)
+#define NT_STATUS_CONNECTION_ACTIVE (0xC0000000 | 0x023b)
+#define NT_STATUS_NETWORK_UNREACHABLE (0xC0000000 | 0x023c)
+#define NT_STATUS_HOST_UNREACHABLE (0xC0000000 | 0x023d)
+#define NT_STATUS_PROTOCOL_UNREACHABLE (0xC0000000 | 0x023e)
+#define NT_STATUS_PORT_UNREACHABLE (0xC0000000 | 0x023f)
+#define NT_STATUS_REQUEST_ABORTED (0xC0000000 | 0x0240)
+#define NT_STATUS_CONNECTION_ABORTED (0xC0000000 | 0x0241)
+#define NT_STATUS_BAD_COMPRESSION_BUFFER (0xC0000000 | 0x0242)
+#define NT_STATUS_USER_MAPPED_FILE (0xC0000000 | 0x0243)
+#define NT_STATUS_AUDIT_FAILED (0xC0000000 | 0x0244)
+#define NT_STATUS_TIMER_RESOLUTION_NOT_SET (0xC0000000 | 0x0245)
+#define NT_STATUS_CONNECTION_COUNT_LIMIT (0xC0000000 | 0x0246)
+#define NT_STATUS_LOGIN_TIME_RESTRICTION (0xC0000000 | 0x0247)
+#define NT_STATUS_LOGIN_WKSTA_RESTRICTION (0xC0000000 | 0x0248)
+#define NT_STATUS_IMAGE_MP_UP_MISMATCH (0xC0000000 | 0x0249)
+#define NT_STATUS_INSUFFICIENT_LOGON_INFO (0xC0000000 | 0x0250)
+#define NT_STATUS_BAD_DLL_ENTRYPOINT (0xC0000000 | 0x0251)
+#define NT_STATUS_BAD_SERVICE_ENTRYPOINT (0xC0000000 | 0x0252)
+#define NT_STATUS_LPC_REPLY_LOST (0xC0000000 | 0x0253)
+#define NT_STATUS_IP_ADDRESS_CONFLICT1 (0xC0000000 | 0x0254)
+#define NT_STATUS_IP_ADDRESS_CONFLICT2 (0xC0000000 | 0x0255)
+#define NT_STATUS_REGISTRY_QUOTA_LIMIT (0xC0000000 | 0x0256)
+#define NT_STATUS_PATH_NOT_COVERED (0xC0000000 | 0x0257)
+#define NT_STATUS_NO_CALLBACK_ACTIVE (0xC0000000 | 0x0258)
+#define NT_STATUS_LICENSE_QUOTA_EXCEEDED (0xC0000000 | 0x0259)
+#define NT_STATUS_PWD_TOO_SHORT (0xC0000000 | 0x025a)
+#define NT_STATUS_PWD_TOO_RECENT (0xC0000000 | 0x025b)
+#define NT_STATUS_PWD_HISTORY_CONFLICT (0xC0000000 | 0x025c)
+#define NT_STATUS_PLUGPLAY_NO_DEVICE (0xC0000000 | 0x025e)
+#define NT_STATUS_UNSUPPORTED_COMPRESSION (0xC0000000 | 0x025f)
+#define NT_STATUS_INVALID_HW_PROFILE (0xC0000000 | 0x0260)
+#define NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH (0xC0000000 | 0x0261)
+#define NT_STATUS_DRIVER_ORDINAL_NOT_FOUND (0xC0000000 | 0x0262)
+#define NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND (0xC0000000 | 0x0263)
+#define NT_STATUS_RESOURCE_NOT_OWNED (0xC0000000 | 0x0264)
+#define NT_STATUS_TOO_MANY_LINKS (0xC0000000 | 0x0265)
+#define NT_STATUS_QUOTA_LIST_INCONSISTENT (0xC0000000 | 0x0266)
+#define NT_STATUS_FILE_IS_OFFLINE (0xC0000000 | 0x0267)
+#define NT_STATUS_NETWORK_SESSION_EXPIRED (0xC0000000 | 0x035c)
+#define NT_STATUS_NO_SUCH_JOB (0xC0000000 | 0xEDE) /* scheduler */
+#define NT_STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP (0xC0000000 | 0x5D0000)
+#define NT_STATUS_PENDING 0x00000103
+#endif /* _NTERR_H */
diff --git a/fs/ksmbd/ntlmssp.h b/fs/ksmbd/ntlmssp.h
new file mode 100644
index 000000000000..adaf4c0cbe8f
--- /dev/null
+++ b/fs/ksmbd/ntlmssp.h
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/*
+ * Copyright (c) International Business Machines Corp., 2002,2007
+ * Author(s): Steve French (sfrench@us.ibm.com)
+ */
+
+#ifndef __KSMBD_NTLMSSP_H
+#define __KSMBD_NTLMSSP_H
+
+#define NTLMSSP_SIGNATURE "NTLMSSP"
+
+/* Security blob target info data */
+#define TGT_Name "KSMBD"
+
+/*
+ * Size of the crypto key returned on the negotiate SMB in bytes
+ */
+#define CIFS_CRYPTO_KEY_SIZE (8)
+#define CIFS_KEY_SIZE (40)
+
+/*
+ * Size of encrypted user password in bytes
+ */
+#define CIFS_ENCPWD_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+
+/* Message Types */
+#define NtLmNegotiate cpu_to_le32(1)
+#define NtLmChallenge cpu_to_le32(2)
+#define NtLmAuthenticate cpu_to_le32(3)
+#define UnknownMessage cpu_to_le32(8)
+
+/* Negotiate Flags */
+#define NTLMSSP_NEGOTIATE_UNICODE 0x01 /* Text strings are unicode */
+#define NTLMSSP_NEGOTIATE_OEM 0x02 /* Text strings are in OEM */
+#define NTLMSSP_REQUEST_TARGET 0x04 /* Srv returns its auth realm */
+/* define reserved9 0x08 */
+#define NTLMSSP_NEGOTIATE_SIGN 0x0010 /* Request signing capability */
+#define NTLMSSP_NEGOTIATE_SEAL 0x0020 /* Request confidentiality */
+#define NTLMSSP_NEGOTIATE_DGRAM 0x0040
+#define NTLMSSP_NEGOTIATE_LM_KEY 0x0080 /* Use LM session key */
+/* defined reserved 8 0x0100 */
+#define NTLMSSP_NEGOTIATE_NTLM 0x0200 /* NTLM authentication */
+#define NTLMSSP_NEGOTIATE_NT_ONLY 0x0400 /* Lanman not allowed */
+#define NTLMSSP_ANONYMOUS 0x0800
+#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */
+#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000
+#define NTLMSSP_NEGOTIATE_LOCAL_CALL 0x4000 /* client/server same machine */
+#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign. All security levels */
+#define NTLMSSP_TARGET_TYPE_DOMAIN 0x10000
+#define NTLMSSP_TARGET_TYPE_SERVER 0x20000
+#define NTLMSSP_TARGET_TYPE_SHARE 0x40000
+#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/
+/* #define NTLMSSP_REQUEST_INIT_RESP 0x100000 */
+#define NTLMSSP_NEGOTIATE_IDENTIFY 0x100000
+#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000 /* reserved5 */
+#define NTLMSSP_REQUEST_NON_NT_KEY 0x400000
+#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000
+/* #define reserved4 0x1000000 */
+#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */
+/* #define reserved3 0x4000000 */
+/* #define reserved2 0x8000000 */
+/* #define reserved1 0x10000000 */
+#define NTLMSSP_NEGOTIATE_128 0x20000000
+#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
+#define NTLMSSP_NEGOTIATE_56 0x80000000
+
+/* Define AV Pair Field IDs */
+enum av_field_type {
+ NTLMSSP_AV_EOL = 0,
+ NTLMSSP_AV_NB_COMPUTER_NAME,
+ NTLMSSP_AV_NB_DOMAIN_NAME,
+ NTLMSSP_AV_DNS_COMPUTER_NAME,
+ NTLMSSP_AV_DNS_DOMAIN_NAME,
+ NTLMSSP_AV_DNS_TREE_NAME,
+ NTLMSSP_AV_FLAGS,
+ NTLMSSP_AV_TIMESTAMP,
+ NTLMSSP_AV_RESTRICTION,
+ NTLMSSP_AV_TARGET_NAME,
+ NTLMSSP_AV_CHANNEL_BINDINGS
+};
+
+/* Although typedefs are not commonly used for structure definitions */
+/* in the Linux kernel, in this particular case they are useful */
+/* to more closely match the standards document for NTLMSSP from */
+/* OpenGroup and to make the code more closely match the standard in */
+/* appearance */
+
+struct security_buffer {
+ __le16 Length;
+ __le16 MaximumLength;
+ __le32 BufferOffset; /* offset to buffer */
+} __packed;
+
+struct target_info {
+ __le16 Type;
+ __le16 Length;
+ __u8 Content[0];
+} __packed;
+
+struct negotiate_message {
+ __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+ __le32 MessageType; /* NtLmNegotiate = 1 */
+ __le32 NegotiateFlags;
+ struct security_buffer DomainName; /* RFC 1001 style and ASCII */
+ struct security_buffer WorkstationName; /* RFC 1001 and ASCII */
+ /*
+ * struct security_buffer for version info not present since we
+ * do not set the version is present flag
+ */
+ char DomainString[0];
+ /* followed by WorkstationString */
+} __packed;
+
+struct challenge_message {
+ __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+ __le32 MessageType; /* NtLmChallenge = 2 */
+ struct security_buffer TargetName;
+ __le32 NegotiateFlags;
+ __u8 Challenge[CIFS_CRYPTO_KEY_SIZE];
+ __u8 Reserved[8];
+ struct security_buffer TargetInfoArray;
+ /*
+ * struct security_buffer for version info not present since we
+ * do not set the version is present flag
+ */
+} __packed;
+
+struct authenticate_message {
+ __u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
+ __le32 MessageType; /* NtLmsAuthenticate = 3 */
+ struct security_buffer LmChallengeResponse;
+ struct security_buffer NtChallengeResponse;
+ struct security_buffer DomainName;
+ struct security_buffer UserName;
+ struct security_buffer WorkstationName;
+ struct security_buffer SessionKey;
+ __le32 NegotiateFlags;
+ /*
+ * struct security_buffer for version info not present since we
+ * do not set the version is present flag
+ */
+ char UserString[0];
+} __packed;
+
+struct ntlmv2_resp {
+ char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+ __le32 blob_signature;
+ __u32 reserved;
+ __le64 time;
+ __u64 client_chal; /* random */
+ __u32 reserved2;
+ /* array of name entries could follow ending in minimum 4 byte struct */
+} __packed;
+
+/* per smb session structure/fields */
+struct ntlmssp_auth {
+ /* whether session key is per smb session */
+ bool sesskey_per_smbsess;
+ /* sent by client in type 1 ntlmsssp exchange */
+ __u32 client_flags;
+ /* sent by server in type 2 ntlmssp exchange */
+ __u32 conn_flags;
+ /* sent to server */
+ unsigned char ciphertext[CIFS_CPHTXT_SIZE];
+ /* used by ntlmssp */
+ char cryptkey[CIFS_CRYPTO_KEY_SIZE];
+};
+#endif /* __KSMBD_NTLMSSP_H */
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
new file mode 100644
index 000000000000..6ace6c2f22dc
--- /dev/null
+++ b/fs/ksmbd/oplock.c
@@ -0,0 +1,1709 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/moduleparam.h>
+
+#include "glob.h"
+#include "oplock.h"
+
+#include "smb_common.h"
+#include "smbstatus.h"
+#include "connection.h"
+#include "mgmt/user_session.h"
+#include "mgmt/share_config.h"
+#include "mgmt/tree_connect.h"
+
+static LIST_HEAD(lease_table_list);
+static DEFINE_RWLOCK(lease_list_lock);
+
+/**
+ * alloc_opinfo() - allocate a new opinfo object for oplock info
+ * @work: smb work
+ * @id: fid of open file
+ * @Tid: tree id of connection
+ *
+ * Return: allocated opinfo object on success, otherwise NULL
+ */
+static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
+ u64 id, __u16 Tid)
+{
+ struct ksmbd_session *sess = work->sess;
+ struct oplock_info *opinfo;
+
+ opinfo = kzalloc(sizeof(struct oplock_info), GFP_KERNEL);
+ if (!opinfo)
+ return NULL;
+
+ opinfo->sess = sess;
+ opinfo->conn = sess->conn;
+ opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
+ opinfo->op_state = OPLOCK_STATE_NONE;
+ opinfo->pending_break = 0;
+ opinfo->fid = id;
+ opinfo->Tid = Tid;
+ INIT_LIST_HEAD(&opinfo->op_entry);
+ INIT_LIST_HEAD(&opinfo->interim_list);
+ init_waitqueue_head(&opinfo->oplock_q);
+ init_waitqueue_head(&opinfo->oplock_brk);
+ atomic_set(&opinfo->refcount, 1);
+ atomic_set(&opinfo->breaking_cnt, 0);
+
+ return opinfo;
+}
+
+static void lease_add_list(struct oplock_info *opinfo)
+{
+ struct lease_table *lb = opinfo->o_lease->l_lb;
+
+ spin_lock(&lb->lb_lock);
+ list_add_rcu(&opinfo->lease_entry, &lb->lease_list);
+ spin_unlock(&lb->lb_lock);
+}
+
+static void lease_del_list(struct oplock_info *opinfo)
+{
+ struct lease_table *lb = opinfo->o_lease->l_lb;
+
+ if (!lb)
+ return;
+
+ spin_lock(&lb->lb_lock);
+ if (list_empty(&opinfo->lease_entry)) {
+ spin_unlock(&lb->lb_lock);
+ return;
+ }
+
+ list_del_init(&opinfo->lease_entry);
+ opinfo->o_lease->l_lb = NULL;
+ spin_unlock(&lb->lb_lock);
+}
+
+static void lb_add(struct lease_table *lb)
+{
+ write_lock(&lease_list_lock);
+ list_add(&lb->l_entry, &lease_table_list);
+ write_unlock(&lease_list_lock);
+}
+
+static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx)
+{
+ struct lease *lease;
+
+ lease = kmalloc(sizeof(struct lease), GFP_KERNEL);
+ if (!lease)
+ return -ENOMEM;
+
+ memcpy(lease->lease_key, lctx->lease_key, SMB2_LEASE_KEY_SIZE);
+ lease->state = lctx->req_state;
+ lease->new_state = 0;
+ lease->flags = lctx->flags;
+ lease->duration = lctx->duration;
+ memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE);
+ lease->version = lctx->version;
+ lease->epoch = 0;
+ INIT_LIST_HEAD(&opinfo->lease_entry);
+ opinfo->o_lease = lease;
+
+ return 0;
+}
+
+static void free_lease(struct oplock_info *opinfo)
+{
+ struct lease *lease;
+
+ lease = opinfo->o_lease;
+ kfree(lease);
+}
+
+static void free_opinfo(struct oplock_info *opinfo)
+{
+ if (opinfo->is_lease)
+ free_lease(opinfo);
+ kfree(opinfo);
+}
+
+static inline void opinfo_free_rcu(struct rcu_head *rcu_head)
+{
+ struct oplock_info *opinfo;
+
+ opinfo = container_of(rcu_head, struct oplock_info, rcu_head);
+ free_opinfo(opinfo);
+}
+
+struct oplock_info *opinfo_get(struct ksmbd_file *fp)
+{
+ struct oplock_info *opinfo;
+
+ rcu_read_lock();
+ opinfo = rcu_dereference(fp->f_opinfo);
+ if (opinfo && !atomic_inc_not_zero(&opinfo->refcount))
+ opinfo = NULL;
+ rcu_read_unlock();
+
+ return opinfo;
+}
+
+static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
+{
+ struct oplock_info *opinfo;
+
+ if (list_empty(&ci->m_op_list))
+ return NULL;
+
+ rcu_read_lock();
+ opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info,
+ op_entry);
+ if (opinfo && !atomic_inc_not_zero(&opinfo->refcount))
+ opinfo = NULL;
+ rcu_read_unlock();
+
+ return opinfo;
+}
+
+void opinfo_put(struct oplock_info *opinfo)
+{
+ if (!atomic_dec_and_test(&opinfo->refcount))
+ return;
+
+ call_rcu(&opinfo->rcu_head, opinfo_free_rcu);
+}
+
+static void opinfo_add(struct oplock_info *opinfo)
+{
+ struct ksmbd_inode *ci = opinfo->o_fp->f_ci;
+
+ write_lock(&ci->m_lock);
+ list_add_rcu(&opinfo->op_entry, &ci->m_op_list);
+ write_unlock(&ci->m_lock);
+}
+
+static void opinfo_del(struct oplock_info *opinfo)
+{
+ struct ksmbd_inode *ci = opinfo->o_fp->f_ci;
+
+ if (opinfo->is_lease) {
+ write_lock(&lease_list_lock);
+ lease_del_list(opinfo);
+ write_unlock(&lease_list_lock);
+ }
+ write_lock(&ci->m_lock);
+ list_del_rcu(&opinfo->op_entry);
+ write_unlock(&ci->m_lock);
+}
+
+static unsigned long opinfo_count(struct ksmbd_file *fp)
+{
+ if (ksmbd_stream_fd(fp))
+ return atomic_read(&fp->f_ci->sop_count);
+ else
+ return atomic_read(&fp->f_ci->op_count);
+}
+
+static void opinfo_count_inc(struct ksmbd_file *fp)
+{
+ if (ksmbd_stream_fd(fp))
+ return atomic_inc(&fp->f_ci->sop_count);
+ else
+ return atomic_inc(&fp->f_ci->op_count);
+}
+
+static void opinfo_count_dec(struct ksmbd_file *fp)
+{
+ if (ksmbd_stream_fd(fp))
+ return atomic_dec(&fp->f_ci->sop_count);
+ else
+ return atomic_dec(&fp->f_ci->op_count);
+}
+
+/**
+ * opinfo_write_to_read() - convert a write oplock to read oplock
+ * @opinfo: current oplock info
+ *
+ * Return: 0 on success, otherwise -EINVAL
+ */
+int opinfo_write_to_read(struct oplock_info *opinfo)
+{
+ struct lease *lease = opinfo->o_lease;
+
+ if (!(opinfo->level == SMB2_OPLOCK_LEVEL_BATCH ||
+ opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE)) {
+ pr_err("bad oplock(0x%x)\n", opinfo->level);
+ if (opinfo->is_lease)
+ pr_err("lease state(0x%x)\n", lease->state);
+ return -EINVAL;
+ }
+ opinfo->level = SMB2_OPLOCK_LEVEL_II;
+
+ if (opinfo->is_lease)
+ lease->state = lease->new_state;
+ return 0;
+}
+
+/**
+ * opinfo_read_handle_to_read() - convert a read/handle oplock to read oplock
+ * @opinfo: current oplock info
+ *
+ * Return: 0 on success, otherwise -EINVAL
+ */
+int opinfo_read_handle_to_read(struct oplock_info *opinfo)
+{
+ struct lease *lease = opinfo->o_lease;
+
+ lease->state = lease->new_state;
+ opinfo->level = SMB2_OPLOCK_LEVEL_II;
+ return 0;
+}
+
+/**
+ * opinfo_write_to_none() - convert a write oplock to none
+ * @opinfo: current oplock info
+ *
+ * Return: 0 on success, otherwise -EINVAL
+ */
+int opinfo_write_to_none(struct oplock_info *opinfo)
+{
+ struct lease *lease = opinfo->o_lease;
+
+ if (!(opinfo->level == SMB2_OPLOCK_LEVEL_BATCH ||
+ opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE)) {
+ pr_err("bad oplock(0x%x)\n", opinfo->level);
+ if (opinfo->is_lease)
+ pr_err("lease state(0x%x)\n", lease->state);
+ return -EINVAL;
+ }
+ opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
+ if (opinfo->is_lease)
+ lease->state = lease->new_state;
+ return 0;
+}
+
+/**
+ * opinfo_read_to_none() - convert a write read to none
+ * @opinfo: current oplock info
+ *
+ * Return: 0 on success, otherwise -EINVAL
+ */
+int opinfo_read_to_none(struct oplock_info *opinfo)
+{
+ struct lease *lease = opinfo->o_lease;
+
+ if (opinfo->level != SMB2_OPLOCK_LEVEL_II) {
+ pr_err("bad oplock(0x%x)\n", opinfo->level);
+ if (opinfo->is_lease)
+ pr_err("lease state(0x%x)\n", lease->state);
+ return -EINVAL;
+ }
+ opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
+ if (opinfo->is_lease)
+ lease->state = lease->new_state;
+ return 0;
+}
+
+/**
+ * lease_read_to_write() - upgrade lease state from read to write
+ * @opinfo: current lease info
+ *
+ * Return: 0 on success, otherwise -EINVAL
+ */
+int lease_read_to_write(struct oplock_info *opinfo)
+{
+ struct lease *lease = opinfo->o_lease;
+
+ if (!(lease->state & SMB2_LEASE_READ_CACHING_LE)) {
+ ksmbd_debug(OPLOCK, "bad lease state(0x%x)\n", lease->state);
+ return -EINVAL;
+ }
+
+ lease->new_state = SMB2_LEASE_NONE_LE;
+ lease->state |= SMB2_LEASE_WRITE_CACHING_LE;
+ if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+ opinfo->level = SMB2_OPLOCK_LEVEL_BATCH;
+ else
+ opinfo->level = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+ return 0;
+}
+
+/**
+ * lease_none_upgrade() - upgrade lease state from none
+ * @opinfo: current lease info
+ * @new_state: new lease state
+ *
+ * Return: 0 on success, otherwise -EINVAL
+ */
+static int lease_none_upgrade(struct oplock_info *opinfo, __le32 new_state)
+{
+ struct lease *lease = opinfo->o_lease;
+
+ if (!(lease->state == SMB2_LEASE_NONE_LE)) {
+ ksmbd_debug(OPLOCK, "bad lease state(0x%x)\n", lease->state);
+ return -EINVAL;
+ }
+
+ lease->new_state = SMB2_LEASE_NONE_LE;
+ lease->state = new_state;
+ if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+ if (lease->state & SMB2_LEASE_WRITE_CACHING_LE)
+ opinfo->level = SMB2_OPLOCK_LEVEL_BATCH;
+ else
+ opinfo->level = SMB2_OPLOCK_LEVEL_II;
+ else if (lease->state & SMB2_LEASE_WRITE_CACHING_LE)
+ opinfo->level = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+ else if (lease->state & SMB2_LEASE_READ_CACHING_LE)
+ opinfo->level = SMB2_OPLOCK_LEVEL_II;
+
+ return 0;
+}
+
+/**
+ * close_id_del_oplock() - release oplock object at file close time
+ * @fp: ksmbd file pointer
+ */
+void close_id_del_oplock(struct ksmbd_file *fp)
+{
+ struct oplock_info *opinfo;
+
+ if (S_ISDIR(file_inode(fp->filp)->i_mode))
+ return;
+
+ opinfo = opinfo_get(fp);
+ if (!opinfo)
+ return;
+
+ opinfo_del(opinfo);
+
+ rcu_assign_pointer(fp->f_opinfo, NULL);
+ if (opinfo->op_state == OPLOCK_ACK_WAIT) {
+ opinfo->op_state = OPLOCK_CLOSING;
+ wake_up_interruptible_all(&opinfo->oplock_q);
+ if (opinfo->is_lease) {
+ atomic_set(&opinfo->breaking_cnt, 0);
+ wake_up_interruptible_all(&opinfo->oplock_brk);
+ }
+ }
+
+ opinfo_count_dec(fp);
+ atomic_dec(&opinfo->refcount);
+ opinfo_put(opinfo);
+}
+
+/**
+ * grant_write_oplock() - grant exclusive/batch oplock or write lease
+ * @opinfo_new: new oplock info object
+ * @req_oplock: request oplock
+ * @lctx: lease context information
+ *
+ * Return: 0
+ */
+static void grant_write_oplock(struct oplock_info *opinfo_new, int req_oplock,
+ struct lease_ctx_info *lctx)
+{
+ struct lease *lease = opinfo_new->o_lease;
+
+ if (req_oplock == SMB2_OPLOCK_LEVEL_BATCH)
+ opinfo_new->level = SMB2_OPLOCK_LEVEL_BATCH;
+ else
+ opinfo_new->level = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+
+ if (lctx) {
+ lease->state = lctx->req_state;
+ memcpy(lease->lease_key, lctx->lease_key, SMB2_LEASE_KEY_SIZE);
+ }
+}
+
+/**
+ * grant_read_oplock() - grant level2 oplock or read lease
+ * @opinfo_new: new oplock info object
+ * @lctx: lease context information
+ *
+ * Return: 0
+ */
+static void grant_read_oplock(struct oplock_info *opinfo_new,
+ struct lease_ctx_info *lctx)
+{
+ struct lease *lease = opinfo_new->o_lease;
+
+ opinfo_new->level = SMB2_OPLOCK_LEVEL_II;
+
+ if (lctx) {
+ lease->state = SMB2_LEASE_READ_CACHING_LE;
+ if (lctx->req_state & SMB2_LEASE_HANDLE_CACHING_LE)
+ lease->state |= SMB2_LEASE_HANDLE_CACHING_LE;
+ memcpy(lease->lease_key, lctx->lease_key, SMB2_LEASE_KEY_SIZE);
+ }
+}
+
+/**
+ * grant_none_oplock() - grant none oplock or none lease
+ * @opinfo_new: new oplock info object
+ * @lctx: lease context information
+ *
+ * Return: 0
+ */
+static void grant_none_oplock(struct oplock_info *opinfo_new,
+ struct lease_ctx_info *lctx)
+{
+ struct lease *lease = opinfo_new->o_lease;
+
+ opinfo_new->level = SMB2_OPLOCK_LEVEL_NONE;
+
+ if (lctx) {
+ lease->state = 0;
+ memcpy(lease->lease_key, lctx->lease_key, SMB2_LEASE_KEY_SIZE);
+ }
+}
+
+static inline int compare_guid_key(struct oplock_info *opinfo,
+ const char *guid1, const char *key1)
+{
+ const char *guid2, *key2;
+
+ guid2 = opinfo->conn->ClientGUID;
+ key2 = opinfo->o_lease->lease_key;
+ if (!memcmp(guid1, guid2, SMB2_CLIENT_GUID_SIZE) &&
+ !memcmp(key1, key2, SMB2_LEASE_KEY_SIZE))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * same_client_has_lease() - check whether current lease request is
+ * from lease owner of file
+ * @ci: master file pointer
+ * @client_guid: Client GUID
+ * @lctx: lease context information
+ *
+ * Return: oplock(lease) object on success, otherwise NULL
+ */
+static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
+ char *client_guid,
+ struct lease_ctx_info *lctx)
+{
+ int ret;
+ struct lease *lease;
+ struct oplock_info *opinfo;
+ struct oplock_info *m_opinfo = NULL;
+
+ if (!lctx)
+ return NULL;
+
+ /*
+ * Compare lease key and client_guid to know request from same owner
+ * of same client
+ */
+ read_lock(&ci->m_lock);
+ list_for_each_entry(opinfo, &ci->m_op_list, op_entry) {
+ if (!opinfo->is_lease)
+ continue;
+ read_unlock(&ci->m_lock);
+ lease = opinfo->o_lease;
+
+ ret = compare_guid_key(opinfo, client_guid, lctx->lease_key);
+ if (ret) {
+ m_opinfo = opinfo;
+ /* skip upgrading lease about breaking lease */
+ if (atomic_read(&opinfo->breaking_cnt)) {
+ read_lock(&ci->m_lock);
+ continue;
+ }
+
+ /* upgrading lease */
+ if ((atomic_read(&ci->op_count) +
+ atomic_read(&ci->sop_count)) == 1) {
+ if (lease->state ==
+ (lctx->req_state & lease->state)) {
+ lease->state |= lctx->req_state;
+ if (lctx->req_state &
+ SMB2_LEASE_WRITE_CACHING_LE)
+ lease_read_to_write(opinfo);
+ }
+ } else if ((atomic_read(&ci->op_count) +
+ atomic_read(&ci->sop_count)) > 1) {
+ if (lctx->req_state ==
+ (SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_HANDLE_CACHING_LE))
+ lease->state = lctx->req_state;
+ }
+
+ if (lctx->req_state && lease->state ==
+ SMB2_LEASE_NONE_LE)
+ lease_none_upgrade(opinfo, lctx->req_state);
+ }
+ read_lock(&ci->m_lock);
+ }
+ read_unlock(&ci->m_lock);
+
+ return m_opinfo;
+}
+
+static void wait_for_break_ack(struct oplock_info *opinfo)
+{
+ int rc = 0;
+
+ rc = wait_event_interruptible_timeout(opinfo->oplock_q,
+ opinfo->op_state == OPLOCK_STATE_NONE ||
+ opinfo->op_state == OPLOCK_CLOSING,
+ OPLOCK_WAIT_TIME);
+
+ /* is this a timeout ? */
+ if (!rc) {
+ if (opinfo->is_lease)
+ opinfo->o_lease->state = SMB2_LEASE_NONE_LE;
+ opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
+ opinfo->op_state = OPLOCK_STATE_NONE;
+ }
+}
+
+static void wake_up_oplock_break(struct oplock_info *opinfo)
+{
+ clear_bit_unlock(0, &opinfo->pending_break);
+ /* memory barrier is needed for wake_up_bit() */
+ smp_mb__after_atomic();
+ wake_up_bit(&opinfo->pending_break, 0);
+}
+
+static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level)
+{
+ while (test_and_set_bit(0, &opinfo->pending_break)) {
+ wait_on_bit(&opinfo->pending_break, 0, TASK_UNINTERRUPTIBLE);
+
+ /* Not immediately break to none. */
+ opinfo->open_trunc = 0;
+
+ if (opinfo->op_state == OPLOCK_CLOSING)
+ return -ENOENT;
+ else if (!opinfo->is_lease && opinfo->level <= req_op_level)
+ return 1;
+ }
+
+ if (!opinfo->is_lease && opinfo->level <= req_op_level) {
+ wake_up_oplock_break(opinfo);
+ return 1;
+ }
+ return 0;
+}
+
+static inline int allocate_oplock_break_buf(struct ksmbd_work *work)
+{
+ work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL);
+ if (!work->response_buf)
+ return -ENOMEM;
+ work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE;
+ return 0;
+}
+
+/**
+ * __smb2_oplock_break_noti() - send smb2 oplock break cmd from conn
+ * to client
+ * @wk: smb work object
+ *
+ * There are two ways this function can be called. 1- while file open we break
+ * from exclusive/batch lock to levelII oplock and 2- while file write/truncate
+ * we break from levelII oplock no oplock.
+ * work->request_buf contains oplock_info.
+ */
+static void __smb2_oplock_break_noti(struct work_struct *wk)
+{
+ struct smb2_oplock_break *rsp = NULL;
+ struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work);
+ struct ksmbd_conn *conn = work->conn;
+ struct oplock_break_info *br_info = work->request_buf;
+ struct smb2_hdr *rsp_hdr;
+ struct ksmbd_file *fp;
+
+ fp = ksmbd_lookup_durable_fd(br_info->fid);
+ if (!fp) {
+ atomic_dec(&conn->r_count);
+ ksmbd_free_work_struct(work);
+ return;
+ }
+
+ if (allocate_oplock_break_buf(work)) {
+ pr_err("smb2_allocate_rsp_buf failed! ");
+ atomic_dec(&conn->r_count);
+ ksmbd_fd_put(work, fp);
+ ksmbd_free_work_struct(work);
+ return;
+ }
+
+ rsp_hdr = work->response_buf;
+ memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
+ rsp_hdr->smb2_buf_length =
+ cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals));
+ rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
+ rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
+ rsp_hdr->CreditRequest = cpu_to_le16(0);
+ rsp_hdr->Command = SMB2_OPLOCK_BREAK;
+ rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR);
+ rsp_hdr->NextCommand = 0;
+ rsp_hdr->MessageId = cpu_to_le64(-1);
+ rsp_hdr->Id.SyncId.ProcessId = 0;
+ rsp_hdr->Id.SyncId.TreeId = 0;
+ rsp_hdr->SessionId = 0;
+ memset(rsp_hdr->Signature, 0, 16);
+
+ rsp = work->response_buf;
+
+ rsp->StructureSize = cpu_to_le16(24);
+ if (!br_info->open_trunc &&
+ (br_info->level == SMB2_OPLOCK_LEVEL_BATCH ||
+ br_info->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE))
+ rsp->OplockLevel = SMB2_OPLOCK_LEVEL_II;
+ else
+ rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE;
+ rsp->Reserved = 0;
+ rsp->Reserved2 = 0;
+ rsp->PersistentFid = cpu_to_le64(fp->persistent_id);
+ rsp->VolatileFid = cpu_to_le64(fp->volatile_id);
+
+ inc_rfc1001_len(rsp, 24);
+
+ ksmbd_debug(OPLOCK,
+ "sending oplock break v_id %llu p_id = %llu lock level = %d\n",
+ rsp->VolatileFid, rsp->PersistentFid, rsp->OplockLevel);
+
+ ksmbd_fd_put(work, fp);
+ ksmbd_conn_write(work);
+ ksmbd_free_work_struct(work);
+ atomic_dec(&conn->r_count);
+}
+
+/**
+ * smb2_oplock_break_noti() - send smb2 exclusive/batch to level2 oplock
+ * break command from server to client
+ * @opinfo: oplock info object
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int smb2_oplock_break_noti(struct oplock_info *opinfo)
+{
+ struct ksmbd_conn *conn = opinfo->conn;
+ struct oplock_break_info *br_info;
+ int ret = 0;
+ struct ksmbd_work *work = ksmbd_alloc_work_struct();
+
+ if (!work)
+ return -ENOMEM;
+
+ br_info = kmalloc(sizeof(struct oplock_break_info), GFP_KERNEL);
+ if (!br_info) {
+ ksmbd_free_work_struct(work);
+ return -ENOMEM;
+ }
+
+ br_info->level = opinfo->level;
+ br_info->fid = opinfo->fid;
+ br_info->open_trunc = opinfo->open_trunc;
+
+ work->request_buf = (char *)br_info;
+ work->conn = conn;
+ work->sess = opinfo->sess;
+
+ atomic_inc(&conn->r_count);
+ if (opinfo->op_state == OPLOCK_ACK_WAIT) {
+ INIT_WORK(&work->work, __smb2_oplock_break_noti);
+ ksmbd_queue_work(work);
+
+ wait_for_break_ack(opinfo);
+ } else {
+ __smb2_oplock_break_noti(&work->work);
+ if (opinfo->level == SMB2_OPLOCK_LEVEL_II)
+ opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
+ }
+ return ret;
+}
+
+/**
+ * __smb2_lease_break_noti() - send lease break command from server
+ * to client
+ * @wk: smb work object
+ */
+static void __smb2_lease_break_noti(struct work_struct *wk)
+{
+ struct smb2_lease_break *rsp = NULL;
+ struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work);
+ struct lease_break_info *br_info = work->request_buf;
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_hdr *rsp_hdr;
+
+ if (allocate_oplock_break_buf(work)) {
+ ksmbd_debug(OPLOCK, "smb2_allocate_rsp_buf failed! ");
+ ksmbd_free_work_struct(work);
+ atomic_dec(&conn->r_count);
+ return;
+ }
+
+ rsp_hdr = work->response_buf;
+ memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
+ rsp_hdr->smb2_buf_length =
+ cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals));
+ rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
+ rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
+ rsp_hdr->CreditRequest = cpu_to_le16(0);
+ rsp_hdr->Command = SMB2_OPLOCK_BREAK;
+ rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR);
+ rsp_hdr->NextCommand = 0;
+ rsp_hdr->MessageId = cpu_to_le64(-1);
+ rsp_hdr->Id.SyncId.ProcessId = 0;
+ rsp_hdr->Id.SyncId.TreeId = 0;
+ rsp_hdr->SessionId = 0;
+ memset(rsp_hdr->Signature, 0, 16);
+
+ rsp = work->response_buf;
+ rsp->StructureSize = cpu_to_le16(44);
+ rsp->Epoch = br_info->epoch;
+ rsp->Flags = 0;
+
+ if (br_info->curr_state & (SMB2_LEASE_WRITE_CACHING_LE |
+ SMB2_LEASE_HANDLE_CACHING_LE))
+ rsp->Flags = SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED;
+
+ memcpy(rsp->LeaseKey, br_info->lease_key, SMB2_LEASE_KEY_SIZE);
+ rsp->CurrentLeaseState = br_info->curr_state;
+ rsp->NewLeaseState = br_info->new_state;
+ rsp->BreakReason = 0;
+ rsp->AccessMaskHint = 0;
+ rsp->ShareMaskHint = 0;
+
+ inc_rfc1001_len(rsp, 44);
+
+ ksmbd_conn_write(work);
+ ksmbd_free_work_struct(work);
+ atomic_dec(&conn->r_count);
+}
+
+/**
+ * smb2_lease_break_noti() - break lease when a new client request
+ * write lease
+ * @opinfo: conains lease state information
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int smb2_lease_break_noti(struct oplock_info *opinfo)
+{
+ struct ksmbd_conn *conn = opinfo->conn;
+ struct list_head *tmp, *t;
+ struct ksmbd_work *work;
+ struct lease_break_info *br_info;
+ struct lease *lease = opinfo->o_lease;
+
+ work = ksmbd_alloc_work_struct();
+ if (!work)
+ return -ENOMEM;
+
+ br_info = kmalloc(sizeof(struct lease_break_info), GFP_KERNEL);
+ if (!br_info) {
+ ksmbd_free_work_struct(work);
+ return -ENOMEM;
+ }
+
+ br_info->curr_state = lease->state;
+ br_info->new_state = lease->new_state;
+ if (lease->version == 2)
+ br_info->epoch = cpu_to_le16(++lease->epoch);
+ else
+ br_info->epoch = 0;
+ memcpy(br_info->lease_key, lease->lease_key, SMB2_LEASE_KEY_SIZE);
+
+ work->request_buf = (char *)br_info;
+ work->conn = conn;
+ work->sess = opinfo->sess;
+
+ atomic_inc(&conn->r_count);
+ if (opinfo->op_state == OPLOCK_ACK_WAIT) {
+ list_for_each_safe(tmp, t, &opinfo->interim_list) {
+ struct ksmbd_work *in_work;
+
+ in_work = list_entry(tmp, struct ksmbd_work,
+ interim_entry);
+ setup_async_work(in_work, NULL, NULL);
+ smb2_send_interim_resp(in_work, STATUS_PENDING);
+ list_del(&in_work->interim_entry);
+ }
+ INIT_WORK(&work->work, __smb2_lease_break_noti);
+ ksmbd_queue_work(work);
+ wait_for_break_ack(opinfo);
+ } else {
+ __smb2_lease_break_noti(&work->work);
+ if (opinfo->o_lease->new_state == SMB2_LEASE_NONE_LE) {
+ opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
+ opinfo->o_lease->state = SMB2_LEASE_NONE_LE;
+ }
+ }
+ return 0;
+}
+
+static void wait_lease_breaking(struct oplock_info *opinfo)
+{
+ if (!opinfo->is_lease)
+ return;
+
+ wake_up_interruptible_all(&opinfo->oplock_brk);
+ if (atomic_read(&opinfo->breaking_cnt)) {
+ int ret = 0;
+
+ ret = wait_event_interruptible_timeout(opinfo->oplock_brk,
+ atomic_read(&opinfo->breaking_cnt) == 0,
+ HZ);
+ if (!ret)
+ atomic_set(&opinfo->breaking_cnt, 0);
+ }
+}
+
+static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level)
+{
+ int err = 0;
+
+ /* Need to break exclusive/batch oplock, write lease or overwrite_if */
+ ksmbd_debug(OPLOCK,
+ "request to send oplock(level : 0x%x) break notification\n",
+ brk_opinfo->level);
+
+ if (brk_opinfo->is_lease) {
+ struct lease *lease = brk_opinfo->o_lease;
+
+ atomic_inc(&brk_opinfo->breaking_cnt);
+
+ err = oplock_break_pending(brk_opinfo, req_op_level);
+ if (err)
+ return err < 0 ? err : 0;
+
+ if (brk_opinfo->open_trunc) {
+ /*
+ * Create overwrite break trigger the lease break to
+ * none.
+ */
+ lease->new_state = SMB2_LEASE_NONE_LE;
+ } else {
+ if (lease->state & SMB2_LEASE_WRITE_CACHING_LE) {
+ if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+ lease->new_state =
+ SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_HANDLE_CACHING_LE;
+ else
+ lease->new_state =
+ SMB2_LEASE_READ_CACHING_LE;
+ } else {
+ if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+ lease->new_state =
+ SMB2_LEASE_READ_CACHING_LE;
+ else
+ lease->new_state = SMB2_LEASE_NONE_LE;
+ }
+ }
+
+ if (lease->state & (SMB2_LEASE_WRITE_CACHING_LE |
+ SMB2_LEASE_HANDLE_CACHING_LE))
+ brk_opinfo->op_state = OPLOCK_ACK_WAIT;
+ else
+ atomic_dec(&brk_opinfo->breaking_cnt);
+ } else {
+ err = oplock_break_pending(brk_opinfo, req_op_level);
+ if (err)
+ return err < 0 ? err : 0;
+
+ if (brk_opinfo->level == SMB2_OPLOCK_LEVEL_BATCH ||
+ brk_opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
+ brk_opinfo->op_state = OPLOCK_ACK_WAIT;
+ }
+
+ if (brk_opinfo->is_lease)
+ err = smb2_lease_break_noti(brk_opinfo);
+ else
+ err = smb2_oplock_break_noti(brk_opinfo);
+
+ ksmbd_debug(OPLOCK, "oplock granted = %d\n", brk_opinfo->level);
+ if (brk_opinfo->op_state == OPLOCK_CLOSING)
+ err = -ENOENT;
+ wake_up_oplock_break(brk_opinfo);
+
+ wait_lease_breaking(brk_opinfo);
+
+ return err;
+}
+
+void destroy_lease_table(struct ksmbd_conn *conn)
+{
+ struct lease_table *lb, *lbtmp;
+ struct oplock_info *opinfo;
+
+ write_lock(&lease_list_lock);
+ if (list_empty(&lease_table_list)) {
+ write_unlock(&lease_list_lock);
+ return;
+ }
+
+ list_for_each_entry_safe(lb, lbtmp, &lease_table_list, l_entry) {
+ if (conn && memcmp(lb->client_guid, conn->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE))
+ continue;
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(opinfo, &lb->lease_list,
+ lease_entry) {
+ rcu_read_unlock();
+ lease_del_list(opinfo);
+ goto again;
+ }
+ rcu_read_unlock();
+ list_del(&lb->l_entry);
+ kfree(lb);
+ }
+ write_unlock(&lease_list_lock);
+}
+
+int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
+ struct lease_ctx_info *lctx)
+{
+ struct oplock_info *opinfo;
+ int err = 0;
+ struct lease_table *lb;
+
+ if (!lctx)
+ return err;
+
+ read_lock(&lease_list_lock);
+ if (list_empty(&lease_table_list)) {
+ read_unlock(&lease_list_lock);
+ return 0;
+ }
+
+ list_for_each_entry(lb, &lease_table_list, l_entry) {
+ if (!memcmp(lb->client_guid, sess->conn->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE))
+ goto found;
+ }
+ read_unlock(&lease_list_lock);
+
+ return 0;
+
+found:
+ rcu_read_lock();
+ list_for_each_entry_rcu(opinfo, &lb->lease_list, lease_entry) {
+ if (!atomic_inc_not_zero(&opinfo->refcount))
+ continue;
+ rcu_read_unlock();
+ if (opinfo->o_fp->f_ci == ci)
+ goto op_next;
+ err = compare_guid_key(opinfo, sess->conn->ClientGUID,
+ lctx->lease_key);
+ if (err) {
+ err = -EINVAL;
+ ksmbd_debug(OPLOCK,
+ "found same lease key is already used in other files\n");
+ opinfo_put(opinfo);
+ goto out;
+ }
+op_next:
+ opinfo_put(opinfo);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+out:
+ read_unlock(&lease_list_lock);
+ return err;
+}
+
+static void copy_lease(struct oplock_info *op1, struct oplock_info *op2)
+{
+ struct lease *lease1 = op1->o_lease;
+ struct lease *lease2 = op2->o_lease;
+
+ op2->level = op1->level;
+ lease2->state = lease1->state;
+ memcpy(lease2->lease_key, lease1->lease_key,
+ SMB2_LEASE_KEY_SIZE);
+ lease2->duration = lease1->duration;
+ lease2->flags = lease1->flags;
+}
+
+static int add_lease_global_list(struct oplock_info *opinfo)
+{
+ struct lease_table *lb;
+
+ read_lock(&lease_list_lock);
+ list_for_each_entry(lb, &lease_table_list, l_entry) {
+ if (!memcmp(lb->client_guid, opinfo->conn->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE)) {
+ opinfo->o_lease->l_lb = lb;
+ lease_add_list(opinfo);
+ read_unlock(&lease_list_lock);
+ return 0;
+ }
+ }
+ read_unlock(&lease_list_lock);
+
+ lb = kmalloc(sizeof(struct lease_table), GFP_KERNEL);
+ if (!lb)
+ return -ENOMEM;
+
+ memcpy(lb->client_guid, opinfo->conn->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE);
+ INIT_LIST_HEAD(&lb->lease_list);
+ spin_lock_init(&lb->lb_lock);
+ opinfo->o_lease->l_lb = lb;
+ lease_add_list(opinfo);
+ lb_add(lb);
+ return 0;
+}
+
+static void set_oplock_level(struct oplock_info *opinfo, int level,
+ struct lease_ctx_info *lctx)
+{
+ switch (level) {
+ case SMB2_OPLOCK_LEVEL_BATCH:
+ case SMB2_OPLOCK_LEVEL_EXCLUSIVE:
+ grant_write_oplock(opinfo, level, lctx);
+ break;
+ case SMB2_OPLOCK_LEVEL_II:
+ grant_read_oplock(opinfo, lctx);
+ break;
+ default:
+ grant_none_oplock(opinfo, lctx);
+ break;
+ }
+}
+
+/**
+ * smb_grant_oplock() - handle oplock/lease request on file open
+ * @work: smb work
+ * @req_op_level: oplock level
+ * @pid: id of open file
+ * @fp: ksmbd file pointer
+ * @tid: Tree id of connection
+ * @lctx: lease context information on file open
+ * @share_ret: share mode
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
+ struct ksmbd_file *fp, __u16 tid,
+ struct lease_ctx_info *lctx, int share_ret)
+{
+ struct ksmbd_session *sess = work->sess;
+ int err = 0;
+ struct oplock_info *opinfo = NULL, *prev_opinfo = NULL;
+ struct ksmbd_inode *ci = fp->f_ci;
+ bool prev_op_has_lease;
+ __le32 prev_op_state = 0;
+
+ /* not support directory lease */
+ if (S_ISDIR(file_inode(fp->filp)->i_mode))
+ return 0;
+
+ opinfo = alloc_opinfo(work, pid, tid);
+ if (!opinfo)
+ return -ENOMEM;
+
+ if (lctx) {
+ err = alloc_lease(opinfo, lctx);
+ if (err)
+ goto err_out;
+ opinfo->is_lease = 1;
+ }
+
+ /* ci does not have any oplock */
+ if (!opinfo_count(fp))
+ goto set_lev;
+
+ /* grant none-oplock if second open is trunc */
+ if (fp->attrib_only && fp->cdoption != FILE_OVERWRITE_IF_LE &&
+ fp->cdoption != FILE_OVERWRITE_LE &&
+ fp->cdoption != FILE_SUPERSEDE_LE) {
+ req_op_level = SMB2_OPLOCK_LEVEL_NONE;
+ goto set_lev;
+ }
+
+ if (lctx) {
+ struct oplock_info *m_opinfo;
+
+ /* is lease already granted ? */
+ m_opinfo = same_client_has_lease(ci, sess->conn->ClientGUID,
+ lctx);
+ if (m_opinfo) {
+ copy_lease(m_opinfo, opinfo);
+ if (atomic_read(&m_opinfo->breaking_cnt))
+ opinfo->o_lease->flags =
+ SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE;
+ goto out;
+ }
+ }
+ prev_opinfo = opinfo_get_list(ci);
+ if (!prev_opinfo ||
+ (prev_opinfo->level == SMB2_OPLOCK_LEVEL_NONE && lctx))
+ goto set_lev;
+ prev_op_has_lease = prev_opinfo->is_lease;
+ if (prev_op_has_lease)
+ prev_op_state = prev_opinfo->o_lease->state;
+
+ if (share_ret < 0 &&
+ prev_opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+ err = share_ret;
+ opinfo_put(prev_opinfo);
+ goto err_out;
+ }
+
+ if (prev_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH &&
+ prev_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+ opinfo_put(prev_opinfo);
+ goto op_break_not_needed;
+ }
+
+ list_add(&work->interim_entry, &prev_opinfo->interim_list);
+ err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II);
+ opinfo_put(prev_opinfo);
+ if (err == -ENOENT)
+ goto set_lev;
+ /* Check all oplock was freed by close */
+ else if (err < 0)
+ goto err_out;
+
+op_break_not_needed:
+ if (share_ret < 0) {
+ err = share_ret;
+ goto err_out;
+ }
+
+ if (req_op_level != SMB2_OPLOCK_LEVEL_NONE)
+ req_op_level = SMB2_OPLOCK_LEVEL_II;
+
+ /* grant fixed oplock on stacked locking between lease and oplock */
+ if (prev_op_has_lease && !lctx)
+ if (prev_op_state & SMB2_LEASE_HANDLE_CACHING_LE)
+ req_op_level = SMB2_OPLOCK_LEVEL_NONE;
+
+ if (!prev_op_has_lease && lctx) {
+ req_op_level = SMB2_OPLOCK_LEVEL_II;
+ lctx->req_state = SMB2_LEASE_READ_CACHING_LE;
+ }
+
+set_lev:
+ set_oplock_level(opinfo, req_op_level, lctx);
+
+out:
+ rcu_assign_pointer(fp->f_opinfo, opinfo);
+ opinfo->o_fp = fp;
+
+ opinfo_count_inc(fp);
+ opinfo_add(opinfo);
+ if (opinfo->is_lease) {
+ err = add_lease_global_list(opinfo);
+ if (err)
+ goto err_out;
+ }
+
+ return 0;
+err_out:
+ free_opinfo(opinfo);
+ return err;
+}
+
+/**
+ * smb_break_all_write_oplock() - break batch/exclusive oplock to level2
+ * @work: smb work
+ * @fp: ksmbd file pointer
+ * @is_trunc: truncate on open
+ */
+static void smb_break_all_write_oplock(struct ksmbd_work *work,
+ struct ksmbd_file *fp, int is_trunc)
+{
+ struct oplock_info *brk_opinfo;
+
+ brk_opinfo = opinfo_get_list(fp->f_ci);
+ if (!brk_opinfo)
+ return;
+ if (brk_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH &&
+ brk_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+ opinfo_put(brk_opinfo);
+ return;
+ }
+
+ brk_opinfo->open_trunc = is_trunc;
+ list_add(&work->interim_entry, &brk_opinfo->interim_list);
+ oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II);
+ opinfo_put(brk_opinfo);
+}
+
+/**
+ * smb_break_all_levII_oplock() - send level2 oplock or read lease break command
+ * from server to client
+ * @work: smb work
+ * @fp: ksmbd file pointer
+ * @is_trunc: truncate on open
+ */
+void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
+ int is_trunc)
+{
+ struct oplock_info *op, *brk_op;
+ struct ksmbd_inode *ci;
+ struct ksmbd_conn *conn = work->sess->conn;
+
+ if (!test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_OPLOCKS))
+ return;
+
+ ci = fp->f_ci;
+ op = opinfo_get(fp);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) {
+ if (!atomic_inc_not_zero(&brk_op->refcount))
+ continue;
+ rcu_read_unlock();
+ if (brk_op->is_lease && (brk_op->o_lease->state &
+ (~(SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_HANDLE_CACHING_LE)))) {
+ ksmbd_debug(OPLOCK, "unexpected lease state(0x%x)\n",
+ brk_op->o_lease->state);
+ goto next;
+ } else if (brk_op->level !=
+ SMB2_OPLOCK_LEVEL_II) {
+ ksmbd_debug(OPLOCK, "unexpected oplock(0x%x)\n",
+ brk_op->level);
+ goto next;
+ }
+
+ /* Skip oplock being break to none */
+ if (brk_op->is_lease &&
+ brk_op->o_lease->new_state == SMB2_LEASE_NONE_LE &&
+ atomic_read(&brk_op->breaking_cnt))
+ goto next;
+
+ if (op && op->is_lease && brk_op->is_lease &&
+ !memcmp(conn->ClientGUID, brk_op->conn->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE) &&
+ !memcmp(op->o_lease->lease_key, brk_op->o_lease->lease_key,
+ SMB2_LEASE_KEY_SIZE))
+ goto next;
+ brk_op->open_trunc = is_trunc;
+ oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE);
+next:
+ opinfo_put(brk_op);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ if (op)
+ opinfo_put(op);
+}
+
+/**
+ * smb_break_all_oplock() - break both batch/exclusive and level2 oplock
+ * @work: smb work
+ * @fp: ksmbd file pointer
+ */
+void smb_break_all_oplock(struct ksmbd_work *work, struct ksmbd_file *fp)
+{
+ if (!test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_OPLOCKS))
+ return;
+
+ smb_break_all_write_oplock(work, fp, 1);
+ smb_break_all_levII_oplock(work, fp, 1);
+}
+
+/**
+ * smb2_map_lease_to_oplock() - map lease state to corresponding oplock type
+ * @lease_state: lease type
+ *
+ * Return: 0 if no mapping, otherwise corresponding oplock type
+ */
+__u8 smb2_map_lease_to_oplock(__le32 lease_state)
+{
+ if (lease_state == (SMB2_LEASE_HANDLE_CACHING_LE |
+ SMB2_LEASE_READ_CACHING_LE |
+ SMB2_LEASE_WRITE_CACHING_LE)) {
+ return SMB2_OPLOCK_LEVEL_BATCH;
+ } else if (lease_state != SMB2_LEASE_WRITE_CACHING_LE &&
+ lease_state & SMB2_LEASE_WRITE_CACHING_LE) {
+ if (!(lease_state & SMB2_LEASE_HANDLE_CACHING_LE))
+ return SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+ } else if (lease_state & SMB2_LEASE_READ_CACHING_LE) {
+ return SMB2_OPLOCK_LEVEL_II;
+ }
+ return 0;
+}
+
+/**
+ * create_lease_buf() - create lease context for open cmd response
+ * @rbuf: buffer to create lease context response
+ * @lease: buffer to stored parsed lease state information
+ */
+void create_lease_buf(u8 *rbuf, struct lease *lease)
+{
+ char *LeaseKey = (char *)&lease->lease_key;
+
+ if (lease->version == 2) {
+ struct create_lease_v2 *buf = (struct create_lease_v2 *)rbuf;
+ char *ParentLeaseKey = (char *)&lease->parent_lease_key;
+
+ memset(buf, 0, sizeof(struct create_lease_v2));
+ buf->lcontext.LeaseKeyLow = *((__le64 *)LeaseKey);
+ buf->lcontext.LeaseKeyHigh = *((__le64 *)(LeaseKey + 8));
+ buf->lcontext.LeaseFlags = lease->flags;
+ buf->lcontext.LeaseState = lease->state;
+ buf->lcontext.ParentLeaseKeyLow = *((__le64 *)ParentLeaseKey);
+ buf->lcontext.ParentLeaseKeyHigh = *((__le64 *)(ParentLeaseKey + 8));
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_lease_v2, lcontext));
+ buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2));
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_lease_v2, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ buf->Name[0] = 'R';
+ buf->Name[1] = 'q';
+ buf->Name[2] = 'L';
+ buf->Name[3] = 's';
+ } else {
+ struct create_lease *buf = (struct create_lease *)rbuf;
+
+ memset(buf, 0, sizeof(struct create_lease));
+ buf->lcontext.LeaseKeyLow = *((__le64 *)LeaseKey);
+ buf->lcontext.LeaseKeyHigh = *((__le64 *)(LeaseKey + 8));
+ buf->lcontext.LeaseFlags = lease->flags;
+ buf->lcontext.LeaseState = lease->state;
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_lease, lcontext));
+ buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context));
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_lease, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ buf->Name[0] = 'R';
+ buf->Name[1] = 'q';
+ buf->Name[2] = 'L';
+ buf->Name[3] = 's';
+ }
+}
+
+/**
+ * parse_lease_state() - parse lease context containted in file open request
+ * @open_req: buffer containing smb2 file open(create) request
+ *
+ * Return: oplock state, -ENOENT if create lease context not found
+ */
+struct lease_ctx_info *parse_lease_state(void *open_req)
+{
+ char *data_offset;
+ struct create_context *cc;
+ unsigned int next = 0;
+ char *name;
+ bool found = false;
+ struct smb2_create_req *req = (struct smb2_create_req *)open_req;
+ struct lease_ctx_info *lreq = kzalloc(sizeof(struct lease_ctx_info),
+ GFP_KERNEL);
+ if (!lreq)
+ return NULL;
+
+ data_offset = (char *)req + 4 + le32_to_cpu(req->CreateContextsOffset);
+ cc = (struct create_context *)data_offset;
+ do {
+ cc = (struct create_context *)((char *)cc + next);
+ name = le16_to_cpu(cc->NameOffset) + (char *)cc;
+ if (le16_to_cpu(cc->NameLength) != 4 ||
+ strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4)) {
+ next = le32_to_cpu(cc->Next);
+ continue;
+ }
+ found = true;
+ break;
+ } while (next != 0);
+
+ if (found) {
+ if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) {
+ struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
+
+ *((__le64 *)lreq->lease_key) = lc->lcontext.LeaseKeyLow;
+ *((__le64 *)(lreq->lease_key + 8)) = lc->lcontext.LeaseKeyHigh;
+ lreq->req_state = lc->lcontext.LeaseState;
+ lreq->flags = lc->lcontext.LeaseFlags;
+ lreq->duration = lc->lcontext.LeaseDuration;
+ *((__le64 *)lreq->parent_lease_key) = lc->lcontext.ParentLeaseKeyLow;
+ *((__le64 *)(lreq->parent_lease_key + 8)) = lc->lcontext.ParentLeaseKeyHigh;
+ lreq->version = 2;
+ } else {
+ struct create_lease *lc = (struct create_lease *)cc;
+
+ *((__le64 *)lreq->lease_key) = lc->lcontext.LeaseKeyLow;
+ *((__le64 *)(lreq->lease_key + 8)) = lc->lcontext.LeaseKeyHigh;
+ lreq->req_state = lc->lcontext.LeaseState;
+ lreq->flags = lc->lcontext.LeaseFlags;
+ lreq->duration = lc->lcontext.LeaseDuration;
+ lreq->version = 1;
+ }
+ return lreq;
+ }
+
+ kfree(lreq);
+ return NULL;
+}
+
+/**
+ * smb2_find_context_vals() - find a particular context info in open request
+ * @open_req: buffer containing smb2 file open(create) request
+ * @tag: context name to search for
+ *
+ * Return: pointer to requested context, NULL if @str context not found
+ * or error pointer if name length is invalid.
+ */
+struct create_context *smb2_find_context_vals(void *open_req, const char *tag)
+{
+ char *data_offset;
+ struct create_context *cc;
+ unsigned int next = 0;
+ char *name;
+ struct smb2_create_req *req = (struct smb2_create_req *)open_req;
+
+ data_offset = (char *)req + 4 + le32_to_cpu(req->CreateContextsOffset);
+ cc = (struct create_context *)data_offset;
+ do {
+ int val;
+
+ cc = (struct create_context *)((char *)cc + next);
+ name = le16_to_cpu(cc->NameOffset) + (char *)cc;
+ val = le16_to_cpu(cc->NameLength);
+ if (val < 4)
+ return ERR_PTR(-EINVAL);
+
+ if (memcmp(name, tag, val) == 0)
+ return cc;
+ next = le32_to_cpu(cc->Next);
+ } while (next != 0);
+
+ return NULL;
+}
+
+/**
+ * create_durable_rsp_buf() - create durable handle context
+ * @cc: buffer to create durable context response
+ */
+void create_durable_rsp_buf(char *cc)
+{
+ struct create_durable_rsp *buf;
+
+ buf = (struct create_durable_rsp *)cc;
+ memset(buf, 0, sizeof(struct create_durable_rsp));
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable_rsp, Data));
+ buf->ccontext.DataLength = cpu_to_le32(8);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable_rsp, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ /* SMB2_CREATE_DURABLE_HANDLE_RESPONSE is "DHnQ" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = 'n';
+ buf->Name[3] = 'Q';
+}
+
+/**
+ * create_durable_v2_rsp_buf() - create durable handle v2 context
+ * @cc: buffer to create durable context response
+ * @fp: ksmbd file pointer
+ */
+void create_durable_v2_rsp_buf(char *cc, struct ksmbd_file *fp)
+{
+ struct create_durable_v2_rsp *buf;
+
+ buf = (struct create_durable_v2_rsp *)cc;
+ memset(buf, 0, sizeof(struct create_durable_rsp));
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable_rsp, Data));
+ buf->ccontext.DataLength = cpu_to_le32(8);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable_rsp, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ /* SMB2_CREATE_DURABLE_HANDLE_RESPONSE_V2 is "DH2Q" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'Q';
+
+ buf->Timeout = cpu_to_le32(fp->durable_timeout);
+}
+
+/**
+ * create_mxac_rsp_buf() - create query maximal access context
+ * @cc: buffer to create maximal access context response
+ * @maximal_access: maximal access
+ */
+void create_mxac_rsp_buf(char *cc, int maximal_access)
+{
+ struct create_mxac_rsp *buf;
+
+ buf = (struct create_mxac_rsp *)cc;
+ memset(buf, 0, sizeof(struct create_mxac_rsp));
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_mxac_rsp, QueryStatus));
+ buf->ccontext.DataLength = cpu_to_le32(8);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_mxac_rsp, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ /* SMB2_CREATE_QUERY_MAXIMAL_ACCESS_RESPONSE is "MxAc" */
+ buf->Name[0] = 'M';
+ buf->Name[1] = 'x';
+ buf->Name[2] = 'A';
+ buf->Name[3] = 'c';
+
+ buf->QueryStatus = STATUS_SUCCESS;
+ buf->MaximalAccess = cpu_to_le32(maximal_access);
+}
+
+void create_disk_id_rsp_buf(char *cc, __u64 file_id, __u64 vol_id)
+{
+ struct create_disk_id_rsp *buf;
+
+ buf = (struct create_disk_id_rsp *)cc;
+ memset(buf, 0, sizeof(struct create_disk_id_rsp));
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_disk_id_rsp, DiskFileId));
+ buf->ccontext.DataLength = cpu_to_le32(32);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_mxac_rsp, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ /* SMB2_CREATE_QUERY_ON_DISK_ID_RESPONSE is "QFid" */
+ buf->Name[0] = 'Q';
+ buf->Name[1] = 'F';
+ buf->Name[2] = 'i';
+ buf->Name[3] = 'd';
+
+ buf->DiskFileId = cpu_to_le64(file_id);
+ buf->VolumeId = cpu_to_le64(vol_id);
+}
+
+/**
+ * create_posix_rsp_buf() - create posix extension context
+ * @cc: buffer to create posix on posix response
+ * @fp: ksmbd file pointer
+ */
+void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp)
+{
+ struct create_posix_rsp *buf;
+ struct inode *inode = file_inode(fp->filp);
+ struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+
+ buf = (struct create_posix_rsp *)cc;
+ memset(buf, 0, sizeof(struct create_posix_rsp));
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_posix_rsp, nlink));
+ buf->ccontext.DataLength = cpu_to_le32(52);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_posix_rsp, Name));
+ buf->ccontext.NameLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+ /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+ buf->Name[0] = 0x93;
+ buf->Name[1] = 0xAD;
+ buf->Name[2] = 0x25;
+ buf->Name[3] = 0x50;
+ buf->Name[4] = 0x9C;
+ buf->Name[5] = 0xB4;
+ buf->Name[6] = 0x11;
+ buf->Name[7] = 0xE7;
+ buf->Name[8] = 0xB4;
+ buf->Name[9] = 0x23;
+ buf->Name[10] = 0x83;
+ buf->Name[11] = 0xDE;
+ buf->Name[12] = 0x96;
+ buf->Name[13] = 0x8B;
+ buf->Name[14] = 0xCD;
+ buf->Name[15] = 0x7C;
+
+ buf->nlink = cpu_to_le32(inode->i_nlink);
+ buf->reparse_tag = cpu_to_le32(fp->volatile_id);
+ buf->mode = cpu_to_le32(inode->i_mode);
+ id_to_sid(from_kuid(user_ns, inode->i_uid),
+ SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]);
+ id_to_sid(from_kgid(user_ns, inode->i_gid),
+ SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]);
+}
+
+/*
+ * Find lease object(opinfo) for given lease key/fid from lease
+ * break/file close path.
+ */
+/**
+ * lookup_lease_in_table() - find a matching lease info object
+ * @conn: connection instance
+ * @lease_key: lease key to be searched for
+ *
+ * Return: opinfo if found matching opinfo, otherwise NULL
+ */
+struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn,
+ char *lease_key)
+{
+ struct oplock_info *opinfo = NULL, *ret_op = NULL;
+ struct lease_table *lt;
+ int ret;
+
+ read_lock(&lease_list_lock);
+ list_for_each_entry(lt, &lease_table_list, l_entry) {
+ if (!memcmp(lt->client_guid, conn->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE))
+ goto found;
+ }
+
+ read_unlock(&lease_list_lock);
+ return NULL;
+
+found:
+ rcu_read_lock();
+ list_for_each_entry_rcu(opinfo, &lt->lease_list, lease_entry) {
+ if (!atomic_inc_not_zero(&opinfo->refcount))
+ continue;
+ rcu_read_unlock();
+ if (!opinfo->op_state || opinfo->op_state == OPLOCK_CLOSING)
+ goto op_next;
+ if (!(opinfo->o_lease->state &
+ (SMB2_LEASE_HANDLE_CACHING_LE |
+ SMB2_LEASE_WRITE_CACHING_LE)))
+ goto op_next;
+ ret = compare_guid_key(opinfo, conn->ClientGUID,
+ lease_key);
+ if (ret) {
+ ksmbd_debug(OPLOCK, "found opinfo\n");
+ ret_op = opinfo;
+ goto out;
+ }
+op_next:
+ opinfo_put(opinfo);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+out:
+ read_unlock(&lease_list_lock);
+ return ret_op;
+}
+
+int smb2_check_durable_oplock(struct ksmbd_file *fp,
+ struct lease_ctx_info *lctx, char *name)
+{
+ struct oplock_info *opinfo = opinfo_get(fp);
+ int ret = 0;
+
+ if (opinfo && opinfo->is_lease) {
+ if (!lctx) {
+ pr_err("open does not include lease\n");
+ ret = -EBADF;
+ goto out;
+ }
+ if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key,
+ SMB2_LEASE_KEY_SIZE)) {
+ pr_err("invalid lease key\n");
+ ret = -EBADF;
+ goto out;
+ }
+ if (name && strcmp(fp->filename, name)) {
+ pr_err("invalid name reconnect %s\n", name);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+out:
+ if (opinfo)
+ opinfo_put(opinfo);
+ return ret;
+}
diff --git a/fs/ksmbd/oplock.h b/fs/ksmbd/oplock.h
new file mode 100644
index 000000000000..119b8047cfbd
--- /dev/null
+++ b/fs/ksmbd/oplock.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_OPLOCK_H
+#define __KSMBD_OPLOCK_H
+
+#include "smb_common.h"
+
+#define OPLOCK_WAIT_TIME (35 * HZ)
+
+/* SMB2 Oplock levels */
+#define SMB2_OPLOCK_LEVEL_NONE 0x00
+#define SMB2_OPLOCK_LEVEL_II 0x01
+#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
+#define SMB2_OPLOCK_LEVEL_BATCH 0x09
+#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
+
+/* Oplock states */
+#define OPLOCK_STATE_NONE 0x00
+#define OPLOCK_ACK_WAIT 0x01
+#define OPLOCK_CLOSING 0x02
+
+#define OPLOCK_WRITE_TO_READ 0x01
+#define OPLOCK_READ_HANDLE_TO_READ 0x02
+#define OPLOCK_WRITE_TO_NONE 0x04
+#define OPLOCK_READ_TO_NONE 0x08
+
+#define SMB2_LEASE_KEY_SIZE 16
+
+struct lease_ctx_info {
+ __u8 lease_key[SMB2_LEASE_KEY_SIZE];
+ __le32 req_state;
+ __le32 flags;
+ __le64 duration;
+ __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE];
+ int version;
+};
+
+struct lease_table {
+ char client_guid[SMB2_CLIENT_GUID_SIZE];
+ struct list_head lease_list;
+ struct list_head l_entry;
+ spinlock_t lb_lock;
+};
+
+struct lease {
+ __u8 lease_key[SMB2_LEASE_KEY_SIZE];
+ __le32 state;
+ __le32 new_state;
+ __le32 flags;
+ __le64 duration;
+ __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE];
+ int version;
+ unsigned short epoch;
+ struct lease_table *l_lb;
+};
+
+struct oplock_info {
+ struct ksmbd_conn *conn;
+ struct ksmbd_session *sess;
+ struct ksmbd_work *work;
+ struct ksmbd_file *o_fp;
+ int level;
+ int op_state;
+ unsigned long pending_break;
+ u64 fid;
+ atomic_t breaking_cnt;
+ atomic_t refcount;
+ __u16 Tid;
+ bool is_lease;
+ bool open_trunc; /* truncate on open */
+ struct lease *o_lease;
+ struct list_head interim_list;
+ struct list_head op_entry;
+ struct list_head lease_entry;
+ wait_queue_head_t oplock_q; /* Other server threads */
+ wait_queue_head_t oplock_brk; /* oplock breaking wait */
+ struct rcu_head rcu_head;
+};
+
+struct lease_break_info {
+ __le32 curr_state;
+ __le32 new_state;
+ __le16 epoch;
+ char lease_key[SMB2_LEASE_KEY_SIZE];
+};
+
+struct oplock_break_info {
+ int level;
+ int open_trunc;
+ int fid;
+};
+
+int smb_grant_oplock(struct ksmbd_work *work, int req_op_level,
+ u64 pid, struct ksmbd_file *fp, __u16 tid,
+ struct lease_ctx_info *lctx, int share_ret);
+void smb_break_all_levII_oplock(struct ksmbd_work *work,
+ struct ksmbd_file *fp, int is_trunc);
+int opinfo_write_to_read(struct oplock_info *opinfo);
+int opinfo_read_handle_to_read(struct oplock_info *opinfo);
+int opinfo_write_to_none(struct oplock_info *opinfo);
+int opinfo_read_to_none(struct oplock_info *opinfo);
+void close_id_del_oplock(struct ksmbd_file *fp);
+void smb_break_all_oplock(struct ksmbd_work *work, struct ksmbd_file *fp);
+struct oplock_info *opinfo_get(struct ksmbd_file *fp);
+void opinfo_put(struct oplock_info *opinfo);
+
+/* Lease related functions */
+void create_lease_buf(u8 *rbuf, struct lease *lease);
+struct lease_ctx_info *parse_lease_state(void *open_req);
+__u8 smb2_map_lease_to_oplock(__le32 lease_state);
+int lease_read_to_write(struct oplock_info *opinfo);
+
+/* Durable related functions */
+void create_durable_rsp_buf(char *cc);
+void create_durable_v2_rsp_buf(char *cc, struct ksmbd_file *fp);
+void create_mxac_rsp_buf(char *cc, int maximal_access);
+void create_disk_id_rsp_buf(char *cc, __u64 file_id, __u64 vol_id);
+void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp);
+struct create_context *smb2_find_context_vals(void *open_req, const char *str);
+struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn,
+ char *lease_key);
+int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
+ struct lease_ctx_info *lctx);
+void destroy_lease_table(struct ksmbd_conn *conn);
+int smb2_check_durable_oplock(struct ksmbd_file *fp,
+ struct lease_ctx_info *lctx, char *name);
+#endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
new file mode 100644
index 000000000000..e6a9f6aa47eb
--- /dev/null
+++ b/fs/ksmbd/server.c
@@ -0,0 +1,633 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include "glob.h"
+#include "oplock.h"
+#include "misc.h"
+#include <linux/sched/signal.h>
+#include <linux/workqueue.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include "server.h"
+#include "smb_common.h"
+#include "smbstatus.h"
+#include "connection.h"
+#include "transport_ipc.h"
+#include "mgmt/user_session.h"
+#include "crypto_ctx.h"
+#include "auth.h"
+
+int ksmbd_debug_types;
+
+struct ksmbd_server_config server_conf;
+
+enum SERVER_CTRL_TYPE {
+ SERVER_CTRL_TYPE_INIT,
+ SERVER_CTRL_TYPE_RESET,
+};
+
+struct server_ctrl_struct {
+ int type;
+ struct work_struct ctrl_work;
+};
+
+static DEFINE_MUTEX(ctrl_lock);
+
+static int ___server_conf_set(int idx, char *val)
+{
+ if (idx >= ARRAY_SIZE(server_conf.conf))
+ return -EINVAL;
+
+ if (!val || val[0] == 0x00)
+ return -EINVAL;
+
+ kfree(server_conf.conf[idx]);
+ server_conf.conf[idx] = kstrdup(val, GFP_KERNEL);
+ if (!server_conf.conf[idx])
+ return -ENOMEM;
+ return 0;
+}
+
+int ksmbd_set_netbios_name(char *v)
+{
+ return ___server_conf_set(SERVER_CONF_NETBIOS_NAME, v);
+}
+
+int ksmbd_set_server_string(char *v)
+{
+ return ___server_conf_set(SERVER_CONF_SERVER_STRING, v);
+}
+
+int ksmbd_set_work_group(char *v)
+{
+ return ___server_conf_set(SERVER_CONF_WORK_GROUP, v);
+}
+
+char *ksmbd_netbios_name(void)
+{
+ return server_conf.conf[SERVER_CONF_NETBIOS_NAME];
+}
+
+char *ksmbd_server_string(void)
+{
+ return server_conf.conf[SERVER_CONF_SERVER_STRING];
+}
+
+char *ksmbd_work_group(void)
+{
+ return server_conf.conf[SERVER_CONF_WORK_GROUP];
+}
+
+/**
+ * check_conn_state() - check state of server thread connection
+ * @work: smb work containing server thread information
+ *
+ * Return: 0 on valid connection, otherwise 1 to reconnect
+ */
+static inline int check_conn_state(struct ksmbd_work *work)
+{
+ struct smb_hdr *rsp_hdr;
+
+ if (ksmbd_conn_exiting(work) || ksmbd_conn_need_reconnect(work)) {
+ rsp_hdr = work->response_buf;
+ rsp_hdr->Status.CifsError = STATUS_CONNECTION_DISCONNECTED;
+ return 1;
+ }
+ return 0;
+}
+
+#define SERVER_HANDLER_CONTINUE 0
+#define SERVER_HANDLER_ABORT 1
+
+static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn,
+ u16 *cmd)
+{
+ struct smb_version_cmds *cmds;
+ u16 command;
+ int ret;
+
+ if (check_conn_state(work))
+ return SERVER_HANDLER_CONTINUE;
+
+ if (ksmbd_verify_smb_message(work))
+ return SERVER_HANDLER_ABORT;
+
+ command = conn->ops->get_cmd_val(work);
+ *cmd = command;
+
+andx_again:
+ if (command >= conn->max_cmds) {
+ conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ return SERVER_HANDLER_CONTINUE;
+ }
+
+ cmds = &conn->cmds[command];
+ if (!cmds->proc) {
+ ksmbd_debug(SMB, "*** not implemented yet cmd = %x\n", command);
+ conn->ops->set_rsp_status(work, STATUS_NOT_IMPLEMENTED);
+ return SERVER_HANDLER_CONTINUE;
+ }
+
+ if (work->sess && conn->ops->is_sign_req(work, command)) {
+ ret = conn->ops->check_sign_req(work);
+ if (!ret) {
+ conn->ops->set_rsp_status(work, STATUS_ACCESS_DENIED);
+ return SERVER_HANDLER_CONTINUE;
+ }
+ }
+
+ ret = cmds->proc(work);
+
+ if (ret < 0)
+ ksmbd_debug(CONN, "Failed to process %u [%d]\n", command, ret);
+ /* AndX commands - chained request can return positive values */
+ else if (ret > 0) {
+ command = ret;
+ *cmd = command;
+ goto andx_again;
+ }
+
+ if (work->send_no_response)
+ return SERVER_HANDLER_ABORT;
+ return SERVER_HANDLER_CONTINUE;
+}
+
+static void __handle_ksmbd_work(struct ksmbd_work *work,
+ struct ksmbd_conn *conn)
+{
+ u16 command = 0;
+ int rc;
+
+ if (conn->ops->allocate_rsp_buf(work))
+ return;
+
+ if (conn->ops->is_transform_hdr &&
+ conn->ops->is_transform_hdr(work->request_buf)) {
+ rc = conn->ops->decrypt_req(work);
+ if (rc < 0) {
+ conn->ops->set_rsp_status(work, STATUS_DATA_ERROR);
+ goto send;
+ }
+
+ work->encrypted = true;
+ }
+
+ rc = conn->ops->init_rsp_hdr(work);
+ if (rc) {
+ /* either uid or tid is not correct */
+ conn->ops->set_rsp_status(work, STATUS_INVALID_HANDLE);
+ goto send;
+ }
+
+ if (conn->ops->check_user_session) {
+ rc = conn->ops->check_user_session(work);
+ if (rc < 0) {
+ command = conn->ops->get_cmd_val(work);
+ conn->ops->set_rsp_status(work,
+ STATUS_USER_SESSION_DELETED);
+ goto send;
+ } else if (rc > 0) {
+ rc = conn->ops->get_ksmbd_tcon(work);
+ if (rc < 0) {
+ conn->ops->set_rsp_status(work,
+ STATUS_NETWORK_NAME_DELETED);
+ goto send;
+ }
+ }
+ }
+
+ do {
+ rc = __process_request(work, conn, &command);
+ if (rc == SERVER_HANDLER_ABORT)
+ break;
+
+ /*
+ * Call smb2_set_rsp_credits() function to set number of credits
+ * granted in hdr of smb2 response.
+ */
+ if (conn->ops->set_rsp_credits) {
+ spin_lock(&conn->credits_lock);
+ rc = conn->ops->set_rsp_credits(work);
+ spin_unlock(&conn->credits_lock);
+ if (rc < 0) {
+ conn->ops->set_rsp_status(work,
+ STATUS_INVALID_PARAMETER);
+ goto send;
+ }
+ }
+
+ if (work->sess &&
+ (work->sess->sign || smb3_11_final_sess_setup_resp(work) ||
+ conn->ops->is_sign_req(work, command)))
+ conn->ops->set_sign_rsp(work);
+ } while (is_chained_smb2_message(work));
+
+ if (work->send_no_response)
+ return;
+
+send:
+ smb3_preauth_hash_rsp(work);
+ if (work->sess && work->sess->enc && work->encrypted &&
+ conn->ops->encrypt_resp) {
+ rc = conn->ops->encrypt_resp(work);
+ if (rc < 0) {
+ conn->ops->set_rsp_status(work, STATUS_DATA_ERROR);
+ goto send;
+ }
+ }
+
+ ksmbd_conn_write(work);
+}
+
+/**
+ * handle_ksmbd_work() - process pending smb work requests
+ * @wk: smb work containing request command buffer
+ *
+ * called by kworker threads to processing remaining smb work requests
+ */
+static void handle_ksmbd_work(struct work_struct *wk)
+{
+ struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work);
+ struct ksmbd_conn *conn = work->conn;
+
+ atomic64_inc(&conn->stats.request_served);
+
+ __handle_ksmbd_work(work, conn);
+
+ ksmbd_conn_try_dequeue_request(work);
+ ksmbd_free_work_struct(work);
+ atomic_dec(&conn->r_count);
+}
+
+/**
+ * queue_ksmbd_work() - queue a smb request to worker thread queue
+ * for proccessing smb command and sending response
+ * @conn: connection instance
+ *
+ * read remaining data from socket create and submit work.
+ */
+static int queue_ksmbd_work(struct ksmbd_conn *conn)
+{
+ struct ksmbd_work *work;
+
+ work = ksmbd_alloc_work_struct();
+ if (!work) {
+ pr_err("allocation for work failed\n");
+ return -ENOMEM;
+ }
+
+ work->conn = conn;
+ work->request_buf = conn->request_buf;
+ conn->request_buf = NULL;
+
+ if (ksmbd_init_smb_server(work)) {
+ ksmbd_free_work_struct(work);
+ return -EINVAL;
+ }
+
+ ksmbd_conn_enqueue_request(work);
+ atomic_inc(&conn->r_count);
+ /* update activity on connection */
+ conn->last_active = jiffies;
+ INIT_WORK(&work->work, handle_ksmbd_work);
+ ksmbd_queue_work(work);
+ return 0;
+}
+
+static int ksmbd_server_process_request(struct ksmbd_conn *conn)
+{
+ return queue_ksmbd_work(conn);
+}
+
+static int ksmbd_server_terminate_conn(struct ksmbd_conn *conn)
+{
+ ksmbd_sessions_deregister(conn);
+ destroy_lease_table(conn);
+ return 0;
+}
+
+static void ksmbd_server_tcp_callbacks_init(void)
+{
+ struct ksmbd_conn_ops ops;
+
+ ops.process_fn = ksmbd_server_process_request;
+ ops.terminate_fn = ksmbd_server_terminate_conn;
+
+ ksmbd_conn_init_server_callbacks(&ops);
+}
+
+static void server_conf_free(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(server_conf.conf); i++) {
+ kfree(server_conf.conf[i]);
+ server_conf.conf[i] = NULL;
+ }
+}
+
+static int server_conf_init(void)
+{
+ WRITE_ONCE(server_conf.state, SERVER_STATE_STARTING_UP);
+ server_conf.enforced_signing = 0;
+ server_conf.min_protocol = ksmbd_min_protocol();
+ server_conf.max_protocol = ksmbd_max_protocol();
+ server_conf.auth_mechs = KSMBD_AUTH_NTLMSSP;
+#ifdef CONFIG_SMB_SERVER_KERBEROS5
+ server_conf.auth_mechs |= KSMBD_AUTH_KRB5 |
+ KSMBD_AUTH_MSKRB5;
+#endif
+ return 0;
+}
+
+static void server_ctrl_handle_init(struct server_ctrl_struct *ctrl)
+{
+ int ret;
+
+ ret = ksmbd_conn_transport_init();
+ if (ret) {
+ server_queue_ctrl_reset_work();
+ return;
+ }
+
+ WRITE_ONCE(server_conf.state, SERVER_STATE_RUNNING);
+}
+
+static void server_ctrl_handle_reset(struct server_ctrl_struct *ctrl)
+{
+ ksmbd_ipc_soft_reset();
+ ksmbd_conn_transport_destroy();
+ server_conf_free();
+ server_conf_init();
+ WRITE_ONCE(server_conf.state, SERVER_STATE_STARTING_UP);
+}
+
+static void server_ctrl_handle_work(struct work_struct *work)
+{
+ struct server_ctrl_struct *ctrl;
+
+ ctrl = container_of(work, struct server_ctrl_struct, ctrl_work);
+
+ mutex_lock(&ctrl_lock);
+ switch (ctrl->type) {
+ case SERVER_CTRL_TYPE_INIT:
+ server_ctrl_handle_init(ctrl);
+ break;
+ case SERVER_CTRL_TYPE_RESET:
+ server_ctrl_handle_reset(ctrl);
+ break;
+ default:
+ pr_err("Unknown server work type: %d\n", ctrl->type);
+ }
+ mutex_unlock(&ctrl_lock);
+ kfree(ctrl);
+ module_put(THIS_MODULE);
+}
+
+static int __queue_ctrl_work(int type)
+{
+ struct server_ctrl_struct *ctrl;
+
+ ctrl = kmalloc(sizeof(struct server_ctrl_struct), GFP_KERNEL);
+ if (!ctrl)
+ return -ENOMEM;
+
+ __module_get(THIS_MODULE);
+ ctrl->type = type;
+ INIT_WORK(&ctrl->ctrl_work, server_ctrl_handle_work);
+ queue_work(system_long_wq, &ctrl->ctrl_work);
+ return 0;
+}
+
+int server_queue_ctrl_init_work(void)
+{
+ return __queue_ctrl_work(SERVER_CTRL_TYPE_INIT);
+}
+
+int server_queue_ctrl_reset_work(void)
+{
+ return __queue_ctrl_work(SERVER_CTRL_TYPE_RESET);
+}
+
+static ssize_t stats_show(struct class *class, struct class_attribute *attr,
+ char *buf)
+{
+ /*
+ * Inc this each time you change stats output format,
+ * so user space will know what to do.
+ */
+ static int stats_version = 2;
+ static const char * const state[] = {
+ "startup",
+ "running",
+ "reset",
+ "shutdown"
+ };
+
+ ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version,
+ state[server_conf.state], server_conf.tcp_port,
+ server_conf.ipc_last_active / HZ);
+ return sz;
+}
+
+static ssize_t kill_server_store(struct class *class,
+ struct class_attribute *attr, const char *buf,
+ size_t len)
+{
+ if (!sysfs_streq(buf, "hard"))
+ return len;
+
+ pr_info("kill command received\n");
+ mutex_lock(&ctrl_lock);
+ WRITE_ONCE(server_conf.state, SERVER_STATE_RESETTING);
+ __module_get(THIS_MODULE);
+ server_ctrl_handle_reset(NULL);
+ module_put(THIS_MODULE);
+ mutex_unlock(&ctrl_lock);
+ return len;
+}
+
+static const char * const debug_type_strings[] = {"smb", "auth", "vfs",
+ "oplock", "ipc", "conn",
+ "rdma"};
+
+static ssize_t debug_show(struct class *class, struct class_attribute *attr,
+ char *buf)
+{
+ ssize_t sz = 0;
+ int i, pos = 0;
+
+ for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) {
+ if ((ksmbd_debug_types >> i) & 1) {
+ pos = scnprintf(buf + sz,
+ PAGE_SIZE - sz,
+ "[%s] ",
+ debug_type_strings[i]);
+ } else {
+ pos = scnprintf(buf + sz,
+ PAGE_SIZE - sz,
+ "%s ",
+ debug_type_strings[i]);
+ }
+ sz += pos;
+ }
+ sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+ return sz;
+}
+
+static ssize_t debug_store(struct class *class, struct class_attribute *attr,
+ const char *buf, size_t len)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) {
+ if (sysfs_streq(buf, "all")) {
+ if (ksmbd_debug_types == KSMBD_DEBUG_ALL)
+ ksmbd_debug_types = 0;
+ else
+ ksmbd_debug_types = KSMBD_DEBUG_ALL;
+ break;
+ }
+
+ if (sysfs_streq(buf, debug_type_strings[i])) {
+ if (ksmbd_debug_types & (1 << i))
+ ksmbd_debug_types &= ~(1 << i);
+ else
+ ksmbd_debug_types |= (1 << i);
+ break;
+ }
+ }
+
+ return len;
+}
+
+static CLASS_ATTR_RO(stats);
+static CLASS_ATTR_WO(kill_server);
+static CLASS_ATTR_RW(debug);
+
+static struct attribute *ksmbd_control_class_attrs[] = {
+ &class_attr_stats.attr,
+ &class_attr_kill_server.attr,
+ &class_attr_debug.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(ksmbd_control_class);
+
+static struct class ksmbd_control_class = {
+ .name = "ksmbd-control",
+ .owner = THIS_MODULE,
+ .class_groups = ksmbd_control_class_groups,
+};
+
+static int ksmbd_server_shutdown(void)
+{
+ WRITE_ONCE(server_conf.state, SERVER_STATE_SHUTTING_DOWN);
+
+ class_unregister(&ksmbd_control_class);
+ ksmbd_workqueue_destroy();
+ ksmbd_ipc_release();
+ ksmbd_conn_transport_destroy();
+ ksmbd_crypto_destroy();
+ ksmbd_free_global_file_table();
+ destroy_lease_table(NULL);
+ ksmbd_work_pool_destroy();
+ ksmbd_exit_file_cache();
+ server_conf_free();
+ return 0;
+}
+
+static int __init ksmbd_server_init(void)
+{
+ int ret;
+
+ ret = class_register(&ksmbd_control_class);
+ if (ret) {
+ pr_err("Unable to register ksmbd-control class\n");
+ return ret;
+ }
+
+ ksmbd_server_tcp_callbacks_init();
+
+ ret = server_conf_init();
+ if (ret)
+ goto err_unregister;
+
+ ret = ksmbd_work_pool_init();
+ if (ret)
+ goto err_unregister;
+
+ ret = ksmbd_init_file_cache();
+ if (ret)
+ goto err_destroy_work_pools;
+
+ ret = ksmbd_ipc_init();
+ if (ret)
+ goto err_exit_file_cache;
+
+ ret = ksmbd_init_global_file_table();
+ if (ret)
+ goto err_ipc_release;
+
+ ret = ksmbd_inode_hash_init();
+ if (ret)
+ goto err_destroy_file_table;
+
+ ret = ksmbd_crypto_create();
+ if (ret)
+ goto err_release_inode_hash;
+
+ ret = ksmbd_workqueue_init();
+ if (ret)
+ goto err_crypto_destroy;
+ return 0;
+
+err_crypto_destroy:
+ ksmbd_crypto_destroy();
+err_release_inode_hash:
+ ksmbd_release_inode_hash();
+err_destroy_file_table:
+ ksmbd_free_global_file_table();
+err_ipc_release:
+ ksmbd_ipc_release();
+err_exit_file_cache:
+ ksmbd_exit_file_cache();
+err_destroy_work_pools:
+ ksmbd_work_pool_destroy();
+err_unregister:
+ class_unregister(&ksmbd_control_class);
+
+ return ret;
+}
+
+/**
+ * ksmbd_server_exit() - shutdown forker thread and free memory at module exit
+ */
+static void __exit ksmbd_server_exit(void)
+{
+ ksmbd_server_shutdown();
+ ksmbd_release_inode_hash();
+}
+
+MODULE_AUTHOR("Namjae Jeon <linkinjeon@kernel.org>");
+MODULE_VERSION(KSMBD_VERSION);
+MODULE_DESCRIPTION("Linux kernel CIFS/SMB SERVER");
+MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: ecb");
+MODULE_SOFTDEP("pre: hmac");
+MODULE_SOFTDEP("pre: md4");
+MODULE_SOFTDEP("pre: md5");
+MODULE_SOFTDEP("pre: nls");
+MODULE_SOFTDEP("pre: aes");
+MODULE_SOFTDEP("pre: cmac");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: sha512");
+MODULE_SOFTDEP("pre: aead2");
+MODULE_SOFTDEP("pre: ccm");
+MODULE_SOFTDEP("pre: gcm");
+module_init(ksmbd_server_init)
+module_exit(ksmbd_server_exit)
diff --git a/fs/ksmbd/server.h b/fs/ksmbd/server.h
new file mode 100644
index 000000000000..ac9d932f8c8a
--- /dev/null
+++ b/fs/ksmbd/server.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __SERVER_H__
+#define __SERVER_H__
+
+#include "smbacl.h"
+
+/*
+ * Server state type
+ */
+enum {
+ SERVER_STATE_STARTING_UP,
+ SERVER_STATE_RUNNING,
+ SERVER_STATE_RESETTING,
+ SERVER_STATE_SHUTTING_DOWN,
+};
+
+/*
+ * Server global config string index
+ */
+enum {
+ SERVER_CONF_NETBIOS_NAME,
+ SERVER_CONF_SERVER_STRING,
+ SERVER_CONF_WORK_GROUP,
+};
+
+struct ksmbd_server_config {
+ unsigned int flags;
+ unsigned int state;
+ short signing;
+ short enforced_signing;
+ short min_protocol;
+ short max_protocol;
+ unsigned short tcp_port;
+ unsigned short ipc_timeout;
+ unsigned long ipc_last_active;
+ unsigned long deadtime;
+ unsigned int share_fake_fscaps;
+ struct smb_sid domain_sid;
+ unsigned int auth_mechs;
+
+ char *conf[SERVER_CONF_WORK_GROUP + 1];
+};
+
+extern struct ksmbd_server_config server_conf;
+
+int ksmbd_set_netbios_name(char *v);
+int ksmbd_set_server_string(char *v);
+int ksmbd_set_work_group(char *v);
+
+char *ksmbd_netbios_name(void);
+char *ksmbd_server_string(void);
+char *ksmbd_work_group(void);
+
+static inline int ksmbd_server_running(void)
+{
+ return READ_ONCE(server_conf.state) == SERVER_STATE_RUNNING;
+}
+
+static inline int ksmbd_server_configurable(void)
+{
+ return READ_ONCE(server_conf.state) < SERVER_STATE_RESETTING;
+}
+
+int server_queue_ctrl_init_work(void);
+int server_queue_ctrl_reset_work(void);
+#endif /* __SERVER_H__ */
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
new file mode 100644
index 000000000000..9aa46bb3e10d
--- /dev/null
+++ b/fs/ksmbd/smb2misc.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include "glob.h"
+#include "nterr.h"
+#include "smb2pdu.h"
+#include "smb_common.h"
+#include "smbstatus.h"
+#include "mgmt/user_session.h"
+#include "connection.h"
+
+static int check_smb2_hdr(struct smb2_hdr *hdr)
+{
+ /*
+ * Make sure that this really is an SMB, that it is a response.
+ */
+ if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
+ return 1;
+ return 0;
+}
+
+/*
+ * The following table defines the expected "StructureSize" of SMB2 requests
+ * in order by SMB2 command. This is similar to "wct" in SMB/CIFS requests.
+ *
+ * Note that commands are defined in smb2pdu.h in le16 but the array below is
+ * indexed by command in host byte order
+ */
+static const __le16 smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
+ /* SMB2_NEGOTIATE */ cpu_to_le16(36),
+ /* SMB2_SESSION_SETUP */ cpu_to_le16(25),
+ /* SMB2_LOGOFF */ cpu_to_le16(4),
+ /* SMB2_TREE_CONNECT */ cpu_to_le16(9),
+ /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4),
+ /* SMB2_CREATE */ cpu_to_le16(57),
+ /* SMB2_CLOSE */ cpu_to_le16(24),
+ /* SMB2_FLUSH */ cpu_to_le16(24),
+ /* SMB2_READ */ cpu_to_le16(49),
+ /* SMB2_WRITE */ cpu_to_le16(49),
+ /* SMB2_LOCK */ cpu_to_le16(48),
+ /* SMB2_IOCTL */ cpu_to_le16(57),
+ /* SMB2_CANCEL */ cpu_to_le16(4),
+ /* SMB2_ECHO */ cpu_to_le16(4),
+ /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(33),
+ /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(32),
+ /* SMB2_QUERY_INFO */ cpu_to_le16(41),
+ /* SMB2_SET_INFO */ cpu_to_le16(33),
+ /* use 44 for lease break */
+ /* SMB2_OPLOCK_BREAK */ cpu_to_le16(36)
+};
+
+/*
+ * The size of the variable area depends on the offset and length fields
+ * located in different fields for various SMB2 requests. SMB2 requests
+ * with no variable length info, show an offset of zero for the offset field.
+ */
+static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
+ /* SMB2_NEGOTIATE */ true,
+ /* SMB2_SESSION_SETUP */ true,
+ /* SMB2_LOGOFF */ false,
+ /* SMB2_TREE_CONNECT */ true,
+ /* SMB2_TREE_DISCONNECT */ false,
+ /* SMB2_CREATE */ true,
+ /* SMB2_CLOSE */ false,
+ /* SMB2_FLUSH */ false,
+ /* SMB2_READ */ true,
+ /* SMB2_WRITE */ true,
+ /* SMB2_LOCK */ true,
+ /* SMB2_IOCTL */ true,
+ /* SMB2_CANCEL */ false, /* BB CHECK this not listed in documentation */
+ /* SMB2_ECHO */ false,
+ /* SMB2_QUERY_DIRECTORY */ true,
+ /* SMB2_CHANGE_NOTIFY */ false,
+ /* SMB2_QUERY_INFO */ true,
+ /* SMB2_SET_INFO */ true,
+ /* SMB2_OPLOCK_BREAK */ false
+};
+
+/*
+ * Returns the pointer to the beginning of the data area. Length of the data
+ * area and the offset to it (from the beginning of the smb are also returned.
+ */
+static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
+{
+ *off = 0;
+ *len = 0;
+
+ /* error reqeusts do not have data area */
+ if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
+ (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE)
+ return NULL;
+
+ /*
+ * Following commands have data areas so we have to get the location
+ * of the data buffer offset and data buffer length for the particular
+ * command.
+ */
+ switch (hdr->Command) {
+ case SMB2_SESSION_SETUP:
+ *off = le16_to_cpu(((struct smb2_sess_setup_req *)hdr)->SecurityBufferOffset);
+ *len = le16_to_cpu(((struct smb2_sess_setup_req *)hdr)->SecurityBufferLength);
+ break;
+ case SMB2_TREE_CONNECT:
+ *off = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset);
+ *len = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathLength);
+ break;
+ case SMB2_CREATE:
+ {
+ if (((struct smb2_create_req *)hdr)->CreateContextsLength) {
+ *off = le32_to_cpu(((struct smb2_create_req *)
+ hdr)->CreateContextsOffset);
+ *len = le32_to_cpu(((struct smb2_create_req *)
+ hdr)->CreateContextsLength);
+ break;
+ }
+
+ *off = le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset);
+ *len = le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength);
+ break;
+ }
+ case SMB2_QUERY_INFO:
+ *off = le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset);
+ *len = le32_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferLength);
+ break;
+ case SMB2_SET_INFO:
+ *off = le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset);
+ *len = le32_to_cpu(((struct smb2_set_info_req *)hdr)->BufferLength);
+ break;
+ case SMB2_READ:
+ *off = le16_to_cpu(((struct smb2_read_req *)hdr)->ReadChannelInfoOffset);
+ *len = le16_to_cpu(((struct smb2_read_req *)hdr)->ReadChannelInfoLength);
+ break;
+ case SMB2_WRITE:
+ if (((struct smb2_write_req *)hdr)->DataOffset) {
+ *off = le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset);
+ *len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length);
+ break;
+ }
+
+ *off = le16_to_cpu(((struct smb2_write_req *)hdr)->WriteChannelInfoOffset);
+ *len = le16_to_cpu(((struct smb2_write_req *)hdr)->WriteChannelInfoLength);
+ break;
+ case SMB2_QUERY_DIRECTORY:
+ *off = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset);
+ *len = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameLength);
+ break;
+ case SMB2_LOCK:
+ {
+ int lock_count;
+
+ /*
+ * smb2_lock request size is 48 included single
+ * smb2_lock_element structure size.
+ */
+ lock_count = le16_to_cpu(((struct smb2_lock_req *)hdr)->LockCount) - 1;
+ if (lock_count > 0) {
+ *off = __SMB2_HEADER_STRUCTURE_SIZE + 48;
+ *len = sizeof(struct smb2_lock_element) * lock_count;
+ }
+ break;
+ }
+ case SMB2_IOCTL:
+ *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset);
+ *len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount);
+
+ break;
+ default:
+ ksmbd_debug(SMB, "no length check for command\n");
+ break;
+ }
+
+ /*
+ * Invalid length or offset probably means data area is invalid, but
+ * we have little choice but to ignore the data area in this case.
+ */
+ if (*off > 4096) {
+ ksmbd_debug(SMB, "offset %d too large, data area ignored\n",
+ *off);
+ *len = 0;
+ *off = 0;
+ } else if (*off < 0) {
+ ksmbd_debug(SMB,
+ "negative offset %d to data invalid ignore data area\n",
+ *off);
+ *off = 0;
+ *len = 0;
+ } else if (*len < 0) {
+ ksmbd_debug(SMB,
+ "negative data length %d invalid, data area ignored\n",
+ *len);
+ *len = 0;
+ } else if (*len > 128 * 1024) {
+ ksmbd_debug(SMB, "data area larger than 128K: %d\n", *len);
+ *len = 0;
+ }
+
+ /* return pointer to beginning of data area, ie offset from SMB start */
+ if ((*off != 0) && (*len != 0))
+ return (char *)hdr + *off;
+ else
+ return NULL;
+}
+
+/*
+ * Calculate the size of the SMB message based on the fixed header
+ * portion, the number of word parameters and the data portion of the message.
+ */
+static unsigned int smb2_calc_size(void *buf)
+{
+ struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+ struct smb2_hdr *hdr = &pdu->hdr;
+ int offset; /* the offset from the beginning of SMB to data area */
+ int data_length; /* the length of the variable length data area */
+ /* Structure Size has already been checked to make sure it is 64 */
+ int len = le16_to_cpu(hdr->StructureSize);
+
+ /*
+ * StructureSize2, ie length of fixed parameter area has already
+ * been checked to make sure it is the correct length.
+ */
+ len += le16_to_cpu(pdu->StructureSize2);
+
+ if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false)
+ goto calc_size_exit;
+
+ smb2_get_data_area_len(&offset, &data_length, hdr);
+ ksmbd_debug(SMB, "SMB2 data length %d offset %d\n", data_length,
+ offset);
+
+ if (data_length > 0) {
+ /*
+ * Check to make sure that data area begins after fixed area,
+ * Note that last byte of the fixed area is part of data area
+ * for some commands, typically those with odd StructureSize,
+ * so we must add one to the calculation.
+ */
+ if (offset + 1 < len)
+ ksmbd_debug(SMB,
+ "data area offset %d overlaps SMB2 header %d\n",
+ offset + 1, len);
+ else
+ len = offset + data_length;
+ }
+calc_size_exit:
+ ksmbd_debug(SMB, "SMB2 len %d\n", len);
+ return len;
+}
+
+static inline int smb2_query_info_req_len(struct smb2_query_info_req *h)
+{
+ return le32_to_cpu(h->InputBufferLength) +
+ le32_to_cpu(h->OutputBufferLength);
+}
+
+static inline int smb2_set_info_req_len(struct smb2_set_info_req *h)
+{
+ return le32_to_cpu(h->BufferLength);
+}
+
+static inline int smb2_read_req_len(struct smb2_read_req *h)
+{
+ return le32_to_cpu(h->Length);
+}
+
+static inline int smb2_write_req_len(struct smb2_write_req *h)
+{
+ return le32_to_cpu(h->Length);
+}
+
+static inline int smb2_query_dir_req_len(struct smb2_query_directory_req *h)
+{
+ return le32_to_cpu(h->OutputBufferLength);
+}
+
+static inline int smb2_ioctl_req_len(struct smb2_ioctl_req *h)
+{
+ return le32_to_cpu(h->InputCount) +
+ le32_to_cpu(h->OutputCount);
+}
+
+static inline int smb2_ioctl_resp_len(struct smb2_ioctl_req *h)
+{
+ return le32_to_cpu(h->MaxInputResponse) +
+ le32_to_cpu(h->MaxOutputResponse);
+}
+
+static int smb2_validate_credit_charge(struct smb2_hdr *hdr)
+{
+ int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len;
+ int credit_charge = le16_to_cpu(hdr->CreditCharge);
+ void *__hdr = hdr;
+
+ switch (hdr->Command) {
+ case SMB2_QUERY_INFO:
+ req_len = smb2_query_info_req_len(__hdr);
+ break;
+ case SMB2_SET_INFO:
+ req_len = smb2_set_info_req_len(__hdr);
+ break;
+ case SMB2_READ:
+ req_len = smb2_read_req_len(__hdr);
+ break;
+ case SMB2_WRITE:
+ req_len = smb2_write_req_len(__hdr);
+ break;
+ case SMB2_QUERY_DIRECTORY:
+ req_len = smb2_query_dir_req_len(__hdr);
+ break;
+ case SMB2_IOCTL:
+ req_len = smb2_ioctl_req_len(__hdr);
+ expect_resp_len = smb2_ioctl_resp_len(__hdr);
+ break;
+ default:
+ return 0;
+ }
+
+ credit_charge = max(1, credit_charge);
+ max_len = max(req_len, expect_resp_len);
+ calc_credit_num = DIV_ROUND_UP(max_len, SMB2_MAX_BUFFER_SIZE);
+
+ if (credit_charge < calc_credit_num) {
+ pr_err("Insufficient credit charge, given: %d, needed: %d\n",
+ credit_charge, calc_credit_num);
+ return 1;
+ }
+
+ return 0;
+}
+
+int ksmbd_smb2_check_message(struct ksmbd_work *work)
+{
+ struct smb2_pdu *pdu = work->request_buf;
+ struct smb2_hdr *hdr = &pdu->hdr;
+ int command;
+ __u32 clc_len; /* calculated length */
+ __u32 len = get_rfc1002_len(pdu);
+
+ if (work->next_smb2_rcv_hdr_off) {
+ pdu = ksmbd_req_buf_next(work);
+ hdr = &pdu->hdr;
+ }
+
+ if (le32_to_cpu(hdr->NextCommand) > 0) {
+ len = le32_to_cpu(hdr->NextCommand);
+ } else if (work->next_smb2_rcv_hdr_off) {
+ len -= work->next_smb2_rcv_hdr_off;
+ len = round_up(len, 8);
+ }
+
+ if (check_smb2_hdr(hdr))
+ return 1;
+
+ if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
+ ksmbd_debug(SMB, "Illegal structure size %u\n",
+ le16_to_cpu(hdr->StructureSize));
+ return 1;
+ }
+
+ command = le16_to_cpu(hdr->Command);
+ if (command >= NUMBER_OF_SMB2_COMMANDS) {
+ ksmbd_debug(SMB, "Illegal SMB2 command %d\n", command);
+ return 1;
+ }
+
+ if (smb2_req_struct_sizes[command] != pdu->StructureSize2) {
+ if (command != SMB2_OPLOCK_BREAK_HE &&
+ (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) {
+ /* error packets have 9 byte structure size */
+ ksmbd_debug(SMB,
+ "Illegal request size %u for command %d\n",
+ le16_to_cpu(pdu->StructureSize2), command);
+ return 1;
+ } else if (command == SMB2_OPLOCK_BREAK_HE &&
+ hdr->Status == 0 &&
+ le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 &&
+ le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) {
+ /* special case for SMB2.1 lease break message */
+ ksmbd_debug(SMB,
+ "Illegal request size %d for oplock break\n",
+ le16_to_cpu(pdu->StructureSize2));
+ return 1;
+ }
+ }
+
+ if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
+ smb2_validate_credit_charge(hdr)) {
+ work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ return 1;
+ }
+
+ clc_len = smb2_calc_size(hdr);
+ if (len != clc_len) {
+ /* server can return one byte more due to implied bcc[0] */
+ if (clc_len == len + 1)
+ return 0;
+
+ /*
+ * Some windows servers (win2016) will pad also the final
+ * PDU in a compound to 8 bytes.
+ */
+ if (ALIGN(clc_len, 8) == len)
+ return 0;
+
+ /*
+ * windows client also pad up to 8 bytes when compounding.
+ * If pad is longer than eight bytes, log the server behavior
+ * (once), since may indicate a problem but allow it and
+ * continue since the frame is parseable.
+ */
+ if (clc_len < len) {
+ ksmbd_debug(SMB,
+ "cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n",
+ len, clc_len, command,
+ le64_to_cpu(hdr->MessageId));
+ return 0;
+ }
+
+ if (command == SMB2_LOCK_HE && len == 88)
+ return 0;
+
+ ksmbd_debug(SMB,
+ "cli req too short, len %d not %d. cmd:%d mid:%llu\n",
+ len, clc_len, command,
+ le64_to_cpu(hdr->MessageId));
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int smb2_negotiate_request(struct ksmbd_work *work)
+{
+ return ksmbd_smb_negotiate_common(work, SMB2_NEGOTIATE_HE);
+}
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
new file mode 100644
index 000000000000..197473871aa4
--- /dev/null
+++ b/fs/ksmbd/smb2ops.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/slab.h>
+#include "glob.h"
+#include "smb2pdu.h"
+
+#include "auth.h"
+#include "connection.h"
+#include "smb_common.h"
+#include "server.h"
+
+static struct smb_version_values smb21_server_values = {
+ .version_string = SMB21_VERSION_STRING,
+ .protocol_id = SMB21_PROT_ID,
+ .capabilities = SMB2_GLOBAL_CAP_LARGE_MTU,
+ .max_read_size = SMB21_DEFAULT_IOSIZE,
+ .max_write_size = SMB21_DEFAULT_IOSIZE,
+ .max_trans_size = SMB21_DEFAULT_IOSIZE,
+ .large_lock_type = 0,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
+ .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+ .header_size = sizeof(struct smb2_hdr),
+ .max_header_size = MAX_SMB2_HDR_SIZE,
+ .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .lock_cmd = SMB2_LOCK,
+ .cap_unix = 0,
+ .cap_nt_find = SMB2_NT_FIND,
+ .cap_large_files = SMB2_LARGE_FILES,
+ .create_lease_size = sizeof(struct create_lease),
+ .create_durable_size = sizeof(struct create_durable_rsp),
+ .create_mxac_size = sizeof(struct create_mxac_rsp),
+ .create_disk_id_size = sizeof(struct create_disk_id_rsp),
+ .create_posix_size = sizeof(struct create_posix_rsp),
+};
+
+static struct smb_version_values smb30_server_values = {
+ .version_string = SMB30_VERSION_STRING,
+ .protocol_id = SMB30_PROT_ID,
+ .capabilities = SMB2_GLOBAL_CAP_LARGE_MTU,
+ .max_read_size = SMB3_DEFAULT_IOSIZE,
+ .max_write_size = SMB3_DEFAULT_IOSIZE,
+ .max_trans_size = SMB3_DEFAULT_TRANS_SIZE,
+ .large_lock_type = 0,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
+ .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+ .header_size = sizeof(struct smb2_hdr),
+ .max_header_size = MAX_SMB2_HDR_SIZE,
+ .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .lock_cmd = SMB2_LOCK,
+ .cap_unix = 0,
+ .cap_nt_find = SMB2_NT_FIND,
+ .cap_large_files = SMB2_LARGE_FILES,
+ .create_lease_size = sizeof(struct create_lease_v2),
+ .create_durable_size = sizeof(struct create_durable_rsp),
+ .create_durable_v2_size = sizeof(struct create_durable_v2_rsp),
+ .create_mxac_size = sizeof(struct create_mxac_rsp),
+ .create_disk_id_size = sizeof(struct create_disk_id_rsp),
+ .create_posix_size = sizeof(struct create_posix_rsp),
+};
+
+static struct smb_version_values smb302_server_values = {
+ .version_string = SMB302_VERSION_STRING,
+ .protocol_id = SMB302_PROT_ID,
+ .capabilities = SMB2_GLOBAL_CAP_LARGE_MTU,
+ .max_read_size = SMB3_DEFAULT_IOSIZE,
+ .max_write_size = SMB3_DEFAULT_IOSIZE,
+ .max_trans_size = SMB3_DEFAULT_TRANS_SIZE,
+ .large_lock_type = 0,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
+ .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+ .header_size = sizeof(struct smb2_hdr),
+ .max_header_size = MAX_SMB2_HDR_SIZE,
+ .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .lock_cmd = SMB2_LOCK,
+ .cap_unix = 0,
+ .cap_nt_find = SMB2_NT_FIND,
+ .cap_large_files = SMB2_LARGE_FILES,
+ .create_lease_size = sizeof(struct create_lease_v2),
+ .create_durable_size = sizeof(struct create_durable_rsp),
+ .create_durable_v2_size = sizeof(struct create_durable_v2_rsp),
+ .create_mxac_size = sizeof(struct create_mxac_rsp),
+ .create_disk_id_size = sizeof(struct create_disk_id_rsp),
+ .create_posix_size = sizeof(struct create_posix_rsp),
+};
+
+static struct smb_version_values smb311_server_values = {
+ .version_string = SMB311_VERSION_STRING,
+ .protocol_id = SMB311_PROT_ID,
+ .capabilities = SMB2_GLOBAL_CAP_LARGE_MTU,
+ .max_read_size = SMB3_DEFAULT_IOSIZE,
+ .max_write_size = SMB3_DEFAULT_IOSIZE,
+ .max_trans_size = SMB3_DEFAULT_TRANS_SIZE,
+ .large_lock_type = 0,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED,
+ .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+ .header_size = sizeof(struct smb2_hdr),
+ .max_header_size = MAX_SMB2_HDR_SIZE,
+ .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .lock_cmd = SMB2_LOCK,
+ .cap_unix = 0,
+ .cap_nt_find = SMB2_NT_FIND,
+ .cap_large_files = SMB2_LARGE_FILES,
+ .create_lease_size = sizeof(struct create_lease_v2),
+ .create_durable_size = sizeof(struct create_durable_rsp),
+ .create_durable_v2_size = sizeof(struct create_durable_v2_rsp),
+ .create_mxac_size = sizeof(struct create_mxac_rsp),
+ .create_disk_id_size = sizeof(struct create_disk_id_rsp),
+ .create_posix_size = sizeof(struct create_posix_rsp),
+};
+
+static struct smb_version_ops smb2_0_server_ops = {
+ .get_cmd_val = get_smb2_cmd_val,
+ .init_rsp_hdr = init_smb2_rsp_hdr,
+ .set_rsp_status = set_smb2_rsp_status,
+ .allocate_rsp_buf = smb2_allocate_rsp_buf,
+ .set_rsp_credits = smb2_set_rsp_credits,
+ .check_user_session = smb2_check_user_session,
+ .get_ksmbd_tcon = smb2_get_ksmbd_tcon,
+ .is_sign_req = smb2_is_sign_req,
+ .check_sign_req = smb2_check_sign_req,
+ .set_sign_rsp = smb2_set_sign_rsp
+};
+
+static struct smb_version_ops smb3_0_server_ops = {
+ .get_cmd_val = get_smb2_cmd_val,
+ .init_rsp_hdr = init_smb2_rsp_hdr,
+ .set_rsp_status = set_smb2_rsp_status,
+ .allocate_rsp_buf = smb2_allocate_rsp_buf,
+ .set_rsp_credits = smb2_set_rsp_credits,
+ .check_user_session = smb2_check_user_session,
+ .get_ksmbd_tcon = smb2_get_ksmbd_tcon,
+ .is_sign_req = smb2_is_sign_req,
+ .check_sign_req = smb3_check_sign_req,
+ .set_sign_rsp = smb3_set_sign_rsp,
+ .generate_signingkey = ksmbd_gen_smb30_signingkey,
+ .generate_encryptionkey = ksmbd_gen_smb30_encryptionkey,
+ .is_transform_hdr = smb3_is_transform_hdr,
+ .decrypt_req = smb3_decrypt_req,
+ .encrypt_resp = smb3_encrypt_resp
+};
+
+static struct smb_version_ops smb3_11_server_ops = {
+ .get_cmd_val = get_smb2_cmd_val,
+ .init_rsp_hdr = init_smb2_rsp_hdr,
+ .set_rsp_status = set_smb2_rsp_status,
+ .allocate_rsp_buf = smb2_allocate_rsp_buf,
+ .set_rsp_credits = smb2_set_rsp_credits,
+ .check_user_session = smb2_check_user_session,
+ .get_ksmbd_tcon = smb2_get_ksmbd_tcon,
+ .is_sign_req = smb2_is_sign_req,
+ .check_sign_req = smb3_check_sign_req,
+ .set_sign_rsp = smb3_set_sign_rsp,
+ .generate_signingkey = ksmbd_gen_smb311_signingkey,
+ .generate_encryptionkey = ksmbd_gen_smb311_encryptionkey,
+ .is_transform_hdr = smb3_is_transform_hdr,
+ .decrypt_req = smb3_decrypt_req,
+ .encrypt_resp = smb3_encrypt_resp
+};
+
+static struct smb_version_cmds smb2_0_server_cmds[NUMBER_OF_SMB2_COMMANDS] = {
+ [SMB2_NEGOTIATE_HE] = { .proc = smb2_negotiate_request, },
+ [SMB2_SESSION_SETUP_HE] = { .proc = smb2_sess_setup, },
+ [SMB2_TREE_CONNECT_HE] = { .proc = smb2_tree_connect,},
+ [SMB2_TREE_DISCONNECT_HE] = { .proc = smb2_tree_disconnect,},
+ [SMB2_LOGOFF_HE] = { .proc = smb2_session_logoff,},
+ [SMB2_CREATE_HE] = { .proc = smb2_open},
+ [SMB2_QUERY_INFO_HE] = { .proc = smb2_query_info},
+ [SMB2_QUERY_DIRECTORY_HE] = { .proc = smb2_query_dir},
+ [SMB2_CLOSE_HE] = { .proc = smb2_close},
+ [SMB2_ECHO_HE] = { .proc = smb2_echo},
+ [SMB2_SET_INFO_HE] = { .proc = smb2_set_info},
+ [SMB2_READ_HE] = { .proc = smb2_read},
+ [SMB2_WRITE_HE] = { .proc = smb2_write},
+ [SMB2_FLUSH_HE] = { .proc = smb2_flush},
+ [SMB2_CANCEL_HE] = { .proc = smb2_cancel},
+ [SMB2_LOCK_HE] = { .proc = smb2_lock},
+ [SMB2_IOCTL_HE] = { .proc = smb2_ioctl},
+ [SMB2_OPLOCK_BREAK_HE] = { .proc = smb2_oplock_break},
+ [SMB2_CHANGE_NOTIFY_HE] = { .proc = smb2_notify},
+};
+
+int init_smb2_0_server(struct ksmbd_conn *conn)
+{
+ return -EOPNOTSUPP;
+}
+
+/**
+ * init_smb2_1_server() - initialize a smb server connection with smb2.1
+ * command dispatcher
+ * @conn: connection instance
+ */
+void init_smb2_1_server(struct ksmbd_conn *conn)
+{
+ conn->vals = &smb21_server_values;
+ conn->ops = &smb2_0_server_ops;
+ conn->cmds = smb2_0_server_cmds;
+ conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds);
+ conn->max_credits = SMB2_MAX_CREDITS;
+ conn->signing_algorithm = SIGNING_ALG_HMAC_SHA256;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+}
+
+/**
+ * init_smb3_0_server() - initialize a smb server connection with smb3.0
+ * command dispatcher
+ * @conn: connection instance
+ */
+void init_smb3_0_server(struct ksmbd_conn *conn)
+{
+ conn->vals = &smb30_server_values;
+ conn->ops = &smb3_0_server_ops;
+ conn->cmds = smb2_0_server_cmds;
+ conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds);
+ conn->max_credits = SMB2_MAX_CREDITS;
+ conn->signing_algorithm = SIGNING_ALG_AES_CMAC;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
+}
+
+/**
+ * init_smb3_02_server() - initialize a smb server connection with smb3.02
+ * command dispatcher
+ * @conn: connection instance
+ */
+void init_smb3_02_server(struct ksmbd_conn *conn)
+{
+ conn->vals = &smb302_server_values;
+ conn->ops = &smb3_0_server_ops;
+ conn->cmds = smb2_0_server_cmds;
+ conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds);
+ conn->max_credits = SMB2_MAX_CREDITS;
+ conn->signing_algorithm = SIGNING_ALG_AES_CMAC;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
+}
+
+/**
+ * init_smb3_11_server() - initialize a smb server connection with smb3.11
+ * command dispatcher
+ * @conn: connection instance
+ */
+int init_smb3_11_server(struct ksmbd_conn *conn)
+{
+ conn->vals = &smb311_server_values;
+ conn->ops = &smb3_11_server_ops;
+ conn->cmds = smb2_0_server_cmds;
+ conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds);
+ conn->max_credits = SMB2_MAX_CREDITS;
+ conn->signing_algorithm = SIGNING_ALG_AES_CMAC;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+
+ if (conn->cipher_type)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
+
+ INIT_LIST_HEAD(&conn->preauth_sess_table);
+ return 0;
+}
+
+void init_smb2_max_read_size(unsigned int sz)
+{
+ smb21_server_values.max_read_size = sz;
+ smb30_server_values.max_read_size = sz;
+ smb302_server_values.max_read_size = sz;
+ smb311_server_values.max_read_size = sz;
+}
+
+void init_smb2_max_write_size(unsigned int sz)
+{
+ smb21_server_values.max_write_size = sz;
+ smb30_server_values.max_write_size = sz;
+ smb302_server_values.max_write_size = sz;
+ smb311_server_values.max_write_size = sz;
+}
+
+void init_smb2_max_trans_size(unsigned int sz)
+{
+ smb21_server_values.max_trans_size = sz;
+ smb30_server_values.max_trans_size = sz;
+ smb302_server_values.max_trans_size = sz;
+ smb311_server_values.max_trans_size = sz;
+}
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
new file mode 100644
index 000000000000..d329ea49fa14
--- /dev/null
+++ b/fs/ksmbd/smb2pdu.c
@@ -0,0 +1,8373 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/ethtool.h>
+#include <linux/falloc.h>
+
+#include "glob.h"
+#include "smb2pdu.h"
+#include "smbfsctl.h"
+#include "oplock.h"
+#include "smbacl.h"
+
+#include "auth.h"
+#include "asn1.h"
+#include "connection.h"
+#include "transport_ipc.h"
+#include "transport_rdma.h"
+#include "vfs.h"
+#include "vfs_cache.h"
+#include "misc.h"
+
+#include "server.h"
+#include "smb_common.h"
+#include "smbstatus.h"
+#include "ksmbd_work.h"
+#include "mgmt/user_config.h"
+#include "mgmt/share_config.h"
+#include "mgmt/tree_connect.h"
+#include "mgmt/user_session.h"
+#include "mgmt/ksmbd_ida.h"
+#include "ndr.h"
+
+static void __wbuf(struct ksmbd_work *work, void **req, void **rsp)
+{
+ if (work->next_smb2_rcv_hdr_off) {
+ *req = ksmbd_req_buf_next(work);
+ *rsp = ksmbd_resp_buf_next(work);
+ } else {
+ *req = work->request_buf;
+ *rsp = work->response_buf;
+ }
+}
+
+#define WORK_BUFFERS(w, rq, rs) __wbuf((w), (void **)&(rq), (void **)&(rs))
+
+/**
+ * check_session_id() - check for valid session id in smb header
+ * @conn: connection instance
+ * @id: session id from smb header
+ *
+ * Return: 1 if valid session id, otherwise 0
+ */
+static inline bool check_session_id(struct ksmbd_conn *conn, u64 id)
+{
+ struct ksmbd_session *sess;
+
+ if (id == 0 || id == -1)
+ return false;
+
+ sess = ksmbd_session_lookup_all(conn, id);
+ if (sess)
+ return true;
+ pr_err("Invalid user session id: %llu\n", id);
+ return false;
+}
+
+struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn *conn)
+{
+ struct channel *chann;
+
+ list_for_each_entry(chann, &sess->ksmbd_chann_list, chann_list) {
+ if (chann->conn == conn)
+ return chann;
+ }
+
+ return NULL;
+}
+
+/**
+ * smb2_get_ksmbd_tcon() - get tree connection information using a tree id.
+ * @work: smb work
+ *
+ * Return: 0 if there is a tree connection matched or these are
+ * skipable commands, otherwise error
+ */
+int smb2_get_ksmbd_tcon(struct ksmbd_work *work)
+{
+ struct smb2_hdr *req_hdr = work->request_buf;
+ int tree_id;
+
+ work->tcon = NULL;
+ if (work->conn->ops->get_cmd_val(work) == SMB2_TREE_CONNECT_HE ||
+ work->conn->ops->get_cmd_val(work) == SMB2_CANCEL_HE ||
+ work->conn->ops->get_cmd_val(work) == SMB2_LOGOFF_HE) {
+ ksmbd_debug(SMB, "skip to check tree connect request\n");
+ return 0;
+ }
+
+ if (xa_empty(&work->sess->tree_conns)) {
+ ksmbd_debug(SMB, "NO tree connected\n");
+ return -ENOENT;
+ }
+
+ tree_id = le32_to_cpu(req_hdr->Id.SyncId.TreeId);
+ work->tcon = ksmbd_tree_conn_lookup(work->sess, tree_id);
+ if (!work->tcon) {
+ pr_err("Invalid tid %d\n", tree_id);
+ return -EINVAL;
+ }
+
+ return 1;
+}
+
+/**
+ * smb2_set_err_rsp() - set error response code on smb response
+ * @work: smb work containing response buffer
+ */
+void smb2_set_err_rsp(struct ksmbd_work *work)
+{
+ struct smb2_err_rsp *err_rsp;
+
+ if (work->next_smb2_rcv_hdr_off)
+ err_rsp = ksmbd_resp_buf_next(work);
+ else
+ err_rsp = work->response_buf;
+
+ if (err_rsp->hdr.Status != STATUS_STOPPED_ON_SYMLINK) {
+ err_rsp->StructureSize = SMB2_ERROR_STRUCTURE_SIZE2_LE;
+ err_rsp->ErrorContextCount = 0;
+ err_rsp->Reserved = 0;
+ err_rsp->ByteCount = 0;
+ err_rsp->ErrorData[0] = 0;
+ inc_rfc1001_len(work->response_buf, SMB2_ERROR_STRUCTURE_SIZE2);
+ }
+}
+
+/**
+ * is_smb2_neg_cmd() - is it smb2 negotiation command
+ * @work: smb work containing smb header
+ *
+ * Return: true if smb2 negotiation command, otherwise false
+ */
+bool is_smb2_neg_cmd(struct ksmbd_work *work)
+{
+ struct smb2_hdr *hdr = work->request_buf;
+
+ /* is it SMB2 header ? */
+ if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
+ return false;
+
+ /* make sure it is request not response message */
+ if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
+ return false;
+
+ if (hdr->Command != SMB2_NEGOTIATE)
+ return false;
+
+ return true;
+}
+
+/**
+ * is_smb2_rsp() - is it smb2 response
+ * @work: smb work containing smb response buffer
+ *
+ * Return: true if smb2 response, otherwise false
+ */
+bool is_smb2_rsp(struct ksmbd_work *work)
+{
+ struct smb2_hdr *hdr = work->response_buf;
+
+ /* is it SMB2 header ? */
+ if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
+ return false;
+
+ /* make sure it is response not request message */
+ if (!(hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR))
+ return false;
+
+ return true;
+}
+
+/**
+ * get_smb2_cmd_val() - get smb command code from smb header
+ * @work: smb work containing smb request buffer
+ *
+ * Return: smb2 request command value
+ */
+u16 get_smb2_cmd_val(struct ksmbd_work *work)
+{
+ struct smb2_hdr *rcv_hdr;
+
+ if (work->next_smb2_rcv_hdr_off)
+ rcv_hdr = ksmbd_req_buf_next(work);
+ else
+ rcv_hdr = work->request_buf;
+ return le16_to_cpu(rcv_hdr->Command);
+}
+
+/**
+ * set_smb2_rsp_status() - set error response code on smb2 header
+ * @work: smb work containing response buffer
+ * @err: error response code
+ */
+void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err)
+{
+ struct smb2_hdr *rsp_hdr;
+
+ if (work->next_smb2_rcv_hdr_off)
+ rsp_hdr = ksmbd_resp_buf_next(work);
+ else
+ rsp_hdr = work->response_buf;
+ rsp_hdr->Status = err;
+ smb2_set_err_rsp(work);
+}
+
+/**
+ * init_smb2_neg_rsp() - initialize smb2 response for negotiate command
+ * @work: smb work containing smb request buffer
+ *
+ * smb2 negotiate response is sent in reply of smb1 negotiate command for
+ * dialect auto-negotiation.
+ */
+int init_smb2_neg_rsp(struct ksmbd_work *work)
+{
+ struct smb2_hdr *rsp_hdr;
+ struct smb2_negotiate_rsp *rsp;
+ struct ksmbd_conn *conn = work->conn;
+
+ if (conn->need_neg == false)
+ return -EINVAL;
+ if (!(conn->dialect >= SMB20_PROT_ID &&
+ conn->dialect <= SMB311_PROT_ID))
+ return -EINVAL;
+
+ rsp_hdr = work->response_buf;
+
+ memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
+
+ rsp_hdr->smb2_buf_length =
+ cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals));
+
+ rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
+ rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
+ rsp_hdr->CreditRequest = cpu_to_le16(2);
+ rsp_hdr->Command = SMB2_NEGOTIATE;
+ rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR);
+ rsp_hdr->NextCommand = 0;
+ rsp_hdr->MessageId = 0;
+ rsp_hdr->Id.SyncId.ProcessId = 0;
+ rsp_hdr->Id.SyncId.TreeId = 0;
+ rsp_hdr->SessionId = 0;
+ memset(rsp_hdr->Signature, 0, 16);
+
+ rsp = work->response_buf;
+
+ WARN_ON(ksmbd_conn_good(work));
+
+ rsp->StructureSize = cpu_to_le16(65);
+ ksmbd_debug(SMB, "conn->dialect 0x%x\n", conn->dialect);
+ rsp->DialectRevision = cpu_to_le16(conn->dialect);
+ /* Not setting conn guid rsp->ServerGUID, as it
+ * not used by client for identifying connection
+ */
+ rsp->Capabilities = cpu_to_le32(conn->vals->capabilities);
+ /* Default Max Message Size till SMB2.0, 64K*/
+ rsp->MaxTransactSize = cpu_to_le32(conn->vals->max_trans_size);
+ rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size);
+ rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size);
+
+ rsp->SystemTime = cpu_to_le64(ksmbd_systime());
+ rsp->ServerStartTime = 0;
+
+ rsp->SecurityBufferOffset = cpu_to_le16(128);
+ rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH);
+ ksmbd_copy_gss_neg_header(((char *)(&rsp->hdr) +
+ sizeof(rsp->hdr.smb2_buf_length)) +
+ le16_to_cpu(rsp->SecurityBufferOffset));
+ inc_rfc1001_len(rsp, sizeof(struct smb2_negotiate_rsp) -
+ sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) +
+ AUTH_GSS_LENGTH);
+ rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE;
+ if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY)
+ rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE;
+ conn->use_spnego = true;
+
+ ksmbd_conn_set_need_negotiate(work);
+ return 0;
+}
+
+static int smb2_consume_credit_charge(struct ksmbd_work *work,
+ unsigned short credit_charge)
+{
+ struct ksmbd_conn *conn = work->conn;
+ unsigned int rsp_credits = 1;
+
+ if (!conn->total_credits)
+ return 0;
+
+ if (credit_charge > 0)
+ rsp_credits = credit_charge;
+
+ conn->total_credits -= rsp_credits;
+ return rsp_credits;
+}
+
+/**
+ * smb2_set_rsp_credits() - set number of credits in response buffer
+ * @work: smb work containing smb response buffer
+ */
+int smb2_set_rsp_credits(struct ksmbd_work *work)
+{
+ struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work);
+ struct smb2_hdr *hdr = ksmbd_resp_buf_next(work);
+ struct ksmbd_conn *conn = work->conn;
+ unsigned short credits_requested = le16_to_cpu(req_hdr->CreditRequest);
+ unsigned short credit_charge = 1, credits_granted = 0;
+ unsigned short aux_max, aux_credits, min_credits;
+ int rsp_credit_charge;
+
+ if (hdr->Command == SMB2_CANCEL)
+ goto out;
+
+ /* get default minimum credits by shifting maximum credits by 4 */
+ min_credits = conn->max_credits >> 4;
+
+ if (conn->total_credits >= conn->max_credits) {
+ pr_err("Total credits overflow: %d\n", conn->total_credits);
+ conn->total_credits = min_credits;
+ }
+
+ rsp_credit_charge =
+ smb2_consume_credit_charge(work, le16_to_cpu(req_hdr->CreditCharge));
+ if (rsp_credit_charge < 0)
+ return -EINVAL;
+
+ hdr->CreditCharge = cpu_to_le16(rsp_credit_charge);
+
+ if (credits_requested > 0) {
+ aux_credits = credits_requested - 1;
+ aux_max = 32;
+ if (hdr->Command == SMB2_NEGOTIATE)
+ aux_max = 0;
+ aux_credits = (aux_credits < aux_max) ? aux_credits : aux_max;
+ credits_granted = aux_credits + credit_charge;
+
+ /* if credits granted per client is getting bigger than default
+ * minimum credits then we should wrap it up within the limits.
+ */
+ if ((conn->total_credits + credits_granted) > min_credits)
+ credits_granted = min_credits - conn->total_credits;
+ /*
+ * TODO: Need to adjuct CreditRequest value according to
+ * current cpu load
+ */
+ } else if (conn->total_credits == 0) {
+ credits_granted = 1;
+ }
+
+ conn->total_credits += credits_granted;
+ work->credits_granted += credits_granted;
+
+ if (!req_hdr->NextCommand) {
+ /* Update CreditRequest in last request */
+ hdr->CreditRequest = cpu_to_le16(work->credits_granted);
+ }
+out:
+ ksmbd_debug(SMB,
+ "credits: requested[%d] granted[%d] total_granted[%d]\n",
+ credits_requested, credits_granted,
+ conn->total_credits);
+ return 0;
+}
+
+/**
+ * init_chained_smb2_rsp() - initialize smb2 chained response
+ * @work: smb work containing smb response buffer
+ */
+static void init_chained_smb2_rsp(struct ksmbd_work *work)
+{
+ struct smb2_hdr *req = ksmbd_req_buf_next(work);
+ struct smb2_hdr *rsp = ksmbd_resp_buf_next(work);
+ struct smb2_hdr *rsp_hdr;
+ struct smb2_hdr *rcv_hdr;
+ int next_hdr_offset = 0;
+ int len, new_len;
+
+ /* Len of this response = updated RFC len - offset of previous cmd
+ * in the compound rsp
+ */
+
+ /* Storing the current local FID which may be needed by subsequent
+ * command in the compound request
+ */
+ if (req->Command == SMB2_CREATE && rsp->Status == STATUS_SUCCESS) {
+ work->compound_fid =
+ le64_to_cpu(((struct smb2_create_rsp *)rsp)->
+ VolatileFileId);
+ work->compound_pfid =
+ le64_to_cpu(((struct smb2_create_rsp *)rsp)->
+ PersistentFileId);
+ work->compound_sid = le64_to_cpu(rsp->SessionId);
+ }
+
+ len = get_rfc1002_len(work->response_buf) - work->next_smb2_rsp_hdr_off;
+ next_hdr_offset = le32_to_cpu(req->NextCommand);
+
+ new_len = ALIGN(len, 8);
+ inc_rfc1001_len(work->response_buf, ((sizeof(struct smb2_hdr) - 4)
+ + new_len - len));
+ rsp->NextCommand = cpu_to_le32(new_len);
+
+ work->next_smb2_rcv_hdr_off += next_hdr_offset;
+ work->next_smb2_rsp_hdr_off += new_len;
+ ksmbd_debug(SMB,
+ "Compound req new_len = %d rcv off = %d rsp off = %d\n",
+ new_len, work->next_smb2_rcv_hdr_off,
+ work->next_smb2_rsp_hdr_off);
+
+ rsp_hdr = ksmbd_resp_buf_next(work);
+ rcv_hdr = ksmbd_req_buf_next(work);
+
+ if (!(rcv_hdr->Flags & SMB2_FLAGS_RELATED_OPERATIONS)) {
+ ksmbd_debug(SMB, "related flag should be set\n");
+ work->compound_fid = KSMBD_NO_FID;
+ work->compound_pfid = KSMBD_NO_FID;
+ }
+ memset((char *)rsp_hdr + 4, 0, sizeof(struct smb2_hdr) + 2);
+ rsp_hdr->ProtocolId = rcv_hdr->ProtocolId;
+ rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
+ rsp_hdr->Command = rcv_hdr->Command;
+
+ /*
+ * Message is response. We don't grant oplock yet.
+ */
+ rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR |
+ SMB2_FLAGS_RELATED_OPERATIONS);
+ rsp_hdr->NextCommand = 0;
+ rsp_hdr->MessageId = rcv_hdr->MessageId;
+ rsp_hdr->Id.SyncId.ProcessId = rcv_hdr->Id.SyncId.ProcessId;
+ rsp_hdr->Id.SyncId.TreeId = rcv_hdr->Id.SyncId.TreeId;
+ rsp_hdr->SessionId = rcv_hdr->SessionId;
+ memcpy(rsp_hdr->Signature, rcv_hdr->Signature, 16);
+}
+
+/**
+ * is_chained_smb2_message() - check for chained command
+ * @work: smb work containing smb request buffer
+ *
+ * Return: true if chained request, otherwise false
+ */
+bool is_chained_smb2_message(struct ksmbd_work *work)
+{
+ struct smb2_hdr *hdr = work->request_buf;
+ unsigned int len;
+
+ if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
+ return false;
+
+ hdr = ksmbd_req_buf_next(work);
+ if (le32_to_cpu(hdr->NextCommand) > 0) {
+ ksmbd_debug(SMB, "got SMB2 chained command\n");
+ init_chained_smb2_rsp(work);
+ return true;
+ } else if (work->next_smb2_rcv_hdr_off) {
+ /*
+ * This is last request in chained command,
+ * align response to 8 byte
+ */
+ len = ALIGN(get_rfc1002_len(work->response_buf), 8);
+ len = len - get_rfc1002_len(work->response_buf);
+ if (len) {
+ ksmbd_debug(SMB, "padding len %u\n", len);
+ inc_rfc1001_len(work->response_buf, len);
+ if (work->aux_payload_sz)
+ work->aux_payload_sz += len;
+ }
+ }
+ return false;
+}
+
+/**
+ * init_smb2_rsp_hdr() - initialize smb2 response
+ * @work: smb work containing smb request buffer
+ *
+ * Return: 0
+ */
+int init_smb2_rsp_hdr(struct ksmbd_work *work)
+{
+ struct smb2_hdr *rsp_hdr = work->response_buf;
+ struct smb2_hdr *rcv_hdr = work->request_buf;
+ struct ksmbd_conn *conn = work->conn;
+
+ memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
+ rsp_hdr->smb2_buf_length =
+ cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals));
+ rsp_hdr->ProtocolId = rcv_hdr->ProtocolId;
+ rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
+ rsp_hdr->Command = rcv_hdr->Command;
+
+ /*
+ * Message is response. We don't grant oplock yet.
+ */
+ rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR);
+ rsp_hdr->NextCommand = 0;
+ rsp_hdr->MessageId = rcv_hdr->MessageId;
+ rsp_hdr->Id.SyncId.ProcessId = rcv_hdr->Id.SyncId.ProcessId;
+ rsp_hdr->Id.SyncId.TreeId = rcv_hdr->Id.SyncId.TreeId;
+ rsp_hdr->SessionId = rcv_hdr->SessionId;
+ memcpy(rsp_hdr->Signature, rcv_hdr->Signature, 16);
+
+ work->syncronous = true;
+ if (work->async_id) {
+ ksmbd_release_id(&conn->async_ida, work->async_id);
+ work->async_id = 0;
+ }
+
+ return 0;
+}
+
+/**
+ * smb2_allocate_rsp_buf() - allocate smb2 response buffer
+ * @work: smb work containing smb request buffer
+ *
+ * Return: 0 on success, otherwise -ENOMEM
+ */
+int smb2_allocate_rsp_buf(struct ksmbd_work *work)
+{
+ struct smb2_hdr *hdr = work->request_buf;
+ size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE;
+ size_t large_sz = work->conn->vals->max_trans_size + MAX_SMB2_HDR_SIZE;
+ size_t sz = small_sz;
+ int cmd = le16_to_cpu(hdr->Command);
+
+ if (cmd == SMB2_IOCTL_HE || cmd == SMB2_QUERY_DIRECTORY_HE)
+ sz = large_sz;
+
+ if (cmd == SMB2_QUERY_INFO_HE) {
+ struct smb2_query_info_req *req;
+
+ req = work->request_buf;
+ if (req->InfoType == SMB2_O_INFO_FILE &&
+ (req->FileInfoClass == FILE_FULL_EA_INFORMATION ||
+ req->FileInfoClass == FILE_ALL_INFORMATION))
+ sz = large_sz;
+ }
+
+ /* allocate large response buf for chained commands */
+ if (le32_to_cpu(hdr->NextCommand) > 0)
+ sz = large_sz;
+
+ work->response_buf = kvmalloc(sz, GFP_KERNEL | __GFP_ZERO);
+ if (!work->response_buf)
+ return -ENOMEM;
+
+ work->response_sz = sz;
+ return 0;
+}
+
+/**
+ * smb2_check_user_session() - check for valid session for a user
+ * @work: smb work containing smb request buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_check_user_session(struct ksmbd_work *work)
+{
+ struct smb2_hdr *req_hdr = work->request_buf;
+ struct ksmbd_conn *conn = work->conn;
+ unsigned int cmd = conn->ops->get_cmd_val(work);
+ unsigned long long sess_id;
+
+ work->sess = NULL;
+ /*
+ * SMB2_ECHO, SMB2_NEGOTIATE, SMB2_SESSION_SETUP command do not
+ * require a session id, so no need to validate user session's for
+ * these commands.
+ */
+ if (cmd == SMB2_ECHO_HE || cmd == SMB2_NEGOTIATE_HE ||
+ cmd == SMB2_SESSION_SETUP_HE)
+ return 0;
+
+ if (!ksmbd_conn_good(work))
+ return -EINVAL;
+
+ sess_id = le64_to_cpu(req_hdr->SessionId);
+ /* Check for validity of user session */
+ work->sess = ksmbd_session_lookup_all(conn, sess_id);
+ if (work->sess)
+ return 1;
+ ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id);
+ return -EINVAL;
+}
+
+static void destroy_previous_session(struct ksmbd_user *user, u64 id)
+{
+ struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id);
+ struct ksmbd_user *prev_user;
+
+ if (!prev_sess)
+ return;
+
+ prev_user = prev_sess->user;
+
+ if (!prev_user ||
+ strcmp(user->name, prev_user->name) ||
+ user->passkey_sz != prev_user->passkey_sz ||
+ memcmp(user->passkey, prev_user->passkey, user->passkey_sz)) {
+ put_session(prev_sess);
+ return;
+ }
+
+ put_session(prev_sess);
+ ksmbd_session_destroy(prev_sess);
+}
+
+/**
+ * smb2_get_name() - get filename string from on the wire smb format
+ * @share: ksmbd_share_config pointer
+ * @src: source buffer
+ * @maxlen: maxlen of source string
+ * @nls_table: nls_table pointer
+ *
+ * Return: matching converted filename on success, otherwise error ptr
+ */
+static char *
+smb2_get_name(struct ksmbd_share_config *share, const char *src,
+ const int maxlen, struct nls_table *local_nls)
+{
+ char *name, *unixname;
+
+ name = smb_strndup_from_utf16(src, maxlen, 1, local_nls);
+ if (IS_ERR(name)) {
+ pr_err("failed to get name %ld\n", PTR_ERR(name));
+ return name;
+ }
+
+ /* change it to absolute unix name */
+ ksmbd_conv_path_to_unix(name);
+ ksmbd_strip_last_slash(name);
+
+ unixname = convert_to_unix_name(share, name);
+ kfree(name);
+ if (!unixname) {
+ pr_err("can not convert absolute name\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ksmbd_debug(SMB, "absolute name = %s\n", unixname);
+ return unixname;
+}
+
+int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
+{
+ struct smb2_hdr *rsp_hdr;
+ struct ksmbd_conn *conn = work->conn;
+ int id;
+
+ rsp_hdr = work->response_buf;
+ rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND;
+
+ id = ksmbd_acquire_async_msg_id(&conn->async_ida);
+ if (id < 0) {
+ pr_err("Failed to alloc async message id\n");
+ return id;
+ }
+ work->syncronous = false;
+ work->async_id = id;
+ rsp_hdr->Id.AsyncId = cpu_to_le64(id);
+
+ ksmbd_debug(SMB,
+ "Send interim Response to inform async request id : %d\n",
+ work->async_id);
+
+ work->cancel_fn = fn;
+ work->cancel_argv = arg;
+
+ if (list_empty(&work->async_request_entry)) {
+ spin_lock(&conn->request_lock);
+ list_add_tail(&work->async_request_entry, &conn->async_requests);
+ spin_unlock(&conn->request_lock);
+ }
+
+ return 0;
+}
+
+void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status)
+{
+ struct smb2_hdr *rsp_hdr;
+
+ rsp_hdr = work->response_buf;
+ smb2_set_err_rsp(work);
+ rsp_hdr->Status = status;
+
+ work->multiRsp = 1;
+ ksmbd_conn_write(work);
+ rsp_hdr->Status = 0;
+ work->multiRsp = 0;
+}
+
+static __le32 smb2_get_reparse_tag_special_file(umode_t mode)
+{
+ if (S_ISDIR(mode) || S_ISREG(mode))
+ return 0;
+
+ if (S_ISLNK(mode))
+ return IO_REPARSE_TAG_LX_SYMLINK_LE;
+ else if (S_ISFIFO(mode))
+ return IO_REPARSE_TAG_LX_FIFO_LE;
+ else if (S_ISSOCK(mode))
+ return IO_REPARSE_TAG_AF_UNIX_LE;
+ else if (S_ISCHR(mode))
+ return IO_REPARSE_TAG_LX_CHR_LE;
+ else if (S_ISBLK(mode))
+ return IO_REPARSE_TAG_LX_BLK_LE;
+
+ return 0;
+}
+
+/**
+ * smb2_get_dos_mode() - get file mode in dos format from unix mode
+ * @stat: kstat containing file mode
+ * @attribute: attribute flags
+ *
+ * Return: converted dos mode
+ */
+static int smb2_get_dos_mode(struct kstat *stat, int attribute)
+{
+ int attr = 0;
+
+ if (S_ISDIR(stat->mode)) {
+ attr = ATTR_DIRECTORY |
+ (attribute & (ATTR_HIDDEN | ATTR_SYSTEM));
+ } else {
+ attr = (attribute & 0x00005137) | ATTR_ARCHIVE;
+ attr &= ~(ATTR_DIRECTORY);
+ if (S_ISREG(stat->mode) && (server_conf.share_fake_fscaps &
+ FILE_SUPPORTS_SPARSE_FILES))
+ attr |= ATTR_SPARSE;
+
+ if (smb2_get_reparse_tag_special_file(stat->mode))
+ attr |= ATTR_REPARSE;
+ }
+
+ return attr;
+}
+
+static void build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt,
+ __le16 hash_id)
+{
+ pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES;
+ pneg_ctxt->DataLength = cpu_to_le16(38);
+ pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1);
+ pneg_ctxt->Reserved = cpu_to_le32(0);
+ pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE);
+ get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE);
+ pneg_ctxt->HashAlgorithms = hash_id;
+}
+
+static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt,
+ __le16 cipher_type)
+{
+ pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES;
+ pneg_ctxt->DataLength = cpu_to_le16(4);
+ pneg_ctxt->Reserved = cpu_to_le32(0);
+ pneg_ctxt->CipherCount = cpu_to_le16(1);
+ pneg_ctxt->Ciphers[0] = cipher_type;
+}
+
+static void build_compression_ctxt(struct smb2_compression_ctx *pneg_ctxt,
+ __le16 comp_algo)
+{
+ pneg_ctxt->ContextType = SMB2_COMPRESSION_CAPABILITIES;
+ pneg_ctxt->DataLength =
+ cpu_to_le16(sizeof(struct smb2_compression_ctx)
+ - sizeof(struct smb2_neg_context));
+ pneg_ctxt->Reserved = cpu_to_le32(0);
+ pneg_ctxt->CompressionAlgorithmCount = cpu_to_le16(1);
+ pneg_ctxt->Reserved1 = cpu_to_le32(0);
+ pneg_ctxt->CompressionAlgorithms[0] = comp_algo;
+}
+
+static void build_sign_cap_ctxt(struct smb2_signing_capabilities *pneg_ctxt,
+ __le16 sign_algo)
+{
+ pneg_ctxt->ContextType = SMB2_SIGNING_CAPABILITIES;
+ pneg_ctxt->DataLength =
+ cpu_to_le16((sizeof(struct smb2_signing_capabilities) + 2)
+ - sizeof(struct smb2_neg_context));
+ pneg_ctxt->Reserved = cpu_to_le32(0);
+ pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(1);
+ pneg_ctxt->SigningAlgorithms[0] = sign_algo;
+}
+
+static void build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
+{
+ pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
+ pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+ /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+ pneg_ctxt->Name[0] = 0x93;
+ pneg_ctxt->Name[1] = 0xAD;
+ pneg_ctxt->Name[2] = 0x25;
+ pneg_ctxt->Name[3] = 0x50;
+ pneg_ctxt->Name[4] = 0x9C;
+ pneg_ctxt->Name[5] = 0xB4;
+ pneg_ctxt->Name[6] = 0x11;
+ pneg_ctxt->Name[7] = 0xE7;
+ pneg_ctxt->Name[8] = 0xB4;
+ pneg_ctxt->Name[9] = 0x23;
+ pneg_ctxt->Name[10] = 0x83;
+ pneg_ctxt->Name[11] = 0xDE;
+ pneg_ctxt->Name[12] = 0x96;
+ pneg_ctxt->Name[13] = 0x8B;
+ pneg_ctxt->Name[14] = 0xCD;
+ pneg_ctxt->Name[15] = 0x7C;
+}
+
+static void assemble_neg_contexts(struct ksmbd_conn *conn,
+ struct smb2_negotiate_rsp *rsp)
+{
+ /* +4 is to account for the RFC1001 len field */
+ char *pneg_ctxt = (char *)rsp +
+ le32_to_cpu(rsp->NegotiateContextOffset) + 4;
+ int neg_ctxt_cnt = 1;
+ int ctxt_size;
+
+ ksmbd_debug(SMB,
+ "assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n");
+ build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt,
+ conn->preauth_info->Preauth_HashId);
+ rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt);
+ inc_rfc1001_len(rsp, AUTH_GSS_PADDING);
+ ctxt_size = sizeof(struct smb2_preauth_neg_context);
+ /* Round to 8 byte boundary */
+ pneg_ctxt += round_up(sizeof(struct smb2_preauth_neg_context), 8);
+
+ if (conn->cipher_type) {
+ ctxt_size = round_up(ctxt_size, 8);
+ ksmbd_debug(SMB,
+ "assemble SMB2_ENCRYPTION_CAPABILITIES context\n");
+ build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt,
+ conn->cipher_type);
+ rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+ ctxt_size += sizeof(struct smb2_encryption_neg_context) + 2;
+ /* Round to 8 byte boundary */
+ pneg_ctxt +=
+ round_up(sizeof(struct smb2_encryption_neg_context) + 2,
+ 8);
+ }
+
+ if (conn->compress_algorithm) {
+ ctxt_size = round_up(ctxt_size, 8);
+ ksmbd_debug(SMB,
+ "assemble SMB2_COMPRESSION_CAPABILITIES context\n");
+ /* Temporarily set to SMB3_COMPRESS_NONE */
+ build_compression_ctxt((struct smb2_compression_ctx *)pneg_ctxt,
+ conn->compress_algorithm);
+ rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+ ctxt_size += sizeof(struct smb2_compression_ctx) + 2;
+ /* Round to 8 byte boundary */
+ pneg_ctxt += round_up(sizeof(struct smb2_compression_ctx) + 2,
+ 8);
+ }
+
+ if (conn->posix_ext_supported) {
+ ctxt_size = round_up(ctxt_size, 8);
+ ksmbd_debug(SMB,
+ "assemble SMB2_POSIX_EXTENSIONS_AVAILABLE context\n");
+ build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
+ rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+ ctxt_size += sizeof(struct smb2_posix_neg_context);
+ /* Round to 8 byte boundary */
+ pneg_ctxt += round_up(sizeof(struct smb2_posix_neg_context), 8);
+ }
+
+ if (conn->signing_negotiated) {
+ ctxt_size = round_up(ctxt_size, 8);
+ ksmbd_debug(SMB,
+ "assemble SMB2_SIGNING_CAPABILITIES context\n");
+ build_sign_cap_ctxt((struct smb2_signing_capabilities *)pneg_ctxt,
+ conn->signing_algorithm);
+ rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+ ctxt_size += sizeof(struct smb2_signing_capabilities) + 2;
+ }
+
+ inc_rfc1001_len(rsp, ctxt_size);
+}
+
+static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn,
+ struct smb2_preauth_neg_context *pneg_ctxt)
+{
+ __le32 err = STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP;
+
+ if (pneg_ctxt->HashAlgorithms == SMB2_PREAUTH_INTEGRITY_SHA512) {
+ conn->preauth_info->Preauth_HashId =
+ SMB2_PREAUTH_INTEGRITY_SHA512;
+ err = STATUS_SUCCESS;
+ }
+
+ return err;
+}
+
+static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
+ struct smb2_encryption_neg_context *pneg_ctxt,
+ int len_of_ctxts)
+{
+ int cph_cnt = le16_to_cpu(pneg_ctxt->CipherCount);
+ int i, cphs_size = cph_cnt * sizeof(__le16);
+
+ conn->cipher_type = 0;
+
+ if (sizeof(struct smb2_encryption_neg_context) + cphs_size >
+ len_of_ctxts) {
+ pr_err("Invalid cipher count(%d)\n", cph_cnt);
+ return;
+ }
+
+ if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION))
+ return;
+
+ for (i = 0; i < cph_cnt; i++) {
+ if (pneg_ctxt->Ciphers[i] == SMB2_ENCRYPTION_AES128_GCM ||
+ pneg_ctxt->Ciphers[i] == SMB2_ENCRYPTION_AES128_CCM ||
+ pneg_ctxt->Ciphers[i] == SMB2_ENCRYPTION_AES256_CCM ||
+ pneg_ctxt->Ciphers[i] == SMB2_ENCRYPTION_AES256_GCM) {
+ ksmbd_debug(SMB, "Cipher ID = 0x%x\n",
+ pneg_ctxt->Ciphers[i]);
+ conn->cipher_type = pneg_ctxt->Ciphers[i];
+ break;
+ }
+ }
+}
+
+static void decode_compress_ctxt(struct ksmbd_conn *conn,
+ struct smb2_compression_ctx *pneg_ctxt)
+{
+ conn->compress_algorithm = SMB3_COMPRESS_NONE;
+}
+
+static void decode_sign_cap_ctxt(struct ksmbd_conn *conn,
+ struct smb2_signing_capabilities *pneg_ctxt,
+ int len_of_ctxts)
+{
+ int sign_algo_cnt = le16_to_cpu(pneg_ctxt->SigningAlgorithmCount);
+ int i, sign_alos_size = sign_algo_cnt * sizeof(__le16);
+
+ conn->signing_negotiated = false;
+
+ if (sizeof(struct smb2_signing_capabilities) + sign_alos_size >
+ len_of_ctxts) {
+ pr_err("Invalid signing algorithm count(%d)\n", sign_algo_cnt);
+ return;
+ }
+
+ for (i = 0; i < sign_algo_cnt; i++) {
+ if (pneg_ctxt->SigningAlgorithms[i] == SIGNING_ALG_HMAC_SHA256 ||
+ pneg_ctxt->SigningAlgorithms[i] == SIGNING_ALG_AES_CMAC) {
+ ksmbd_debug(SMB, "Signing Algorithm ID = 0x%x\n",
+ pneg_ctxt->SigningAlgorithms[i]);
+ conn->signing_negotiated = true;
+ conn->signing_algorithm =
+ pneg_ctxt->SigningAlgorithms[i];
+ break;
+ }
+ }
+}
+
+static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn,
+ struct smb2_negotiate_req *req)
+{
+ /* +4 is to account for the RFC1001 len field */
+ struct smb2_neg_context *pctx = (struct smb2_neg_context *)((char *)req + 4);
+ int i = 0, len_of_ctxts;
+ int offset = le32_to_cpu(req->NegotiateContextOffset);
+ int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount);
+ int len_of_smb = be32_to_cpu(req->hdr.smb2_buf_length);
+ __le32 status = STATUS_INVALID_PARAMETER;
+
+ ksmbd_debug(SMB, "decoding %d negotiate contexts\n", neg_ctxt_cnt);
+ if (len_of_smb <= offset) {
+ ksmbd_debug(SMB, "Invalid response: negotiate context offset\n");
+ return status;
+ }
+
+ len_of_ctxts = len_of_smb - offset;
+
+ while (i++ < neg_ctxt_cnt) {
+ int clen;
+
+ /* check that offset is not beyond end of SMB */
+ if (len_of_ctxts == 0)
+ break;
+
+ if (len_of_ctxts < sizeof(struct smb2_neg_context))
+ break;
+
+ pctx = (struct smb2_neg_context *)((char *)pctx + offset);
+ clen = le16_to_cpu(pctx->DataLength);
+ if (clen + sizeof(struct smb2_neg_context) > len_of_ctxts)
+ break;
+
+ if (pctx->ContextType == SMB2_PREAUTH_INTEGRITY_CAPABILITIES) {
+ ksmbd_debug(SMB,
+ "deassemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n");
+ if (conn->preauth_info->Preauth_HashId)
+ break;
+
+ status = decode_preauth_ctxt(conn,
+ (struct smb2_preauth_neg_context *)pctx);
+ if (status != STATUS_SUCCESS)
+ break;
+ } else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) {
+ ksmbd_debug(SMB,
+ "deassemble SMB2_ENCRYPTION_CAPABILITIES context\n");
+ if (conn->cipher_type)
+ break;
+
+ decode_encrypt_ctxt(conn,
+ (struct smb2_encryption_neg_context *)pctx,
+ len_of_ctxts);
+ } else if (pctx->ContextType == SMB2_COMPRESSION_CAPABILITIES) {
+ ksmbd_debug(SMB,
+ "deassemble SMB2_COMPRESSION_CAPABILITIES context\n");
+ if (conn->compress_algorithm)
+ break;
+
+ decode_compress_ctxt(conn,
+ (struct smb2_compression_ctx *)pctx);
+ } else if (pctx->ContextType == SMB2_NETNAME_NEGOTIATE_CONTEXT_ID) {
+ ksmbd_debug(SMB,
+ "deassemble SMB2_NETNAME_NEGOTIATE_CONTEXT_ID context\n");
+ } else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE) {
+ ksmbd_debug(SMB,
+ "deassemble SMB2_POSIX_EXTENSIONS_AVAILABLE context\n");
+ conn->posix_ext_supported = true;
+ } else if (pctx->ContextType == SMB2_SIGNING_CAPABILITIES) {
+ ksmbd_debug(SMB,
+ "deassemble SMB2_SIGNING_CAPABILITIES context\n");
+ decode_sign_cap_ctxt(conn,
+ (struct smb2_signing_capabilities *)pctx,
+ len_of_ctxts);
+ }
+
+ /* offsets must be 8 byte aligned */
+ clen = (clen + 7) & ~0x7;
+ offset = clen + sizeof(struct smb2_neg_context);
+ len_of_ctxts -= clen + sizeof(struct smb2_neg_context);
+ }
+ return status;
+}
+
+/**
+ * smb2_handle_negotiate() - handler for smb2 negotiate command
+ * @work: smb work containing smb request buffer
+ *
+ * Return: 0
+ */
+int smb2_handle_negotiate(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_negotiate_req *req = work->request_buf;
+ struct smb2_negotiate_rsp *rsp = work->response_buf;
+ int rc = 0;
+ __le32 status;
+
+ ksmbd_debug(SMB, "Received negotiate request\n");
+ conn->need_neg = false;
+ if (ksmbd_conn_good(work)) {
+ pr_err("conn->tcp_status is already in CifsGood State\n");
+ work->send_no_response = 1;
+ return rc;
+ }
+
+ if (req->DialectCount == 0) {
+ pr_err("malformed packet\n");
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ conn->cli_cap = le32_to_cpu(req->Capabilities);
+ switch (conn->dialect) {
+ case SMB311_PROT_ID:
+ conn->preauth_info =
+ kzalloc(sizeof(struct preauth_integrity_info),
+ GFP_KERNEL);
+ if (!conn->preauth_info) {
+ rc = -ENOMEM;
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ goto err_out;
+ }
+
+ status = deassemble_neg_contexts(conn, req);
+ if (status != STATUS_SUCCESS) {
+ pr_err("deassemble_neg_contexts error(0x%x)\n",
+ status);
+ rsp->hdr.Status = status;
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ rc = init_smb3_11_server(conn);
+ if (rc < 0) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ goto err_out;
+ }
+
+ ksmbd_gen_preauth_integrity_hash(conn,
+ work->request_buf,
+ conn->preauth_info->Preauth_HashValue);
+ rsp->NegotiateContextOffset =
+ cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
+ assemble_neg_contexts(conn, rsp);
+ break;
+ case SMB302_PROT_ID:
+ init_smb3_02_server(conn);
+ break;
+ case SMB30_PROT_ID:
+ init_smb3_0_server(conn);
+ break;
+ case SMB21_PROT_ID:
+ init_smb2_1_server(conn);
+ break;
+ case SMB20_PROT_ID:
+ rc = init_smb2_0_server(conn);
+ if (rc) {
+ rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ goto err_out;
+ }
+ break;
+ case SMB2X_PROT_ID:
+ case BAD_PROT_ID:
+ default:
+ ksmbd_debug(SMB, "Server dialect :0x%x not supported\n",
+ conn->dialect);
+ rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ rc = -EINVAL;
+ goto err_out;
+ }
+ rsp->Capabilities = cpu_to_le32(conn->vals->capabilities);
+
+ /* For stats */
+ conn->connection_type = conn->dialect;
+
+ rsp->MaxTransactSize = cpu_to_le32(conn->vals->max_trans_size);
+ rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size);
+ rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size);
+
+ if (conn->dialect > SMB20_PROT_ID) {
+ memcpy(conn->ClientGUID, req->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE);
+ conn->cli_sec_mode = le16_to_cpu(req->SecurityMode);
+ }
+
+ rsp->StructureSize = cpu_to_le16(65);
+ rsp->DialectRevision = cpu_to_le16(conn->dialect);
+ /* Not setting conn guid rsp->ServerGUID, as it
+ * not used by client for identifying server
+ */
+ memset(rsp->ServerGUID, 0, SMB2_CLIENT_GUID_SIZE);
+
+ rsp->SystemTime = cpu_to_le64(ksmbd_systime());
+ rsp->ServerStartTime = 0;
+ ksmbd_debug(SMB, "negotiate context offset %d, count %d\n",
+ le32_to_cpu(rsp->NegotiateContextOffset),
+ le16_to_cpu(rsp->NegotiateContextCount));
+
+ rsp->SecurityBufferOffset = cpu_to_le16(128);
+ rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH);
+ ksmbd_copy_gss_neg_header(((char *)(&rsp->hdr) +
+ sizeof(rsp->hdr.smb2_buf_length)) +
+ le16_to_cpu(rsp->SecurityBufferOffset));
+ inc_rfc1001_len(rsp, sizeof(struct smb2_negotiate_rsp) -
+ sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) +
+ AUTH_GSS_LENGTH);
+ rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE;
+ conn->use_spnego = true;
+
+ if ((server_conf.signing == KSMBD_CONFIG_OPT_AUTO ||
+ server_conf.signing == KSMBD_CONFIG_OPT_DISABLED) &&
+ req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED_LE)
+ conn->sign = true;
+ else if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) {
+ server_conf.enforced_signing = true;
+ rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE;
+ conn->sign = true;
+ }
+
+ conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode);
+ ksmbd_conn_set_need_negotiate(work);
+
+err_out:
+ if (rc < 0)
+ smb2_set_err_rsp(work);
+
+ return rc;
+}
+
+static int alloc_preauth_hash(struct ksmbd_session *sess,
+ struct ksmbd_conn *conn)
+{
+ if (sess->Preauth_HashValue)
+ return 0;
+
+ sess->Preauth_HashValue = kmemdup(conn->preauth_info->Preauth_HashValue,
+ PREAUTH_HASHVALUE_SIZE, GFP_KERNEL);
+ if (!sess->Preauth_HashValue)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int generate_preauth_hash(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct ksmbd_session *sess = work->sess;
+ u8 *preauth_hash;
+
+ if (conn->dialect != SMB311_PROT_ID)
+ return 0;
+
+ if (conn->binding) {
+ struct preauth_session *preauth_sess;
+
+ preauth_sess = ksmbd_preauth_session_lookup(conn, sess->id);
+ if (!preauth_sess) {
+ preauth_sess = ksmbd_preauth_session_alloc(conn, sess->id);
+ if (!preauth_sess)
+ return -ENOMEM;
+ }
+
+ preauth_hash = preauth_sess->Preauth_HashValue;
+ } else {
+ if (!sess->Preauth_HashValue)
+ if (alloc_preauth_hash(sess, conn))
+ return -ENOMEM;
+ preauth_hash = sess->Preauth_HashValue;
+ }
+
+ ksmbd_gen_preauth_integrity_hash(conn, work->request_buf, preauth_hash);
+ return 0;
+}
+
+static int decode_negotiation_token(struct ksmbd_work *work,
+ struct negotiate_message *negblob)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_sess_setup_req *req;
+ int sz;
+
+ if (!conn->use_spnego)
+ return -EINVAL;
+
+ req = work->request_buf;
+ sz = le16_to_cpu(req->SecurityBufferLength);
+
+ if (ksmbd_decode_negTokenInit((char *)negblob, sz, conn)) {
+ if (ksmbd_decode_negTokenTarg((char *)negblob, sz, conn)) {
+ conn->auth_mechs |= KSMBD_AUTH_NTLMSSP;
+ conn->preferred_auth_mech = KSMBD_AUTH_NTLMSSP;
+ conn->use_spnego = false;
+ }
+ }
+ return 0;
+}
+
+static int ntlm_negotiate(struct ksmbd_work *work,
+ struct negotiate_message *negblob)
+{
+ struct smb2_sess_setup_req *req = work->request_buf;
+ struct smb2_sess_setup_rsp *rsp = work->response_buf;
+ struct challenge_message *chgblob;
+ unsigned char *spnego_blob = NULL;
+ u16 spnego_blob_len;
+ char *neg_blob;
+ int sz, rc;
+
+ ksmbd_debug(SMB, "negotiate phase\n");
+ sz = le16_to_cpu(req->SecurityBufferLength);
+ rc = ksmbd_decode_ntlmssp_neg_blob(negblob, sz, work->sess);
+ if (rc)
+ return rc;
+
+ sz = le16_to_cpu(rsp->SecurityBufferOffset);
+ chgblob =
+ (struct challenge_message *)((char *)&rsp->hdr.ProtocolId + sz);
+ memset(chgblob, 0, sizeof(struct challenge_message));
+
+ if (!work->conn->use_spnego) {
+ sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess);
+ if (sz < 0)
+ return -ENOMEM;
+
+ rsp->SecurityBufferLength = cpu_to_le16(sz);
+ return 0;
+ }
+
+ sz = sizeof(struct challenge_message);
+ sz += (strlen(ksmbd_netbios_name()) * 2 + 1 + 4) * 6;
+
+ neg_blob = kzalloc(sz, GFP_KERNEL);
+ if (!neg_blob)
+ return -ENOMEM;
+
+ chgblob = (struct challenge_message *)neg_blob;
+ sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess);
+ if (sz < 0) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = build_spnego_ntlmssp_neg_blob(&spnego_blob, &spnego_blob_len,
+ neg_blob, sz);
+ if (rc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ sz = le16_to_cpu(rsp->SecurityBufferOffset);
+ memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
+ rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
+
+out:
+ kfree(spnego_blob);
+ kfree(neg_blob);
+ return rc;
+}
+
+static struct authenticate_message *user_authblob(struct ksmbd_conn *conn,
+ struct smb2_sess_setup_req *req)
+{
+ int sz;
+
+ if (conn->use_spnego && conn->mechToken)
+ return (struct authenticate_message *)conn->mechToken;
+
+ sz = le16_to_cpu(req->SecurityBufferOffset);
+ return (struct authenticate_message *)((char *)&req->hdr.ProtocolId
+ + sz);
+}
+
+static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
+ struct smb2_sess_setup_req *req)
+{
+ struct authenticate_message *authblob;
+ struct ksmbd_user *user;
+ char *name;
+ int sz;
+
+ authblob = user_authblob(conn, req);
+ sz = le32_to_cpu(authblob->UserName.BufferOffset);
+ name = smb_strndup_from_utf16((const char *)authblob + sz,
+ le16_to_cpu(authblob->UserName.Length),
+ true,
+ conn->local_nls);
+ if (IS_ERR(name)) {
+ pr_err("cannot allocate memory\n");
+ return NULL;
+ }
+
+ ksmbd_debug(SMB, "session setup request for user %s\n", name);
+ user = ksmbd_login_user(name);
+ kfree(name);
+ return user;
+}
+
+static int ntlm_authenticate(struct ksmbd_work *work)
+{
+ struct smb2_sess_setup_req *req = work->request_buf;
+ struct smb2_sess_setup_rsp *rsp = work->response_buf;
+ struct ksmbd_conn *conn = work->conn;
+ struct ksmbd_session *sess = work->sess;
+ struct channel *chann = NULL;
+ struct ksmbd_user *user;
+ u64 prev_id;
+ int sz, rc;
+
+ ksmbd_debug(SMB, "authenticate phase\n");
+ if (conn->use_spnego) {
+ unsigned char *spnego_blob;
+ u16 spnego_blob_len;
+
+ rc = build_spnego_ntlmssp_auth_blob(&spnego_blob,
+ &spnego_blob_len,
+ 0);
+ if (rc)
+ return -ENOMEM;
+
+ sz = le16_to_cpu(rsp->SecurityBufferOffset);
+ memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
+ rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
+ kfree(spnego_blob);
+ inc_rfc1001_len(rsp, spnego_blob_len - 1);
+ }
+
+ user = session_user(conn, req);
+ if (!user) {
+ ksmbd_debug(SMB, "Unknown user name or an error\n");
+ return -EPERM;
+ }
+
+ /* Check for previous session */
+ prev_id = le64_to_cpu(req->PreviousSessionId);
+ if (prev_id && prev_id != sess->id)
+ destroy_previous_session(user, prev_id);
+
+ if (sess->state == SMB2_SESSION_VALID) {
+ /*
+ * Reuse session if anonymous try to connect
+ * on reauthetication.
+ */
+ if (ksmbd_anonymous_user(user)) {
+ ksmbd_free_user(user);
+ return 0;
+ }
+ ksmbd_free_user(sess->user);
+ }
+
+ sess->user = user;
+ if (user_guest(sess->user)) {
+ if (conn->sign) {
+ ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n");
+ return -EPERM;
+ }
+
+ rsp->SessionFlags = SMB2_SESSION_FLAG_IS_GUEST_LE;
+ } else {
+ struct authenticate_message *authblob;
+
+ authblob = user_authblob(conn, req);
+ sz = le16_to_cpu(req->SecurityBufferLength);
+ rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, sess);
+ if (rc) {
+ set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD);
+ ksmbd_debug(SMB, "authentication failed\n");
+ return -EPERM;
+ }
+
+ /*
+ * If session state is SMB2_SESSION_VALID, We can assume
+ * that it is reauthentication. And the user/password
+ * has been verified, so return it here.
+ */
+ if (sess->state == SMB2_SESSION_VALID) {
+ if (conn->binding)
+ goto binding_session;
+ return 0;
+ }
+
+ if ((conn->sign || server_conf.enforced_signing) ||
+ (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
+ sess->sign = true;
+
+ if (conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION &&
+ conn->ops->generate_encryptionkey &&
+ !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
+ rc = conn->ops->generate_encryptionkey(sess);
+ if (rc) {
+ ksmbd_debug(SMB,
+ "SMB3 encryption key generation failed\n");
+ return -EINVAL;
+ }
+ sess->enc = true;
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ /*
+ * signing is disable if encryption is enable
+ * on this session
+ */
+ sess->sign = false;
+ }
+ }
+
+binding_session:
+ if (conn->dialect >= SMB30_PROT_ID) {
+ chann = lookup_chann_list(sess, conn);
+ if (!chann) {
+ chann = kmalloc(sizeof(struct channel), GFP_KERNEL);
+ if (!chann)
+ return -ENOMEM;
+
+ chann->conn = conn;
+ INIT_LIST_HEAD(&chann->chann_list);
+ list_add(&chann->chann_list, &sess->ksmbd_chann_list);
+ }
+ }
+
+ if (conn->ops->generate_signingkey) {
+ rc = conn->ops->generate_signingkey(sess, conn);
+ if (rc) {
+ ksmbd_debug(SMB, "SMB3 signing key generation failed\n");
+ return -EINVAL;
+ }
+ }
+
+ if (conn->dialect > SMB20_PROT_ID) {
+ if (!ksmbd_conn_lookup_dialect(conn)) {
+ pr_err("fail to verify the dialect\n");
+ return -ENOENT;
+ }
+ }
+ return 0;
+}
+
+#ifdef CONFIG_SMB_SERVER_KERBEROS5
+static int krb5_authenticate(struct ksmbd_work *work)
+{
+ struct smb2_sess_setup_req *req = work->request_buf;
+ struct smb2_sess_setup_rsp *rsp = work->response_buf;
+ struct ksmbd_conn *conn = work->conn;
+ struct ksmbd_session *sess = work->sess;
+ char *in_blob, *out_blob;
+ struct channel *chann = NULL;
+ u64 prev_sess_id;
+ int in_len, out_len;
+ int retval;
+
+ in_blob = (char *)&req->hdr.ProtocolId +
+ le16_to_cpu(req->SecurityBufferOffset);
+ in_len = le16_to_cpu(req->SecurityBufferLength);
+ out_blob = (char *)&rsp->hdr.ProtocolId +
+ le16_to_cpu(rsp->SecurityBufferOffset);
+ out_len = work->response_sz -
+ offsetof(struct smb2_hdr, smb2_buf_length) -
+ le16_to_cpu(rsp->SecurityBufferOffset);
+
+ /* Check previous session */
+ prev_sess_id = le64_to_cpu(req->PreviousSessionId);
+ if (prev_sess_id && prev_sess_id != sess->id)
+ destroy_previous_session(sess->user, prev_sess_id);
+
+ if (sess->state == SMB2_SESSION_VALID)
+ ksmbd_free_user(sess->user);
+
+ retval = ksmbd_krb5_authenticate(sess, in_blob, in_len,
+ out_blob, &out_len);
+ if (retval) {
+ ksmbd_debug(SMB, "krb5 authentication failed\n");
+ return -EINVAL;
+ }
+ rsp->SecurityBufferLength = cpu_to_le16(out_len);
+ inc_rfc1001_len(rsp, out_len - 1);
+
+ if ((conn->sign || server_conf.enforced_signing) ||
+ (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
+ sess->sign = true;
+
+ if ((conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) &&
+ conn->ops->generate_encryptionkey) {
+ retval = conn->ops->generate_encryptionkey(sess);
+ if (retval) {
+ ksmbd_debug(SMB,
+ "SMB3 encryption key generation failed\n");
+ return -EINVAL;
+ }
+ sess->enc = true;
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ sess->sign = false;
+ }
+
+ if (conn->dialect >= SMB30_PROT_ID) {
+ chann = lookup_chann_list(sess, conn);
+ if (!chann) {
+ chann = kmalloc(sizeof(struct channel), GFP_KERNEL);
+ if (!chann)
+ return -ENOMEM;
+
+ chann->conn = conn;
+ INIT_LIST_HEAD(&chann->chann_list);
+ list_add(&chann->chann_list, &sess->ksmbd_chann_list);
+ }
+ }
+
+ if (conn->ops->generate_signingkey) {
+ retval = conn->ops->generate_signingkey(sess, conn);
+ if (retval) {
+ ksmbd_debug(SMB, "SMB3 signing key generation failed\n");
+ return -EINVAL;
+ }
+ }
+
+ if (conn->dialect > SMB20_PROT_ID) {
+ if (!ksmbd_conn_lookup_dialect(conn)) {
+ pr_err("fail to verify the dialect\n");
+ return -ENOENT;
+ }
+ }
+ return 0;
+}
+#else
+static int krb5_authenticate(struct ksmbd_work *work)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+int smb2_sess_setup(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_sess_setup_req *req = work->request_buf;
+ struct smb2_sess_setup_rsp *rsp = work->response_buf;
+ struct ksmbd_session *sess;
+ struct negotiate_message *negblob;
+ int rc = 0;
+
+ ksmbd_debug(SMB, "Received request for session setup\n");
+
+ rsp->StructureSize = cpu_to_le16(9);
+ rsp->SessionFlags = 0;
+ rsp->SecurityBufferOffset = cpu_to_le16(72);
+ rsp->SecurityBufferLength = 0;
+ inc_rfc1001_len(rsp, 9);
+
+ if (!req->hdr.SessionId) {
+ sess = ksmbd_smb2_session_create();
+ if (!sess) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+ rsp->hdr.SessionId = cpu_to_le64(sess->id);
+ ksmbd_session_register(conn, sess);
+ } else if (conn->dialect >= SMB30_PROT_ID &&
+ (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) &&
+ req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) {
+ u64 sess_id = le64_to_cpu(req->hdr.SessionId);
+
+ sess = ksmbd_session_lookup_slowpath(sess_id);
+ if (!sess) {
+ rc = -ENOENT;
+ goto out_err;
+ }
+
+ if (conn->dialect != sess->conn->dialect) {
+ rc = -EINVAL;
+ goto out_err;
+ }
+
+ if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) {
+ rc = -EINVAL;
+ goto out_err;
+ }
+
+ if (strncmp(conn->ClientGUID, sess->conn->ClientGUID,
+ SMB2_CLIENT_GUID_SIZE)) {
+ rc = -ENOENT;
+ goto out_err;
+ }
+
+ if (sess->state == SMB2_SESSION_IN_PROGRESS) {
+ rc = -EACCES;
+ goto out_err;
+ }
+
+ if (sess->state == SMB2_SESSION_EXPIRED) {
+ rc = -EFAULT;
+ goto out_err;
+ }
+
+ if (ksmbd_session_lookup(conn, sess_id)) {
+ rc = -EACCES;
+ goto out_err;
+ }
+
+ conn->binding = true;
+ } else if ((conn->dialect < SMB30_PROT_ID ||
+ server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) &&
+ (req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
+ sess = NULL;
+ rc = -EACCES;
+ goto out_err;
+ } else {
+ sess = ksmbd_session_lookup(conn,
+ le64_to_cpu(req->hdr.SessionId));
+ if (!sess) {
+ rc = -ENOENT;
+ goto out_err;
+ }
+ }
+ work->sess = sess;
+
+ if (sess->state == SMB2_SESSION_EXPIRED)
+ sess->state = SMB2_SESSION_IN_PROGRESS;
+
+ negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId +
+ le16_to_cpu(req->SecurityBufferOffset));
+
+ if (decode_negotiation_token(work, negblob) == 0) {
+ if (conn->mechToken)
+ negblob = (struct negotiate_message *)conn->mechToken;
+ }
+
+ if (server_conf.auth_mechs & conn->auth_mechs) {
+ rc = generate_preauth_hash(work);
+ if (rc)
+ goto out_err;
+
+ if (conn->preferred_auth_mech &
+ (KSMBD_AUTH_KRB5 | KSMBD_AUTH_MSKRB5)) {
+ rc = krb5_authenticate(work);
+ if (rc) {
+ rc = -EINVAL;
+ goto out_err;
+ }
+
+ ksmbd_conn_set_good(work);
+ sess->state = SMB2_SESSION_VALID;
+ kfree(sess->Preauth_HashValue);
+ sess->Preauth_HashValue = NULL;
+ } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) {
+ if (negblob->MessageType == NtLmNegotiate) {
+ rc = ntlm_negotiate(work, negblob);
+ if (rc)
+ goto out_err;
+ rsp->hdr.Status =
+ STATUS_MORE_PROCESSING_REQUIRED;
+ /*
+ * Note: here total size -1 is done as an
+ * adjustment for 0 size blob
+ */
+ inc_rfc1001_len(rsp, le16_to_cpu(rsp->SecurityBufferLength) - 1);
+
+ } else if (negblob->MessageType == NtLmAuthenticate) {
+ rc = ntlm_authenticate(work);
+ if (rc)
+ goto out_err;
+
+ ksmbd_conn_set_good(work);
+ sess->state = SMB2_SESSION_VALID;
+ if (conn->binding) {
+ struct preauth_session *preauth_sess;
+
+ preauth_sess =
+ ksmbd_preauth_session_lookup(conn, sess->id);
+ if (preauth_sess) {
+ list_del(&preauth_sess->preauth_entry);
+ kfree(preauth_sess);
+ }
+ }
+ kfree(sess->Preauth_HashValue);
+ sess->Preauth_HashValue = NULL;
+ }
+ } else {
+ /* TODO: need one more negotiation */
+ pr_err("Not support the preferred authentication\n");
+ rc = -EINVAL;
+ }
+ } else {
+ pr_err("Not support authentication\n");
+ rc = -EINVAL;
+ }
+
+out_err:
+ if (rc == -EINVAL)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ else if (rc == -ENOENT)
+ rsp->hdr.Status = STATUS_USER_SESSION_DELETED;
+ else if (rc == -EACCES)
+ rsp->hdr.Status = STATUS_REQUEST_NOT_ACCEPTED;
+ else if (rc == -EFAULT)
+ rsp->hdr.Status = STATUS_NETWORK_SESSION_EXPIRED;
+ else if (rc == -ENOMEM)
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
+ else if (rc)
+ rsp->hdr.Status = STATUS_LOGON_FAILURE;
+
+ if (conn->use_spnego && conn->mechToken) {
+ kfree(conn->mechToken);
+ conn->mechToken = NULL;
+ }
+
+ if (rc < 0 && sess) {
+ ksmbd_session_destroy(sess);
+ work->sess = NULL;
+ }
+
+ return rc;
+}
+
+/**
+ * smb2_tree_connect() - handler for smb2 tree connect command
+ * @work: smb work containing smb request buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_tree_connect(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_tree_connect_req *req = work->request_buf;
+ struct smb2_tree_connect_rsp *rsp = work->response_buf;
+ struct ksmbd_session *sess = work->sess;
+ char *treename = NULL, *name = NULL;
+ struct ksmbd_tree_conn_status status;
+ struct ksmbd_share_config *share;
+ int rc = -EINVAL;
+
+ treename = smb_strndup_from_utf16(req->Buffer,
+ le16_to_cpu(req->PathLength), true,
+ conn->local_nls);
+ if (IS_ERR(treename)) {
+ pr_err("treename is NULL\n");
+ status.ret = KSMBD_TREE_CONN_STATUS_ERROR;
+ goto out_err1;
+ }
+
+ name = ksmbd_extract_sharename(treename);
+ if (IS_ERR(name)) {
+ status.ret = KSMBD_TREE_CONN_STATUS_ERROR;
+ goto out_err1;
+ }
+
+ ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n",
+ name, treename);
+
+ status = ksmbd_tree_conn_connect(sess, name);
+ if (status.ret == KSMBD_TREE_CONN_STATUS_OK)
+ rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id);
+ else
+ goto out_err1;
+
+ share = status.tree_conn->share_conf;
+ if (test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) {
+ ksmbd_debug(SMB, "IPC share path request\n");
+ rsp->ShareType = SMB2_SHARE_TYPE_PIPE;
+ rsp->MaximalAccess = FILE_READ_DATA_LE | FILE_READ_EA_LE |
+ FILE_EXECUTE_LE | FILE_READ_ATTRIBUTES_LE |
+ FILE_DELETE_LE | FILE_READ_CONTROL_LE |
+ FILE_WRITE_DAC_LE | FILE_WRITE_OWNER_LE |
+ FILE_SYNCHRONIZE_LE;
+ } else {
+ rsp->ShareType = SMB2_SHARE_TYPE_DISK;
+ rsp->MaximalAccess = FILE_READ_DATA_LE | FILE_READ_EA_LE |
+ FILE_EXECUTE_LE | FILE_READ_ATTRIBUTES_LE;
+ if (test_tree_conn_flag(status.tree_conn,
+ KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ rsp->MaximalAccess |= FILE_WRITE_DATA_LE |
+ FILE_APPEND_DATA_LE | FILE_WRITE_EA_LE |
+ FILE_DELETE_LE | FILE_WRITE_ATTRIBUTES_LE |
+ FILE_DELETE_CHILD_LE | FILE_READ_CONTROL_LE |
+ FILE_WRITE_DAC_LE | FILE_WRITE_OWNER_LE |
+ FILE_SYNCHRONIZE_LE;
+ }
+ }
+
+ status.tree_conn->maximal_access = le32_to_cpu(rsp->MaximalAccess);
+ if (conn->posix_ext_supported)
+ status.tree_conn->posix_extensions = true;
+
+out_err1:
+ rsp->StructureSize = cpu_to_le16(16);
+ rsp->Capabilities = 0;
+ rsp->Reserved = 0;
+ /* default manual caching */
+ rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING;
+ inc_rfc1001_len(rsp, 16);
+
+ if (!IS_ERR(treename))
+ kfree(treename);
+ if (!IS_ERR(name))
+ kfree(name);
+
+ switch (status.ret) {
+ case KSMBD_TREE_CONN_STATUS_OK:
+ rsp->hdr.Status = STATUS_SUCCESS;
+ rc = 0;
+ break;
+ case KSMBD_TREE_CONN_STATUS_NO_SHARE:
+ rsp->hdr.Status = STATUS_BAD_NETWORK_PATH;
+ break;
+ case -ENOMEM:
+ case KSMBD_TREE_CONN_STATUS_NOMEM:
+ rsp->hdr.Status = STATUS_NO_MEMORY;
+ break;
+ case KSMBD_TREE_CONN_STATUS_ERROR:
+ case KSMBD_TREE_CONN_STATUS_TOO_MANY_CONNS:
+ case KSMBD_TREE_CONN_STATUS_TOO_MANY_SESSIONS:
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ break;
+ case -EINVAL:
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ break;
+ default:
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ }
+
+ return rc;
+}
+
+/**
+ * smb2_create_open_flags() - convert smb open flags to unix open flags
+ * @file_present: is file already present
+ * @access: file access flags
+ * @disposition: file disposition flags
+ * @may_flags: set with MAY_ flags
+ *
+ * Return: file open flags
+ */
+static int smb2_create_open_flags(bool file_present, __le32 access,
+ __le32 disposition,
+ int *may_flags)
+{
+ int oflags = O_NONBLOCK | O_LARGEFILE;
+
+ if (access & FILE_READ_DESIRED_ACCESS_LE &&
+ access & FILE_WRITE_DESIRE_ACCESS_LE) {
+ oflags |= O_RDWR;
+ *may_flags = MAY_OPEN | MAY_READ | MAY_WRITE;
+ } else if (access & FILE_WRITE_DESIRE_ACCESS_LE) {
+ oflags |= O_WRONLY;
+ *may_flags = MAY_OPEN | MAY_WRITE;
+ } else {
+ oflags |= O_RDONLY;
+ *may_flags = MAY_OPEN | MAY_READ;
+ }
+
+ if (access == FILE_READ_ATTRIBUTES_LE)
+ oflags |= O_PATH;
+
+ if (file_present) {
+ switch (disposition & FILE_CREATE_MASK_LE) {
+ case FILE_OPEN_LE:
+ case FILE_CREATE_LE:
+ break;
+ case FILE_SUPERSEDE_LE:
+ case FILE_OVERWRITE_LE:
+ case FILE_OVERWRITE_IF_LE:
+ oflags |= O_TRUNC;
+ break;
+ default:
+ break;
+ }
+ } else {
+ switch (disposition & FILE_CREATE_MASK_LE) {
+ case FILE_SUPERSEDE_LE:
+ case FILE_CREATE_LE:
+ case FILE_OPEN_IF_LE:
+ case FILE_OVERWRITE_IF_LE:
+ oflags |= O_CREAT;
+ break;
+ case FILE_OPEN_LE:
+ case FILE_OVERWRITE_LE:
+ oflags &= ~O_CREAT;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return oflags;
+}
+
+/**
+ * smb2_tree_disconnect() - handler for smb tree connect request
+ * @work: smb work containing request buffer
+ *
+ * Return: 0
+ */
+int smb2_tree_disconnect(struct ksmbd_work *work)
+{
+ struct smb2_tree_disconnect_rsp *rsp = work->response_buf;
+ struct ksmbd_session *sess = work->sess;
+ struct ksmbd_tree_connect *tcon = work->tcon;
+
+ rsp->StructureSize = cpu_to_le16(4);
+ inc_rfc1001_len(rsp, 4);
+
+ ksmbd_debug(SMB, "request\n");
+
+ if (!tcon) {
+ struct smb2_tree_disconnect_req *req = work->request_buf;
+
+ ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId);
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
+ smb2_set_err_rsp(work);
+ return 0;
+ }
+
+ ksmbd_close_tree_conn_fds(work);
+ ksmbd_tree_conn_disconnect(sess, tcon);
+ return 0;
+}
+
+/**
+ * smb2_session_logoff() - handler for session log off request
+ * @work: smb work containing request buffer
+ *
+ * Return: 0
+ */
+int smb2_session_logoff(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_logoff_rsp *rsp = work->response_buf;
+ struct ksmbd_session *sess = work->sess;
+
+ rsp->StructureSize = cpu_to_le16(4);
+ inc_rfc1001_len(rsp, 4);
+
+ ksmbd_debug(SMB, "request\n");
+
+ /* Got a valid session, set connection state */
+ WARN_ON(sess->conn != conn);
+
+ /* setting CifsExiting here may race with start_tcp_sess */
+ ksmbd_conn_set_need_reconnect(work);
+ ksmbd_close_session_fds(work);
+ ksmbd_conn_wait_idle(conn);
+
+ if (ksmbd_tree_conn_session_logoff(sess)) {
+ struct smb2_logoff_req *req = work->request_buf;
+
+ ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId);
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
+ smb2_set_err_rsp(work);
+ return 0;
+ }
+
+ ksmbd_destroy_file_table(&sess->file_table);
+ sess->state = SMB2_SESSION_EXPIRED;
+
+ ksmbd_free_user(sess->user);
+ sess->user = NULL;
+
+ /* let start_tcp_sess free connection info now */
+ ksmbd_conn_set_need_negotiate(work);
+ return 0;
+}
+
+/**
+ * create_smb2_pipe() - create IPC pipe
+ * @work: smb work containing request buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+static noinline int create_smb2_pipe(struct ksmbd_work *work)
+{
+ struct smb2_create_rsp *rsp = work->response_buf;
+ struct smb2_create_req *req = work->request_buf;
+ int id;
+ int err;
+ char *name;
+
+ name = smb_strndup_from_utf16(req->Buffer, le16_to_cpu(req->NameLength),
+ 1, work->conn->local_nls);
+ if (IS_ERR(name)) {
+ rsp->hdr.Status = STATUS_NO_MEMORY;
+ err = PTR_ERR(name);
+ goto out;
+ }
+
+ id = ksmbd_session_rpc_open(work->sess, name);
+ if (id < 0) {
+ pr_err("Unable to open RPC pipe: %d\n", id);
+ err = id;
+ goto out;
+ }
+
+ rsp->hdr.Status = STATUS_SUCCESS;
+ rsp->StructureSize = cpu_to_le16(89);
+ rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE;
+ rsp->Reserved = 0;
+ rsp->CreateAction = cpu_to_le32(FILE_OPENED);
+
+ rsp->CreationTime = cpu_to_le64(0);
+ rsp->LastAccessTime = cpu_to_le64(0);
+ rsp->ChangeTime = cpu_to_le64(0);
+ rsp->AllocationSize = cpu_to_le64(0);
+ rsp->EndofFile = cpu_to_le64(0);
+ rsp->FileAttributes = ATTR_NORMAL_LE;
+ rsp->Reserved2 = 0;
+ rsp->VolatileFileId = cpu_to_le64(id);
+ rsp->PersistentFileId = 0;
+ rsp->CreateContextsOffset = 0;
+ rsp->CreateContextsLength = 0;
+
+ inc_rfc1001_len(rsp, 88); /* StructureSize - 1*/
+ kfree(name);
+ return 0;
+
+out:
+ switch (err) {
+ case -EINVAL:
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ break;
+ case -ENOSPC:
+ case -ENOMEM:
+ rsp->hdr.Status = STATUS_NO_MEMORY;
+ break;
+ }
+
+ if (!IS_ERR(name))
+ kfree(name);
+
+ smb2_set_err_rsp(work);
+ return err;
+}
+
+/**
+ * smb2_set_ea() - handler for setting extended attributes using set
+ * info command
+ * @eabuf: set info command buffer
+ * @path: dentry path for get ea
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int smb2_set_ea(struct smb2_ea_info *eabuf, struct path *path)
+{
+ struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+ char *attr_name = NULL, *value;
+ int rc = 0;
+ int next = 0;
+
+ attr_name = kmalloc(XATTR_NAME_MAX + 1, GFP_KERNEL);
+ if (!attr_name)
+ return -ENOMEM;
+
+ do {
+ if (!eabuf->EaNameLength)
+ goto next;
+
+ ksmbd_debug(SMB,
+ "name : <%s>, name_len : %u, value_len : %u, next : %u\n",
+ eabuf->name, eabuf->EaNameLength,
+ le16_to_cpu(eabuf->EaValueLength),
+ le32_to_cpu(eabuf->NextEntryOffset));
+
+ if (eabuf->EaNameLength >
+ (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) {
+ rc = -EINVAL;
+ break;
+ }
+
+ memcpy(attr_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ memcpy(&attr_name[XATTR_USER_PREFIX_LEN], eabuf->name,
+ eabuf->EaNameLength);
+ attr_name[XATTR_USER_PREFIX_LEN + eabuf->EaNameLength] = '\0';
+ value = (char *)&eabuf->name + eabuf->EaNameLength + 1;
+
+ if (!eabuf->EaValueLength) {
+ rc = ksmbd_vfs_casexattr_len(user_ns,
+ path->dentry,
+ attr_name,
+ XATTR_USER_PREFIX_LEN +
+ eabuf->EaNameLength);
+
+ /* delete the EA only when it exits */
+ if (rc > 0) {
+ rc = ksmbd_vfs_remove_xattr(user_ns,
+ path->dentry,
+ attr_name);
+
+ if (rc < 0) {
+ ksmbd_debug(SMB,
+ "remove xattr failed(%d)\n",
+ rc);
+ break;
+ }
+ }
+
+ /* if the EA doesn't exist, just do nothing. */
+ rc = 0;
+ } else {
+ rc = ksmbd_vfs_setxattr(user_ns,
+ path->dentry, attr_name, value,
+ le16_to_cpu(eabuf->EaValueLength), 0);
+ if (rc < 0) {
+ ksmbd_debug(SMB,
+ "ksmbd_vfs_setxattr is failed(%d)\n",
+ rc);
+ break;
+ }
+ }
+
+next:
+ next = le32_to_cpu(eabuf->NextEntryOffset);
+ eabuf = (struct smb2_ea_info *)((char *)eabuf + next);
+ } while (next != 0);
+
+ kfree(attr_name);
+ return rc;
+}
+
+static noinline int smb2_set_stream_name_xattr(struct path *path,
+ struct ksmbd_file *fp,
+ char *stream_name, int s_type)
+{
+ struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+ size_t xattr_stream_size;
+ char *xattr_stream_name;
+ int rc;
+
+ rc = ksmbd_vfs_xattr_stream_name(stream_name,
+ &xattr_stream_name,
+ &xattr_stream_size,
+ s_type);
+ if (rc)
+ return rc;
+
+ fp->stream.name = xattr_stream_name;
+ fp->stream.size = xattr_stream_size;
+
+ /* Check if there is stream prefix in xattr space */
+ rc = ksmbd_vfs_casexattr_len(user_ns,
+ path->dentry,
+ xattr_stream_name,
+ xattr_stream_size);
+ if (rc >= 0)
+ return 0;
+
+ if (fp->cdoption == FILE_OPEN_LE) {
+ ksmbd_debug(SMB, "XATTR stream name lookup failed: %d\n", rc);
+ return -EBADF;
+ }
+
+ rc = ksmbd_vfs_setxattr(user_ns, path->dentry,
+ xattr_stream_name, NULL, 0, 0);
+ if (rc < 0)
+ pr_err("Failed to store XATTR stream name :%d\n", rc);
+ return 0;
+}
+
+static int smb2_remove_smb_xattrs(struct path *path)
+{
+ struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+ char *name, *xattr_list = NULL;
+ ssize_t xattr_list_len;
+ int err = 0;
+
+ xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
+ if (xattr_list_len < 0) {
+ goto out;
+ } else if (!xattr_list_len) {
+ ksmbd_debug(SMB, "empty xattr in the file\n");
+ goto out;
+ }
+
+ for (name = xattr_list; name - xattr_list < xattr_list_len;
+ name += strlen(name) + 1) {
+ ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
+
+ if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+ strncmp(&name[XATTR_USER_PREFIX_LEN], DOS_ATTRIBUTE_PREFIX,
+ DOS_ATTRIBUTE_PREFIX_LEN) &&
+ strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN))
+ continue;
+
+ err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, name);
+ if (err)
+ ksmbd_debug(SMB, "remove xattr failed : %s\n", name);
+ }
+out:
+ kvfree(xattr_list);
+ return err;
+}
+
+static int smb2_create_truncate(struct path *path)
+{
+ int rc = vfs_truncate(path, 0);
+
+ if (rc) {
+ pr_err("vfs_truncate failed, rc %d\n", rc);
+ return rc;
+ }
+
+ rc = smb2_remove_smb_xattrs(path);
+ if (rc == -EOPNOTSUPP)
+ rc = 0;
+ if (rc)
+ ksmbd_debug(SMB,
+ "ksmbd_truncate_stream_name_xattr failed, rc %d\n",
+ rc);
+ return rc;
+}
+
+static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path,
+ struct ksmbd_file *fp)
+{
+ struct xattr_dos_attrib da = {0};
+ int rc;
+
+ if (!test_share_config_flag(tcon->share_conf,
+ KSMBD_SHARE_FLAG_STORE_DOS_ATTRS))
+ return;
+
+ da.version = 4;
+ da.attr = le32_to_cpu(fp->f_ci->m_fattr);
+ da.itime = da.create_time = fp->create_time;
+ da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME |
+ XATTR_DOSINFO_ITIME;
+
+ rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_user_ns(path->mnt),
+ path->dentry, &da);
+ if (rc)
+ ksmbd_debug(SMB, "failed to store file attribute into xattr\n");
+}
+
+static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon,
+ struct path *path, struct ksmbd_file *fp)
+{
+ struct xattr_dos_attrib da;
+ int rc;
+
+ fp->f_ci->m_fattr &= ~(ATTR_HIDDEN_LE | ATTR_SYSTEM_LE);
+
+ /* get FileAttributes from XATTR_NAME_DOS_ATTRIBUTE */
+ if (!test_share_config_flag(tcon->share_conf,
+ KSMBD_SHARE_FLAG_STORE_DOS_ATTRS))
+ return;
+
+ rc = ksmbd_vfs_get_dos_attrib_xattr(mnt_user_ns(path->mnt),
+ path->dentry, &da);
+ if (rc > 0) {
+ fp->f_ci->m_fattr = cpu_to_le32(da.attr);
+ fp->create_time = da.create_time;
+ fp->itime = da.itime;
+ }
+}
+
+static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
+ int open_flags, umode_t posix_mode, bool is_dir)
+{
+ struct ksmbd_tree_connect *tcon = work->tcon;
+ struct ksmbd_share_config *share = tcon->share_conf;
+ umode_t mode;
+ int rc;
+
+ if (!(open_flags & O_CREAT))
+ return -EBADF;
+
+ ksmbd_debug(SMB, "file does not exist, so creating\n");
+ if (is_dir == true) {
+ ksmbd_debug(SMB, "creating directory\n");
+
+ mode = share_config_directory_mode(share, posix_mode);
+ rc = ksmbd_vfs_mkdir(work, name, mode);
+ if (rc)
+ return rc;
+ } else {
+ ksmbd_debug(SMB, "creating regular file\n");
+
+ mode = share_config_create_mode(share, posix_mode);
+ rc = ksmbd_vfs_create(work, name, mode);
+ if (rc)
+ return rc;
+ }
+
+ rc = ksmbd_vfs_kern_path(name, 0, path, 0);
+ if (rc) {
+ pr_err("cannot get linux path (%s), err = %d\n",
+ name, rc);
+ return rc;
+ }
+ return 0;
+}
+
+static int smb2_create_sd_buffer(struct ksmbd_work *work,
+ struct smb2_create_req *req,
+ struct path *path)
+{
+ struct create_context *context;
+ struct create_sd_buf_req *sd_buf;
+
+ if (!req->CreateContextsOffset)
+ return -ENOENT;
+
+ /* Parse SD BUFFER create contexts */
+ context = smb2_find_context_vals(req, SMB2_CREATE_SD_BUFFER);
+ if (!context)
+ return -ENOENT;
+ else if (IS_ERR(context))
+ return PTR_ERR(context);
+
+ ksmbd_debug(SMB,
+ "Set ACLs using SMB2_CREATE_SD_BUFFER context\n");
+ sd_buf = (struct create_sd_buf_req *)context;
+ return set_info_sec(work->conn, work->tcon, path, &sd_buf->ntsd,
+ le32_to_cpu(sd_buf->ccontext.DataLength), true);
+}
+
+static void ksmbd_acls_fattr(struct smb_fattr *fattr, struct inode *inode)
+{
+ fattr->cf_uid = inode->i_uid;
+ fattr->cf_gid = inode->i_gid;
+ fattr->cf_mode = inode->i_mode;
+ fattr->cf_acls = NULL;
+ fattr->cf_dacls = NULL;
+
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
+ fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS);
+ if (S_ISDIR(inode->i_mode))
+ fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT);
+ }
+}
+
+/**
+ * smb2_open() - handler for smb file open request
+ * @work: smb work containing request buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_open(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct ksmbd_session *sess = work->sess;
+ struct ksmbd_tree_connect *tcon = work->tcon;
+ struct smb2_create_req *req;
+ struct smb2_create_rsp *rsp, *rsp_org;
+ struct path path;
+ struct ksmbd_share_config *share = tcon->share_conf;
+ struct ksmbd_file *fp = NULL;
+ struct file *filp = NULL;
+ struct user_namespace *user_ns = NULL;
+ struct kstat stat;
+ struct create_context *context;
+ struct lease_ctx_info *lc = NULL;
+ struct create_ea_buf_req *ea_buf = NULL;
+ struct oplock_info *opinfo;
+ __le32 *next_ptr = NULL;
+ int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0;
+ int rc = 0, len = 0;
+ int contxt_cnt = 0, query_disk_id = 0;
+ int maximal_access_ctxt = 0, posix_ctxt = 0;
+ int s_type = 0;
+ int next_off = 0;
+ char *name = NULL;
+ char *stream_name = NULL;
+ bool file_present = false, created = false, already_permitted = false;
+ int share_ret, need_truncate = 0;
+ u64 time;
+ umode_t posix_mode = 0;
+ __le32 daccess, maximal_access = 0;
+
+ rsp_org = work->response_buf;
+ WORK_BUFFERS(work, req, rsp);
+
+ if (req->hdr.NextCommand && !work->next_smb2_rcv_hdr_off &&
+ (req->hdr.Flags & SMB2_FLAGS_RELATED_OPERATIONS)) {
+ ksmbd_debug(SMB, "invalid flag in chained command\n");
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ smb2_set_err_rsp(work);
+ return -EINVAL;
+ }
+
+ if (test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) {
+ ksmbd_debug(SMB, "IPC pipe create request\n");
+ return create_smb2_pipe(work);
+ }
+
+ if (req->NameLength) {
+ if ((req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
+ *(char *)req->Buffer == '\\') {
+ pr_err("not allow directory name included leading slash\n");
+ rc = -EINVAL;
+ goto err_out1;
+ }
+
+ name = smb2_get_name(share,
+ req->Buffer,
+ le16_to_cpu(req->NameLength),
+ work->conn->local_nls);
+ if (IS_ERR(name)) {
+ rc = PTR_ERR(name);
+ if (rc != -ENOMEM)
+ rc = -ENOENT;
+ name = NULL;
+ goto err_out1;
+ }
+
+ ksmbd_debug(SMB, "converted name = %s\n", name);
+ if (strchr(name, ':')) {
+ if (!test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_STREAMS)) {
+ rc = -EBADF;
+ goto err_out1;
+ }
+ rc = parse_stream_name(name, &stream_name, &s_type);
+ if (rc < 0)
+ goto err_out1;
+ }
+
+ rc = ksmbd_validate_filename(name);
+ if (rc < 0)
+ goto err_out1;
+
+ if (ksmbd_share_veto_filename(share, name)) {
+ rc = -ENOENT;
+ ksmbd_debug(SMB, "Reject open(), vetoed file: %s\n",
+ name);
+ goto err_out1;
+ }
+ } else {
+ len = strlen(share->path);
+ ksmbd_debug(SMB, "share path len %d\n", len);
+ name = kmalloc(len + 1, GFP_KERNEL);
+ if (!name) {
+ rsp->hdr.Status = STATUS_NO_MEMORY;
+ rc = -ENOMEM;
+ goto err_out1;
+ }
+
+ memcpy(name, share->path, len);
+ *(name + len) = '\0';
+ }
+
+ req_op_level = req->RequestedOplockLevel;
+ if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
+ lc = parse_lease_state(req);
+
+ if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE_LE)) {
+ pr_err("Invalid impersonationlevel : 0x%x\n",
+ le32_to_cpu(req->ImpersonationLevel));
+ rc = -EIO;
+ rsp->hdr.Status = STATUS_BAD_IMPERSONATION_LEVEL;
+ goto err_out1;
+ }
+
+ if (req->CreateOptions && !(req->CreateOptions & CREATE_OPTIONS_MASK)) {
+ pr_err("Invalid create options : 0x%x\n",
+ le32_to_cpu(req->CreateOptions));
+ rc = -EINVAL;
+ goto err_out1;
+ } else {
+ if (req->CreateOptions & FILE_SEQUENTIAL_ONLY_LE &&
+ req->CreateOptions & FILE_RANDOM_ACCESS_LE)
+ req->CreateOptions = ~(FILE_SEQUENTIAL_ONLY_LE);
+
+ if (req->CreateOptions &
+ (FILE_OPEN_BY_FILE_ID_LE | CREATE_TREE_CONNECTION |
+ FILE_RESERVE_OPFILTER_LE)) {
+ rc = -EOPNOTSUPP;
+ goto err_out1;
+ }
+
+ if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
+ if (req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE) {
+ rc = -EINVAL;
+ goto err_out1;
+ } else if (req->CreateOptions & FILE_NO_COMPRESSION_LE) {
+ req->CreateOptions = ~(FILE_NO_COMPRESSION_LE);
+ }
+ }
+ }
+
+ if (le32_to_cpu(req->CreateDisposition) >
+ le32_to_cpu(FILE_OVERWRITE_IF_LE)) {
+ pr_err("Invalid create disposition : 0x%x\n",
+ le32_to_cpu(req->CreateDisposition));
+ rc = -EINVAL;
+ goto err_out1;
+ }
+
+ if (!(req->DesiredAccess & DESIRED_ACCESS_MASK)) {
+ pr_err("Invalid desired access : 0x%x\n",
+ le32_to_cpu(req->DesiredAccess));
+ rc = -EACCES;
+ goto err_out1;
+ }
+
+ if (req->FileAttributes && !(req->FileAttributes & ATTR_MASK_LE)) {
+ pr_err("Invalid file attribute : 0x%x\n",
+ le32_to_cpu(req->FileAttributes));
+ rc = -EINVAL;
+ goto err_out1;
+ }
+
+ if (req->CreateContextsOffset) {
+ /* Parse non-durable handle create contexts */
+ context = smb2_find_context_vals(req, SMB2_CREATE_EA_BUFFER);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out1;
+ } else if (context) {
+ ea_buf = (struct create_ea_buf_req *)context;
+ if (req->CreateOptions & FILE_NO_EA_KNOWLEDGE_LE) {
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ rc = -EACCES;
+ goto err_out1;
+ }
+ }
+
+ context = smb2_find_context_vals(req,
+ SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out1;
+ } else if (context) {
+ ksmbd_debug(SMB,
+ "get query maximal access context\n");
+ maximal_access_ctxt = 1;
+ }
+
+ context = smb2_find_context_vals(req,
+ SMB2_CREATE_TIMEWARP_REQUEST);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out1;
+ } else if (context) {
+ ksmbd_debug(SMB, "get timewarp context\n");
+ rc = -EBADF;
+ goto err_out1;
+ }
+
+ if (tcon->posix_extensions) {
+ context = smb2_find_context_vals(req,
+ SMB2_CREATE_TAG_POSIX);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out1;
+ } else if (context) {
+ struct create_posix *posix =
+ (struct create_posix *)context;
+ ksmbd_debug(SMB, "get posix context\n");
+
+ posix_mode = le32_to_cpu(posix->Mode);
+ posix_ctxt = 1;
+ }
+ }
+ }
+
+ if (ksmbd_override_fsids(work)) {
+ rc = -ENOMEM;
+ goto err_out1;
+ }
+
+ if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) {
+ /*
+ * On delete request, instead of following up, need to
+ * look the current entity
+ */
+ rc = ksmbd_vfs_kern_path(name, 0, &path, 1);
+ if (!rc) {
+ /*
+ * If file exists with under flags, return access
+ * denied error.
+ */
+ if (req->CreateDisposition == FILE_OVERWRITE_IF_LE ||
+ req->CreateDisposition == FILE_OPEN_IF_LE) {
+ rc = -EACCES;
+ path_put(&path);
+ goto err_out;
+ }
+
+ if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ ksmbd_debug(SMB,
+ "User does not have write permission\n");
+ rc = -EACCES;
+ path_put(&path);
+ goto err_out;
+ }
+ }
+ } else {
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) {
+ /*
+ * Use LOOKUP_FOLLOW to follow the path of
+ * symlink in path buildup
+ */
+ rc = ksmbd_vfs_kern_path(name, LOOKUP_FOLLOW, &path, 1);
+ if (rc) { /* Case for broken link ?*/
+ rc = ksmbd_vfs_kern_path(name, 0, &path, 1);
+ }
+ } else {
+ rc = ksmbd_vfs_kern_path(name, 0, &path, 1);
+ if (!rc && d_is_symlink(path.dentry)) {
+ rc = -EACCES;
+ path_put(&path);
+ goto err_out;
+ }
+ }
+ }
+
+ if (rc) {
+ if (rc == -EACCES) {
+ ksmbd_debug(SMB,
+ "User does not have right permission\n");
+ goto err_out;
+ }
+ ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n",
+ name, rc);
+ rc = 0;
+ } else {
+ file_present = true;
+ user_ns = mnt_user_ns(path.mnt);
+ generic_fillattr(user_ns, d_inode(path.dentry), &stat);
+ }
+ if (stream_name) {
+ if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
+ if (s_type == DATA_STREAM) {
+ rc = -EIO;
+ rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
+ }
+ } else {
+ if (S_ISDIR(stat.mode) && s_type == DATA_STREAM) {
+ rc = -EIO;
+ rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
+ }
+ }
+
+ if (req->CreateOptions & FILE_DIRECTORY_FILE_LE &&
+ req->FileAttributes & ATTR_NORMAL_LE) {
+ rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
+ rc = -EIO;
+ }
+
+ if (rc < 0)
+ goto err_out;
+ }
+
+ if (file_present && req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE &&
+ S_ISDIR(stat.mode) && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
+ ksmbd_debug(SMB, "open() argument is a directory: %s, %x\n",
+ name, req->CreateOptions);
+ rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
+ rc = -EIO;
+ goto err_out;
+ }
+
+ if (file_present && (req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
+ !(req->CreateDisposition == FILE_CREATE_LE) &&
+ !S_ISDIR(stat.mode)) {
+ rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
+ rc = -EIO;
+ goto err_out;
+ }
+
+ if (!stream_name && file_present &&
+ req->CreateDisposition == FILE_CREATE_LE) {
+ rc = -EEXIST;
+ goto err_out;
+ }
+
+ daccess = smb_map_generic_desired_access(req->DesiredAccess);
+
+ if (file_present && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
+ rc = smb_check_perm_dacl(conn, &path, &daccess,
+ sess->user->uid);
+ if (rc)
+ goto err_out;
+ }
+
+ if (daccess & FILE_MAXIMAL_ACCESS_LE) {
+ if (!file_present) {
+ daccess = cpu_to_le32(GENERIC_ALL_FLAGS);
+ } else {
+ rc = ksmbd_vfs_query_maximal_access(user_ns,
+ path.dentry,
+ &daccess);
+ if (rc)
+ goto err_out;
+ already_permitted = true;
+ }
+ maximal_access = daccess;
+ }
+
+ open_flags = smb2_create_open_flags(file_present, daccess,
+ req->CreateDisposition,
+ &may_flags);
+
+ if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ if (open_flags & O_CREAT) {
+ ksmbd_debug(SMB,
+ "User does not have write permission\n");
+ rc = -EACCES;
+ goto err_out;
+ }
+ }
+
+ /*create file if not present */
+ if (!file_present) {
+ rc = smb2_creat(work, &path, name, open_flags, posix_mode,
+ req->CreateOptions & FILE_DIRECTORY_FILE_LE);
+ if (rc) {
+ if (rc == -ENOENT) {
+ rc = -EIO;
+ rsp->hdr.Status = STATUS_OBJECT_PATH_NOT_FOUND;
+ }
+ goto err_out;
+ }
+
+ created = true;
+ user_ns = mnt_user_ns(path.mnt);
+ if (ea_buf) {
+ rc = smb2_set_ea(&ea_buf->ea, &path);
+ if (rc == -EOPNOTSUPP)
+ rc = 0;
+ else if (rc)
+ goto err_out;
+ }
+ } else if (!already_permitted) {
+ /* FILE_READ_ATTRIBUTE is allowed without inode_permission,
+ * because execute(search) permission on a parent directory,
+ * is already granted.
+ */
+ if (daccess & ~(FILE_READ_ATTRIBUTES_LE | FILE_READ_CONTROL_LE)) {
+ rc = inode_permission(user_ns,
+ d_inode(path.dentry),
+ may_flags);
+ if (rc)
+ goto err_out;
+
+ if ((daccess & FILE_DELETE_LE) ||
+ (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
+ rc = ksmbd_vfs_may_delete(user_ns,
+ path.dentry);
+ if (rc)
+ goto err_out;
+ }
+ }
+ }
+
+ rc = ksmbd_query_inode_status(d_inode(path.dentry->d_parent));
+ if (rc == KSMBD_INODE_STATUS_PENDING_DELETE) {
+ rc = -EBUSY;
+ goto err_out;
+ }
+
+ rc = 0;
+ filp = dentry_open(&path, open_flags, current_cred());
+ if (IS_ERR(filp)) {
+ rc = PTR_ERR(filp);
+ pr_err("dentry open for dir failed, rc %d\n", rc);
+ goto err_out;
+ }
+
+ if (file_present) {
+ if (!(open_flags & O_TRUNC))
+ file_info = FILE_OPENED;
+ else
+ file_info = FILE_OVERWRITTEN;
+
+ if ((req->CreateDisposition & FILE_CREATE_MASK_LE) ==
+ FILE_SUPERSEDE_LE)
+ file_info = FILE_SUPERSEDED;
+ } else if (open_flags & O_CREAT) {
+ file_info = FILE_CREATED;
+ }
+
+ ksmbd_vfs_set_fadvise(filp, req->CreateOptions);
+
+ /* Obtain Volatile-ID */
+ fp = ksmbd_open_fd(work, filp);
+ if (IS_ERR(fp)) {
+ fput(filp);
+ rc = PTR_ERR(fp);
+ fp = NULL;
+ goto err_out;
+ }
+
+ /* Get Persistent-ID */
+ ksmbd_open_durable_fd(fp);
+ if (!has_file_id(fp->persistent_id)) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ fp->filename = name;
+ fp->cdoption = req->CreateDisposition;
+ fp->daccess = daccess;
+ fp->saccess = req->ShareAccess;
+ fp->coption = req->CreateOptions;
+
+ /* Set default windows and posix acls if creating new file */
+ if (created) {
+ int posix_acl_rc;
+ struct inode *inode = d_inode(path.dentry);
+
+ posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns,
+ inode,
+ d_inode(path.dentry->d_parent));
+ if (posix_acl_rc)
+ ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc);
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_ACL_XATTR)) {
+ rc = smb_inherit_dacl(conn, &path, sess->user->uid,
+ sess->user->gid);
+ }
+
+ if (rc) {
+ rc = smb2_create_sd_buffer(work, req, &path);
+ if (rc) {
+ if (posix_acl_rc)
+ ksmbd_vfs_set_init_posix_acl(user_ns,
+ inode);
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_ACL_XATTR)) {
+ struct smb_fattr fattr;
+ struct smb_ntsd *pntsd;
+ int pntsd_size, ace_num = 0;
+
+ ksmbd_acls_fattr(&fattr, inode);
+ if (fattr.cf_acls)
+ ace_num = fattr.cf_acls->a_count;
+ if (fattr.cf_dacls)
+ ace_num += fattr.cf_dacls->a_count;
+
+ pntsd = kmalloc(sizeof(struct smb_ntsd) +
+ sizeof(struct smb_sid) * 3 +
+ sizeof(struct smb_acl) +
+ sizeof(struct smb_ace) * ace_num * 2,
+ GFP_KERNEL);
+ if (!pntsd)
+ goto err_out;
+
+ rc = build_sec_desc(user_ns,
+ pntsd, NULL,
+ OWNER_SECINFO |
+ GROUP_SECINFO |
+ DACL_SECINFO,
+ &pntsd_size, &fattr);
+ posix_acl_release(fattr.cf_acls);
+ posix_acl_release(fattr.cf_dacls);
+
+ rc = ksmbd_vfs_set_sd_xattr(conn,
+ user_ns,
+ path.dentry,
+ pntsd,
+ pntsd_size);
+ kfree(pntsd);
+ if (rc)
+ pr_err("failed to store ntacl in xattr : %d\n",
+ rc);
+ }
+ }
+ }
+ rc = 0;
+ }
+
+ if (stream_name) {
+ rc = smb2_set_stream_name_xattr(&path,
+ fp,
+ stream_name,
+ s_type);
+ if (rc)
+ goto err_out;
+ file_info = FILE_CREATED;
+ }
+
+ fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE |
+ FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE));
+ if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC &&
+ !fp->attrib_only && !stream_name) {
+ smb_break_all_oplock(work, fp);
+ need_truncate = 1;
+ }
+
+ /* fp should be searchable through ksmbd_inode.m_fp_list
+ * after daccess, saccess, attrib_only, and stream are
+ * initialized.
+ */
+ write_lock(&fp->f_ci->m_lock);
+ list_add(&fp->node, &fp->f_ci->m_fp_list);
+ write_unlock(&fp->f_ci->m_lock);
+
+ rc = ksmbd_vfs_getattr(&path, &stat);
+ if (rc) {
+ generic_fillattr(user_ns, d_inode(path.dentry), &stat);
+ rc = 0;
+ }
+
+ /* Check delete pending among previous fp before oplock break */
+ if (ksmbd_inode_pending_delete(fp)) {
+ rc = -EBUSY;
+ goto err_out;
+ }
+
+ share_ret = ksmbd_smb_check_shared_mode(fp->filp, fp);
+ if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_OPLOCKS) ||
+ (req_op_level == SMB2_OPLOCK_LEVEL_LEASE &&
+ !(conn->vals->capabilities & SMB2_GLOBAL_CAP_LEASING))) {
+ if (share_ret < 0 && !S_ISDIR(file_inode(fp->filp)->i_mode)) {
+ rc = share_ret;
+ goto err_out;
+ }
+ } else {
+ if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+ req_op_level = smb2_map_lease_to_oplock(lc->req_state);
+ ksmbd_debug(SMB,
+ "lease req for(%s) req oplock state 0x%x, lease state 0x%x\n",
+ name, req_op_level, lc->req_state);
+ rc = find_same_lease_key(sess, fp->f_ci, lc);
+ if (rc)
+ goto err_out;
+ } else if (open_flags == O_RDONLY &&
+ (req_op_level == SMB2_OPLOCK_LEVEL_BATCH ||
+ req_op_level == SMB2_OPLOCK_LEVEL_EXCLUSIVE))
+ req_op_level = SMB2_OPLOCK_LEVEL_II;
+
+ rc = smb_grant_oplock(work, req_op_level,
+ fp->persistent_id, fp,
+ le32_to_cpu(req->hdr.Id.SyncId.TreeId),
+ lc, share_ret);
+ if (rc < 0)
+ goto err_out;
+ }
+
+ if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)
+ ksmbd_fd_set_delete_on_close(fp, file_info);
+
+ if (need_truncate) {
+ rc = smb2_create_truncate(&path);
+ if (rc)
+ goto err_out;
+ }
+
+ if (req->CreateContextsOffset) {
+ struct create_alloc_size_req *az_req;
+
+ az_req = (struct create_alloc_size_req *)smb2_find_context_vals(req,
+ SMB2_CREATE_ALLOCATION_SIZE);
+ if (IS_ERR(az_req)) {
+ rc = PTR_ERR(az_req);
+ goto err_out;
+ } else if (az_req) {
+ loff_t alloc_size = le64_to_cpu(az_req->AllocationSize);
+ int err;
+
+ ksmbd_debug(SMB,
+ "request smb2 create allocate size : %llu\n",
+ alloc_size);
+ smb_break_all_levII_oplock(work, fp, 1);
+ err = vfs_fallocate(fp->filp, FALLOC_FL_KEEP_SIZE, 0,
+ alloc_size);
+ if (err < 0)
+ ksmbd_debug(SMB,
+ "vfs_fallocate is failed : %d\n",
+ err);
+ }
+
+ context = smb2_find_context_vals(req, SMB2_CREATE_QUERY_ON_DISK_ID);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out;
+ } else if (context) {
+ ksmbd_debug(SMB, "get query on disk id context\n");
+ query_disk_id = 1;
+ }
+ }
+
+ if (stat.result_mask & STATX_BTIME)
+ fp->create_time = ksmbd_UnixTimeToNT(stat.btime);
+ else
+ fp->create_time = ksmbd_UnixTimeToNT(stat.ctime);
+ if (req->FileAttributes || fp->f_ci->m_fattr == 0)
+ fp->f_ci->m_fattr =
+ cpu_to_le32(smb2_get_dos_mode(&stat, le32_to_cpu(req->FileAttributes)));
+
+ if (!created)
+ smb2_update_xattrs(tcon, &path, fp);
+ else
+ smb2_new_xattrs(tcon, &path, fp);
+
+ memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
+
+ generic_fillattr(user_ns, file_inode(fp->filp),
+ &stat);
+
+ rsp->StructureSize = cpu_to_le16(89);
+ rcu_read_lock();
+ opinfo = rcu_dereference(fp->f_opinfo);
+ rsp->OplockLevel = opinfo != NULL ? opinfo->level : 0;
+ rcu_read_unlock();
+ rsp->Reserved = 0;
+ rsp->CreateAction = cpu_to_le32(file_info);
+ rsp->CreationTime = cpu_to_le64(fp->create_time);
+ time = ksmbd_UnixTimeToNT(stat.atime);
+ rsp->LastAccessTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(stat.mtime);
+ rsp->LastWriteTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(stat.ctime);
+ rsp->ChangeTime = cpu_to_le64(time);
+ rsp->AllocationSize = S_ISDIR(stat.mode) ? 0 :
+ cpu_to_le64(stat.blocks << 9);
+ rsp->EndofFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ rsp->FileAttributes = fp->f_ci->m_fattr;
+
+ rsp->Reserved2 = 0;
+
+ rsp->PersistentFileId = cpu_to_le64(fp->persistent_id);
+ rsp->VolatileFileId = cpu_to_le64(fp->volatile_id);
+
+ rsp->CreateContextsOffset = 0;
+ rsp->CreateContextsLength = 0;
+ inc_rfc1001_len(rsp_org, 88); /* StructureSize - 1*/
+
+ /* If lease is request send lease context response */
+ if (opinfo && opinfo->is_lease) {
+ struct create_context *lease_ccontext;
+
+ ksmbd_debug(SMB, "lease granted on(%s) lease state 0x%x\n",
+ name, opinfo->o_lease->state);
+ rsp->OplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
+
+ lease_ccontext = (struct create_context *)rsp->Buffer;
+ contxt_cnt++;
+ create_lease_buf(rsp->Buffer, opinfo->o_lease);
+ le32_add_cpu(&rsp->CreateContextsLength,
+ conn->vals->create_lease_size);
+ inc_rfc1001_len(rsp_org, conn->vals->create_lease_size);
+ next_ptr = &lease_ccontext->Next;
+ next_off = conn->vals->create_lease_size;
+ }
+
+ if (maximal_access_ctxt) {
+ struct create_context *mxac_ccontext;
+
+ if (maximal_access == 0)
+ ksmbd_vfs_query_maximal_access(user_ns,
+ path.dentry,
+ &maximal_access);
+ mxac_ccontext = (struct create_context *)(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength));
+ contxt_cnt++;
+ create_mxac_rsp_buf(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength),
+ le32_to_cpu(maximal_access));
+ le32_add_cpu(&rsp->CreateContextsLength,
+ conn->vals->create_mxac_size);
+ inc_rfc1001_len(rsp_org, conn->vals->create_mxac_size);
+ if (next_ptr)
+ *next_ptr = cpu_to_le32(next_off);
+ next_ptr = &mxac_ccontext->Next;
+ next_off = conn->vals->create_mxac_size;
+ }
+
+ if (query_disk_id) {
+ struct create_context *disk_id_ccontext;
+
+ disk_id_ccontext = (struct create_context *)(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength));
+ contxt_cnt++;
+ create_disk_id_rsp_buf(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength),
+ stat.ino, tcon->id);
+ le32_add_cpu(&rsp->CreateContextsLength,
+ conn->vals->create_disk_id_size);
+ inc_rfc1001_len(rsp_org, conn->vals->create_disk_id_size);
+ if (next_ptr)
+ *next_ptr = cpu_to_le32(next_off);
+ next_ptr = &disk_id_ccontext->Next;
+ next_off = conn->vals->create_disk_id_size;
+ }
+
+ if (posix_ctxt) {
+ contxt_cnt++;
+ create_posix_rsp_buf(rsp->Buffer +
+ le32_to_cpu(rsp->CreateContextsLength),
+ fp);
+ le32_add_cpu(&rsp->CreateContextsLength,
+ conn->vals->create_posix_size);
+ inc_rfc1001_len(rsp_org, conn->vals->create_posix_size);
+ if (next_ptr)
+ *next_ptr = cpu_to_le32(next_off);
+ }
+
+ if (contxt_cnt > 0) {
+ rsp->CreateContextsOffset =
+ cpu_to_le32(offsetof(struct smb2_create_rsp, Buffer)
+ - 4);
+ }
+
+err_out:
+ if (file_present || created)
+ path_put(&path);
+ ksmbd_revert_fsids(work);
+err_out1:
+ if (rc) {
+ if (rc == -EINVAL)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ else if (rc == -EOPNOTSUPP)
+ rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ else if (rc == -EACCES || rc == -ESTALE)
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ else if (rc == -ENOENT)
+ rsp->hdr.Status = STATUS_OBJECT_NAME_INVALID;
+ else if (rc == -EPERM)
+ rsp->hdr.Status = STATUS_SHARING_VIOLATION;
+ else if (rc == -EBUSY)
+ rsp->hdr.Status = STATUS_DELETE_PENDING;
+ else if (rc == -EBADF)
+ rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND;
+ else if (rc == -ENOEXEC)
+ rsp->hdr.Status = STATUS_DUPLICATE_OBJECTID;
+ else if (rc == -ENXIO)
+ rsp->hdr.Status = STATUS_NO_SUCH_DEVICE;
+ else if (rc == -EEXIST)
+ rsp->hdr.Status = STATUS_OBJECT_NAME_COLLISION;
+ else if (rc == -EMFILE)
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
+ if (!rsp->hdr.Status)
+ rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
+
+ if (!fp || !fp->filename)
+ kfree(name);
+ if (fp)
+ ksmbd_fd_put(work, fp);
+ smb2_set_err_rsp(work);
+ ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status);
+ }
+
+ kfree(lc);
+
+ return 0;
+}
+
+static int readdir_info_level_struct_sz(int info_level)
+{
+ switch (info_level) {
+ case FILE_FULL_DIRECTORY_INFORMATION:
+ return sizeof(struct file_full_directory_info);
+ case FILE_BOTH_DIRECTORY_INFORMATION:
+ return sizeof(struct file_both_directory_info);
+ case FILE_DIRECTORY_INFORMATION:
+ return sizeof(struct file_directory_info);
+ case FILE_NAMES_INFORMATION:
+ return sizeof(struct file_names_info);
+ case FILEID_FULL_DIRECTORY_INFORMATION:
+ return sizeof(struct file_id_full_dir_info);
+ case FILEID_BOTH_DIRECTORY_INFORMATION:
+ return sizeof(struct file_id_both_directory_info);
+ case SMB_FIND_FILE_POSIX_INFO:
+ return sizeof(struct smb2_posix_info);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dentry_name(struct ksmbd_dir_info *d_info, int info_level)
+{
+ switch (info_level) {
+ case FILE_FULL_DIRECTORY_INFORMATION:
+ {
+ struct file_full_directory_info *ffdinfo;
+
+ ffdinfo = (struct file_full_directory_info *)d_info->rptr;
+ d_info->rptr += le32_to_cpu(ffdinfo->NextEntryOffset);
+ d_info->name = ffdinfo->FileName;
+ d_info->name_len = le32_to_cpu(ffdinfo->FileNameLength);
+ return 0;
+ }
+ case FILE_BOTH_DIRECTORY_INFORMATION:
+ {
+ struct file_both_directory_info *fbdinfo;
+
+ fbdinfo = (struct file_both_directory_info *)d_info->rptr;
+ d_info->rptr += le32_to_cpu(fbdinfo->NextEntryOffset);
+ d_info->name = fbdinfo->FileName;
+ d_info->name_len = le32_to_cpu(fbdinfo->FileNameLength);
+ return 0;
+ }
+ case FILE_DIRECTORY_INFORMATION:
+ {
+ struct file_directory_info *fdinfo;
+
+ fdinfo = (struct file_directory_info *)d_info->rptr;
+ d_info->rptr += le32_to_cpu(fdinfo->NextEntryOffset);
+ d_info->name = fdinfo->FileName;
+ d_info->name_len = le32_to_cpu(fdinfo->FileNameLength);
+ return 0;
+ }
+ case FILE_NAMES_INFORMATION:
+ {
+ struct file_names_info *fninfo;
+
+ fninfo = (struct file_names_info *)d_info->rptr;
+ d_info->rptr += le32_to_cpu(fninfo->NextEntryOffset);
+ d_info->name = fninfo->FileName;
+ d_info->name_len = le32_to_cpu(fninfo->FileNameLength);
+ return 0;
+ }
+ case FILEID_FULL_DIRECTORY_INFORMATION:
+ {
+ struct file_id_full_dir_info *dinfo;
+
+ dinfo = (struct file_id_full_dir_info *)d_info->rptr;
+ d_info->rptr += le32_to_cpu(dinfo->NextEntryOffset);
+ d_info->name = dinfo->FileName;
+ d_info->name_len = le32_to_cpu(dinfo->FileNameLength);
+ return 0;
+ }
+ case FILEID_BOTH_DIRECTORY_INFORMATION:
+ {
+ struct file_id_both_directory_info *fibdinfo;
+
+ fibdinfo = (struct file_id_both_directory_info *)d_info->rptr;
+ d_info->rptr += le32_to_cpu(fibdinfo->NextEntryOffset);
+ d_info->name = fibdinfo->FileName;
+ d_info->name_len = le32_to_cpu(fibdinfo->FileNameLength);
+ return 0;
+ }
+ case SMB_FIND_FILE_POSIX_INFO:
+ {
+ struct smb2_posix_info *posix_info;
+
+ posix_info = (struct smb2_posix_info *)d_info->rptr;
+ d_info->rptr += le32_to_cpu(posix_info->NextEntryOffset);
+ d_info->name = posix_info->name;
+ d_info->name_len = le32_to_cpu(posix_info->name_len);
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * smb2_populate_readdir_entry() - encode directory entry in smb2 response
+ * buffer
+ * @conn: connection instance
+ * @info_level: smb information level
+ * @d_info: structure included variables for query dir
+ * @user_ns: user namespace
+ * @ksmbd_kstat: ksmbd wrapper of dirent stat information
+ *
+ * if directory has many entries, find first can't read it fully.
+ * find next might be called multiple times to read remaining dir entries
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
+ struct ksmbd_dir_info *d_info,
+ struct user_namespace *user_ns,
+ struct ksmbd_kstat *ksmbd_kstat)
+{
+ int next_entry_offset = 0;
+ char *conv_name;
+ int conv_len;
+ void *kstat;
+ int struct_sz, rc = 0;
+
+ conv_name = ksmbd_convert_dir_info_name(d_info,
+ conn->local_nls,
+ &conv_len);
+ if (!conv_name)
+ return -ENOMEM;
+
+ /* Somehow the name has only terminating NULL bytes */
+ if (conv_len < 0) {
+ rc = -EINVAL;
+ goto free_conv_name;
+ }
+
+ struct_sz = readdir_info_level_struct_sz(info_level);
+ next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
+ KSMBD_DIR_INFO_ALIGNMENT);
+
+ if (next_entry_offset > d_info->out_buf_len) {
+ d_info->out_buf_len = 0;
+ rc = -ENOSPC;
+ goto free_conv_name;
+ }
+
+ kstat = d_info->wptr;
+ if (info_level != FILE_NAMES_INFORMATION)
+ kstat = ksmbd_vfs_init_kstat(&d_info->wptr, ksmbd_kstat);
+
+ switch (info_level) {
+ case FILE_FULL_DIRECTORY_INFORMATION:
+ {
+ struct file_full_directory_info *ffdinfo;
+
+ ffdinfo = (struct file_full_directory_info *)kstat;
+ ffdinfo->FileNameLength = cpu_to_le32(conv_len);
+ ffdinfo->EaSize =
+ smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
+ if (ffdinfo->EaSize)
+ ffdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE;
+ if (d_info->hide_dot_file && d_info->name[0] == '.')
+ ffdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ memcpy(ffdinfo->FileName, conv_name, conv_len);
+ ffdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILE_BOTH_DIRECTORY_INFORMATION:
+ {
+ struct file_both_directory_info *fbdinfo;
+
+ fbdinfo = (struct file_both_directory_info *)kstat;
+ fbdinfo->FileNameLength = cpu_to_le32(conv_len);
+ fbdinfo->EaSize =
+ smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
+ if (fbdinfo->EaSize)
+ fbdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE;
+ fbdinfo->ShortNameLength = 0;
+ fbdinfo->Reserved = 0;
+ if (d_info->hide_dot_file && d_info->name[0] == '.')
+ fbdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ memcpy(fbdinfo->FileName, conv_name, conv_len);
+ fbdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILE_DIRECTORY_INFORMATION:
+ {
+ struct file_directory_info *fdinfo;
+
+ fdinfo = (struct file_directory_info *)kstat;
+ fdinfo->FileNameLength = cpu_to_le32(conv_len);
+ if (d_info->hide_dot_file && d_info->name[0] == '.')
+ fdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ memcpy(fdinfo->FileName, conv_name, conv_len);
+ fdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILE_NAMES_INFORMATION:
+ {
+ struct file_names_info *fninfo;
+
+ fninfo = (struct file_names_info *)kstat;
+ fninfo->FileNameLength = cpu_to_le32(conv_len);
+ memcpy(fninfo->FileName, conv_name, conv_len);
+ fninfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILEID_FULL_DIRECTORY_INFORMATION:
+ {
+ struct file_id_full_dir_info *dinfo;
+
+ dinfo = (struct file_id_full_dir_info *)kstat;
+ dinfo->FileNameLength = cpu_to_le32(conv_len);
+ dinfo->EaSize =
+ smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
+ if (dinfo->EaSize)
+ dinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE;
+ dinfo->Reserved = 0;
+ dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
+ if (d_info->hide_dot_file && d_info->name[0] == '.')
+ dinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ memcpy(dinfo->FileName, conv_name, conv_len);
+ dinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILEID_BOTH_DIRECTORY_INFORMATION:
+ {
+ struct file_id_both_directory_info *fibdinfo;
+
+ fibdinfo = (struct file_id_both_directory_info *)kstat;
+ fibdinfo->FileNameLength = cpu_to_le32(conv_len);
+ fibdinfo->EaSize =
+ smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
+ if (fibdinfo->EaSize)
+ fibdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE;
+ fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
+ fibdinfo->ShortNameLength = 0;
+ fibdinfo->Reserved = 0;
+ fibdinfo->Reserved2 = cpu_to_le16(0);
+ if (d_info->hide_dot_file && d_info->name[0] == '.')
+ fibdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE;
+ memcpy(fibdinfo->FileName, conv_name, conv_len);
+ fibdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case SMB_FIND_FILE_POSIX_INFO:
+ {
+ struct smb2_posix_info *posix_info;
+ u64 time;
+
+ posix_info = (struct smb2_posix_info *)kstat;
+ posix_info->Ignored = 0;
+ posix_info->CreationTime = cpu_to_le64(ksmbd_kstat->create_time);
+ time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime);
+ posix_info->ChangeTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->atime);
+ posix_info->LastAccessTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->mtime);
+ posix_info->LastWriteTime = cpu_to_le64(time);
+ posix_info->EndOfFile = cpu_to_le64(ksmbd_kstat->kstat->size);
+ posix_info->AllocationSize = cpu_to_le64(ksmbd_kstat->kstat->blocks << 9);
+ posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev);
+ posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink);
+ posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode);
+ posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino);
+ posix_info->DosAttributes =
+ S_ISDIR(ksmbd_kstat->kstat->mode) ? ATTR_DIRECTORY_LE : ATTR_ARCHIVE_LE;
+ if (d_info->hide_dot_file && d_info->name[0] == '.')
+ posix_info->DosAttributes |= ATTR_HIDDEN_LE;
+ id_to_sid(from_kuid(user_ns, ksmbd_kstat->kstat->uid),
+ SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
+ id_to_sid(from_kgid(user_ns, ksmbd_kstat->kstat->gid),
+ SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]);
+ memcpy(posix_info->name, conv_name, conv_len);
+ posix_info->name_len = cpu_to_le32(conv_len);
+ posix_info->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+
+ } /* switch (info_level) */
+
+ d_info->last_entry_offset = d_info->data_count;
+ d_info->data_count += next_entry_offset;
+ d_info->out_buf_len -= next_entry_offset;
+ d_info->wptr += next_entry_offset;
+
+ ksmbd_debug(SMB,
+ "info_level : %d, buf_len :%d, next_offset : %d, data_count : %d\n",
+ info_level, d_info->out_buf_len,
+ next_entry_offset, d_info->data_count);
+
+free_conv_name:
+ kfree(conv_name);
+ return rc;
+}
+
+struct smb2_query_dir_private {
+ struct ksmbd_work *work;
+ char *search_pattern;
+ struct ksmbd_file *dir_fp;
+
+ struct ksmbd_dir_info *d_info;
+ int info_level;
+};
+
+static void lock_dir(struct ksmbd_file *dir_fp)
+{
+ struct dentry *dir = dir_fp->filp->f_path.dentry;
+
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+}
+
+static void unlock_dir(struct ksmbd_file *dir_fp)
+{
+ struct dentry *dir = dir_fp->filp->f_path.dentry;
+
+ inode_unlock(d_inode(dir));
+}
+
+static int process_query_dir_entries(struct smb2_query_dir_private *priv)
+{
+ struct user_namespace *user_ns = file_mnt_user_ns(priv->dir_fp->filp);
+ struct kstat kstat;
+ struct ksmbd_kstat ksmbd_kstat;
+ int rc;
+ int i;
+
+ for (i = 0; i < priv->d_info->num_entry; i++) {
+ struct dentry *dent;
+
+ if (dentry_name(priv->d_info, priv->info_level))
+ return -EINVAL;
+
+ lock_dir(priv->dir_fp);
+ dent = lookup_one_len(priv->d_info->name,
+ priv->dir_fp->filp->f_path.dentry,
+ priv->d_info->name_len);
+ unlock_dir(priv->dir_fp);
+
+ if (IS_ERR(dent)) {
+ ksmbd_debug(SMB, "Cannot lookup `%s' [%ld]\n",
+ priv->d_info->name,
+ PTR_ERR(dent));
+ continue;
+ }
+ if (unlikely(d_is_negative(dent))) {
+ dput(dent);
+ ksmbd_debug(SMB, "Negative dentry `%s'\n",
+ priv->d_info->name);
+ continue;
+ }
+
+ ksmbd_kstat.kstat = &kstat;
+ if (priv->info_level != FILE_NAMES_INFORMATION)
+ ksmbd_vfs_fill_dentry_attrs(priv->work,
+ user_ns,
+ dent,
+ &ksmbd_kstat);
+
+ rc = smb2_populate_readdir_entry(priv->work->conn,
+ priv->info_level,
+ priv->d_info,
+ user_ns,
+ &ksmbd_kstat);
+ dput(dent);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
+ int info_level)
+{
+ int struct_sz;
+ int conv_len;
+ int next_entry_offset;
+
+ struct_sz = readdir_info_level_struct_sz(info_level);
+ if (struct_sz == -EOPNOTSUPP)
+ return -EOPNOTSUPP;
+
+ conv_len = (d_info->name_len + 1) * 2;
+ next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
+ KSMBD_DIR_INFO_ALIGNMENT);
+
+ if (next_entry_offset > d_info->out_buf_len) {
+ d_info->out_buf_len = 0;
+ return -ENOSPC;
+ }
+
+ switch (info_level) {
+ case FILE_FULL_DIRECTORY_INFORMATION:
+ {
+ struct file_full_directory_info *ffdinfo;
+
+ ffdinfo = (struct file_full_directory_info *)d_info->wptr;
+ memcpy(ffdinfo->FileName, d_info->name, d_info->name_len);
+ ffdinfo->FileName[d_info->name_len] = 0x00;
+ ffdinfo->FileNameLength = cpu_to_le32(d_info->name_len);
+ ffdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILE_BOTH_DIRECTORY_INFORMATION:
+ {
+ struct file_both_directory_info *fbdinfo;
+
+ fbdinfo = (struct file_both_directory_info *)d_info->wptr;
+ memcpy(fbdinfo->FileName, d_info->name, d_info->name_len);
+ fbdinfo->FileName[d_info->name_len] = 0x00;
+ fbdinfo->FileNameLength = cpu_to_le32(d_info->name_len);
+ fbdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILE_DIRECTORY_INFORMATION:
+ {
+ struct file_directory_info *fdinfo;
+
+ fdinfo = (struct file_directory_info *)d_info->wptr;
+ memcpy(fdinfo->FileName, d_info->name, d_info->name_len);
+ fdinfo->FileName[d_info->name_len] = 0x00;
+ fdinfo->FileNameLength = cpu_to_le32(d_info->name_len);
+ fdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILE_NAMES_INFORMATION:
+ {
+ struct file_names_info *fninfo;
+
+ fninfo = (struct file_names_info *)d_info->wptr;
+ memcpy(fninfo->FileName, d_info->name, d_info->name_len);
+ fninfo->FileName[d_info->name_len] = 0x00;
+ fninfo->FileNameLength = cpu_to_le32(d_info->name_len);
+ fninfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILEID_FULL_DIRECTORY_INFORMATION:
+ {
+ struct file_id_full_dir_info *dinfo;
+
+ dinfo = (struct file_id_full_dir_info *)d_info->wptr;
+ memcpy(dinfo->FileName, d_info->name, d_info->name_len);
+ dinfo->FileName[d_info->name_len] = 0x00;
+ dinfo->FileNameLength = cpu_to_le32(d_info->name_len);
+ dinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case FILEID_BOTH_DIRECTORY_INFORMATION:
+ {
+ struct file_id_both_directory_info *fibdinfo;
+
+ fibdinfo = (struct file_id_both_directory_info *)d_info->wptr;
+ memcpy(fibdinfo->FileName, d_info->name, d_info->name_len);
+ fibdinfo->FileName[d_info->name_len] = 0x00;
+ fibdinfo->FileNameLength = cpu_to_le32(d_info->name_len);
+ fibdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset);
+ break;
+ }
+ case SMB_FIND_FILE_POSIX_INFO:
+ {
+ struct smb2_posix_info *posix_info;
+
+ posix_info = (struct smb2_posix_info *)d_info->wptr;
+ memcpy(posix_info->name, d_info->name, d_info->name_len);
+ posix_info->name[d_info->name_len] = 0x00;
+ posix_info->name_len = cpu_to_le32(d_info->name_len);
+ posix_info->NextEntryOffset =
+ cpu_to_le32(next_entry_offset);
+ break;
+ }
+ } /* switch (info_level) */
+
+ d_info->num_entry++;
+ d_info->out_buf_len -= next_entry_offset;
+ d_info->wptr += next_entry_offset;
+ return 0;
+}
+
+static int __query_dir(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct ksmbd_readdir_data *buf;
+ struct smb2_query_dir_private *priv;
+ struct ksmbd_dir_info *d_info;
+ int rc;
+
+ buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
+ priv = buf->private;
+ d_info = priv->d_info;
+
+ /* dot and dotdot entries are already reserved */
+ if (!strcmp(".", name) || !strcmp("..", name))
+ return 0;
+ if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name))
+ return 0;
+ if (!match_pattern(name, namlen, priv->search_pattern))
+ return 0;
+
+ d_info->name = name;
+ d_info->name_len = namlen;
+ rc = reserve_populate_dentry(d_info, priv->info_level);
+ if (rc)
+ return rc;
+ if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) {
+ d_info->out_buf_len = 0;
+ return 0;
+ }
+ return 0;
+}
+
+static void restart_ctx(struct dir_context *ctx)
+{
+ ctx->pos = 0;
+}
+
+static int verify_info_level(int info_level)
+{
+ switch (info_level) {
+ case FILE_FULL_DIRECTORY_INFORMATION:
+ case FILE_BOTH_DIRECTORY_INFORMATION:
+ case FILE_DIRECTORY_INFORMATION:
+ case FILE_NAMES_INFORMATION:
+ case FILEID_FULL_DIRECTORY_INFORMATION:
+ case FILEID_BOTH_DIRECTORY_INFORMATION:
+ case SMB_FIND_FILE_POSIX_INFO:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+int smb2_query_dir(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_query_directory_req *req;
+ struct smb2_query_directory_rsp *rsp, *rsp_org;
+ struct ksmbd_share_config *share = work->tcon->share_conf;
+ struct ksmbd_file *dir_fp = NULL;
+ struct ksmbd_dir_info d_info;
+ int rc = 0;
+ char *srch_ptr = NULL;
+ unsigned char srch_flag;
+ int buffer_sz;
+ struct smb2_query_dir_private query_dir_private = {NULL, };
+
+ rsp_org = work->response_buf;
+ WORK_BUFFERS(work, req, rsp);
+
+ if (ksmbd_override_fsids(work)) {
+ rsp->hdr.Status = STATUS_NO_MEMORY;
+ smb2_set_err_rsp(work);
+ return -ENOMEM;
+ }
+
+ rc = verify_info_level(req->FileInformationClass);
+ if (rc) {
+ rc = -EFAULT;
+ goto err_out2;
+ }
+
+ dir_fp = ksmbd_lookup_fd_slow(work,
+ le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId));
+ if (!dir_fp) {
+ rc = -EBADF;
+ goto err_out2;
+ }
+
+ if (!(dir_fp->daccess & FILE_LIST_DIRECTORY_LE) ||
+ inode_permission(file_mnt_user_ns(dir_fp->filp),
+ file_inode(dir_fp->filp),
+ MAY_READ | MAY_EXEC)) {
+ pr_err("no right to enumerate directory (%pd)\n",
+ dir_fp->filp->f_path.dentry);
+ rc = -EACCES;
+ goto err_out2;
+ }
+
+ if (!S_ISDIR(file_inode(dir_fp->filp)->i_mode)) {
+ pr_err("can't do query dir for a file\n");
+ rc = -EINVAL;
+ goto err_out2;
+ }
+
+ srch_flag = req->Flags;
+ srch_ptr = smb_strndup_from_utf16(req->Buffer,
+ le16_to_cpu(req->FileNameLength), 1,
+ conn->local_nls);
+ if (IS_ERR(srch_ptr)) {
+ ksmbd_debug(SMB, "Search Pattern not found\n");
+ rc = -EINVAL;
+ goto err_out2;
+ } else {
+ ksmbd_debug(SMB, "Search pattern is %s\n", srch_ptr);
+ }
+
+ ksmbd_debug(SMB, "Directory name is %s\n", dir_fp->filename);
+
+ if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) {
+ ksmbd_debug(SMB, "Restart directory scan\n");
+ generic_file_llseek(dir_fp->filp, 0, SEEK_SET);
+ restart_ctx(&dir_fp->readdir_data.ctx);
+ }
+
+ memset(&d_info, 0, sizeof(struct ksmbd_dir_info));
+ d_info.wptr = (char *)rsp->Buffer;
+ d_info.rptr = (char *)rsp->Buffer;
+ d_info.out_buf_len = (work->response_sz - (get_rfc1002_len(rsp_org) + 4));
+ d_info.out_buf_len = min_t(int, d_info.out_buf_len, le32_to_cpu(req->OutputBufferLength)) -
+ sizeof(struct smb2_query_directory_rsp);
+ d_info.flags = srch_flag;
+
+ /*
+ * reserve dot and dotdot entries in head of buffer
+ * in first response
+ */
+ rc = ksmbd_populate_dot_dotdot_entries(work, req->FileInformationClass,
+ dir_fp, &d_info, srch_ptr,
+ smb2_populate_readdir_entry);
+ if (rc == -ENOSPC)
+ rc = 0;
+ else if (rc)
+ goto err_out;
+
+ if (test_share_config_flag(share, KSMBD_SHARE_FLAG_HIDE_DOT_FILES))
+ d_info.hide_dot_file = true;
+
+ buffer_sz = d_info.out_buf_len;
+ d_info.rptr = d_info.wptr;
+ query_dir_private.work = work;
+ query_dir_private.search_pattern = srch_ptr;
+ query_dir_private.dir_fp = dir_fp;
+ query_dir_private.d_info = &d_info;
+ query_dir_private.info_level = req->FileInformationClass;
+ dir_fp->readdir_data.private = &query_dir_private;
+ set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir);
+
+ rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx);
+ if (rc == 0)
+ restart_ctx(&dir_fp->readdir_data.ctx);
+ if (rc == -ENOSPC)
+ rc = 0;
+ if (rc)
+ goto err_out;
+
+ d_info.wptr = d_info.rptr;
+ d_info.out_buf_len = buffer_sz;
+ rc = process_query_dir_entries(&query_dir_private);
+ if (rc)
+ goto err_out;
+
+ if (!d_info.data_count && d_info.out_buf_len >= 0) {
+ if (srch_flag & SMB2_RETURN_SINGLE_ENTRY && !is_asterisk(srch_ptr)) {
+ rsp->hdr.Status = STATUS_NO_SUCH_FILE;
+ } else {
+ dir_fp->dot_dotdot[0] = dir_fp->dot_dotdot[1] = 0;
+ rsp->hdr.Status = STATUS_NO_MORE_FILES;
+ }
+ rsp->StructureSize = cpu_to_le16(9);
+ rsp->OutputBufferOffset = cpu_to_le16(0);
+ rsp->OutputBufferLength = cpu_to_le32(0);
+ rsp->Buffer[0] = 0;
+ inc_rfc1001_len(rsp_org, 9);
+ } else {
+ ((struct file_directory_info *)
+ ((char *)rsp->Buffer + d_info.last_entry_offset))
+ ->NextEntryOffset = 0;
+
+ rsp->StructureSize = cpu_to_le16(9);
+ rsp->OutputBufferOffset = cpu_to_le16(72);
+ rsp->OutputBufferLength = cpu_to_le32(d_info.data_count);
+ inc_rfc1001_len(rsp_org, 8 + d_info.data_count);
+ }
+
+ kfree(srch_ptr);
+ ksmbd_fd_put(work, dir_fp);
+ ksmbd_revert_fsids(work);
+ return 0;
+
+err_out:
+ pr_err("error while processing smb2 query dir rc = %d\n", rc);
+ kfree(srch_ptr);
+
+err_out2:
+ if (rc == -EINVAL)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ else if (rc == -EACCES)
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ else if (rc == -ENOENT)
+ rsp->hdr.Status = STATUS_NO_SUCH_FILE;
+ else if (rc == -EBADF)
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ else if (rc == -ENOMEM)
+ rsp->hdr.Status = STATUS_NO_MEMORY;
+ else if (rc == -EFAULT)
+ rsp->hdr.Status = STATUS_INVALID_INFO_CLASS;
+ if (!rsp->hdr.Status)
+ rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
+
+ smb2_set_err_rsp(work);
+ ksmbd_fd_put(work, dir_fp);
+ ksmbd_revert_fsids(work);
+ return 0;
+}
+
+/**
+ * buffer_check_err() - helper function to check buffer errors
+ * @reqOutputBufferLength: max buffer length expected in command response
+ * @rsp: query info response buffer contains output buffer length
+ * @infoclass_size: query info class response buffer size
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int buffer_check_err(int reqOutputBufferLength,
+ struct smb2_query_info_rsp *rsp, int infoclass_size)
+{
+ if (reqOutputBufferLength < le32_to_cpu(rsp->OutputBufferLength)) {
+ if (reqOutputBufferLength < infoclass_size) {
+ pr_err("Invalid Buffer Size Requested\n");
+ rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH;
+ rsp->hdr.smb2_buf_length = cpu_to_be32(sizeof(struct smb2_hdr) - 4);
+ return -EINVAL;
+ }
+
+ ksmbd_debug(SMB, "Buffer Overflow\n");
+ rsp->hdr.Status = STATUS_BUFFER_OVERFLOW;
+ rsp->hdr.smb2_buf_length = cpu_to_be32(sizeof(struct smb2_hdr) - 4 +
+ reqOutputBufferLength);
+ rsp->OutputBufferLength = cpu_to_le32(reqOutputBufferLength);
+ }
+ return 0;
+}
+
+static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp)
+{
+ struct smb2_file_standard_info *sinfo;
+
+ sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
+
+ sinfo->AllocationSize = cpu_to_le64(4096);
+ sinfo->EndOfFile = cpu_to_le64(0);
+ sinfo->NumberOfLinks = cpu_to_le32(1);
+ sinfo->DeletePending = 1;
+ sinfo->Directory = 0;
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_standard_info));
+ inc_rfc1001_len(rsp, sizeof(struct smb2_file_standard_info));
+}
+
+static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num)
+{
+ struct smb2_file_internal_info *file_info;
+
+ file_info = (struct smb2_file_internal_info *)rsp->Buffer;
+
+ /* any unique number */
+ file_info->IndexNumber = cpu_to_le64(num | (1ULL << 63));
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_internal_info));
+ inc_rfc1001_len(rsp, sizeof(struct smb2_file_internal_info));
+}
+
+static int smb2_get_info_file_pipe(struct ksmbd_session *sess,
+ struct smb2_query_info_req *req,
+ struct smb2_query_info_rsp *rsp)
+{
+ u64 id;
+ int rc;
+
+ /*
+ * Windows can sometime send query file info request on
+ * pipe without opening it, checking error condition here
+ */
+ id = le64_to_cpu(req->VolatileFileId);
+ if (!ksmbd_session_rpc_method(sess, id))
+ return -ENOENT;
+
+ ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n",
+ req->FileInfoClass, le64_to_cpu(req->VolatileFileId));
+
+ switch (req->FileInfoClass) {
+ case FILE_STANDARD_INFORMATION:
+ get_standard_info_pipe(rsp);
+ rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
+ rsp, FILE_STANDARD_INFORMATION_SIZE);
+ break;
+ case FILE_INTERNAL_INFORMATION:
+ get_internal_info_pipe(rsp, id);
+ rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
+ rsp, FILE_INTERNAL_INFORMATION_SIZE);
+ break;
+ default:
+ ksmbd_debug(SMB, "smb2_info_file_pipe for %u not supported\n",
+ req->FileInfoClass);
+ rc = -EOPNOTSUPP;
+ }
+ return rc;
+}
+
+/**
+ * smb2_get_ea() - handler for smb2 get extended attribute command
+ * @work: smb work containing query info command buffer
+ * @fp: ksmbd_file pointer
+ * @req: get extended attribute request
+ * @rsp: response buffer pointer
+ * @rsp_org: base response buffer pointer in case of chained response
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
+ struct smb2_query_info_req *req,
+ struct smb2_query_info_rsp *rsp, void *rsp_org)
+{
+ struct smb2_ea_info *eainfo, *prev_eainfo;
+ char *name, *ptr, *xattr_list = NULL, *buf;
+ int rc, name_len, value_len, xattr_list_len, idx;
+ ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0;
+ struct smb2_ea_info_req *ea_req = NULL;
+ struct path *path;
+ struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+
+ if (!(fp->daccess & FILE_READ_EA_LE)) {
+ pr_err("Not permitted to read ext attr : 0x%x\n",
+ fp->daccess);
+ return -EACCES;
+ }
+
+ path = &fp->filp->f_path;
+ /* single EA entry is requested with given user.* name */
+ if (req->InputBufferLength) {
+ ea_req = (struct smb2_ea_info_req *)req->Buffer;
+ } else {
+ /* need to send all EAs, if no specific EA is requested*/
+ if (le32_to_cpu(req->Flags) & SL_RETURN_SINGLE_ENTRY)
+ ksmbd_debug(SMB,
+ "All EAs are requested but need to send single EA entry in rsp flags 0x%x\n",
+ le32_to_cpu(req->Flags));
+ }
+
+ buf_free_len = work->response_sz -
+ (get_rfc1002_len(rsp_org) + 4) -
+ sizeof(struct smb2_query_info_rsp);
+
+ if (le32_to_cpu(req->OutputBufferLength) < buf_free_len)
+ buf_free_len = le32_to_cpu(req->OutputBufferLength);
+
+ rc = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
+ if (rc < 0) {
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+ goto out;
+ } else if (!rc) { /* there is no EA in the file */
+ ksmbd_debug(SMB, "no ea data in the file\n");
+ goto done;
+ }
+ xattr_list_len = rc;
+
+ ptr = (char *)rsp->Buffer;
+ eainfo = (struct smb2_ea_info *)ptr;
+ prev_eainfo = eainfo;
+ idx = 0;
+
+ while (idx < xattr_list_len) {
+ name = xattr_list + idx;
+ name_len = strlen(name);
+
+ ksmbd_debug(SMB, "%s, len %d\n", name, name_len);
+ idx += name_len + 1;
+
+ /*
+ * CIFS does not support EA other than user.* namespace,
+ * still keep the framework generic, to list other attrs
+ * in future.
+ */
+ if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ continue;
+
+ if (!strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX,
+ STREAM_PREFIX_LEN))
+ continue;
+
+ if (req->InputBufferLength &&
+ strncmp(&name[XATTR_USER_PREFIX_LEN], ea_req->name,
+ ea_req->EaNameLength))
+ continue;
+
+ if (!strncmp(&name[XATTR_USER_PREFIX_LEN],
+ DOS_ATTRIBUTE_PREFIX, DOS_ATTRIBUTE_PREFIX_LEN))
+ continue;
+
+ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ name_len -= XATTR_USER_PREFIX_LEN;
+
+ ptr = (char *)(&eainfo->name + name_len + 1);
+ buf_free_len -= (offsetof(struct smb2_ea_info, name) +
+ name_len + 1);
+ /* bailout if xattr can't fit in buf_free_len */
+ value_len = ksmbd_vfs_getxattr(user_ns, path->dentry,
+ name, &buf);
+ if (value_len <= 0) {
+ rc = -ENOENT;
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+ goto out;
+ }
+
+ buf_free_len -= value_len;
+ if (buf_free_len < 0) {
+ kfree(buf);
+ break;
+ }
+
+ memcpy(ptr, buf, value_len);
+ kfree(buf);
+
+ ptr += value_len;
+ eainfo->Flags = 0;
+ eainfo->EaNameLength = name_len;
+
+ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ memcpy(eainfo->name, &name[XATTR_USER_PREFIX_LEN],
+ name_len);
+ else
+ memcpy(eainfo->name, name, name_len);
+
+ eainfo->name[name_len] = '\0';
+ eainfo->EaValueLength = cpu_to_le16(value_len);
+ next_offset = offsetof(struct smb2_ea_info, name) +
+ name_len + 1 + value_len;
+
+ /* align next xattr entry at 4 byte bundary */
+ alignment_bytes = ((next_offset + 3) & ~3) - next_offset;
+ if (alignment_bytes) {
+ memset(ptr, '\0', alignment_bytes);
+ ptr += alignment_bytes;
+ next_offset += alignment_bytes;
+ buf_free_len -= alignment_bytes;
+ }
+ eainfo->NextEntryOffset = cpu_to_le32(next_offset);
+ prev_eainfo = eainfo;
+ eainfo = (struct smb2_ea_info *)ptr;
+ rsp_data_cnt += next_offset;
+
+ if (req->InputBufferLength) {
+ ksmbd_debug(SMB, "single entry requested\n");
+ break;
+ }
+ }
+
+ /* no more ea entries */
+ prev_eainfo->NextEntryOffset = 0;
+done:
+ rc = 0;
+ if (rsp_data_cnt == 0)
+ rsp->hdr.Status = STATUS_NO_EAS_ON_FILE;
+ rsp->OutputBufferLength = cpu_to_le32(rsp_data_cnt);
+ inc_rfc1001_len(rsp_org, rsp_data_cnt);
+out:
+ kvfree(xattr_list);
+ return rc;
+}
+
+static void get_file_access_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_access_info *file_info;
+
+ file_info = (struct smb2_file_access_info *)rsp->Buffer;
+ file_info->AccessFlags = fp->daccess;
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_access_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_access_info));
+}
+
+static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_all_info *basic_info;
+ struct kstat stat;
+ u64 time;
+
+ if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
+ pr_err("no right to read the attributes : 0x%x\n",
+ fp->daccess);
+ return -EACCES;
+ }
+
+ basic_info = (struct smb2_file_all_info *)rsp->Buffer;
+ generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp),
+ &stat);
+ basic_info->CreationTime = cpu_to_le64(fp->create_time);
+ time = ksmbd_UnixTimeToNT(stat.atime);
+ basic_info->LastAccessTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(stat.mtime);
+ basic_info->LastWriteTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(stat.ctime);
+ basic_info->ChangeTime = cpu_to_le64(time);
+ basic_info->Attributes = fp->f_ci->m_fattr;
+ basic_info->Pad1 = 0;
+ rsp->OutputBufferLength =
+ cpu_to_le32(offsetof(struct smb2_file_all_info, AllocationSize));
+ inc_rfc1001_len(rsp_org, offsetof(struct smb2_file_all_info,
+ AllocationSize));
+ return 0;
+}
+
+static unsigned long long get_allocation_size(struct inode *inode,
+ struct kstat *stat)
+{
+ unsigned long long alloc_size = 0;
+
+ if (!S_ISDIR(stat->mode)) {
+ if ((inode->i_blocks << 9) <= stat->size)
+ alloc_size = stat->size;
+ else
+ alloc_size = inode->i_blocks << 9;
+ }
+
+ return alloc_size;
+}
+
+static void get_file_standard_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_standard_info *sinfo;
+ unsigned int delete_pending;
+ struct inode *inode;
+ struct kstat stat;
+
+ inode = file_inode(fp->filp);
+ generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat);
+
+ sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
+ delete_pending = ksmbd_inode_pending_delete(fp);
+
+ sinfo->AllocationSize = cpu_to_le64(get_allocation_size(inode, &stat));
+ sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending);
+ sinfo->DeletePending = delete_pending;
+ sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0;
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_standard_info));
+ inc_rfc1001_len(rsp_org,
+ sizeof(struct smb2_file_standard_info));
+}
+
+static void get_file_alignment_info(struct smb2_query_info_rsp *rsp,
+ void *rsp_org)
+{
+ struct smb2_file_alignment_info *file_info;
+
+ file_info = (struct smb2_file_alignment_info *)rsp->Buffer;
+ file_info->AlignmentRequirement = 0;
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_alignment_info));
+ inc_rfc1001_len(rsp_org,
+ sizeof(struct smb2_file_alignment_info));
+}
+
+static int get_file_all_info(struct ksmbd_work *work,
+ struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp,
+ void *rsp_org)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_file_all_info *file_info;
+ unsigned int delete_pending;
+ struct inode *inode;
+ struct kstat stat;
+ int conv_len;
+ char *filename;
+ u64 time;
+
+ if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
+ ksmbd_debug(SMB, "no right to read the attributes : 0x%x\n",
+ fp->daccess);
+ return -EACCES;
+ }
+
+ filename = convert_to_nt_pathname(fp->filename,
+ work->tcon->share_conf->path);
+ if (!filename)
+ return -ENOMEM;
+
+ inode = file_inode(fp->filp);
+ generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat);
+
+ ksmbd_debug(SMB, "filename = %s\n", filename);
+ delete_pending = ksmbd_inode_pending_delete(fp);
+ file_info = (struct smb2_file_all_info *)rsp->Buffer;
+
+ file_info->CreationTime = cpu_to_le64(fp->create_time);
+ time = ksmbd_UnixTimeToNT(stat.atime);
+ file_info->LastAccessTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(stat.mtime);
+ file_info->LastWriteTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(stat.ctime);
+ file_info->ChangeTime = cpu_to_le64(time);
+ file_info->Attributes = fp->f_ci->m_fattr;
+ file_info->Pad1 = 0;
+ file_info->AllocationSize =
+ cpu_to_le64(get_allocation_size(inode, &stat));
+ file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ file_info->NumberOfLinks =
+ cpu_to_le32(get_nlink(&stat) - delete_pending);
+ file_info->DeletePending = delete_pending;
+ file_info->Directory = S_ISDIR(stat.mode) ? 1 : 0;
+ file_info->Pad2 = 0;
+ file_info->IndexNumber = cpu_to_le64(stat.ino);
+ file_info->EASize = 0;
+ file_info->AccessFlags = fp->daccess;
+ file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ file_info->Mode = fp->coption;
+ file_info->AlignmentRequirement = 0;
+ conv_len = smbConvertToUTF16((__le16 *)file_info->FileName, filename,
+ PATH_MAX, conn->local_nls, 0);
+ conv_len *= 2;
+ file_info->FileNameLength = cpu_to_le32(conv_len);
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_all_info) + conv_len - 1);
+ kfree(filename);
+ inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength));
+ return 0;
+}
+
+static void get_file_alternate_info(struct ksmbd_work *work,
+ struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp,
+ void *rsp_org)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_file_alt_name_info *file_info;
+ struct dentry *dentry = fp->filp->f_path.dentry;
+ int conv_len;
+
+ spin_lock(&dentry->d_lock);
+ file_info = (struct smb2_file_alt_name_info *)rsp->Buffer;
+ conv_len = ksmbd_extract_shortname(conn,
+ dentry->d_name.name,
+ file_info->FileName);
+ spin_unlock(&dentry->d_lock);
+ file_info->FileNameLength = cpu_to_le32(conv_len);
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len);
+ inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength));
+}
+
+static void get_file_stream_info(struct ksmbd_work *work,
+ struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp,
+ void *rsp_org)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_file_stream_info *file_info;
+ char *stream_name, *xattr_list = NULL, *stream_buf;
+ struct kstat stat;
+ struct path *path = &fp->filp->f_path;
+ ssize_t xattr_list_len;
+ int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
+
+ generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp),
+ &stat);
+ file_info = (struct smb2_file_stream_info *)rsp->Buffer;
+
+ xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list);
+ if (xattr_list_len < 0) {
+ goto out;
+ } else if (!xattr_list_len) {
+ ksmbd_debug(SMB, "empty xattr in the file\n");
+ goto out;
+ }
+
+ while (idx < xattr_list_len) {
+ stream_name = xattr_list + idx;
+ streamlen = strlen(stream_name);
+ idx += streamlen + 1;
+
+ ksmbd_debug(SMB, "%s, len %d\n", stream_name, streamlen);
+
+ if (strncmp(&stream_name[XATTR_USER_PREFIX_LEN],
+ STREAM_PREFIX, STREAM_PREFIX_LEN))
+ continue;
+
+ stream_name_len = streamlen - (XATTR_USER_PREFIX_LEN +
+ STREAM_PREFIX_LEN);
+ streamlen = stream_name_len;
+
+ /* plus : size */
+ streamlen += 1;
+ stream_buf = kmalloc(streamlen + 1, GFP_KERNEL);
+ if (!stream_buf)
+ break;
+
+ streamlen = snprintf(stream_buf, streamlen + 1,
+ ":%s", &stream_name[XATTR_NAME_STREAM_LEN]);
+
+ file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes];
+ streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
+ stream_buf, streamlen,
+ conn->local_nls, 0);
+ streamlen *= 2;
+ kfree(stream_buf);
+ file_info->StreamNameLength = cpu_to_le32(streamlen);
+ file_info->StreamSize = cpu_to_le64(stream_name_len);
+ file_info->StreamAllocationSize = cpu_to_le64(stream_name_len);
+
+ next = sizeof(struct smb2_file_stream_info) + streamlen;
+ nbytes += next;
+ file_info->NextEntryOffset = cpu_to_le32(next);
+ }
+
+ if (nbytes) {
+ file_info = (struct smb2_file_stream_info *)
+ &rsp->Buffer[nbytes];
+ streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName,
+ "::$DATA", 7, conn->local_nls, 0);
+ streamlen *= 2;
+ file_info->StreamNameLength = cpu_to_le32(streamlen);
+ file_info->StreamSize = S_ISDIR(stat.mode) ? 0 :
+ cpu_to_le64(stat.size);
+ file_info->StreamAllocationSize = S_ISDIR(stat.mode) ? 0 :
+ cpu_to_le64(stat.size);
+ nbytes += sizeof(struct smb2_file_stream_info) + streamlen;
+ }
+
+ /* last entry offset should be 0 */
+ file_info->NextEntryOffset = 0;
+out:
+ kvfree(xattr_list);
+
+ rsp->OutputBufferLength = cpu_to_le32(nbytes);
+ inc_rfc1001_len(rsp_org, nbytes);
+}
+
+static void get_file_internal_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_internal_info *file_info;
+ struct kstat stat;
+
+ generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp),
+ &stat);
+ file_info = (struct smb2_file_internal_info *)rsp->Buffer;
+ file_info->IndexNumber = cpu_to_le64(stat.ino);
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_internal_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info));
+}
+
+static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_ntwrk_info *file_info;
+ struct inode *inode;
+ struct kstat stat;
+ u64 time;
+
+ if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
+ pr_err("no right to read the attributes : 0x%x\n",
+ fp->daccess);
+ return -EACCES;
+ }
+
+ file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
+
+ inode = file_inode(fp->filp);
+ generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat);
+
+ file_info->CreationTime = cpu_to_le64(fp->create_time);
+ time = ksmbd_UnixTimeToNT(stat.atime);
+ file_info->LastAccessTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(stat.mtime);
+ file_info->LastWriteTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(stat.ctime);
+ file_info->ChangeTime = cpu_to_le64(time);
+ file_info->Attributes = fp->f_ci->m_fattr;
+ file_info->AllocationSize =
+ cpu_to_le64(get_allocation_size(inode, &stat));
+ file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ file_info->Reserved = cpu_to_le32(0);
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_ntwrk_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ntwrk_info));
+ return 0;
+}
+
+static void get_file_ea_info(struct smb2_query_info_rsp *rsp, void *rsp_org)
+{
+ struct smb2_file_ea_info *file_info;
+
+ file_info = (struct smb2_file_ea_info *)rsp->Buffer;
+ file_info->EASize = 0;
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_ea_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ea_info));
+}
+
+static void get_file_position_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_pos_info *file_info;
+
+ file_info = (struct smb2_file_pos_info *)rsp->Buffer;
+ file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_pos_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_pos_info));
+}
+
+static void get_file_mode_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_mode_info *file_info;
+
+ file_info = (struct smb2_file_mode_info *)rsp->Buffer;
+ file_info->Mode = fp->coption & FILE_MODE_INFO_MASK;
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_mode_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_mode_info));
+}
+
+static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_comp_info *file_info;
+ struct kstat stat;
+
+ generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp),
+ &stat);
+
+ file_info = (struct smb2_file_comp_info *)rsp->Buffer;
+ file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9);
+ file_info->CompressionFormat = COMPRESSION_FORMAT_NONE;
+ file_info->CompressionUnitShift = 0;
+ file_info->ChunkShift = 0;
+ file_info->ClusterShift = 0;
+ memset(&file_info->Reserved[0], 0, 3);
+
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_comp_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_comp_info));
+}
+
+static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb2_file_attr_tag_info *file_info;
+
+ if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
+ pr_err("no right to read the attributes : 0x%x\n",
+ fp->daccess);
+ return -EACCES;
+ }
+
+ file_info = (struct smb2_file_attr_tag_info *)rsp->Buffer;
+ file_info->FileAttributes = fp->f_ci->m_fattr;
+ file_info->ReparseTag = 0;
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb2_file_attr_tag_info));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_attr_tag_info));
+ return 0;
+}
+
+static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
+ struct ksmbd_file *fp, void *rsp_org)
+{
+ struct smb311_posix_qinfo *file_info;
+ struct inode *inode = file_inode(fp->filp);
+ u64 time;
+
+ file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
+ file_info->CreationTime = cpu_to_le64(fp->create_time);
+ time = ksmbd_UnixTimeToNT(inode->i_atime);
+ file_info->LastAccessTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(inode->i_mtime);
+ file_info->LastWriteTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(inode->i_ctime);
+ file_info->ChangeTime = cpu_to_le64(time);
+ file_info->DosAttributes = fp->f_ci->m_fattr;
+ file_info->Inode = cpu_to_le64(inode->i_ino);
+ file_info->EndOfFile = cpu_to_le64(inode->i_size);
+ file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
+ file_info->HardLinks = cpu_to_le32(inode->i_nlink);
+ file_info->Mode = cpu_to_le32(inode->i_mode);
+ file_info->DeviceId = cpu_to_le32(inode->i_rdev);
+ rsp->OutputBufferLength =
+ cpu_to_le32(sizeof(struct smb311_posix_qinfo));
+ inc_rfc1001_len(rsp_org, sizeof(struct smb311_posix_qinfo));
+ return 0;
+}
+
+static int smb2_get_info_file(struct ksmbd_work *work,
+ struct smb2_query_info_req *req,
+ struct smb2_query_info_rsp *rsp, void *rsp_org)
+{
+ struct ksmbd_file *fp;
+ int fileinfoclass = 0;
+ int rc = 0;
+ int file_infoclass_size;
+ unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_PIPE)) {
+ /* smb2 info file called for pipe */
+ return smb2_get_info_file_pipe(work->sess, req, rsp);
+ }
+
+ if (work->next_smb2_rcv_hdr_off) {
+ if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ ksmbd_debug(SMB, "Compound request set FID = %llu\n",
+ work->compound_fid);
+ id = work->compound_fid;
+ pid = work->compound_pfid;
+ }
+ }
+
+ if (!has_file_id(id)) {
+ id = le64_to_cpu(req->VolatileFileId);
+ pid = le64_to_cpu(req->PersistentFileId);
+ }
+
+ fp = ksmbd_lookup_fd_slow(work, id, pid);
+ if (!fp)
+ return -ENOENT;
+
+ fileinfoclass = req->FileInfoClass;
+
+ switch (fileinfoclass) {
+ case FILE_ACCESS_INFORMATION:
+ get_file_access_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_ACCESS_INFORMATION_SIZE;
+ break;
+
+ case FILE_BASIC_INFORMATION:
+ rc = get_file_basic_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_BASIC_INFORMATION_SIZE;
+ break;
+
+ case FILE_STANDARD_INFORMATION:
+ get_file_standard_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_STANDARD_INFORMATION_SIZE;
+ break;
+
+ case FILE_ALIGNMENT_INFORMATION:
+ get_file_alignment_info(rsp, rsp_org);
+ file_infoclass_size = FILE_ALIGNMENT_INFORMATION_SIZE;
+ break;
+
+ case FILE_ALL_INFORMATION:
+ rc = get_file_all_info(work, rsp, fp, rsp_org);
+ file_infoclass_size = FILE_ALL_INFORMATION_SIZE;
+ break;
+
+ case FILE_ALTERNATE_NAME_INFORMATION:
+ get_file_alternate_info(work, rsp, fp, rsp_org);
+ file_infoclass_size = FILE_ALTERNATE_NAME_INFORMATION_SIZE;
+ break;
+
+ case FILE_STREAM_INFORMATION:
+ get_file_stream_info(work, rsp, fp, rsp_org);
+ file_infoclass_size = FILE_STREAM_INFORMATION_SIZE;
+ break;
+
+ case FILE_INTERNAL_INFORMATION:
+ get_file_internal_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_INTERNAL_INFORMATION_SIZE;
+ break;
+
+ case FILE_NETWORK_OPEN_INFORMATION:
+ rc = get_file_network_open_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_NETWORK_OPEN_INFORMATION_SIZE;
+ break;
+
+ case FILE_EA_INFORMATION:
+ get_file_ea_info(rsp, rsp_org);
+ file_infoclass_size = FILE_EA_INFORMATION_SIZE;
+ break;
+
+ case FILE_FULL_EA_INFORMATION:
+ rc = smb2_get_ea(work, fp, req, rsp, rsp_org);
+ file_infoclass_size = FILE_FULL_EA_INFORMATION_SIZE;
+ break;
+
+ case FILE_POSITION_INFORMATION:
+ get_file_position_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_POSITION_INFORMATION_SIZE;
+ break;
+
+ case FILE_MODE_INFORMATION:
+ get_file_mode_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_MODE_INFORMATION_SIZE;
+ break;
+
+ case FILE_COMPRESSION_INFORMATION:
+ get_file_compression_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_COMPRESSION_INFORMATION_SIZE;
+ break;
+
+ case FILE_ATTRIBUTE_TAG_INFORMATION:
+ rc = get_file_attribute_tag_info(rsp, fp, rsp_org);
+ file_infoclass_size = FILE_ATTRIBUTE_TAG_INFORMATION_SIZE;
+ break;
+ case SMB_FIND_FILE_POSIX_INFO:
+ if (!work->tcon->posix_extensions) {
+ pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
+ rc = -EOPNOTSUPP;
+ } else {
+ rc = find_file_posix_info(rsp, fp, rsp_org);
+ file_infoclass_size = sizeof(struct smb311_posix_qinfo);
+ }
+ break;
+ default:
+ ksmbd_debug(SMB, "fileinfoclass %d not supported yet\n",
+ fileinfoclass);
+ rc = -EOPNOTSUPP;
+ }
+ if (!rc)
+ rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
+ rsp,
+ file_infoclass_size);
+ ksmbd_fd_put(work, fp);
+ return rc;
+}
+
+static int smb2_get_info_filesystem(struct ksmbd_work *work,
+ struct smb2_query_info_req *req,
+ struct smb2_query_info_rsp *rsp, void *rsp_org)
+{
+ struct ksmbd_session *sess = work->sess;
+ struct ksmbd_conn *conn = sess->conn;
+ struct ksmbd_share_config *share = work->tcon->share_conf;
+ int fsinfoclass = 0;
+ struct kstatfs stfs;
+ struct path path;
+ int rc = 0, len;
+ int fs_infoclass_size = 0;
+ int lookup_flags = 0;
+
+ if (test_share_config_flag(share, KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS))
+ lookup_flags = LOOKUP_FOLLOW;
+
+ rc = ksmbd_vfs_kern_path(share->path, lookup_flags, &path, 0);
+ if (rc) {
+ pr_err("cannot create vfs path\n");
+ return -EIO;
+ }
+
+ rc = vfs_statfs(&path, &stfs);
+ if (rc) {
+ pr_err("cannot do stat of path %s\n", share->path);
+ path_put(&path);
+ return -EIO;
+ }
+
+ fsinfoclass = req->FileInfoClass;
+
+ switch (fsinfoclass) {
+ case FS_DEVICE_INFORMATION:
+ {
+ struct filesystem_device_info *info;
+
+ info = (struct filesystem_device_info *)rsp->Buffer;
+
+ info->DeviceType = cpu_to_le32(stfs.f_type);
+ info->DeviceCharacteristics = cpu_to_le32(0x00000020);
+ rsp->OutputBufferLength = cpu_to_le32(8);
+ inc_rfc1001_len(rsp_org, 8);
+ fs_infoclass_size = FS_DEVICE_INFORMATION_SIZE;
+ break;
+ }
+ case FS_ATTRIBUTE_INFORMATION:
+ {
+ struct filesystem_attribute_info *info;
+ size_t sz;
+
+ info = (struct filesystem_attribute_info *)rsp->Buffer;
+ info->Attributes = cpu_to_le32(FILE_SUPPORTS_OBJECT_IDS |
+ FILE_PERSISTENT_ACLS |
+ FILE_UNICODE_ON_DISK |
+ FILE_CASE_PRESERVED_NAMES |
+ FILE_CASE_SENSITIVE_SEARCH |
+ FILE_SUPPORTS_BLOCK_REFCOUNTING);
+
+ info->Attributes |= cpu_to_le32(server_conf.share_fake_fscaps);
+
+ info->MaxPathNameComponentLength = cpu_to_le32(stfs.f_namelen);
+ len = smbConvertToUTF16((__le16 *)info->FileSystemName,
+ "NTFS", PATH_MAX, conn->local_nls, 0);
+ len = len * 2;
+ info->FileSystemNameLen = cpu_to_le32(len);
+ sz = sizeof(struct filesystem_attribute_info) - 2 + len;
+ rsp->OutputBufferLength = cpu_to_le32(sz);
+ inc_rfc1001_len(rsp_org, sz);
+ fs_infoclass_size = FS_ATTRIBUTE_INFORMATION_SIZE;
+ break;
+ }
+ case FS_VOLUME_INFORMATION:
+ {
+ struct filesystem_vol_info *info;
+ size_t sz;
+
+ info = (struct filesystem_vol_info *)(rsp->Buffer);
+ info->VolumeCreationTime = 0;
+ /* Taking dummy value of serial number*/
+ info->SerialNumber = cpu_to_le32(0xbc3ac512);
+ len = smbConvertToUTF16((__le16 *)info->VolumeLabel,
+ share->name, PATH_MAX,
+ conn->local_nls, 0);
+ len = len * 2;
+ info->VolumeLabelSize = cpu_to_le32(len);
+ info->Reserved = 0;
+ sz = sizeof(struct filesystem_vol_info) - 2 + len;
+ rsp->OutputBufferLength = cpu_to_le32(sz);
+ inc_rfc1001_len(rsp_org, sz);
+ fs_infoclass_size = FS_VOLUME_INFORMATION_SIZE;
+ break;
+ }
+ case FS_SIZE_INFORMATION:
+ {
+ struct filesystem_info *info;
+
+ info = (struct filesystem_info *)(rsp->Buffer);
+ info->TotalAllocationUnits = cpu_to_le64(stfs.f_blocks);
+ info->FreeAllocationUnits = cpu_to_le64(stfs.f_bfree);
+ info->SectorsPerAllocationUnit = cpu_to_le32(1);
+ info->BytesPerSector = cpu_to_le32(stfs.f_bsize);
+ rsp->OutputBufferLength = cpu_to_le32(24);
+ inc_rfc1001_len(rsp_org, 24);
+ fs_infoclass_size = FS_SIZE_INFORMATION_SIZE;
+ break;
+ }
+ case FS_FULL_SIZE_INFORMATION:
+ {
+ struct smb2_fs_full_size_info *info;
+
+ info = (struct smb2_fs_full_size_info *)(rsp->Buffer);
+ info->TotalAllocationUnits = cpu_to_le64(stfs.f_blocks);
+ info->CallerAvailableAllocationUnits =
+ cpu_to_le64(stfs.f_bavail);
+ info->ActualAvailableAllocationUnits =
+ cpu_to_le64(stfs.f_bfree);
+ info->SectorsPerAllocationUnit = cpu_to_le32(1);
+ info->BytesPerSector = cpu_to_le32(stfs.f_bsize);
+ rsp->OutputBufferLength = cpu_to_le32(32);
+ inc_rfc1001_len(rsp_org, 32);
+ fs_infoclass_size = FS_FULL_SIZE_INFORMATION_SIZE;
+ break;
+ }
+ case FS_OBJECT_ID_INFORMATION:
+ {
+ struct object_id_info *info;
+
+ info = (struct object_id_info *)(rsp->Buffer);
+
+ if (!user_guest(sess->user))
+ memcpy(info->objid, user_passkey(sess->user), 16);
+ else
+ memset(info->objid, 0, 16);
+
+ info->extended_info.magic = cpu_to_le32(EXTENDED_INFO_MAGIC);
+ info->extended_info.version = cpu_to_le32(1);
+ info->extended_info.release = cpu_to_le32(1);
+ info->extended_info.rel_date = 0;
+ memcpy(info->extended_info.version_string, "1.1.0", strlen("1.1.0"));
+ rsp->OutputBufferLength = cpu_to_le32(64);
+ inc_rfc1001_len(rsp_org, 64);
+ fs_infoclass_size = FS_OBJECT_ID_INFORMATION_SIZE;
+ break;
+ }
+ case FS_SECTOR_SIZE_INFORMATION:
+ {
+ struct smb3_fs_ss_info *info;
+
+ info = (struct smb3_fs_ss_info *)(rsp->Buffer);
+
+ info->LogicalBytesPerSector = cpu_to_le32(stfs.f_bsize);
+ info->PhysicalBytesPerSectorForAtomicity =
+ cpu_to_le32(stfs.f_bsize);
+ info->PhysicalBytesPerSectorForPerf = cpu_to_le32(stfs.f_bsize);
+ info->FSEffPhysicalBytesPerSectorForAtomicity =
+ cpu_to_le32(stfs.f_bsize);
+ info->Flags = cpu_to_le32(SSINFO_FLAGS_ALIGNED_DEVICE |
+ SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE);
+ info->ByteOffsetForSectorAlignment = 0;
+ info->ByteOffsetForPartitionAlignment = 0;
+ rsp->OutputBufferLength = cpu_to_le32(28);
+ inc_rfc1001_len(rsp_org, 28);
+ fs_infoclass_size = FS_SECTOR_SIZE_INFORMATION_SIZE;
+ break;
+ }
+ case FS_CONTROL_INFORMATION:
+ {
+ /*
+ * TODO : The current implementation is based on
+ * test result with win7(NTFS) server. It's need to
+ * modify this to get valid Quota values
+ * from Linux kernel
+ */
+ struct smb2_fs_control_info *info;
+
+ info = (struct smb2_fs_control_info *)(rsp->Buffer);
+ info->FreeSpaceStartFiltering = 0;
+ info->FreeSpaceThreshold = 0;
+ info->FreeSpaceStopFiltering = 0;
+ info->DefaultQuotaThreshold = cpu_to_le64(SMB2_NO_FID);
+ info->DefaultQuotaLimit = cpu_to_le64(SMB2_NO_FID);
+ info->Padding = 0;
+ rsp->OutputBufferLength = cpu_to_le32(48);
+ inc_rfc1001_len(rsp_org, 48);
+ fs_infoclass_size = FS_CONTROL_INFORMATION_SIZE;
+ break;
+ }
+ case FS_POSIX_INFORMATION:
+ {
+ struct filesystem_posix_info *info;
+
+ if (!work->tcon->posix_extensions) {
+ pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
+ rc = -EOPNOTSUPP;
+ } else {
+ info = (struct filesystem_posix_info *)(rsp->Buffer);
+ info->OptimalTransferSize = cpu_to_le32(stfs.f_bsize);
+ info->BlockSize = cpu_to_le32(stfs.f_bsize);
+ info->TotalBlocks = cpu_to_le64(stfs.f_blocks);
+ info->BlocksAvail = cpu_to_le64(stfs.f_bfree);
+ info->UserBlocksAvail = cpu_to_le64(stfs.f_bavail);
+ info->TotalFileNodes = cpu_to_le64(stfs.f_files);
+ info->FreeFileNodes = cpu_to_le64(stfs.f_ffree);
+ rsp->OutputBufferLength = cpu_to_le32(56);
+ inc_rfc1001_len(rsp_org, 56);
+ fs_infoclass_size = FS_POSIX_INFORMATION_SIZE;
+ }
+ break;
+ }
+ default:
+ path_put(&path);
+ return -EOPNOTSUPP;
+ }
+ rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
+ rsp,
+ fs_infoclass_size);
+ path_put(&path);
+ return rc;
+}
+
+static int smb2_get_info_sec(struct ksmbd_work *work,
+ struct smb2_query_info_req *req,
+ struct smb2_query_info_rsp *rsp, void *rsp_org)
+{
+ struct ksmbd_file *fp;
+ struct user_namespace *user_ns;
+ struct smb_ntsd *pntsd = (struct smb_ntsd *)rsp->Buffer, *ppntsd = NULL;
+ struct smb_fattr fattr = {{0}};
+ struct inode *inode;
+ __u32 secdesclen;
+ unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
+ int addition_info = le32_to_cpu(req->AdditionalInformation);
+ int rc;
+
+ if (addition_info & ~(OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO |
+ PROTECTED_DACL_SECINFO |
+ UNPROTECTED_DACL_SECINFO)) {
+ pr_err("Unsupported addition info: 0x%x)\n",
+ addition_info);
+
+ pntsd->revision = cpu_to_le16(1);
+ pntsd->type = cpu_to_le16(SELF_RELATIVE | DACL_PROTECTED);
+ pntsd->osidoffset = 0;
+ pntsd->gsidoffset = 0;
+ pntsd->sacloffset = 0;
+ pntsd->dacloffset = 0;
+
+ secdesclen = sizeof(struct smb_ntsd);
+ rsp->OutputBufferLength = cpu_to_le32(secdesclen);
+ inc_rfc1001_len(rsp_org, secdesclen);
+
+ return 0;
+ }
+
+ if (work->next_smb2_rcv_hdr_off) {
+ if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ ksmbd_debug(SMB, "Compound request set FID = %llu\n",
+ work->compound_fid);
+ id = work->compound_fid;
+ pid = work->compound_pfid;
+ }
+ }
+
+ if (!has_file_id(id)) {
+ id = le64_to_cpu(req->VolatileFileId);
+ pid = le64_to_cpu(req->PersistentFileId);
+ }
+
+ fp = ksmbd_lookup_fd_slow(work, id, pid);
+ if (!fp)
+ return -ENOENT;
+
+ user_ns = file_mnt_user_ns(fp->filp);
+ inode = file_inode(fp->filp);
+ ksmbd_acls_fattr(&fattr, inode);
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_ACL_XATTR))
+ ksmbd_vfs_get_sd_xattr(work->conn, user_ns,
+ fp->filp->f_path.dentry, &ppntsd);
+
+ rc = build_sec_desc(user_ns, pntsd, ppntsd, addition_info,
+ &secdesclen, &fattr);
+ posix_acl_release(fattr.cf_acls);
+ posix_acl_release(fattr.cf_dacls);
+ kfree(ppntsd);
+ ksmbd_fd_put(work, fp);
+ if (rc)
+ return rc;
+
+ rsp->OutputBufferLength = cpu_to_le32(secdesclen);
+ inc_rfc1001_len(rsp_org, secdesclen);
+ return 0;
+}
+
+/**
+ * smb2_query_info() - handler for smb2 query info command
+ * @work: smb work containing query info request buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_query_info(struct ksmbd_work *work)
+{
+ struct smb2_query_info_req *req;
+ struct smb2_query_info_rsp *rsp, *rsp_org;
+ int rc = 0;
+
+ rsp_org = work->response_buf;
+ WORK_BUFFERS(work, req, rsp);
+
+ ksmbd_debug(SMB, "GOT query info request\n");
+
+ switch (req->InfoType) {
+ case SMB2_O_INFO_FILE:
+ ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n");
+ rc = smb2_get_info_file(work, req, rsp, (void *)rsp_org);
+ break;
+ case SMB2_O_INFO_FILESYSTEM:
+ ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILESYSTEM\n");
+ rc = smb2_get_info_filesystem(work, req, rsp, (void *)rsp_org);
+ break;
+ case SMB2_O_INFO_SECURITY:
+ ksmbd_debug(SMB, "GOT SMB2_O_INFO_SECURITY\n");
+ rc = smb2_get_info_sec(work, req, rsp, (void *)rsp_org);
+ break;
+ default:
+ ksmbd_debug(SMB, "InfoType %d not supported yet\n",
+ req->InfoType);
+ rc = -EOPNOTSUPP;
+ }
+
+ if (rc < 0) {
+ if (rc == -EACCES)
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ else if (rc == -ENOENT)
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ else if (rc == -EIO)
+ rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
+ else if (rc == -EOPNOTSUPP || rsp->hdr.Status == 0)
+ rsp->hdr.Status = STATUS_INVALID_INFO_CLASS;
+ smb2_set_err_rsp(work);
+
+ ksmbd_debug(SMB, "error while processing smb2 query rc = %d\n",
+ rc);
+ return rc;
+ }
+ rsp->StructureSize = cpu_to_le16(9);
+ rsp->OutputBufferOffset = cpu_to_le16(72);
+ inc_rfc1001_len(rsp_org, 8);
+ return 0;
+}
+
+/**
+ * smb2_close_pipe() - handler for closing IPC pipe
+ * @work: smb work containing close request buffer
+ *
+ * Return: 0
+ */
+static noinline int smb2_close_pipe(struct ksmbd_work *work)
+{
+ u64 id;
+ struct smb2_close_req *req = work->request_buf;
+ struct smb2_close_rsp *rsp = work->response_buf;
+
+ id = le64_to_cpu(req->VolatileFileId);
+ ksmbd_session_rpc_close(work->sess, id);
+
+ rsp->StructureSize = cpu_to_le16(60);
+ rsp->Flags = 0;
+ rsp->Reserved = 0;
+ rsp->CreationTime = 0;
+ rsp->LastAccessTime = 0;
+ rsp->LastWriteTime = 0;
+ rsp->ChangeTime = 0;
+ rsp->AllocationSize = 0;
+ rsp->EndOfFile = 0;
+ rsp->Attributes = 0;
+ inc_rfc1001_len(rsp, 60);
+ return 0;
+}
+
+/**
+ * smb2_close() - handler for smb2 close file command
+ * @work: smb work containing close request buffer
+ *
+ * Return: 0
+ */
+int smb2_close(struct ksmbd_work *work)
+{
+ u64 volatile_id = KSMBD_NO_FID;
+ u64 sess_id;
+ struct smb2_close_req *req;
+ struct smb2_close_rsp *rsp;
+ struct smb2_close_rsp *rsp_org;
+ struct ksmbd_conn *conn = work->conn;
+ struct ksmbd_file *fp;
+ struct inode *inode;
+ u64 time;
+ int err = 0;
+
+ rsp_org = work->response_buf;
+ WORK_BUFFERS(work, req, rsp);
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_PIPE)) {
+ ksmbd_debug(SMB, "IPC pipe close request\n");
+ return smb2_close_pipe(work);
+ }
+
+ sess_id = le64_to_cpu(req->hdr.SessionId);
+ if (req->hdr.Flags & SMB2_FLAGS_RELATED_OPERATIONS)
+ sess_id = work->compound_sid;
+
+ work->compound_sid = 0;
+ if (check_session_id(conn, sess_id)) {
+ work->compound_sid = sess_id;
+ } else {
+ rsp->hdr.Status = STATUS_USER_SESSION_DELETED;
+ if (req->hdr.Flags & SMB2_FLAGS_RELATED_OPERATIONS)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ err = -EBADF;
+ goto out;
+ }
+
+ if (work->next_smb2_rcv_hdr_off &&
+ !has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ if (!has_file_id(work->compound_fid)) {
+ /* file already closed, return FILE_CLOSED */
+ ksmbd_debug(SMB, "file already closed\n");
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ err = -EBADF;
+ goto out;
+ } else {
+ ksmbd_debug(SMB,
+ "Compound request set FID = %llu:%llu\n",
+ work->compound_fid,
+ work->compound_pfid);
+ volatile_id = work->compound_fid;
+
+ /* file closed, stored id is not valid anymore */
+ work->compound_fid = KSMBD_NO_FID;
+ work->compound_pfid = KSMBD_NO_FID;
+ }
+ } else {
+ volatile_id = le64_to_cpu(req->VolatileFileId);
+ }
+ ksmbd_debug(SMB, "volatile_id = %llu\n", volatile_id);
+
+ rsp->StructureSize = cpu_to_le16(60);
+ rsp->Reserved = 0;
+
+ if (req->Flags == SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB) {
+ fp = ksmbd_lookup_fd_fast(work, volatile_id);
+ if (!fp) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ inode = file_inode(fp->filp);
+ rsp->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB;
+ rsp->AllocationSize = S_ISDIR(inode->i_mode) ? 0 :
+ cpu_to_le64(inode->i_blocks << 9);
+ rsp->EndOfFile = cpu_to_le64(inode->i_size);
+ rsp->Attributes = fp->f_ci->m_fattr;
+ rsp->CreationTime = cpu_to_le64(fp->create_time);
+ time = ksmbd_UnixTimeToNT(inode->i_atime);
+ rsp->LastAccessTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(inode->i_mtime);
+ rsp->LastWriteTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(inode->i_ctime);
+ rsp->ChangeTime = cpu_to_le64(time);
+ ksmbd_fd_put(work, fp);
+ } else {
+ rsp->Flags = 0;
+ rsp->AllocationSize = 0;
+ rsp->EndOfFile = 0;
+ rsp->Attributes = 0;
+ rsp->CreationTime = 0;
+ rsp->LastAccessTime = 0;
+ rsp->LastWriteTime = 0;
+ rsp->ChangeTime = 0;
+ }
+
+ err = ksmbd_close_fd(work, volatile_id);
+out:
+ if (err) {
+ if (rsp->hdr.Status == 0)
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ smb2_set_err_rsp(work);
+ } else {
+ inc_rfc1001_len(rsp_org, 60);
+ }
+
+ return 0;
+}
+
+/**
+ * smb2_echo() - handler for smb2 echo(ping) command
+ * @work: smb work containing echo request buffer
+ *
+ * Return: 0
+ */
+int smb2_echo(struct ksmbd_work *work)
+{
+ struct smb2_echo_rsp *rsp = work->response_buf;
+
+ rsp->StructureSize = cpu_to_le16(4);
+ rsp->Reserved = 0;
+ inc_rfc1001_len(rsp, 4);
+ return 0;
+}
+
+static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
+ struct smb2_file_rename_info *file_info,
+ struct nls_table *local_nls)
+{
+ struct ksmbd_share_config *share = fp->tcon->share_conf;
+ char *new_name = NULL, *abs_oldname = NULL, *old_name = NULL;
+ char *pathname = NULL;
+ struct path path;
+ bool file_present = true;
+ int rc;
+
+ ksmbd_debug(SMB, "setting FILE_RENAME_INFO\n");
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathname)
+ return -ENOMEM;
+
+ abs_oldname = d_path(&fp->filp->f_path, pathname, PATH_MAX);
+ if (IS_ERR(abs_oldname)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ old_name = strrchr(abs_oldname, '/');
+ if (old_name && old_name[1] != '\0') {
+ old_name++;
+ } else {
+ ksmbd_debug(SMB, "can't get last component in path %s\n",
+ abs_oldname);
+ rc = -ENOENT;
+ goto out;
+ }
+
+ new_name = smb2_get_name(share,
+ file_info->FileName,
+ le32_to_cpu(file_info->FileNameLength),
+ local_nls);
+ if (IS_ERR(new_name)) {
+ rc = PTR_ERR(new_name);
+ goto out;
+ }
+
+ if (strchr(new_name, ':')) {
+ int s_type;
+ char *xattr_stream_name, *stream_name = NULL;
+ size_t xattr_stream_size;
+ int len;
+
+ rc = parse_stream_name(new_name, &stream_name, &s_type);
+ if (rc < 0)
+ goto out;
+
+ len = strlen(new_name);
+ if (new_name[len - 1] != '/') {
+ pr_err("not allow base filename in rename\n");
+ rc = -ESHARE;
+ goto out;
+ }
+
+ rc = ksmbd_vfs_xattr_stream_name(stream_name,
+ &xattr_stream_name,
+ &xattr_stream_size,
+ s_type);
+ if (rc)
+ goto out;
+
+ rc = ksmbd_vfs_setxattr(file_mnt_user_ns(fp->filp),
+ fp->filp->f_path.dentry,
+ xattr_stream_name,
+ NULL, 0, 0);
+ if (rc < 0) {
+ pr_err("failed to store stream name in xattr: %d\n",
+ rc);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ goto out;
+ }
+
+ ksmbd_debug(SMB, "new name %s\n", new_name);
+ rc = ksmbd_vfs_kern_path(new_name, 0, &path, 1);
+ if (rc)
+ file_present = false;
+ else
+ path_put(&path);
+
+ if (ksmbd_share_veto_filename(share, new_name)) {
+ rc = -ENOENT;
+ ksmbd_debug(SMB, "Can't rename vetoed file: %s\n", new_name);
+ goto out;
+ }
+
+ if (file_info->ReplaceIfExists) {
+ if (file_present) {
+ rc = ksmbd_vfs_remove_file(work, new_name);
+ if (rc) {
+ if (rc != -ENOTEMPTY)
+ rc = -EINVAL;
+ ksmbd_debug(SMB, "cannot delete %s, rc %d\n",
+ new_name, rc);
+ goto out;
+ }
+ }
+ } else {
+ if (file_present &&
+ strncmp(old_name, path.dentry->d_name.name, strlen(old_name))) {
+ rc = -EEXIST;
+ ksmbd_debug(SMB,
+ "cannot rename already existing file\n");
+ goto out;
+ }
+ }
+
+ rc = ksmbd_vfs_fp_rename(work, fp, new_name);
+out:
+ kfree(pathname);
+ if (!IS_ERR(new_name))
+ kfree(new_name);
+ return rc;
+}
+
+static int smb2_create_link(struct ksmbd_work *work,
+ struct ksmbd_share_config *share,
+ struct smb2_file_link_info *file_info,
+ struct file *filp,
+ struct nls_table *local_nls)
+{
+ char *link_name = NULL, *target_name = NULL, *pathname = NULL;
+ struct path path;
+ bool file_present = true;
+ int rc;
+
+ ksmbd_debug(SMB, "setting FILE_LINK_INFORMATION\n");
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathname)
+ return -ENOMEM;
+
+ link_name = smb2_get_name(share,
+ file_info->FileName,
+ le32_to_cpu(file_info->FileNameLength),
+ local_nls);
+ if (IS_ERR(link_name) || S_ISDIR(file_inode(filp)->i_mode)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ ksmbd_debug(SMB, "link name is %s\n", link_name);
+ target_name = d_path(&filp->f_path, pathname, PATH_MAX);
+ if (IS_ERR(target_name)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ ksmbd_debug(SMB, "target name is %s\n", target_name);
+ rc = ksmbd_vfs_kern_path(link_name, 0, &path, 0);
+ if (rc)
+ file_present = false;
+ else
+ path_put(&path);
+
+ if (file_info->ReplaceIfExists) {
+ if (file_present) {
+ rc = ksmbd_vfs_remove_file(work, link_name);
+ if (rc) {
+ rc = -EINVAL;
+ ksmbd_debug(SMB, "cannot delete %s\n",
+ link_name);
+ goto out;
+ }
+ }
+ } else {
+ if (file_present) {
+ rc = -EEXIST;
+ ksmbd_debug(SMB, "link already exists\n");
+ goto out;
+ }
+ }
+
+ rc = ksmbd_vfs_link(work, target_name, link_name);
+ if (rc)
+ rc = -EINVAL;
+out:
+ if (!IS_ERR(link_name))
+ kfree(link_name);
+ kfree(pathname);
+ return rc;
+}
+
+static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
+ struct ksmbd_share_config *share)
+{
+ struct smb2_file_all_info *file_info;
+ struct iattr attrs;
+ struct iattr temp_attrs;
+ struct file *filp;
+ struct inode *inode;
+ struct user_namespace *user_ns;
+ int rc;
+
+ if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE))
+ return -EACCES;
+
+ file_info = (struct smb2_file_all_info *)buf;
+ attrs.ia_valid = 0;
+ filp = fp->filp;
+ inode = file_inode(filp);
+ user_ns = file_mnt_user_ns(filp);
+
+ if (file_info->CreationTime)
+ fp->create_time = le64_to_cpu(file_info->CreationTime);
+
+ if (file_info->LastAccessTime) {
+ attrs.ia_atime = ksmbd_NTtimeToUnix(file_info->LastAccessTime);
+ attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
+ }
+
+ if (file_info->ChangeTime) {
+ temp_attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime);
+ attrs.ia_ctime = temp_attrs.ia_ctime;
+ attrs.ia_valid |= ATTR_CTIME;
+ } else {
+ temp_attrs.ia_ctime = inode->i_ctime;
+ }
+
+ if (file_info->LastWriteTime) {
+ attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime);
+ attrs.ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
+ }
+
+ if (file_info->Attributes) {
+ if (!S_ISDIR(inode->i_mode) &&
+ file_info->Attributes & ATTR_DIRECTORY_LE) {
+ pr_err("can't change a file to a directory\n");
+ return -EINVAL;
+ }
+
+ if (!(S_ISDIR(inode->i_mode) && file_info->Attributes == ATTR_NORMAL_LE))
+ fp->f_ci->m_fattr = file_info->Attributes |
+ (fp->f_ci->m_fattr & ATTR_DIRECTORY_LE);
+ }
+
+ if (test_share_config_flag(share, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS) &&
+ (file_info->CreationTime || file_info->Attributes)) {
+ struct xattr_dos_attrib da = {0};
+
+ da.version = 4;
+ da.itime = fp->itime;
+ da.create_time = fp->create_time;
+ da.attr = le32_to_cpu(fp->f_ci->m_fattr);
+ da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME |
+ XATTR_DOSINFO_ITIME;
+
+ rc = ksmbd_vfs_set_dos_attrib_xattr(user_ns,
+ filp->f_path.dentry, &da);
+ if (rc)
+ ksmbd_debug(SMB,
+ "failed to restore file attribute in EA\n");
+ rc = 0;
+ }
+
+ /*
+ * HACK : set ctime here to avoid ctime changed
+ * when file_info->ChangeTime is zero.
+ */
+ attrs.ia_ctime = temp_attrs.ia_ctime;
+ attrs.ia_valid |= ATTR_CTIME;
+
+ if (attrs.ia_valid) {
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = d_inode(dentry);
+
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return -EACCES;
+
+ rc = setattr_prepare(user_ns, dentry, &attrs);
+ if (rc)
+ return -EINVAL;
+
+ inode_lock(inode);
+ setattr_copy(user_ns, inode, &attrs);
+ attrs.ia_valid &= ~ATTR_CTIME;
+ rc = notify_change(user_ns, dentry, &attrs, NULL);
+ inode_unlock(inode);
+ }
+ return 0;
+}
+
+static int set_file_allocation_info(struct ksmbd_work *work,
+ struct ksmbd_file *fp, char *buf)
+{
+ /*
+ * TODO : It's working fine only when store dos attributes
+ * is not yes. need to implement a logic which works
+ * properly with any smb.conf option
+ */
+
+ struct smb2_file_alloc_info *file_alloc_info;
+ loff_t alloc_blks;
+ struct inode *inode;
+ int rc;
+
+ if (!(fp->daccess & FILE_WRITE_DATA_LE))
+ return -EACCES;
+
+ file_alloc_info = (struct smb2_file_alloc_info *)buf;
+ alloc_blks = (le64_to_cpu(file_alloc_info->AllocationSize) + 511) >> 9;
+ inode = file_inode(fp->filp);
+
+ if (alloc_blks > inode->i_blocks) {
+ smb_break_all_levII_oplock(work, fp, 1);
+ rc = vfs_fallocate(fp->filp, FALLOC_FL_KEEP_SIZE, 0,
+ alloc_blks * 512);
+ if (rc && rc != -EOPNOTSUPP) {
+ pr_err("vfs_fallocate is failed : %d\n", rc);
+ return rc;
+ }
+ } else if (alloc_blks < inode->i_blocks) {
+ loff_t size;
+
+ /*
+ * Allocation size could be smaller than original one
+ * which means allocated blocks in file should be
+ * deallocated. use truncate to cut out it, but inode
+ * size is also updated with truncate offset.
+ * inode size is retained by backup inode size.
+ */
+ size = i_size_read(inode);
+ rc = ksmbd_vfs_truncate(work, NULL, fp, alloc_blks * 512);
+ if (rc) {
+ pr_err("truncate failed! filename : %s, err %d\n",
+ fp->filename, rc);
+ return rc;
+ }
+ if (size < alloc_blks * 512)
+ i_size_write(inode, size);
+ }
+ return 0;
+}
+
+static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
+ char *buf)
+{
+ struct smb2_file_eof_info *file_eof_info;
+ loff_t newsize;
+ struct inode *inode;
+ int rc;
+
+ if (!(fp->daccess & FILE_WRITE_DATA_LE))
+ return -EACCES;
+
+ file_eof_info = (struct smb2_file_eof_info *)buf;
+ newsize = le64_to_cpu(file_eof_info->EndOfFile);
+ inode = file_inode(fp->filp);
+
+ /*
+ * If FILE_END_OF_FILE_INFORMATION of set_info_file is called
+ * on FAT32 shared device, truncate execution time is too long
+ * and network error could cause from windows client. because
+ * truncate of some filesystem like FAT32 fill zero data in
+ * truncated range.
+ */
+ if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) {
+ ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n",
+ fp->filename, newsize);
+ rc = ksmbd_vfs_truncate(work, NULL, fp, newsize);
+ if (rc) {
+ ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n",
+ fp->filename, rc);
+ if (rc != -EAGAIN)
+ rc = -EBADF;
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
+ char *buf)
+{
+ struct ksmbd_file *parent_fp;
+ struct dentry *parent;
+ struct dentry *dentry = fp->filp->f_path.dentry;
+ int ret;
+
+ if (!(fp->daccess & FILE_DELETE_LE)) {
+ pr_err("no right to delete : 0x%x\n", fp->daccess);
+ return -EACCES;
+ }
+
+ if (ksmbd_stream_fd(fp))
+ goto next;
+
+ parent = dget_parent(dentry);
+ ret = ksmbd_vfs_lock_parent(parent, dentry);
+ if (ret) {
+ dput(parent);
+ return ret;
+ }
+
+ parent_fp = ksmbd_lookup_fd_inode(d_inode(parent));
+ inode_unlock(d_inode(parent));
+ dput(parent);
+
+ if (parent_fp) {
+ if (parent_fp->daccess & FILE_DELETE_LE) {
+ pr_err("parent dir is opened with delete access\n");
+ return -ESHARE;
+ }
+ }
+next:
+ return smb2_rename(work, fp,
+ (struct smb2_file_rename_info *)buf,
+ work->sess->conn->local_nls);
+}
+
+static int set_file_disposition_info(struct ksmbd_file *fp, char *buf)
+{
+ struct smb2_file_disposition_info *file_info;
+ struct inode *inode;
+
+ if (!(fp->daccess & FILE_DELETE_LE)) {
+ pr_err("no right to delete : 0x%x\n", fp->daccess);
+ return -EACCES;
+ }
+
+ inode = file_inode(fp->filp);
+ file_info = (struct smb2_file_disposition_info *)buf;
+ if (file_info->DeletePending) {
+ if (S_ISDIR(inode->i_mode) &&
+ ksmbd_vfs_empty_dir(fp) == -ENOTEMPTY)
+ return -EBUSY;
+ ksmbd_set_inode_pending_delete(fp);
+ } else {
+ ksmbd_clear_inode_pending_delete(fp);
+ }
+ return 0;
+}
+
+static int set_file_position_info(struct ksmbd_file *fp, char *buf)
+{
+ struct smb2_file_pos_info *file_info;
+ loff_t current_byte_offset;
+ unsigned long sector_size;
+ struct inode *inode;
+
+ inode = file_inode(fp->filp);
+ file_info = (struct smb2_file_pos_info *)buf;
+ current_byte_offset = le64_to_cpu(file_info->CurrentByteOffset);
+ sector_size = inode->i_sb->s_blocksize;
+
+ if (current_byte_offset < 0 ||
+ (fp->coption == FILE_NO_INTERMEDIATE_BUFFERING_LE &&
+ current_byte_offset & (sector_size - 1))) {
+ pr_err("CurrentByteOffset is not valid : %llu\n",
+ current_byte_offset);
+ return -EINVAL;
+ }
+
+ fp->filp->f_pos = current_byte_offset;
+ return 0;
+}
+
+static int set_file_mode_info(struct ksmbd_file *fp, char *buf)
+{
+ struct smb2_file_mode_info *file_info;
+ __le32 mode;
+
+ file_info = (struct smb2_file_mode_info *)buf;
+ mode = file_info->Mode;
+
+ if ((mode & ~FILE_MODE_INFO_MASK) ||
+ (mode & FILE_SYNCHRONOUS_IO_ALERT_LE &&
+ mode & FILE_SYNCHRONOUS_IO_NONALERT_LE)) {
+ pr_err("Mode is not valid : 0x%x\n", le32_to_cpu(mode));
+ return -EINVAL;
+ }
+
+ /*
+ * TODO : need to implement consideration for
+ * FILE_SYNCHRONOUS_IO_ALERT and FILE_SYNCHRONOUS_IO_NONALERT
+ */
+ ksmbd_vfs_set_fadvise(fp->filp, mode);
+ fp->coption = mode;
+ return 0;
+}
+
+/**
+ * smb2_set_info_file() - handler for smb2 set info command
+ * @work: smb work containing set info command buffer
+ * @fp: ksmbd_file pointer
+ * @info_class: smb2 set info class
+ * @share: ksmbd_share_config pointer
+ *
+ * Return: 0 on success, otherwise error
+ * TODO: need to implement an error handling for STATUS_INFO_LENGTH_MISMATCH
+ */
+static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
+ int info_class, char *buf,
+ struct ksmbd_share_config *share)
+{
+ switch (info_class) {
+ case FILE_BASIC_INFORMATION:
+ return set_file_basic_info(fp, buf, share);
+
+ case FILE_ALLOCATION_INFORMATION:
+ return set_file_allocation_info(work, fp, buf);
+
+ case FILE_END_OF_FILE_INFORMATION:
+ return set_end_of_file_info(work, fp, buf);
+
+ case FILE_RENAME_INFORMATION:
+ if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ ksmbd_debug(SMB,
+ "User does not have write permission\n");
+ return -EACCES;
+ }
+ return set_rename_info(work, fp, buf);
+
+ case FILE_LINK_INFORMATION:
+ return smb2_create_link(work, work->tcon->share_conf,
+ (struct smb2_file_link_info *)buf, fp->filp,
+ work->sess->conn->local_nls);
+
+ case FILE_DISPOSITION_INFORMATION:
+ if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ ksmbd_debug(SMB,
+ "User does not have write permission\n");
+ return -EACCES;
+ }
+ return set_file_disposition_info(fp, buf);
+
+ case FILE_FULL_EA_INFORMATION:
+ {
+ if (!(fp->daccess & FILE_WRITE_EA_LE)) {
+ pr_err("Not permitted to write ext attr: 0x%x\n",
+ fp->daccess);
+ return -EACCES;
+ }
+
+ return smb2_set_ea((struct smb2_ea_info *)buf,
+ &fp->filp->f_path);
+ }
+
+ case FILE_POSITION_INFORMATION:
+ return set_file_position_info(fp, buf);
+
+ case FILE_MODE_INFORMATION:
+ return set_file_mode_info(fp, buf);
+ }
+
+ pr_err("Unimplemented Fileinfoclass :%d\n", info_class);
+ return -EOPNOTSUPP;
+}
+
+static int smb2_set_info_sec(struct ksmbd_file *fp, int addition_info,
+ char *buffer, int buf_len)
+{
+ struct smb_ntsd *pntsd = (struct smb_ntsd *)buffer;
+
+ fp->saccess |= FILE_SHARE_DELETE_LE;
+
+ return set_info_sec(fp->conn, fp->tcon, &fp->filp->f_path, pntsd,
+ buf_len, false);
+}
+
+/**
+ * smb2_set_info() - handler for smb2 set info command handler
+ * @work: smb work containing set info request buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_set_info(struct ksmbd_work *work)
+{
+ struct smb2_set_info_req *req;
+ struct smb2_set_info_rsp *rsp, *rsp_org;
+ struct ksmbd_file *fp;
+ int rc = 0;
+ unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
+
+ ksmbd_debug(SMB, "Received set info request\n");
+
+ rsp_org = work->response_buf;
+ if (work->next_smb2_rcv_hdr_off) {
+ req = ksmbd_req_buf_next(work);
+ rsp = ksmbd_resp_buf_next(work);
+ if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ ksmbd_debug(SMB, "Compound request set FID = %llu\n",
+ work->compound_fid);
+ id = work->compound_fid;
+ pid = work->compound_pfid;
+ }
+ } else {
+ req = work->request_buf;
+ rsp = work->response_buf;
+ }
+
+ if (!has_file_id(id)) {
+ id = le64_to_cpu(req->VolatileFileId);
+ pid = le64_to_cpu(req->PersistentFileId);
+ }
+
+ fp = ksmbd_lookup_fd_slow(work, id, pid);
+ if (!fp) {
+ ksmbd_debug(SMB, "Invalid id for close: %u\n", id);
+ rc = -ENOENT;
+ goto err_out;
+ }
+
+ switch (req->InfoType) {
+ case SMB2_O_INFO_FILE:
+ ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n");
+ rc = smb2_set_info_file(work, fp, req->FileInfoClass,
+ req->Buffer, work->tcon->share_conf);
+ break;
+ case SMB2_O_INFO_SECURITY:
+ ksmbd_debug(SMB, "GOT SMB2_O_INFO_SECURITY\n");
+ if (ksmbd_override_fsids(work)) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+ rc = smb2_set_info_sec(fp,
+ le32_to_cpu(req->AdditionalInformation),
+ req->Buffer,
+ le32_to_cpu(req->BufferLength));
+ ksmbd_revert_fsids(work);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ if (rc < 0)
+ goto err_out;
+
+ rsp->StructureSize = cpu_to_le16(2);
+ inc_rfc1001_len(rsp_org, 2);
+ ksmbd_fd_put(work, fp);
+ return 0;
+
+err_out:
+ if (rc == -EACCES || rc == -EPERM)
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ else if (rc == -EINVAL)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ else if (rc == -ESHARE)
+ rsp->hdr.Status = STATUS_SHARING_VIOLATION;
+ else if (rc == -ENOENT)
+ rsp->hdr.Status = STATUS_OBJECT_NAME_INVALID;
+ else if (rc == -EBUSY || rc == -ENOTEMPTY)
+ rsp->hdr.Status = STATUS_DIRECTORY_NOT_EMPTY;
+ else if (rc == -EAGAIN)
+ rsp->hdr.Status = STATUS_FILE_LOCK_CONFLICT;
+ else if (rc == -EBADF || rc == -ESTALE)
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+ else if (rc == -EEXIST)
+ rsp->hdr.Status = STATUS_OBJECT_NAME_COLLISION;
+ else if (rsp->hdr.Status == 0 || rc == -EOPNOTSUPP)
+ rsp->hdr.Status = STATUS_INVALID_INFO_CLASS;
+ smb2_set_err_rsp(work);
+ ksmbd_fd_put(work, fp);
+ ksmbd_debug(SMB, "error while processing smb2 query rc = %d\n", rc);
+ return rc;
+}
+
+/**
+ * smb2_read_pipe() - handler for smb2 read from IPC pipe
+ * @work: smb work containing read IPC pipe command buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+static noinline int smb2_read_pipe(struct ksmbd_work *work)
+{
+ int nbytes = 0, err;
+ u64 id;
+ struct ksmbd_rpc_command *rpc_resp;
+ struct smb2_read_req *req = work->request_buf;
+ struct smb2_read_rsp *rsp = work->response_buf;
+
+ id = le64_to_cpu(req->VolatileFileId);
+
+ inc_rfc1001_len(rsp, 16);
+ rpc_resp = ksmbd_rpc_read(work->sess, id);
+ if (rpc_resp) {
+ if (rpc_resp->flags != KSMBD_RPC_OK) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ work->aux_payload_buf =
+ kvmalloc(rpc_resp->payload_sz, GFP_KERNEL | __GFP_ZERO);
+ if (!work->aux_payload_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(work->aux_payload_buf, rpc_resp->payload,
+ rpc_resp->payload_sz);
+
+ nbytes = rpc_resp->payload_sz;
+ work->resp_hdr_sz = get_rfc1002_len(rsp) + 4;
+ work->aux_payload_sz = nbytes;
+ kvfree(rpc_resp);
+ }
+
+ rsp->StructureSize = cpu_to_le16(17);
+ rsp->DataOffset = 80;
+ rsp->Reserved = 0;
+ rsp->DataLength = cpu_to_le32(nbytes);
+ rsp->DataRemaining = 0;
+ rsp->Reserved2 = 0;
+ inc_rfc1001_len(rsp, nbytes);
+ return 0;
+
+out:
+ rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
+ smb2_set_err_rsp(work);
+ kvfree(rpc_resp);
+ return err;
+}
+
+static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work,
+ struct smb2_read_req *req, void *data_buf,
+ size_t length)
+{
+ struct smb2_buffer_desc_v1 *desc =
+ (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
+ int err;
+
+ if (work->conn->dialect == SMB30_PROT_ID &&
+ req->Channel != SMB2_CHANNEL_RDMA_V1)
+ return -EINVAL;
+
+ if (req->ReadChannelInfoOffset == 0 ||
+ le16_to_cpu(req->ReadChannelInfoLength) < sizeof(*desc))
+ return -EINVAL;
+
+ work->need_invalidate_rkey =
+ (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
+ work->remote_key = le32_to_cpu(desc->token);
+
+ err = ksmbd_conn_rdma_write(work->conn, data_buf, length,
+ le32_to_cpu(desc->token),
+ le64_to_cpu(desc->offset),
+ le32_to_cpu(desc->length));
+ if (err)
+ return err;
+
+ return length;
+}
+
+/**
+ * smb2_read() - handler for smb2 read from file
+ * @work: smb work containing read command buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_read(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_read_req *req;
+ struct smb2_read_rsp *rsp, *rsp_org;
+ struct ksmbd_file *fp;
+ loff_t offset;
+ size_t length, mincount;
+ ssize_t nbytes = 0, remain_bytes = 0;
+ int err = 0;
+
+ rsp_org = work->response_buf;
+ WORK_BUFFERS(work, req, rsp);
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_PIPE)) {
+ ksmbd_debug(SMB, "IPC pipe read request\n");
+ return smb2_read_pipe(work);
+ }
+
+ fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId));
+ if (!fp) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_READ_ATTRIBUTES_LE))) {
+ pr_err("Not permitted to read : 0x%x\n", fp->daccess);
+ err = -EACCES;
+ goto out;
+ }
+
+ offset = le64_to_cpu(req->Offset);
+ length = le32_to_cpu(req->Length);
+ mincount = le32_to_cpu(req->MinimumCount);
+
+ if (length > conn->vals->max_read_size) {
+ ksmbd_debug(SMB, "limiting read size to max size(%u)\n",
+ conn->vals->max_read_size);
+ err = -EINVAL;
+ goto out;
+ }
+
+ ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
+ fp->filp->f_path.dentry, offset, length);
+
+ work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
+ if (!work->aux_payload_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ nbytes = ksmbd_vfs_read(work, fp, length, &offset);
+ if (nbytes < 0) {
+ err = nbytes;
+ goto out;
+ }
+
+ if ((nbytes == 0 && length != 0) || nbytes < mincount) {
+ kvfree(work->aux_payload_buf);
+ work->aux_payload_buf = NULL;
+ rsp->hdr.Status = STATUS_END_OF_FILE;
+ smb2_set_err_rsp(work);
+ ksmbd_fd_put(work, fp);
+ return 0;
+ }
+
+ ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n",
+ nbytes, offset, mincount);
+
+ if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
+ req->Channel == SMB2_CHANNEL_RDMA_V1) {
+ /* write data to the client using rdma channel */
+ remain_bytes = smb2_read_rdma_channel(work, req,
+ work->aux_payload_buf,
+ nbytes);
+ kvfree(work->aux_payload_buf);
+ work->aux_payload_buf = NULL;
+
+ nbytes = 0;
+ if (remain_bytes < 0) {
+ err = (int)remain_bytes;
+ goto out;
+ }
+ }
+
+ rsp->StructureSize = cpu_to_le16(17);
+ rsp->DataOffset = 80;
+ rsp->Reserved = 0;
+ rsp->DataLength = cpu_to_le32(nbytes);
+ rsp->DataRemaining = cpu_to_le32(remain_bytes);
+ rsp->Reserved2 = 0;
+ inc_rfc1001_len(rsp_org, 16);
+ work->resp_hdr_sz = get_rfc1002_len(rsp_org) + 4;
+ work->aux_payload_sz = nbytes;
+ inc_rfc1001_len(rsp_org, nbytes);
+ ksmbd_fd_put(work, fp);
+ return 0;
+
+out:
+ if (err) {
+ if (err == -EISDIR)
+ rsp->hdr.Status = STATUS_INVALID_DEVICE_REQUEST;
+ else if (err == -EAGAIN)
+ rsp->hdr.Status = STATUS_FILE_LOCK_CONFLICT;
+ else if (err == -ENOENT)
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ else if (err == -EACCES)
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ else if (err == -ESHARE)
+ rsp->hdr.Status = STATUS_SHARING_VIOLATION;
+ else if (err == -EINVAL)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ else
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+
+ smb2_set_err_rsp(work);
+ }
+ ksmbd_fd_put(work, fp);
+ return err;
+}
+
+/**
+ * smb2_write_pipe() - handler for smb2 write on IPC pipe
+ * @work: smb work containing write IPC pipe command buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+static noinline int smb2_write_pipe(struct ksmbd_work *work)
+{
+ struct smb2_write_req *req = work->request_buf;
+ struct smb2_write_rsp *rsp = work->response_buf;
+ struct ksmbd_rpc_command *rpc_resp;
+ u64 id = 0;
+ int err = 0, ret = 0;
+ char *data_buf;
+ size_t length;
+
+ length = le32_to_cpu(req->Length);
+ id = le64_to_cpu(req->VolatileFileId);
+
+ if (le16_to_cpu(req->DataOffset) ==
+ (offsetof(struct smb2_write_req, Buffer) - 4)) {
+ data_buf = (char *)&req->Buffer[0];
+ } else {
+ if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) ||
+ (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) {
+ pr_err("invalid write data offset %u, smb_len %u\n",
+ le16_to_cpu(req->DataOffset),
+ get_rfc1002_len(req));
+ err = -EINVAL;
+ goto out;
+ }
+
+ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+ le16_to_cpu(req->DataOffset));
+ }
+
+ rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length);
+ if (rpc_resp) {
+ if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) {
+ rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ kvfree(rpc_resp);
+ smb2_set_err_rsp(work);
+ return -EOPNOTSUPP;
+ }
+ if (rpc_resp->flags != KSMBD_RPC_OK) {
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+ smb2_set_err_rsp(work);
+ kvfree(rpc_resp);
+ return ret;
+ }
+ kvfree(rpc_resp);
+ }
+
+ rsp->StructureSize = cpu_to_le16(17);
+ rsp->DataOffset = 0;
+ rsp->Reserved = 0;
+ rsp->DataLength = cpu_to_le32(length);
+ rsp->DataRemaining = 0;
+ rsp->Reserved2 = 0;
+ inc_rfc1001_len(rsp, 16);
+ return 0;
+out:
+ if (err) {
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+ smb2_set_err_rsp(work);
+ }
+
+ return err;
+}
+
+static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work,
+ struct smb2_write_req *req,
+ struct ksmbd_file *fp,
+ loff_t offset, size_t length, bool sync)
+{
+ struct smb2_buffer_desc_v1 *desc;
+ char *data_buf;
+ int ret;
+ ssize_t nbytes;
+
+ desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
+
+ if (work->conn->dialect == SMB30_PROT_ID &&
+ req->Channel != SMB2_CHANNEL_RDMA_V1)
+ return -EINVAL;
+
+ if (req->Length != 0 || req->DataOffset != 0)
+ return -EINVAL;
+
+ if (req->WriteChannelInfoOffset == 0 ||
+ le16_to_cpu(req->WriteChannelInfoLength) < sizeof(*desc))
+ return -EINVAL;
+
+ work->need_invalidate_rkey =
+ (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
+ work->remote_key = le32_to_cpu(desc->token);
+
+ data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
+ if (!data_buf)
+ return -ENOMEM;
+
+ ret = ksmbd_conn_rdma_read(work->conn, data_buf, length,
+ le32_to_cpu(desc->token),
+ le64_to_cpu(desc->offset),
+ le32_to_cpu(desc->length));
+ if (ret < 0) {
+ kvfree(data_buf);
+ return ret;
+ }
+
+ ret = ksmbd_vfs_write(work, fp, data_buf, length, &offset, sync, &nbytes);
+ kvfree(data_buf);
+ if (ret < 0)
+ return ret;
+
+ return nbytes;
+}
+
+/**
+ * smb2_write() - handler for smb2 write from file
+ * @work: smb work containing write command buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_write(struct ksmbd_work *work)
+{
+ struct smb2_write_req *req;
+ struct smb2_write_rsp *rsp, *rsp_org;
+ struct ksmbd_file *fp = NULL;
+ loff_t offset;
+ size_t length;
+ ssize_t nbytes;
+ char *data_buf;
+ bool writethrough = false;
+ int err = 0;
+
+ rsp_org = work->response_buf;
+ WORK_BUFFERS(work, req, rsp);
+
+ if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) {
+ ksmbd_debug(SMB, "IPC pipe write request\n");
+ return smb2_write_pipe(work);
+ }
+
+ if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ ksmbd_debug(SMB, "User does not have write permission\n");
+ err = -EACCES;
+ goto out;
+ }
+
+ fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId));
+ if (!fp) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_READ_ATTRIBUTES_LE))) {
+ pr_err("Not permitted to write : 0x%x\n", fp->daccess);
+ err = -EACCES;
+ goto out;
+ }
+
+ offset = le64_to_cpu(req->Offset);
+ length = le32_to_cpu(req->Length);
+
+ if (length > work->conn->vals->max_write_size) {
+ ksmbd_debug(SMB, "limiting write size to max size(%u)\n",
+ work->conn->vals->max_write_size);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
+ writethrough = true;
+
+ if (req->Channel != SMB2_CHANNEL_RDMA_V1 &&
+ req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
+ if (le16_to_cpu(req->DataOffset) ==
+ (offsetof(struct smb2_write_req, Buffer) - 4)) {
+ data_buf = (char *)&req->Buffer[0];
+ } else {
+ if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) ||
+ (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) {
+ pr_err("invalid write data offset %u, smb_len %u\n",
+ le16_to_cpu(req->DataOffset),
+ get_rfc1002_len(req));
+ err = -EINVAL;
+ goto out;
+ }
+
+ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+ le16_to_cpu(req->DataOffset));
+ }
+
+ ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
+ if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
+ writethrough = true;
+
+ ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
+ fp->filp->f_path.dentry, offset, length);
+ err = ksmbd_vfs_write(work, fp, data_buf, length, &offset,
+ writethrough, &nbytes);
+ if (err < 0)
+ goto out;
+ } else {
+ /* read data from the client using rdma channel, and
+ * write the data.
+ */
+ nbytes = smb2_write_rdma_channel(work, req, fp, offset,
+ le32_to_cpu(req->RemainingBytes),
+ writethrough);
+ if (nbytes < 0) {
+ err = (int)nbytes;
+ goto out;
+ }
+ }
+
+ rsp->StructureSize = cpu_to_le16(17);
+ rsp->DataOffset = 0;
+ rsp->Reserved = 0;
+ rsp->DataLength = cpu_to_le32(nbytes);
+ rsp->DataRemaining = 0;
+ rsp->Reserved2 = 0;
+ inc_rfc1001_len(rsp_org, 16);
+ ksmbd_fd_put(work, fp);
+ return 0;
+
+out:
+ if (err == -EAGAIN)
+ rsp->hdr.Status = STATUS_FILE_LOCK_CONFLICT;
+ else if (err == -ENOSPC || err == -EFBIG)
+ rsp->hdr.Status = STATUS_DISK_FULL;
+ else if (err == -ENOENT)
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ else if (err == -EACCES)
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ else if (err == -ESHARE)
+ rsp->hdr.Status = STATUS_SHARING_VIOLATION;
+ else if (err == -EINVAL)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ else
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+
+ smb2_set_err_rsp(work);
+ ksmbd_fd_put(work, fp);
+ return err;
+}
+
+/**
+ * smb2_flush() - handler for smb2 flush file - fsync
+ * @work: smb work containing flush command buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_flush(struct ksmbd_work *work)
+{
+ struct smb2_flush_req *req;
+ struct smb2_flush_rsp *rsp, *rsp_org;
+ int err;
+
+ rsp_org = work->response_buf;
+ WORK_BUFFERS(work, req, rsp);
+
+ ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n",
+ le64_to_cpu(req->VolatileFileId));
+
+ err = ksmbd_vfs_fsync(work,
+ le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId));
+ if (err)
+ goto out;
+
+ rsp->StructureSize = cpu_to_le16(4);
+ rsp->Reserved = 0;
+ inc_rfc1001_len(rsp_org, 4);
+ return 0;
+
+out:
+ if (err) {
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+ smb2_set_err_rsp(work);
+ }
+
+ return err;
+}
+
+/**
+ * smb2_cancel() - handler for smb2 cancel command
+ * @work: smb work containing cancel command buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_cancel(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_hdr *hdr = work->request_buf;
+ struct smb2_hdr *chdr;
+ struct ksmbd_work *cancel_work = NULL;
+ int canceled = 0;
+ struct list_head *command_list;
+
+ ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n",
+ hdr->MessageId, hdr->Flags);
+
+ if (hdr->Flags & SMB2_FLAGS_ASYNC_COMMAND) {
+ command_list = &conn->async_requests;
+
+ spin_lock(&conn->request_lock);
+ list_for_each_entry(cancel_work, command_list,
+ async_request_entry) {
+ chdr = cancel_work->request_buf;
+
+ if (cancel_work->async_id !=
+ le64_to_cpu(hdr->Id.AsyncId))
+ continue;
+
+ ksmbd_debug(SMB,
+ "smb2 with AsyncId %llu cancelled command = 0x%x\n",
+ le64_to_cpu(hdr->Id.AsyncId),
+ le16_to_cpu(chdr->Command));
+ canceled = 1;
+ break;
+ }
+ spin_unlock(&conn->request_lock);
+ } else {
+ command_list = &conn->requests;
+
+ spin_lock(&conn->request_lock);
+ list_for_each_entry(cancel_work, command_list, request_entry) {
+ chdr = cancel_work->request_buf;
+
+ if (chdr->MessageId != hdr->MessageId ||
+ cancel_work == work)
+ continue;
+
+ ksmbd_debug(SMB,
+ "smb2 with mid %llu cancelled command = 0x%x\n",
+ le64_to_cpu(hdr->MessageId),
+ le16_to_cpu(chdr->Command));
+ canceled = 1;
+ break;
+ }
+ spin_unlock(&conn->request_lock);
+ }
+
+ if (canceled) {
+ cancel_work->state = KSMBD_WORK_CANCELLED;
+ if (cancel_work->cancel_fn)
+ cancel_work->cancel_fn(cancel_work->cancel_argv);
+ }
+
+ /* For SMB2_CANCEL command itself send no response*/
+ work->send_no_response = 1;
+ return 0;
+}
+
+struct file_lock *smb_flock_init(struct file *f)
+{
+ struct file_lock *fl;
+
+ fl = locks_alloc_lock();
+ if (!fl)
+ goto out;
+
+ locks_init_lock(fl);
+
+ fl->fl_owner = f;
+ fl->fl_pid = current->tgid;
+ fl->fl_file = f;
+ fl->fl_flags = FL_POSIX;
+ fl->fl_ops = NULL;
+ fl->fl_lmops = NULL;
+
+out:
+ return fl;
+}
+
+static int smb2_set_flock_flags(struct file_lock *flock, int flags)
+{
+ int cmd = -EINVAL;
+
+ /* Checking for wrong flag combination during lock request*/
+ switch (flags) {
+ case SMB2_LOCKFLAG_SHARED:
+ ksmbd_debug(SMB, "received shared request\n");
+ cmd = F_SETLKW;
+ flock->fl_type = F_RDLCK;
+ flock->fl_flags |= FL_SLEEP;
+ break;
+ case SMB2_LOCKFLAG_EXCLUSIVE:
+ ksmbd_debug(SMB, "received exclusive request\n");
+ cmd = F_SETLKW;
+ flock->fl_type = F_WRLCK;
+ flock->fl_flags |= FL_SLEEP;
+ break;
+ case SMB2_LOCKFLAG_SHARED | SMB2_LOCKFLAG_FAIL_IMMEDIATELY:
+ ksmbd_debug(SMB,
+ "received shared & fail immediately request\n");
+ cmd = F_SETLK;
+ flock->fl_type = F_RDLCK;
+ break;
+ case SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_FAIL_IMMEDIATELY:
+ ksmbd_debug(SMB,
+ "received exclusive & fail immediately request\n");
+ cmd = F_SETLK;
+ flock->fl_type = F_WRLCK;
+ break;
+ case SMB2_LOCKFLAG_UNLOCK:
+ ksmbd_debug(SMB, "received unlock request\n");
+ flock->fl_type = F_UNLCK;
+ cmd = 0;
+ break;
+ }
+
+ return cmd;
+}
+
+static struct ksmbd_lock *smb2_lock_init(struct file_lock *flock,
+ unsigned int cmd, int flags,
+ struct list_head *lock_list)
+{
+ struct ksmbd_lock *lock;
+
+ lock = kzalloc(sizeof(struct ksmbd_lock), GFP_KERNEL);
+ if (!lock)
+ return NULL;
+
+ lock->cmd = cmd;
+ lock->fl = flock;
+ lock->start = flock->fl_start;
+ lock->end = flock->fl_end;
+ lock->flags = flags;
+ if (lock->start == lock->end)
+ lock->zero_len = 1;
+ INIT_LIST_HEAD(&lock->clist);
+ INIT_LIST_HEAD(&lock->flist);
+ INIT_LIST_HEAD(&lock->llist);
+ list_add_tail(&lock->llist, lock_list);
+
+ return lock;
+}
+
+static void smb2_remove_blocked_lock(void **argv)
+{
+ struct file_lock *flock = (struct file_lock *)argv[0];
+
+ ksmbd_vfs_posix_lock_unblock(flock);
+ wake_up(&flock->fl_wait);
+}
+
+static inline bool lock_defer_pending(struct file_lock *fl)
+{
+ /* check pending lock waiters */
+ return waitqueue_active(&fl->fl_wait);
+}
+
+/**
+ * smb2_lock() - handler for smb2 file lock command
+ * @work: smb work containing lock command buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_lock(struct ksmbd_work *work)
+{
+ struct smb2_lock_req *req = work->request_buf;
+ struct smb2_lock_rsp *rsp = work->response_buf;
+ struct smb2_lock_element *lock_ele;
+ struct ksmbd_file *fp = NULL;
+ struct file_lock *flock = NULL;
+ struct file *filp = NULL;
+ int lock_count;
+ int flags = 0;
+ int cmd = 0;
+ int err = -EIO, i, rc = 0;
+ u64 lock_start, lock_length;
+ struct ksmbd_lock *smb_lock = NULL, *cmp_lock, *tmp, *tmp2;
+ struct ksmbd_conn *conn;
+ int nolock = 0;
+ LIST_HEAD(lock_list);
+ LIST_HEAD(rollback_list);
+ int prior_lock = 0;
+
+ ksmbd_debug(SMB, "Received lock request\n");
+ fp = ksmbd_lookup_fd_slow(work,
+ le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId));
+ if (!fp) {
+ ksmbd_debug(SMB, "Invalid file id for lock : %llu\n",
+ le64_to_cpu(req->VolatileFileId));
+ err = -ENOENT;
+ goto out2;
+ }
+
+ filp = fp->filp;
+ lock_count = le16_to_cpu(req->LockCount);
+ lock_ele = req->locks;
+
+ ksmbd_debug(SMB, "lock count is %d\n", lock_count);
+ if (!lock_count) {
+ err = -EINVAL;
+ goto out2;
+ }
+
+ for (i = 0; i < lock_count; i++) {
+ flags = le32_to_cpu(lock_ele[i].Flags);
+
+ flock = smb_flock_init(filp);
+ if (!flock)
+ goto out;
+
+ cmd = smb2_set_flock_flags(flock, flags);
+
+ lock_start = le64_to_cpu(lock_ele[i].Offset);
+ lock_length = le64_to_cpu(lock_ele[i].Length);
+ if (lock_start > U64_MAX - lock_length) {
+ pr_err("Invalid lock range requested\n");
+ rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+ goto out;
+ }
+
+ if (lock_start > OFFSET_MAX)
+ flock->fl_start = OFFSET_MAX;
+ else
+ flock->fl_start = lock_start;
+
+ lock_length = le64_to_cpu(lock_ele[i].Length);
+ if (lock_length > OFFSET_MAX - flock->fl_start)
+ lock_length = OFFSET_MAX - flock->fl_start;
+
+ flock->fl_end = flock->fl_start + lock_length;
+
+ if (flock->fl_end < flock->fl_start) {
+ ksmbd_debug(SMB,
+ "the end offset(%llx) is smaller than the start offset(%llx)\n",
+ flock->fl_end, flock->fl_start);
+ rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+ goto out;
+ }
+
+ /* Check conflict locks in one request */
+ list_for_each_entry(cmp_lock, &lock_list, llist) {
+ if (cmp_lock->fl->fl_start <= flock->fl_start &&
+ cmp_lock->fl->fl_end >= flock->fl_end) {
+ if (cmp_lock->fl->fl_type != F_UNLCK &&
+ flock->fl_type != F_UNLCK) {
+ pr_err("conflict two locks in one request\n");
+ err = -EINVAL;
+ goto out;
+ }
+ }
+ }
+
+ smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list);
+ if (!smb_lock) {
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ list_for_each_entry_safe(smb_lock, tmp, &lock_list, llist) {
+ if (smb_lock->cmd < 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!(smb_lock->flags & SMB2_LOCKFLAG_MASK)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if ((prior_lock & (SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_SHARED) &&
+ smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) ||
+ (prior_lock == SMB2_LOCKFLAG_UNLOCK &&
+ !(smb_lock->flags & SMB2_LOCKFLAG_UNLOCK))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ prior_lock = smb_lock->flags;
+
+ if (!(smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) &&
+ !(smb_lock->flags & SMB2_LOCKFLAG_FAIL_IMMEDIATELY))
+ goto no_check_cl;
+
+ nolock = 1;
+ /* check locks in connection list */
+ read_lock(&conn_list_lock);
+ list_for_each_entry(conn, &conn_list, conns_list) {
+ spin_lock(&conn->llist_lock);
+ list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) {
+ if (file_inode(cmp_lock->fl->fl_file) !=
+ file_inode(smb_lock->fl->fl_file))
+ continue;
+
+ if (smb_lock->fl->fl_type == F_UNLCK) {
+ if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file &&
+ cmp_lock->start == smb_lock->start &&
+ cmp_lock->end == smb_lock->end &&
+ !lock_defer_pending(cmp_lock->fl)) {
+ nolock = 0;
+ list_del(&cmp_lock->flist);
+ list_del(&cmp_lock->clist);
+ spin_unlock(&conn->llist_lock);
+ read_unlock(&conn_list_lock);
+
+ locks_free_lock(cmp_lock->fl);
+ kfree(cmp_lock);
+ goto out_check_cl;
+ }
+ continue;
+ }
+
+ if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file) {
+ if (smb_lock->flags & SMB2_LOCKFLAG_SHARED)
+ continue;
+ } else {
+ if (cmp_lock->flags & SMB2_LOCKFLAG_SHARED)
+ continue;
+ }
+
+ /* check zero byte lock range */
+ if (cmp_lock->zero_len && !smb_lock->zero_len &&
+ cmp_lock->start > smb_lock->start &&
+ cmp_lock->start < smb_lock->end) {
+ spin_unlock(&conn->llist_lock);
+ read_unlock(&conn_list_lock);
+ pr_err("previous lock conflict with zero byte lock range\n");
+ goto out;
+ }
+
+ if (smb_lock->zero_len && !cmp_lock->zero_len &&
+ smb_lock->start > cmp_lock->start &&
+ smb_lock->start < cmp_lock->end) {
+ spin_unlock(&conn->llist_lock);
+ read_unlock(&conn_list_lock);
+ pr_err("current lock conflict with zero byte lock range\n");
+ goto out;
+ }
+
+ if (((cmp_lock->start <= smb_lock->start &&
+ cmp_lock->end > smb_lock->start) ||
+ (cmp_lock->start < smb_lock->end &&
+ cmp_lock->end >= smb_lock->end)) &&
+ !cmp_lock->zero_len && !smb_lock->zero_len) {
+ spin_unlock(&conn->llist_lock);
+ read_unlock(&conn_list_lock);
+ pr_err("Not allow lock operation on exclusive lock range\n");
+ goto out;
+ }
+ }
+ spin_unlock(&conn->llist_lock);
+ }
+ read_unlock(&conn_list_lock);
+out_check_cl:
+ if (smb_lock->fl->fl_type == F_UNLCK && nolock) {
+ pr_err("Try to unlock nolocked range\n");
+ rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED;
+ goto out;
+ }
+
+no_check_cl:
+ if (smb_lock->zero_len) {
+ err = 0;
+ goto skip;
+ }
+
+ flock = smb_lock->fl;
+ list_del(&smb_lock->llist);
+retry:
+ rc = vfs_lock_file(filp, smb_lock->cmd, flock, NULL);
+skip:
+ if (flags & SMB2_LOCKFLAG_UNLOCK) {
+ if (!rc) {
+ ksmbd_debug(SMB, "File unlocked\n");
+ } else if (rc == -ENOENT) {
+ rsp->hdr.Status = STATUS_NOT_LOCKED;
+ goto out;
+ }
+ locks_free_lock(flock);
+ kfree(smb_lock);
+ } else {
+ if (rc == FILE_LOCK_DEFERRED) {
+ void **argv;
+
+ ksmbd_debug(SMB,
+ "would have to wait for getting lock\n");
+ spin_lock(&work->conn->llist_lock);
+ list_add_tail(&smb_lock->clist,
+ &work->conn->lock_list);
+ spin_unlock(&work->conn->llist_lock);
+ list_add(&smb_lock->llist, &rollback_list);
+
+ argv = kmalloc(sizeof(void *), GFP_KERNEL);
+ if (!argv) {
+ err = -ENOMEM;
+ goto out;
+ }
+ argv[0] = flock;
+
+ rc = setup_async_work(work,
+ smb2_remove_blocked_lock,
+ argv);
+ if (rc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ spin_lock(&fp->f_lock);
+ list_add(&work->fp_entry, &fp->blocked_works);
+ spin_unlock(&fp->f_lock);
+
+ smb2_send_interim_resp(work, STATUS_PENDING);
+
+ ksmbd_vfs_posix_lock_wait(flock);
+
+ if (work->state != KSMBD_WORK_ACTIVE) {
+ list_del(&smb_lock->llist);
+ spin_lock(&work->conn->llist_lock);
+ list_del(&smb_lock->clist);
+ spin_unlock(&work->conn->llist_lock);
+ locks_free_lock(flock);
+
+ if (work->state == KSMBD_WORK_CANCELLED) {
+ spin_lock(&fp->f_lock);
+ list_del(&work->fp_entry);
+ spin_unlock(&fp->f_lock);
+ rsp->hdr.Status =
+ STATUS_CANCELLED;
+ kfree(smb_lock);
+ smb2_send_interim_resp(work,
+ STATUS_CANCELLED);
+ work->send_no_response = 1;
+ goto out;
+ }
+ init_smb2_rsp_hdr(work);
+ smb2_set_err_rsp(work);
+ rsp->hdr.Status =
+ STATUS_RANGE_NOT_LOCKED;
+ kfree(smb_lock);
+ goto out2;
+ }
+
+ list_del(&smb_lock->llist);
+ spin_lock(&work->conn->llist_lock);
+ list_del(&smb_lock->clist);
+ spin_unlock(&work->conn->llist_lock);
+
+ spin_lock(&fp->f_lock);
+ list_del(&work->fp_entry);
+ spin_unlock(&fp->f_lock);
+ goto retry;
+ } else if (!rc) {
+ spin_lock(&work->conn->llist_lock);
+ list_add_tail(&smb_lock->clist,
+ &work->conn->lock_list);
+ list_add_tail(&smb_lock->flist,
+ &fp->lock_list);
+ spin_unlock(&work->conn->llist_lock);
+ list_add(&smb_lock->llist, &rollback_list);
+ ksmbd_debug(SMB, "successful in taking lock\n");
+ } else {
+ goto out;
+ }
+ }
+ }
+
+ if (atomic_read(&fp->f_ci->op_count) > 1)
+ smb_break_all_oplock(work, fp);
+
+ rsp->StructureSize = cpu_to_le16(4);
+ ksmbd_debug(SMB, "successful in taking lock\n");
+ rsp->hdr.Status = STATUS_SUCCESS;
+ rsp->Reserved = 0;
+ inc_rfc1001_len(rsp, 4);
+ ksmbd_fd_put(work, fp);
+ return 0;
+
+out:
+ list_for_each_entry_safe(smb_lock, tmp, &lock_list, llist) {
+ locks_free_lock(smb_lock->fl);
+ list_del(&smb_lock->llist);
+ kfree(smb_lock);
+ }
+
+ list_for_each_entry_safe(smb_lock, tmp, &rollback_list, llist) {
+ struct file_lock *rlock = NULL;
+
+ rlock = smb_flock_init(filp);
+ rlock->fl_type = F_UNLCK;
+ rlock->fl_start = smb_lock->start;
+ rlock->fl_end = smb_lock->end;
+
+ rc = vfs_lock_file(filp, 0, rlock, NULL);
+ if (rc)
+ pr_err("rollback unlock fail : %d\n", rc);
+
+ list_del(&smb_lock->llist);
+ spin_lock(&work->conn->llist_lock);
+ if (!list_empty(&smb_lock->flist))
+ list_del(&smb_lock->flist);
+ list_del(&smb_lock->clist);
+ spin_unlock(&work->conn->llist_lock);
+
+ locks_free_lock(smb_lock->fl);
+ locks_free_lock(rlock);
+ kfree(smb_lock);
+ }
+out2:
+ ksmbd_debug(SMB, "failed in taking lock(flags : %x), err : %d\n", flags, err);
+
+ if (!rsp->hdr.Status) {
+ if (err == -EINVAL)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ else if (err == -ENOMEM)
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
+ else if (err == -ENOENT)
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ else
+ rsp->hdr.Status = STATUS_LOCK_NOT_GRANTED;
+ }
+
+ smb2_set_err_rsp(work);
+ ksmbd_fd_put(work, fp);
+ return err;
+}
+
+static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req,
+ struct smb2_ioctl_rsp *rsp)
+{
+ struct copychunk_ioctl_req *ci_req;
+ struct copychunk_ioctl_rsp *ci_rsp;
+ struct ksmbd_file *src_fp = NULL, *dst_fp = NULL;
+ struct srv_copychunk *chunks;
+ unsigned int i, chunk_count, chunk_count_written = 0;
+ unsigned int chunk_size_written = 0;
+ loff_t total_size_written = 0;
+ int ret, cnt_code;
+
+ cnt_code = le32_to_cpu(req->CntCode);
+ ci_req = (struct copychunk_ioctl_req *)&req->Buffer[0];
+ ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0];
+
+ rsp->VolatileFileId = req->VolatileFileId;
+ rsp->PersistentFileId = req->PersistentFileId;
+ ci_rsp->ChunksWritten =
+ cpu_to_le32(ksmbd_server_side_copy_max_chunk_count());
+ ci_rsp->ChunkBytesWritten =
+ cpu_to_le32(ksmbd_server_side_copy_max_chunk_size());
+ ci_rsp->TotalBytesWritten =
+ cpu_to_le32(ksmbd_server_side_copy_max_total_size());
+
+ chunks = (struct srv_copychunk *)&ci_req->Chunks[0];
+ chunk_count = le32_to_cpu(ci_req->ChunkCount);
+ total_size_written = 0;
+
+ /* verify the SRV_COPYCHUNK_COPY packet */
+ if (chunk_count > ksmbd_server_side_copy_max_chunk_count() ||
+ le32_to_cpu(req->InputCount) <
+ offsetof(struct copychunk_ioctl_req, Chunks) +
+ chunk_count * sizeof(struct srv_copychunk)) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ return -EINVAL;
+ }
+
+ for (i = 0; i < chunk_count; i++) {
+ if (le32_to_cpu(chunks[i].Length) == 0 ||
+ le32_to_cpu(chunks[i].Length) > ksmbd_server_side_copy_max_chunk_size())
+ break;
+ total_size_written += le32_to_cpu(chunks[i].Length);
+ }
+
+ if (i < chunk_count ||
+ total_size_written > ksmbd_server_side_copy_max_total_size()) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ return -EINVAL;
+ }
+
+ src_fp = ksmbd_lookup_foreign_fd(work,
+ le64_to_cpu(ci_req->ResumeKey[0]));
+ dst_fp = ksmbd_lookup_fd_slow(work,
+ le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId));
+ ret = -EINVAL;
+ if (!src_fp ||
+ src_fp->persistent_id != le64_to_cpu(ci_req->ResumeKey[1])) {
+ rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND;
+ goto out;
+ }
+
+ if (!dst_fp) {
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ goto out;
+ }
+
+ /*
+ * FILE_READ_DATA should only be included in
+ * the FSCTL_COPYCHUNK case
+ */
+ if (cnt_code == FSCTL_COPYCHUNK &&
+ !(dst_fp->daccess & (FILE_READ_DATA_LE | FILE_GENERIC_READ_LE))) {
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ goto out;
+ }
+
+ ret = ksmbd_vfs_copy_file_ranges(work, src_fp, dst_fp,
+ chunks, chunk_count,
+ &chunk_count_written,
+ &chunk_size_written,
+ &total_size_written);
+ if (ret < 0) {
+ if (ret == -EACCES)
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ if (ret == -EAGAIN)
+ rsp->hdr.Status = STATUS_FILE_LOCK_CONFLICT;
+ else if (ret == -EBADF)
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+ else if (ret == -EFBIG || ret == -ENOSPC)
+ rsp->hdr.Status = STATUS_DISK_FULL;
+ else if (ret == -EINVAL)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ else if (ret == -EISDIR)
+ rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
+ else if (ret == -E2BIG)
+ rsp->hdr.Status = STATUS_INVALID_VIEW_SIZE;
+ else
+ rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
+ }
+
+ ci_rsp->ChunksWritten = cpu_to_le32(chunk_count_written);
+ ci_rsp->ChunkBytesWritten = cpu_to_le32(chunk_size_written);
+ ci_rsp->TotalBytesWritten = cpu_to_le32(total_size_written);
+out:
+ ksmbd_fd_put(work, src_fp);
+ ksmbd_fd_put(work, dst_fp);
+ return ret;
+}
+
+static __be32 idev_ipv4_address(struct in_device *idev)
+{
+ __be32 addr = 0;
+
+ struct in_ifaddr *ifa;
+
+ rcu_read_lock();
+ in_dev_for_each_ifa_rcu(ifa, idev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+
+ addr = ifa->ifa_address;
+ break;
+ }
+ rcu_read_unlock();
+ return addr;
+}
+
+static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
+ struct smb2_ioctl_req *req,
+ struct smb2_ioctl_rsp *rsp)
+{
+ struct network_interface_info_ioctl_rsp *nii_rsp = NULL;
+ int nbytes = 0;
+ struct net_device *netdev;
+ struct sockaddr_storage_rsp *sockaddr_storage;
+ unsigned int flags;
+ unsigned long long speed;
+ struct sockaddr_in6 *csin6 = (struct sockaddr_in6 *)&conn->peer_addr;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, netdev) {
+ if (netdev->type == ARPHRD_LOOPBACK)
+ continue;
+
+ flags = dev_get_flags(netdev);
+ if (!(flags & IFF_RUNNING))
+ continue;
+
+ nii_rsp = (struct network_interface_info_ioctl_rsp *)
+ &rsp->Buffer[nbytes];
+ nii_rsp->IfIndex = cpu_to_le32(netdev->ifindex);
+
+ nii_rsp->Capability = 0;
+ if (ksmbd_rdma_capable_netdev(netdev))
+ nii_rsp->Capability |= cpu_to_le32(RDMA_CAPABLE);
+
+ nii_rsp->Next = cpu_to_le32(152);
+ nii_rsp->Reserved = 0;
+
+ if (netdev->ethtool_ops->get_link_ksettings) {
+ struct ethtool_link_ksettings cmd;
+
+ netdev->ethtool_ops->get_link_ksettings(netdev, &cmd);
+ speed = cmd.base.speed;
+ } else {
+ pr_err("%s %s\n", netdev->name,
+ "speed is unknown, defaulting to 1Gb/sec");
+ speed = SPEED_1000;
+ }
+
+ speed *= 1000000;
+ nii_rsp->LinkSpeed = cpu_to_le64(speed);
+
+ sockaddr_storage = (struct sockaddr_storage_rsp *)
+ nii_rsp->SockAddr_Storage;
+ memset(sockaddr_storage, 0, 128);
+
+ if (conn->peer_addr.ss_family == PF_INET ||
+ ipv6_addr_v4mapped(&csin6->sin6_addr)) {
+ struct in_device *idev;
+
+ sockaddr_storage->Family = cpu_to_le16(INTERNETWORK);
+ sockaddr_storage->addr4.Port = 0;
+
+ idev = __in_dev_get_rtnl(netdev);
+ if (!idev)
+ continue;
+ sockaddr_storage->addr4.IPv4address =
+ idev_ipv4_address(idev);
+ } else {
+ struct inet6_dev *idev6;
+ struct inet6_ifaddr *ifa;
+ __u8 *ipv6_addr = sockaddr_storage->addr6.IPv6address;
+
+ sockaddr_storage->Family = cpu_to_le16(INTERNETWORKV6);
+ sockaddr_storage->addr6.Port = 0;
+ sockaddr_storage->addr6.FlowInfo = 0;
+
+ idev6 = __in6_dev_get(netdev);
+ if (!idev6)
+ continue;
+
+ list_for_each_entry(ifa, &idev6->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE |
+ IFA_F_DEPRECATED))
+ continue;
+ memcpy(ipv6_addr, ifa->addr.s6_addr, 16);
+ break;
+ }
+ sockaddr_storage->addr6.ScopeId = 0;
+ }
+
+ nbytes += sizeof(struct network_interface_info_ioctl_rsp);
+ }
+ rtnl_unlock();
+
+ /* zero if this is last one */
+ if (nii_rsp)
+ nii_rsp->Next = 0;
+
+ if (!nbytes) {
+ rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL;
+ return -EINVAL;
+ }
+
+ rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
+ rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
+ return nbytes;
+}
+
+static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn,
+ struct validate_negotiate_info_req *neg_req,
+ struct validate_negotiate_info_rsp *neg_rsp)
+{
+ int ret = 0;
+ int dialect;
+
+ dialect = ksmbd_lookup_dialect_by_id(neg_req->Dialects,
+ neg_req->DialectCount);
+ if (dialect == BAD_PROT_ID || dialect != conn->dialect) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ if (strncmp(neg_req->Guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE)) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ if (le16_to_cpu(neg_req->SecurityMode) != conn->cli_sec_mode) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ if (le32_to_cpu(neg_req->Capabilities) != conn->cli_cap) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ neg_rsp->Capabilities = cpu_to_le32(conn->vals->capabilities);
+ memset(neg_rsp->Guid, 0, SMB2_CLIENT_GUID_SIZE);
+ neg_rsp->SecurityMode = cpu_to_le16(conn->srv_sec_mode);
+ neg_rsp->Dialect = cpu_to_le16(conn->dialect);
+err_out:
+ return ret;
+}
+
+static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id,
+ struct file_allocated_range_buffer *qar_req,
+ struct file_allocated_range_buffer *qar_rsp,
+ int in_count, int *out_count)
+{
+ struct ksmbd_file *fp;
+ loff_t start, length;
+ int ret = 0;
+
+ *out_count = 0;
+ if (in_count == 0)
+ return -EINVAL;
+
+ fp = ksmbd_lookup_fd_fast(work, id);
+ if (!fp)
+ return -ENOENT;
+
+ start = le64_to_cpu(qar_req->file_offset);
+ length = le64_to_cpu(qar_req->length);
+
+ ret = ksmbd_vfs_fqar_lseek(fp, start, length,
+ qar_rsp, in_count, out_count);
+ if (ret && ret != -E2BIG)
+ *out_count = 0;
+
+ ksmbd_fd_put(work, fp);
+ return ret;
+}
+
+static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id,
+ int out_buf_len, struct smb2_ioctl_req *req,
+ struct smb2_ioctl_rsp *rsp)
+{
+ struct ksmbd_rpc_command *rpc_resp;
+ char *data_buf = (char *)&req->Buffer[0];
+ int nbytes = 0;
+
+ rpc_resp = ksmbd_rpc_ioctl(work->sess, id, data_buf,
+ le32_to_cpu(req->InputCount));
+ if (rpc_resp) {
+ if (rpc_resp->flags == KSMBD_RPC_SOME_NOT_MAPPED) {
+ /*
+ * set STATUS_SOME_NOT_MAPPED response
+ * for unknown domain sid.
+ */
+ rsp->hdr.Status = STATUS_SOME_NOT_MAPPED;
+ } else if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) {
+ rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ goto out;
+ } else if (rpc_resp->flags != KSMBD_RPC_OK) {
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ goto out;
+ }
+
+ nbytes = rpc_resp->payload_sz;
+ if (rpc_resp->payload_sz > out_buf_len) {
+ rsp->hdr.Status = STATUS_BUFFER_OVERFLOW;
+ nbytes = out_buf_len;
+ }
+
+ if (!rpc_resp->payload_sz) {
+ rsp->hdr.Status =
+ STATUS_UNEXPECTED_IO_ERROR;
+ goto out;
+ }
+
+ memcpy((char *)rsp->Buffer, rpc_resp->payload, nbytes);
+ }
+out:
+ kvfree(rpc_resp);
+ return nbytes;
+}
+
+static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id,
+ struct file_sparse *sparse)
+{
+ struct ksmbd_file *fp;
+ struct user_namespace *user_ns;
+ int ret = 0;
+ __le32 old_fattr;
+
+ fp = ksmbd_lookup_fd_fast(work, id);
+ if (!fp)
+ return -ENOENT;
+ user_ns = file_mnt_user_ns(fp->filp);
+
+ old_fattr = fp->f_ci->m_fattr;
+ if (sparse->SetSparse)
+ fp->f_ci->m_fattr |= ATTR_SPARSE_FILE_LE;
+ else
+ fp->f_ci->m_fattr &= ~ATTR_SPARSE_FILE_LE;
+
+ if (fp->f_ci->m_fattr != old_fattr &&
+ test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) {
+ struct xattr_dos_attrib da;
+
+ ret = ksmbd_vfs_get_dos_attrib_xattr(user_ns,
+ fp->filp->f_path.dentry, &da);
+ if (ret <= 0)
+ goto out;
+
+ da.attr = le32_to_cpu(fp->f_ci->m_fattr);
+ ret = ksmbd_vfs_set_dos_attrib_xattr(user_ns,
+ fp->filp->f_path.dentry, &da);
+ if (ret)
+ fp->f_ci->m_fattr = old_fattr;
+ }
+
+out:
+ ksmbd_fd_put(work, fp);
+ return ret;
+}
+
+static int fsctl_request_resume_key(struct ksmbd_work *work,
+ struct smb2_ioctl_req *req,
+ struct resume_key_ioctl_rsp *key_rsp)
+{
+ struct ksmbd_file *fp;
+
+ fp = ksmbd_lookup_fd_slow(work,
+ le64_to_cpu(req->VolatileFileId),
+ le64_to_cpu(req->PersistentFileId));
+ if (!fp)
+ return -ENOENT;
+
+ memset(key_rsp, 0, sizeof(*key_rsp));
+ key_rsp->ResumeKey[0] = req->VolatileFileId;
+ key_rsp->ResumeKey[1] = req->PersistentFileId;
+ ksmbd_fd_put(work, fp);
+
+ return 0;
+}
+
+/**
+ * smb2_ioctl() - handler for smb2 ioctl command
+ * @work: smb work containing ioctl command buffer
+ *
+ * Return: 0 on success, otherwise error
+ */
+int smb2_ioctl(struct ksmbd_work *work)
+{
+ struct smb2_ioctl_req *req;
+ struct smb2_ioctl_rsp *rsp, *rsp_org;
+ int cnt_code, nbytes = 0;
+ int out_buf_len;
+ u64 id = KSMBD_NO_FID;
+ struct ksmbd_conn *conn = work->conn;
+ int ret = 0;
+
+ rsp_org = work->response_buf;
+ if (work->next_smb2_rcv_hdr_off) {
+ req = ksmbd_req_buf_next(work);
+ rsp = ksmbd_resp_buf_next(work);
+ if (!has_file_id(le64_to_cpu(req->VolatileFileId))) {
+ ksmbd_debug(SMB, "Compound request set FID = %llu\n",
+ work->compound_fid);
+ id = work->compound_fid;
+ }
+ } else {
+ req = work->request_buf;
+ rsp = work->response_buf;
+ }
+
+ if (!has_file_id(id))
+ id = le64_to_cpu(req->VolatileFileId);
+
+ if (req->Flags != cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL)) {
+ rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ goto out;
+ }
+
+ cnt_code = le32_to_cpu(req->CntCode);
+ out_buf_len = le32_to_cpu(req->MaxOutputResponse);
+ out_buf_len = min(KSMBD_IPC_MAX_PAYLOAD, out_buf_len);
+
+ switch (cnt_code) {
+ case FSCTL_DFS_GET_REFERRALS:
+ case FSCTL_DFS_GET_REFERRALS_EX:
+ /* Not support DFS yet */
+ rsp->hdr.Status = STATUS_FS_DRIVER_REQUIRED;
+ goto out;
+ case FSCTL_CREATE_OR_GET_OBJECT_ID:
+ {
+ struct file_object_buf_type1_ioctl_rsp *obj_buf;
+
+ nbytes = sizeof(struct file_object_buf_type1_ioctl_rsp);
+ obj_buf = (struct file_object_buf_type1_ioctl_rsp *)
+ &rsp->Buffer[0];
+
+ /*
+ * TODO: This is dummy implementation to pass smbtorture
+ * Need to check correct response later
+ */
+ memset(obj_buf->ObjectId, 0x0, 16);
+ memset(obj_buf->BirthVolumeId, 0x0, 16);
+ memset(obj_buf->BirthObjectId, 0x0, 16);
+ memset(obj_buf->DomainId, 0x0, 16);
+
+ break;
+ }
+ case FSCTL_PIPE_TRANSCEIVE:
+ nbytes = fsctl_pipe_transceive(work, id, out_buf_len, req, rsp);
+ break;
+ case FSCTL_VALIDATE_NEGOTIATE_INFO:
+ if (conn->dialect < SMB30_PROT_ID) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = fsctl_validate_negotiate_info(conn,
+ (struct validate_negotiate_info_req *)&req->Buffer[0],
+ (struct validate_negotiate_info_rsp *)&rsp->Buffer[0]);
+ if (ret < 0)
+ goto out;
+
+ nbytes = sizeof(struct validate_negotiate_info_rsp);
+ rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID);
+ rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID);
+ break;
+ case FSCTL_QUERY_NETWORK_INTERFACE_INFO:
+ nbytes = fsctl_query_iface_info_ioctl(conn, req, rsp);
+ if (nbytes < 0)
+ goto out;
+ break;
+ case FSCTL_REQUEST_RESUME_KEY:
+ if (out_buf_len < sizeof(struct resume_key_ioctl_rsp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = fsctl_request_resume_key(work, req,
+ (struct resume_key_ioctl_rsp *)&rsp->Buffer[0]);
+ if (ret < 0)
+ goto out;
+ rsp->PersistentFileId = req->PersistentFileId;
+ rsp->VolatileFileId = req->VolatileFileId;
+ nbytes = sizeof(struct resume_key_ioctl_rsp);
+ break;
+ case FSCTL_COPYCHUNK:
+ case FSCTL_COPYCHUNK_WRITE:
+ if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ ksmbd_debug(SMB,
+ "User does not have write permission\n");
+ ret = -EACCES;
+ goto out;
+ }
+
+ if (out_buf_len < sizeof(struct copychunk_ioctl_rsp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ nbytes = sizeof(struct copychunk_ioctl_rsp);
+ fsctl_copychunk(work, req, rsp);
+ break;
+ case FSCTL_SET_SPARSE:
+ ret = fsctl_set_sparse(work, id,
+ (struct file_sparse *)&req->Buffer[0]);
+ if (ret < 0)
+ goto out;
+ break;
+ case FSCTL_SET_ZERO_DATA:
+ {
+ struct file_zero_data_information *zero_data;
+ struct ksmbd_file *fp;
+ loff_t off, len;
+
+ if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ ksmbd_debug(SMB,
+ "User does not have write permission\n");
+ ret = -EACCES;
+ goto out;
+ }
+
+ zero_data =
+ (struct file_zero_data_information *)&req->Buffer[0];
+
+ fp = ksmbd_lookup_fd_fast(work, id);
+ if (!fp) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ off = le64_to_cpu(zero_data->FileOffset);
+ len = le64_to_cpu(zero_data->BeyondFinalZero) - off;
+
+ ret = ksmbd_vfs_zero_data(work, fp, off, len);
+ ksmbd_fd_put(work, fp);
+ if (ret < 0)
+ goto out;
+ break;
+ }
+ case FSCTL_QUERY_ALLOCATED_RANGES:
+ ret = fsctl_query_allocated_ranges(work, id,
+ (struct file_allocated_range_buffer *)&req->Buffer[0],
+ (struct file_allocated_range_buffer *)&rsp->Buffer[0],
+ out_buf_len /
+ sizeof(struct file_allocated_range_buffer), &nbytes);
+ if (ret == -E2BIG) {
+ rsp->hdr.Status = STATUS_BUFFER_OVERFLOW;
+ } else if (ret < 0) {
+ nbytes = 0;
+ goto out;
+ }
+
+ nbytes *= sizeof(struct file_allocated_range_buffer);
+ break;
+ case FSCTL_GET_REPARSE_POINT:
+ {
+ struct reparse_data_buffer *reparse_ptr;
+ struct ksmbd_file *fp;
+
+ reparse_ptr = (struct reparse_data_buffer *)&rsp->Buffer[0];
+ fp = ksmbd_lookup_fd_fast(work, id);
+ if (!fp) {
+ pr_err("not found fp!!\n");
+ ret = -ENOENT;
+ goto out;
+ }
+
+ reparse_ptr->ReparseTag =
+ smb2_get_reparse_tag_special_file(file_inode(fp->filp)->i_mode);
+ reparse_ptr->ReparseDataLength = 0;
+ ksmbd_fd_put(work, fp);
+ nbytes = sizeof(struct reparse_data_buffer);
+ break;
+ }
+ case FSCTL_DUPLICATE_EXTENTS_TO_FILE:
+ {
+ struct ksmbd_file *fp_in, *fp_out = NULL;
+ struct duplicate_extents_to_file *dup_ext;
+ loff_t src_off, dst_off, length, cloned;
+
+ dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0];
+
+ fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle,
+ dup_ext->PersistentFileHandle);
+ if (!fp_in) {
+ pr_err("not found file handle in duplicate extent to file\n");
+ ret = -ENOENT;
+ goto out;
+ }
+
+ fp_out = ksmbd_lookup_fd_fast(work, id);
+ if (!fp_out) {
+ pr_err("not found fp\n");
+ ret = -ENOENT;
+ goto dup_ext_out;
+ }
+
+ src_off = le64_to_cpu(dup_ext->SourceFileOffset);
+ dst_off = le64_to_cpu(dup_ext->TargetFileOffset);
+ length = le64_to_cpu(dup_ext->ByteCount);
+ cloned = vfs_clone_file_range(fp_in->filp, src_off, fp_out->filp,
+ dst_off, length, 0);
+ if (cloned == -EXDEV || cloned == -EOPNOTSUPP) {
+ ret = -EOPNOTSUPP;
+ goto dup_ext_out;
+ } else if (cloned != length) {
+ cloned = vfs_copy_file_range(fp_in->filp, src_off,
+ fp_out->filp, dst_off, length, 0);
+ if (cloned != length) {
+ if (cloned < 0)
+ ret = cloned;
+ else
+ ret = -EINVAL;
+ }
+ }
+
+dup_ext_out:
+ ksmbd_fd_put(work, fp_in);
+ ksmbd_fd_put(work, fp_out);
+ if (ret < 0)
+ goto out;
+ break;
+ }
+ default:
+ ksmbd_debug(SMB, "not implemented yet ioctl command 0x%x\n",
+ cnt_code);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rsp->CntCode = cpu_to_le32(cnt_code);
+ rsp->InputCount = cpu_to_le32(0);
+ rsp->InputOffset = cpu_to_le32(112);
+ rsp->OutputOffset = cpu_to_le32(112);
+ rsp->OutputCount = cpu_to_le32(nbytes);
+ rsp->StructureSize = cpu_to_le16(49);
+ rsp->Reserved = cpu_to_le16(0);
+ rsp->Flags = cpu_to_le32(0);
+ rsp->Reserved2 = cpu_to_le32(0);
+ inc_rfc1001_len(rsp_org, 48 + nbytes);
+
+ return 0;
+
+out:
+ if (ret == -EACCES)
+ rsp->hdr.Status = STATUS_ACCESS_DENIED;
+ else if (ret == -ENOENT)
+ rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND;
+ else if (ret == -EOPNOTSUPP)
+ rsp->hdr.Status = STATUS_NOT_SUPPORTED;
+ else if (ret < 0 || rsp->hdr.Status == 0)
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ smb2_set_err_rsp(work);
+ return 0;
+}
+
+/**
+ * smb20_oplock_break_ack() - handler for smb2.0 oplock break command
+ * @work: smb work containing oplock break command buffer
+ *
+ * Return: 0
+ */
+static void smb20_oplock_break_ack(struct ksmbd_work *work)
+{
+ struct smb2_oplock_break *req = work->request_buf;
+ struct smb2_oplock_break *rsp = work->response_buf;
+ struct ksmbd_file *fp;
+ struct oplock_info *opinfo = NULL;
+ __le32 err = 0;
+ int ret = 0;
+ u64 volatile_id, persistent_id;
+ char req_oplevel = 0, rsp_oplevel = 0;
+ unsigned int oplock_change_type;
+
+ volatile_id = le64_to_cpu(req->VolatileFid);
+ persistent_id = le64_to_cpu(req->PersistentFid);
+ req_oplevel = req->OplockLevel;
+ ksmbd_debug(OPLOCK, "v_id %llu, p_id %llu request oplock level %d\n",
+ volatile_id, persistent_id, req_oplevel);
+
+ fp = ksmbd_lookup_fd_slow(work, volatile_id, persistent_id);
+ if (!fp) {
+ rsp->hdr.Status = STATUS_FILE_CLOSED;
+ smb2_set_err_rsp(work);
+ return;
+ }
+
+ opinfo = opinfo_get(fp);
+ if (!opinfo) {
+ pr_err("unexpected null oplock_info\n");
+ rsp->hdr.Status = STATUS_INVALID_OPLOCK_PROTOCOL;
+ smb2_set_err_rsp(work);
+ ksmbd_fd_put(work, fp);
+ return;
+ }
+
+ if (opinfo->level == SMB2_OPLOCK_LEVEL_NONE) {
+ rsp->hdr.Status = STATUS_INVALID_OPLOCK_PROTOCOL;
+ goto err_out;
+ }
+
+ if (opinfo->op_state == OPLOCK_STATE_NONE) {
+ ksmbd_debug(SMB, "unexpected oplock state 0x%x\n", opinfo->op_state);
+ rsp->hdr.Status = STATUS_UNSUCCESSFUL;
+ goto err_out;
+ }
+
+ if ((opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
+ opinfo->level == SMB2_OPLOCK_LEVEL_BATCH) &&
+ (req_oplevel != SMB2_OPLOCK_LEVEL_II &&
+ req_oplevel != SMB2_OPLOCK_LEVEL_NONE)) {
+ err = STATUS_INVALID_OPLOCK_PROTOCOL;
+ oplock_change_type = OPLOCK_WRITE_TO_NONE;
+ } else if (opinfo->level == SMB2_OPLOCK_LEVEL_II &&
+ req_oplevel != SMB2_OPLOCK_LEVEL_NONE) {
+ err = STATUS_INVALID_OPLOCK_PROTOCOL;
+ oplock_change_type = OPLOCK_READ_TO_NONE;
+ } else if (req_oplevel == SMB2_OPLOCK_LEVEL_II ||
+ req_oplevel == SMB2_OPLOCK_LEVEL_NONE) {
+ err = STATUS_INVALID_DEVICE_STATE;
+ if ((opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
+ opinfo->level == SMB2_OPLOCK_LEVEL_BATCH) &&
+ req_oplevel == SMB2_OPLOCK_LEVEL_II) {
+ oplock_change_type = OPLOCK_WRITE_TO_READ;
+ } else if ((opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
+ opinfo->level == SMB2_OPLOCK_LEVEL_BATCH) &&
+ req_oplevel == SMB2_OPLOCK_LEVEL_NONE) {
+ oplock_change_type = OPLOCK_WRITE_TO_NONE;
+ } else if (opinfo->level == SMB2_OPLOCK_LEVEL_II &&
+ req_oplevel == SMB2_OPLOCK_LEVEL_NONE) {
+ oplock_change_type = OPLOCK_READ_TO_NONE;
+ } else {
+ oplock_change_type = 0;
+ }
+ } else {
+ oplock_change_type = 0;
+ }
+
+ switch (oplock_change_type) {
+ case OPLOCK_WRITE_TO_READ:
+ ret = opinfo_write_to_read(opinfo);
+ rsp_oplevel = SMB2_OPLOCK_LEVEL_II;
+ break;
+ case OPLOCK_WRITE_TO_NONE:
+ ret = opinfo_write_to_none(opinfo);
+ rsp_oplevel = SMB2_OPLOCK_LEVEL_NONE;
+ break;
+ case OPLOCK_READ_TO_NONE:
+ ret = opinfo_read_to_none(opinfo);
+ rsp_oplevel = SMB2_OPLOCK_LEVEL_NONE;
+ break;
+ default:
+ pr_err("unknown oplock change 0x%x -> 0x%x\n",
+ opinfo->level, rsp_oplevel);
+ }
+
+ if (ret < 0) {
+ rsp->hdr.Status = err;
+ goto err_out;
+ }
+
+ opinfo_put(opinfo);
+ ksmbd_fd_put(work, fp);
+ opinfo->op_state = OPLOCK_STATE_NONE;
+ wake_up_interruptible_all(&opinfo->oplock_q);
+
+ rsp->StructureSize = cpu_to_le16(24);
+ rsp->OplockLevel = rsp_oplevel;
+ rsp->Reserved = 0;
+ rsp->Reserved2 = 0;
+ rsp->VolatileFid = cpu_to_le64(volatile_id);
+ rsp->PersistentFid = cpu_to_le64(persistent_id);
+ inc_rfc1001_len(rsp, 24);
+ return;
+
+err_out:
+ opinfo->op_state = OPLOCK_STATE_NONE;
+ wake_up_interruptible_all(&opinfo->oplock_q);
+
+ opinfo_put(opinfo);
+ ksmbd_fd_put(work, fp);
+ smb2_set_err_rsp(work);
+}
+
+static int check_lease_state(struct lease *lease, __le32 req_state)
+{
+ if ((lease->new_state ==
+ (SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_HANDLE_CACHING_LE)) &&
+ !(req_state & SMB2_LEASE_WRITE_CACHING_LE)) {
+ lease->new_state = req_state;
+ return 0;
+ }
+
+ if (lease->new_state == req_state)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * smb21_lease_break_ack() - handler for smb2.1 lease break command
+ * @work: smb work containing lease break command buffer
+ *
+ * Return: 0
+ */
+static void smb21_lease_break_ack(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_lease_ack *req = work->request_buf;
+ struct smb2_lease_ack *rsp = work->response_buf;
+ struct oplock_info *opinfo;
+ __le32 err = 0;
+ int ret = 0;
+ unsigned int lease_change_type;
+ __le32 lease_state;
+ struct lease *lease;
+
+ ksmbd_debug(OPLOCK, "smb21 lease break, lease state(0x%x)\n",
+ le32_to_cpu(req->LeaseState));
+ opinfo = lookup_lease_in_table(conn, req->LeaseKey);
+ if (!opinfo) {
+ ksmbd_debug(OPLOCK, "file not opened\n");
+ smb2_set_err_rsp(work);
+ rsp->hdr.Status = STATUS_UNSUCCESSFUL;
+ return;
+ }
+ lease = opinfo->o_lease;
+
+ if (opinfo->op_state == OPLOCK_STATE_NONE) {
+ pr_err("unexpected lease break state 0x%x\n",
+ opinfo->op_state);
+ rsp->hdr.Status = STATUS_UNSUCCESSFUL;
+ goto err_out;
+ }
+
+ if (check_lease_state(lease, req->LeaseState)) {
+ rsp->hdr.Status = STATUS_REQUEST_NOT_ACCEPTED;
+ ksmbd_debug(OPLOCK,
+ "req lease state: 0x%x, expected state: 0x%x\n",
+ req->LeaseState, lease->new_state);
+ goto err_out;
+ }
+
+ if (!atomic_read(&opinfo->breaking_cnt)) {
+ rsp->hdr.Status = STATUS_UNSUCCESSFUL;
+ goto err_out;
+ }
+
+ /* check for bad lease state */
+ if (req->LeaseState &
+ (~(SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_HANDLE_CACHING_LE))) {
+ err = STATUS_INVALID_OPLOCK_PROTOCOL;
+ if (lease->state & SMB2_LEASE_WRITE_CACHING_LE)
+ lease_change_type = OPLOCK_WRITE_TO_NONE;
+ else
+ lease_change_type = OPLOCK_READ_TO_NONE;
+ ksmbd_debug(OPLOCK, "handle bad lease state 0x%x -> 0x%x\n",
+ le32_to_cpu(lease->state),
+ le32_to_cpu(req->LeaseState));
+ } else if (lease->state == SMB2_LEASE_READ_CACHING_LE &&
+ req->LeaseState != SMB2_LEASE_NONE_LE) {
+ err = STATUS_INVALID_OPLOCK_PROTOCOL;
+ lease_change_type = OPLOCK_READ_TO_NONE;
+ ksmbd_debug(OPLOCK, "handle bad lease state 0x%x -> 0x%x\n",
+ le32_to_cpu(lease->state),
+ le32_to_cpu(req->LeaseState));
+ } else {
+ /* valid lease state changes */
+ err = STATUS_INVALID_DEVICE_STATE;
+ if (req->LeaseState == SMB2_LEASE_NONE_LE) {
+ if (lease->state & SMB2_LEASE_WRITE_CACHING_LE)
+ lease_change_type = OPLOCK_WRITE_TO_NONE;
+ else
+ lease_change_type = OPLOCK_READ_TO_NONE;
+ } else if (req->LeaseState & SMB2_LEASE_READ_CACHING_LE) {
+ if (lease->state & SMB2_LEASE_WRITE_CACHING_LE)
+ lease_change_type = OPLOCK_WRITE_TO_READ;
+ else
+ lease_change_type = OPLOCK_READ_HANDLE_TO_READ;
+ } else {
+ lease_change_type = 0;
+ }
+ }
+
+ switch (lease_change_type) {
+ case OPLOCK_WRITE_TO_READ:
+ ret = opinfo_write_to_read(opinfo);
+ break;
+ case OPLOCK_READ_HANDLE_TO_READ:
+ ret = opinfo_read_handle_to_read(opinfo);
+ break;
+ case OPLOCK_WRITE_TO_NONE:
+ ret = opinfo_write_to_none(opinfo);
+ break;
+ case OPLOCK_READ_TO_NONE:
+ ret = opinfo_read_to_none(opinfo);
+ break;
+ default:
+ ksmbd_debug(OPLOCK, "unknown lease change 0x%x -> 0x%x\n",
+ le32_to_cpu(lease->state),
+ le32_to_cpu(req->LeaseState));
+ }
+
+ lease_state = lease->state;
+ opinfo->op_state = OPLOCK_STATE_NONE;
+ wake_up_interruptible_all(&opinfo->oplock_q);
+ atomic_dec(&opinfo->breaking_cnt);
+ wake_up_interruptible_all(&opinfo->oplock_brk);
+ opinfo_put(opinfo);
+
+ if (ret < 0) {
+ rsp->hdr.Status = err;
+ goto err_out;
+ }
+
+ rsp->StructureSize = cpu_to_le16(36);
+ rsp->Reserved = 0;
+ rsp->Flags = 0;
+ memcpy(rsp->LeaseKey, req->LeaseKey, 16);
+ rsp->LeaseState = lease_state;
+ rsp->LeaseDuration = 0;
+ inc_rfc1001_len(rsp, 36);
+ return;
+
+err_out:
+ opinfo->op_state = OPLOCK_STATE_NONE;
+ wake_up_interruptible_all(&opinfo->oplock_q);
+ atomic_dec(&opinfo->breaking_cnt);
+ wake_up_interruptible_all(&opinfo->oplock_brk);
+
+ opinfo_put(opinfo);
+ smb2_set_err_rsp(work);
+}
+
+/**
+ * smb2_oplock_break() - dispatcher for smb2.0 and 2.1 oplock/lease break
+ * @work: smb work containing oplock/lease break command buffer
+ *
+ * Return: 0
+ */
+int smb2_oplock_break(struct ksmbd_work *work)
+{
+ struct smb2_oplock_break *req = work->request_buf;
+ struct smb2_oplock_break *rsp = work->response_buf;
+
+ switch (le16_to_cpu(req->StructureSize)) {
+ case OP_BREAK_STRUCT_SIZE_20:
+ smb20_oplock_break_ack(work);
+ break;
+ case OP_BREAK_STRUCT_SIZE_21:
+ smb21_lease_break_ack(work);
+ break;
+ default:
+ ksmbd_debug(OPLOCK, "invalid break cmd %d\n",
+ le16_to_cpu(req->StructureSize));
+ rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ smb2_set_err_rsp(work);
+ }
+
+ return 0;
+}
+
+/**
+ * smb2_notify() - handler for smb2 notify request
+ * @work: smb work containing notify command buffer
+ *
+ * Return: 0
+ */
+int smb2_notify(struct ksmbd_work *work)
+{
+ struct smb2_notify_req *req;
+ struct smb2_notify_rsp *rsp;
+
+ WORK_BUFFERS(work, req, rsp);
+
+ if (work->next_smb2_rcv_hdr_off && req->hdr.NextCommand) {
+ rsp->hdr.Status = STATUS_INTERNAL_ERROR;
+ smb2_set_err_rsp(work);
+ return 0;
+ }
+
+ smb2_set_err_rsp(work);
+ rsp->hdr.Status = STATUS_NOT_IMPLEMENTED;
+ return 0;
+}
+
+/**
+ * smb2_is_sign_req() - handler for checking packet signing status
+ * @work: smb work containing notify command buffer
+ * @command: SMB2 command id
+ *
+ * Return: true if packed is signed, false otherwise
+ */
+bool smb2_is_sign_req(struct ksmbd_work *work, unsigned int command)
+{
+ struct smb2_hdr *rcv_hdr2 = work->request_buf;
+
+ if ((rcv_hdr2->Flags & SMB2_FLAGS_SIGNED) &&
+ command != SMB2_NEGOTIATE_HE &&
+ command != SMB2_SESSION_SETUP_HE &&
+ command != SMB2_OPLOCK_BREAK_HE)
+ return true;
+
+ return false;
+}
+
+/**
+ * smb2_check_sign_req() - handler for req packet sign processing
+ * @work: smb work containing notify command buffer
+ *
+ * Return: 1 on success, 0 otherwise
+ */
+int smb2_check_sign_req(struct ksmbd_work *work)
+{
+ struct smb2_hdr *hdr, *hdr_org;
+ char signature_req[SMB2_SIGNATURE_SIZE];
+ char signature[SMB2_HMACSHA256_SIZE];
+ struct kvec iov[1];
+ size_t len;
+
+ hdr_org = hdr = work->request_buf;
+ if (work->next_smb2_rcv_hdr_off)
+ hdr = ksmbd_req_buf_next(work);
+
+ if (!hdr->NextCommand && !work->next_smb2_rcv_hdr_off)
+ len = be32_to_cpu(hdr_org->smb2_buf_length);
+ else if (hdr->NextCommand)
+ len = le32_to_cpu(hdr->NextCommand);
+ else
+ len = be32_to_cpu(hdr_org->smb2_buf_length) -
+ work->next_smb2_rcv_hdr_off;
+
+ memcpy(signature_req, hdr->Signature, SMB2_SIGNATURE_SIZE);
+ memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE);
+
+ iov[0].iov_base = (char *)&hdr->ProtocolId;
+ iov[0].iov_len = len;
+
+ if (ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, 1,
+ signature))
+ return 0;
+
+ if (memcmp(signature, signature_req, SMB2_SIGNATURE_SIZE)) {
+ pr_err("bad smb2 signature\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+/**
+ * smb2_set_sign_rsp() - handler for rsp packet sign processing
+ * @work: smb work containing notify command buffer
+ *
+ */
+void smb2_set_sign_rsp(struct ksmbd_work *work)
+{
+ struct smb2_hdr *hdr, *hdr_org;
+ struct smb2_hdr *req_hdr;
+ char signature[SMB2_HMACSHA256_SIZE];
+ struct kvec iov[2];
+ size_t len;
+ int n_vec = 1;
+
+ hdr_org = hdr = work->response_buf;
+ if (work->next_smb2_rsp_hdr_off)
+ hdr = ksmbd_resp_buf_next(work);
+
+ req_hdr = ksmbd_req_buf_next(work);
+
+ if (!work->next_smb2_rsp_hdr_off) {
+ len = get_rfc1002_len(hdr_org);
+ if (req_hdr->NextCommand)
+ len = ALIGN(len, 8);
+ } else {
+ len = get_rfc1002_len(hdr_org) - work->next_smb2_rsp_hdr_off;
+ len = ALIGN(len, 8);
+ }
+
+ if (req_hdr->NextCommand)
+ hdr->NextCommand = cpu_to_le32(len);
+
+ hdr->Flags |= SMB2_FLAGS_SIGNED;
+ memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE);
+
+ iov[0].iov_base = (char *)&hdr->ProtocolId;
+ iov[0].iov_len = len;
+
+ if (work->aux_payload_sz) {
+ iov[0].iov_len -= work->aux_payload_sz;
+
+ iov[1].iov_base = work->aux_payload_buf;
+ iov[1].iov_len = work->aux_payload_sz;
+ n_vec++;
+ }
+
+ if (!ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, n_vec,
+ signature))
+ memcpy(hdr->Signature, signature, SMB2_SIGNATURE_SIZE);
+}
+
+/**
+ * smb3_check_sign_req() - handler for req packet sign processing
+ * @work: smb work containing notify command buffer
+ *
+ * Return: 1 on success, 0 otherwise
+ */
+int smb3_check_sign_req(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ char *signing_key;
+ struct smb2_hdr *hdr, *hdr_org;
+ struct channel *chann;
+ char signature_req[SMB2_SIGNATURE_SIZE];
+ char signature[SMB2_CMACAES_SIZE];
+ struct kvec iov[1];
+ size_t len;
+
+ hdr_org = hdr = work->request_buf;
+ if (work->next_smb2_rcv_hdr_off)
+ hdr = ksmbd_req_buf_next(work);
+
+ if (!hdr->NextCommand && !work->next_smb2_rcv_hdr_off)
+ len = be32_to_cpu(hdr_org->smb2_buf_length);
+ else if (hdr->NextCommand)
+ len = le32_to_cpu(hdr->NextCommand);
+ else
+ len = be32_to_cpu(hdr_org->smb2_buf_length) -
+ work->next_smb2_rcv_hdr_off;
+
+ if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
+ signing_key = work->sess->smb3signingkey;
+ } else {
+ chann = lookup_chann_list(work->sess, conn);
+ if (!chann)
+ return 0;
+ signing_key = chann->smb3signingkey;
+ }
+
+ if (!signing_key) {
+ pr_err("SMB3 signing key is not generated\n");
+ return 0;
+ }
+
+ memcpy(signature_req, hdr->Signature, SMB2_SIGNATURE_SIZE);
+ memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE);
+ iov[0].iov_base = (char *)&hdr->ProtocolId;
+ iov[0].iov_len = len;
+
+ if (ksmbd_sign_smb3_pdu(conn, signing_key, iov, 1, signature))
+ return 0;
+
+ if (memcmp(signature, signature_req, SMB2_SIGNATURE_SIZE)) {
+ pr_err("bad smb2 signature\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+/**
+ * smb3_set_sign_rsp() - handler for rsp packet sign processing
+ * @work: smb work containing notify command buffer
+ *
+ */
+void smb3_set_sign_rsp(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_hdr *req_hdr;
+ struct smb2_hdr *hdr, *hdr_org;
+ struct channel *chann;
+ char signature[SMB2_CMACAES_SIZE];
+ struct kvec iov[2];
+ int n_vec = 1;
+ size_t len;
+ char *signing_key;
+
+ hdr_org = hdr = work->response_buf;
+ if (work->next_smb2_rsp_hdr_off)
+ hdr = ksmbd_resp_buf_next(work);
+
+ req_hdr = ksmbd_req_buf_next(work);
+
+ if (!work->next_smb2_rsp_hdr_off) {
+ len = get_rfc1002_len(hdr_org);
+ if (req_hdr->NextCommand)
+ len = ALIGN(len, 8);
+ } else {
+ len = get_rfc1002_len(hdr_org) - work->next_smb2_rsp_hdr_off;
+ len = ALIGN(len, 8);
+ }
+
+ if (conn->binding == false &&
+ le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
+ signing_key = work->sess->smb3signingkey;
+ } else {
+ chann = lookup_chann_list(work->sess, work->conn);
+ if (!chann)
+ return;
+ signing_key = chann->smb3signingkey;
+ }
+
+ if (!signing_key)
+ return;
+
+ if (req_hdr->NextCommand)
+ hdr->NextCommand = cpu_to_le32(len);
+
+ hdr->Flags |= SMB2_FLAGS_SIGNED;
+ memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE);
+ iov[0].iov_base = (char *)&hdr->ProtocolId;
+ iov[0].iov_len = len;
+ if (work->aux_payload_sz) {
+ iov[0].iov_len -= work->aux_payload_sz;
+ iov[1].iov_base = work->aux_payload_buf;
+ iov[1].iov_len = work->aux_payload_sz;
+ n_vec++;
+ }
+
+ if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, signature))
+ memcpy(hdr->Signature, signature, SMB2_SIGNATURE_SIZE);
+}
+
+/**
+ * smb3_preauth_hash_rsp() - handler for computing preauth hash on response
+ * @work: smb work containing response buffer
+ *
+ */
+void smb3_preauth_hash_rsp(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct ksmbd_session *sess = work->sess;
+ struct smb2_hdr *req, *rsp;
+
+ if (conn->dialect != SMB311_PROT_ID)
+ return;
+
+ WORK_BUFFERS(work, req, rsp);
+
+ if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE)
+ ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp,
+ conn->preauth_info->Preauth_HashValue);
+
+ if (le16_to_cpu(rsp->Command) == SMB2_SESSION_SETUP_HE && sess) {
+ __u8 *hash_value;
+
+ if (conn->binding) {
+ struct preauth_session *preauth_sess;
+
+ preauth_sess = ksmbd_preauth_session_lookup(conn, sess->id);
+ if (!preauth_sess)
+ return;
+ hash_value = preauth_sess->Preauth_HashValue;
+ } else {
+ hash_value = sess->Preauth_HashValue;
+ if (!hash_value)
+ return;
+ }
+ ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp,
+ hash_value);
+ }
+}
+
+static void fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, char *old_buf,
+ __le16 cipher_type)
+{
+ struct smb2_hdr *hdr = (struct smb2_hdr *)old_buf;
+ unsigned int orig_len = get_rfc1002_len(old_buf);
+
+ memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
+ tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
+ tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len);
+ tr_hdr->Flags = cpu_to_le16(0x01);
+ if (cipher_type == SMB2_ENCRYPTION_AES128_GCM ||
+ cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ get_random_bytes(&tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
+ else
+ get_random_bytes(&tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
+ memcpy(&tr_hdr->SessionId, &hdr->SessionId, 8);
+ inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - 4);
+ inc_rfc1001_len(tr_hdr, orig_len);
+}
+
+int smb3_encrypt_resp(struct ksmbd_work *work)
+{
+ char *buf = work->response_buf;
+ struct smb2_transform_hdr *tr_hdr;
+ struct kvec iov[3];
+ int rc = -ENOMEM;
+ int buf_size = 0, rq_nvec = 2 + (work->aux_payload_sz ? 1 : 0);
+
+ if (ARRAY_SIZE(iov) < rq_nvec)
+ return -ENOMEM;
+
+ tr_hdr = kzalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL);
+ if (!tr_hdr)
+ return rc;
+
+ /* fill transform header */
+ fill_transform_hdr(tr_hdr, buf, work->conn->cipher_type);
+
+ iov[0].iov_base = tr_hdr;
+ iov[0].iov_len = sizeof(struct smb2_transform_hdr);
+ buf_size += iov[0].iov_len - 4;
+
+ iov[1].iov_base = buf + 4;
+ iov[1].iov_len = get_rfc1002_len(buf);
+ if (work->aux_payload_sz) {
+ iov[1].iov_len = work->resp_hdr_sz - 4;
+
+ iov[2].iov_base = work->aux_payload_buf;
+ iov[2].iov_len = work->aux_payload_sz;
+ buf_size += iov[2].iov_len;
+ }
+ buf_size += iov[1].iov_len;
+ work->resp_hdr_sz = iov[1].iov_len;
+
+ rc = ksmbd_crypt_message(work->conn, iov, rq_nvec, 1);
+ if (rc)
+ return rc;
+
+ memmove(buf, iov[1].iov_base, iov[1].iov_len);
+ tr_hdr->smb2_buf_length = cpu_to_be32(buf_size);
+ work->tr_buf = tr_hdr;
+
+ return rc;
+}
+
+bool smb3_is_transform_hdr(void *buf)
+{
+ struct smb2_transform_hdr *trhdr = buf;
+
+ return trhdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM;
+}
+
+int smb3_decrypt_req(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct ksmbd_session *sess;
+ char *buf = work->request_buf;
+ struct smb2_hdr *hdr;
+ unsigned int pdu_length = get_rfc1002_len(buf);
+ struct kvec iov[2];
+ unsigned int buf_data_size = pdu_length + 4 -
+ sizeof(struct smb2_transform_hdr);
+ struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
+ unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+ int rc = 0;
+
+ sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId));
+ if (!sess) {
+ pr_err("invalid session id(%llx) in transform header\n",
+ le64_to_cpu(tr_hdr->SessionId));
+ return -ECONNABORTED;
+ }
+
+ if (pdu_length + 4 <
+ sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_hdr)) {
+ pr_err("Transform message is too small (%u)\n",
+ pdu_length);
+ return -ECONNABORTED;
+ }
+
+ if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) {
+ pr_err("Transform message is broken\n");
+ return -ECONNABORTED;
+ }
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = sizeof(struct smb2_transform_hdr);
+ iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr);
+ iov[1].iov_len = buf_data_size;
+ rc = ksmbd_crypt_message(conn, iov, 2, 0);
+ if (rc)
+ return rc;
+
+ memmove(buf + 4, iov[1].iov_base, buf_data_size);
+ hdr = (struct smb2_hdr *)buf;
+ hdr->smb2_buf_length = cpu_to_be32(buf_data_size);
+
+ return rc;
+}
+
+bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+ struct smb2_hdr *rsp = work->response_buf;
+
+ if (conn->dialect < SMB30_PROT_ID)
+ return false;
+
+ if (work->next_smb2_rcv_hdr_off)
+ rsp = ksmbd_resp_buf_next(work);
+
+ if (le16_to_cpu(rsp->Command) == SMB2_SESSION_SETUP_HE &&
+ rsp->Status == STATUS_SUCCESS)
+ return true;
+ return false;
+}
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
new file mode 100644
index 000000000000..bcec845b03f3
--- /dev/null
+++ b/fs/ksmbd/smb2pdu.h
@@ -0,0 +1,1698 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef _SMB2PDU_H
+#define _SMB2PDU_H
+
+#include "ntlmssp.h"
+#include "smbacl.h"
+
+/*
+ * Note that, due to trying to use names similar to the protocol specifications,
+ * there are many mixed case field names in the structures below. Although
+ * this does not match typical Linux kernel style, it is necessary to be
+ * able to match against the protocol specfication.
+ *
+ * SMB2 commands
+ * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie no useful data other than the SMB error code itself) and are marked such.
+ * Knowing this helps avoid response buffer allocations and copy in some cases.
+ */
+
+/* List of commands in host endian */
+#define SMB2_NEGOTIATE_HE 0x0000
+#define SMB2_SESSION_SETUP_HE 0x0001
+#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */
+#define SMB2_TREE_CONNECT_HE 0x0003
+#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */
+#define SMB2_CREATE_HE 0x0005
+#define SMB2_CLOSE_HE 0x0006
+#define SMB2_FLUSH_HE 0x0007 /* trivial resp */
+#define SMB2_READ_HE 0x0008
+#define SMB2_WRITE_HE 0x0009
+#define SMB2_LOCK_HE 0x000A
+#define SMB2_IOCTL_HE 0x000B
+#define SMB2_CANCEL_HE 0x000C
+#define SMB2_ECHO_HE 0x000D
+#define SMB2_QUERY_DIRECTORY_HE 0x000E
+#define SMB2_CHANGE_NOTIFY_HE 0x000F
+#define SMB2_QUERY_INFO_HE 0x0010
+#define SMB2_SET_INFO_HE 0x0011
+#define SMB2_OPLOCK_BREAK_HE 0x0012
+
+/* The same list in little endian */
+#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
+#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE)
+#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE)
+#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE)
+#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
+#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE)
+#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE)
+#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE)
+#define SMB2_READ cpu_to_le16(SMB2_READ_HE)
+#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE)
+#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE)
+#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE)
+#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE)
+#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE)
+#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
+#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
+#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE)
+#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
+#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
+
+/*Create Action Flags*/
+#define FILE_SUPERSEDED 0x00000000
+#define FILE_OPENED 0x00000001
+#define FILE_CREATED 0x00000002
+#define FILE_OVERWRITTEN 0x00000003
+
+/*
+ * Size of the session key (crypto key encrypted with the password
+ */
+#define SMB2_NTLMV2_SESSKEY_SIZE 16
+#define SMB2_SIGNATURE_SIZE 16
+#define SMB2_HMACSHA256_SIZE 32
+#define SMB2_CMACAES_SIZE 16
+#define SMB3_GCM128_CRYPTKEY_SIZE 16
+#define SMB3_GCM256_CRYPTKEY_SIZE 32
+
+/*
+ * Size of the smb3 encryption/decryption keys
+ */
+#define SMB3_ENC_DEC_KEY_SIZE 32
+
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE 16
+
+#define CIFS_CLIENT_CHALLENGE_SIZE 8
+#define SMB_SERVER_CHALLENGE_SIZE 8
+
+/* SMB2 Max Credits */
+#define SMB2_MAX_CREDITS 8192
+
+#define SMB2_CLIENT_GUID_SIZE 16
+#define SMB2_CREATE_GUID_SIZE 16
+
+/* Maximum buffer size value we can send with 1 credit */
+#define SMB2_MAX_BUFFER_SIZE 65536
+
+#define NUMBER_OF_SMB2_COMMANDS 0x0013
+
+/* BB FIXME - analyze following length BB */
+#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
+
+#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) /* 'B''M''S' */
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
+
+#define SMB21_DEFAULT_IOSIZE (1024 * 1024)
+#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
+#define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024)
+
+/*
+ * SMB2 Header Definition
+ *
+ * "MBZ" : Must be Zero
+ * "BB" : BugBug, Something to check/review/analyze later
+ * "PDU" : "Protocol Data Unit" (ie a network "frame")
+ *
+ */
+
+#define __SMB2_HEADER_STRUCTURE_SIZE 64
+#define SMB2_HEADER_STRUCTURE_SIZE \
+ cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE)
+
+struct smb2_hdr {
+ __be32 smb2_buf_length; /* big endian on wire */
+ /*
+ * length is only two or three bytes - with
+ * one or two byte type preceding it that MBZ
+ */
+ __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
+ __le16 StructureSize; /* 64 */
+ __le16 CreditCharge; /* MBZ */
+ __le32 Status; /* Error from server */
+ __le16 Command;
+ __le16 CreditRequest; /* CreditResponse */
+ __le32 Flags;
+ __le32 NextCommand;
+ __le64 MessageId;
+ union {
+ struct {
+ __le32 ProcessId;
+ __le32 TreeId;
+ } __packed SyncId;
+ __le64 AsyncId;
+ } __packed Id;
+ __le64 SessionId;
+ __u8 Signature[16];
+} __packed;
+
+struct smb2_pdu {
+ struct smb2_hdr hdr;
+ __le16 StructureSize2; /* size of wct area (varies, request specific) */
+} __packed;
+
+#define SMB3_AES_CCM_NONCE 11
+#define SMB3_AES_GCM_NONCE 12
+
+struct smb2_transform_hdr {
+ __be32 smb2_buf_length; /* big endian on wire */
+ /*
+ * length is only two or three bytes - with
+ * one or two byte type preceding it that MBZ
+ */
+ __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
+ __u8 Signature[16];
+ __u8 Nonce[16];
+ __le32 OriginalMessageSize;
+ __u16 Reserved1;
+ __le16 Flags; /* EncryptionAlgorithm */
+ __le64 SessionId;
+} __packed;
+
+/*
+ * SMB2 flag definitions
+ */
+#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
+#define SMB2_FLAGS_REPLAY_OPERATIONS cpu_to_le32(0x20000000)
+
+/*
+ * Definitions for SMB2 Protocol Data Units (network frames)
+ *
+ * See MS-SMB2.PDF specification for protocol details.
+ * The Naming convention is the lower case version of the SMB2
+ * command code name for the struct. Note that structures must be packed.
+ *
+ */
+
+#define SMB2_ERROR_STRUCTURE_SIZE2 9
+#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2)
+
+struct smb2_err_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __u8 ErrorContextCount;
+ __u8 Reserved;
+ __le32 ByteCount; /* even if zero, at least one byte follows */
+ __u8 ErrorData[1]; /* variable length */
+} __packed;
+
+struct smb2_negotiate_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 36 */
+ __le16 DialectCount;
+ __le16 SecurityMode;
+ __le16 Reserved; /* MBZ */
+ __le32 Capabilities;
+ __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
+ /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
+ __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
+ __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
+ __le16 Reserved2;
+ __le16 Dialects[1]; /* One dialect (vers=) at a time for now */
+} __packed;
+
+/* SecurityMode flags */
+#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001)
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002)
+/* Capabilities flags */
+#define SMB2_GLOBAL_CAP_DFS 0x00000001
+#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
+/* Internal types */
+#define SMB2_NT_FIND 0x00100000
+#define SMB2_LARGE_FILES 0x00200000
+
+#define SMB311_SALT_SIZE 32
+/* Hash Algorithm Types */
+#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
+
+#define PREAUTH_HASHVALUE_SIZE 64
+
+struct preauth_integrity_info {
+ /* PreAuth integrity Hash ID */
+ __le16 Preauth_HashId;
+ /* PreAuth integrity Hash Value */
+ __u8 Preauth_HashValue[PREAUTH_HASHVALUE_SIZE];
+};
+
+/* offset is sizeof smb2_negotiate_rsp - 4 but rounded up to 8 bytes. */
+#ifdef CONFIG_SMB_SERVER_KERBEROS5
+/* sizeof(struct smb2_negotiate_rsp) - 4 =
+ * header(64) + response(64) + GSS_LENGTH(96) + GSS_PADDING(0)
+ */
+#define OFFSET_OF_NEG_CONTEXT 0xe0
+#else
+/* sizeof(struct smb2_negotiate_rsp) - 4 =
+ * header(64) + response(64) + GSS_LENGTH(74) + GSS_PADDING(6)
+ */
+#define OFFSET_OF_NEG_CONTEXT 0xd0
+#endif
+
+#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
+#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
+#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
+#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
+#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
+
+struct smb2_neg_context {
+ __le16 ContextType;
+ __le16 DataLength;
+ __le32 Reserved;
+ /* Followed by array of data */
+} __packed;
+
+struct smb2_preauth_neg_context {
+ __le16 ContextType; /* 1 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 HashAlgorithmCount; /* 1 */
+ __le16 SaltLength;
+ __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
+ __u8 Salt[SMB311_SALT_SIZE];
+} __packed;
+
+/* Encryption Algorithms Ciphers */
+#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
+#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
+#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
+
+struct smb2_encryption_neg_context {
+ __le16 ContextType; /* 2 */
+ __le16 DataLength;
+ __le32 Reserved;
+ /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
+ __le16 CipherCount; /* AES-128-GCM and AES-128-CCM by default */
+ __le16 Ciphers[];
+} __packed;
+
+#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000)
+#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
+#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
+#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
+
+struct smb2_compression_ctx {
+ __le16 ContextType; /* 3 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 CompressionAlgorithmCount;
+ __u16 Padding;
+ __le32 Reserved1;
+ __le16 CompressionAlgorithms[];
+} __packed;
+
+#define POSIX_CTXT_DATA_LEN 16
+struct smb2_posix_neg_context {
+ __le16 ContextType; /* 0x100 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
+} __packed;
+
+struct smb2_netname_neg_context {
+ __le16 ContextType; /* 0x100 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 NetName[]; /* hostname of target converted to UCS-2 */
+} __packed;
+
+/* Signing algorithms */
+#define SIGNING_ALG_HMAC_SHA256 cpu_to_le16(0)
+#define SIGNING_ALG_AES_CMAC cpu_to_le16(1)
+#define SIGNING_ALG_AES_GMAC cpu_to_le16(2)
+
+struct smb2_signing_capabilities {
+ __le16 ContextType; /* 8 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 SigningAlgorithmCount;
+ __le16 SigningAlgorithms[];
+} __packed;
+
+struct smb2_negotiate_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 65 */
+ __le16 SecurityMode;
+ __le16 DialectRevision;
+ __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
+ __u8 ServerGUID[16];
+ __le32 Capabilities;
+ __le32 MaxTransactSize;
+ __le32 MaxReadSize;
+ __le32 MaxWriteSize;
+ __le64 SystemTime; /* MBZ */
+ __le64 ServerStartTime;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+/* Flags */
+#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
+#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
+
+#define SMB2_SESSION_EXPIRED (0)
+#define SMB2_SESSION_IN_PROGRESS BIT(0)
+#define SMB2_SESSION_VALID BIT(1)
+
+/* Flags */
+#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
+#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
+
+struct smb2_sess_setup_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 25 */
+ __u8 Flags;
+ __u8 SecurityMode;
+ __le32 Capabilities;
+ __le32 Channel;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __le64 PreviousSessionId;
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+/* Flags/Reserved for SMB3.1.1 */
+#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001
+
+/* Currently defined SessionFlags */
+#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001)
+#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002)
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004)
+struct smb2_sess_setup_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 SessionFlags;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+struct smb2_logoff_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_logoff_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_tree_connect_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 Reserved; /* Flags in SMB3.1.1 */
+ __le16 PathOffset;
+ __le16 PathLength;
+ __u8 Buffer[1]; /* variable length */
+} __packed;
+
+struct smb2_tree_connect_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 16 */
+ __u8 ShareType; /* see below */
+ __u8 Reserved;
+ __le32 ShareFlags; /* see below */
+ __le32 Capabilities; /* see below */
+ __le32 MaximalAccess;
+} __packed;
+
+/* Possible ShareType values */
+#define SMB2_SHARE_TYPE_DISK 0x01
+#define SMB2_SHARE_TYPE_PIPE 0x02
+#define SMB2_SHARE_TYPE_PRINT 0x03
+
+/*
+ * Possible ShareFlags - exactly one and only one of the first 4 caching flags
+ * must be set (any of the remaining, SHI1005, flags may be set individually
+ * or in combination.
+ */
+#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000
+#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010
+#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020
+#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
+#define SHI1005_FLAGS_DFS 0x00000001
+#define SHI1005_FLAGS_DFS_ROOT 0x00000002
+#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
+#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
+#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
+#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
+#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
+#define SHI1005_FLAGS_ENABLE_HASH 0x00002000
+
+/* Possible share capabilities */
+#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008)
+
+struct smb2_tree_disconnect_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_tree_disconnect_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+#define ATTR_READONLY_LE cpu_to_le32(ATTR_READONLY)
+#define ATTR_HIDDEN_LE cpu_to_le32(ATTR_HIDDEN)
+#define ATTR_SYSTEM_LE cpu_to_le32(ATTR_SYSTEM)
+#define ATTR_DIRECTORY_LE cpu_to_le32(ATTR_DIRECTORY)
+#define ATTR_ARCHIVE_LE cpu_to_le32(ATTR_ARCHIVE)
+#define ATTR_NORMAL_LE cpu_to_le32(ATTR_NORMAL)
+#define ATTR_TEMPORARY_LE cpu_to_le32(ATTR_TEMPORARY)
+#define ATTR_SPARSE_FILE_LE cpu_to_le32(ATTR_SPARSE)
+#define ATTR_REPARSE_POINT_LE cpu_to_le32(ATTR_REPARSE)
+#define ATTR_COMPRESSED_LE cpu_to_le32(ATTR_COMPRESSED)
+#define ATTR_OFFLINE_LE cpu_to_le32(ATTR_OFFLINE)
+#define ATTR_NOT_CONTENT_INDEXED_LE cpu_to_le32(ATTR_NOT_CONTENT_INDEXED)
+#define ATTR_ENCRYPTED_LE cpu_to_le32(ATTR_ENCRYPTED)
+#define ATTR_INTEGRITY_STREAML_LE cpu_to_le32(0x00008000)
+#define ATTR_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000)
+#define ATTR_MASK_LE cpu_to_le32(0x00007FB7)
+
+/* Oplock levels */
+#define SMB2_OPLOCK_LEVEL_NONE 0x00
+#define SMB2_OPLOCK_LEVEL_II 0x01
+#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
+#define SMB2_OPLOCK_LEVEL_BATCH 0x09
+#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
+/* Non-spec internal type */
+#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
+
+/* Desired Access Flags */
+#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
+#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001)
+#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002)
+#define FILE_ADD_FILE_LE cpu_to_le32(0x00000002)
+#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004)
+#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004)
+#define FILE_READ_EA_LE cpu_to_le32(0x00000008)
+#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010)
+#define FILE_EXECUTE_LE cpu_to_le32(0x00000020)
+#define FILE_TRAVERSE_LE cpu_to_le32(0x00000020)
+#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040)
+#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080)
+#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100)
+#define FILE_DELETE_LE cpu_to_le32(0x00010000)
+#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000)
+#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000)
+#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000)
+#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000)
+#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
+#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000)
+#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000)
+#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000)
+#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000)
+#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000)
+#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF)
+
+/* ShareAccess Flags */
+#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001)
+#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002)
+#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004)
+#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007)
+
+/* CreateDisposition Flags */
+#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000)
+#define FILE_OPEN_LE cpu_to_le32(0x00000001)
+#define FILE_CREATE_LE cpu_to_le32(0x00000002)
+#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003)
+#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004)
+#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005)
+#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007)
+
+#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \
+ FILE_READ_EA_LE | \
+ FILE_GENERIC_READ_LE)
+#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \
+ FILE_APPEND_DATA_LE | \
+ FILE_WRITE_EA_LE | \
+ FILE_WRITE_ATTRIBUTES_LE | \
+ FILE_GENERIC_WRITE_LE)
+
+/* Impersonation Levels */
+#define IL_ANONYMOUS_LE cpu_to_le32(0x00000000)
+#define IL_IDENTIFICATION_LE cpu_to_le32(0x00000001)
+#define IL_IMPERSONATION_LE cpu_to_le32(0x00000002)
+#define IL_DELEGATE_LE cpu_to_le32(0x00000003)
+
+/* Create Context Values */
+#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */
+#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
+#define SMB2_CREATE_ALLOCATION_SIZE "AlSi"
+#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
+#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
+#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
+#define SMB2_CREATE_REQUEST_LEASE "RqLs"
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
+#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74"
+ #define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10"
+#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9
+#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+
+struct smb2_create_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __u8 SecurityFlags;
+ __u8 RequestedOplockLevel;
+ __le32 ImpersonationLevel;
+ __le64 SmbCreateFlags;
+ __le64 Reserved;
+ __le32 DesiredAccess;
+ __le32 FileAttributes;
+ __le32 ShareAccess;
+ __le32 CreateDisposition;
+ __le32 CreateOptions;
+ __le16 NameOffset;
+ __le16 NameLength;
+ __le32 CreateContextsOffset;
+ __le32 CreateContextsLength;
+ __u8 Buffer[0];
+} __packed;
+
+struct smb2_create_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 89 */
+ __u8 OplockLevel;
+ __u8 Reserved;
+ __le32 CreateAction;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize;
+ __le64 EndofFile;
+ __le32 FileAttributes;
+ __le32 Reserved2;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 CreateContextsOffset;
+ __le32 CreateContextsLength;
+ __u8 Buffer[1];
+} __packed;
+
+struct create_context {
+ __le32 Next;
+ __le16 NameOffset;
+ __le16 NameLength;
+ __le16 Reserved;
+ __le16 DataOffset;
+ __le32 DataLength;
+ __u8 Buffer[0];
+} __packed;
+
+struct create_durable_req_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le32 Timeout;
+ __le32 Flags;
+ __u8 Reserved[8];
+ __u8 CreateGuid[16];
+} __packed;
+
+struct create_durable_reconn_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ union {
+ __u8 Reserved[16];
+ struct {
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ } Fid;
+ } Data;
+} __packed;
+
+struct create_durable_reconn_v2_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct {
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ } Fid;
+ __u8 CreateGuid[16];
+ __le32 Flags;
+} __packed;
+
+struct create_app_inst_id {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __u8 Reserved[8];
+ __u8 AppInstanceId[16];
+} __packed;
+
+struct create_app_inst_id_vers {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __u8 Reserved[2];
+ __u8 Padding[4];
+ __le64 AppInstanceVersionHigh;
+ __le64 AppInstanceVersionLow;
+} __packed;
+
+struct create_mxac_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le64 Timestamp;
+} __packed;
+
+struct create_alloc_size_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le64 AllocationSize;
+} __packed;
+
+struct create_posix {
+ struct create_context ccontext;
+ __u8 Name[16];
+ __le32 Mode;
+ __u32 Reserved;
+} __packed;
+
+struct create_durable_rsp {
+ struct create_context ccontext;
+ __u8 Name[8];
+ union {
+ __u8 Reserved[8];
+ __u64 data;
+ } Data;
+} __packed;
+
+struct create_durable_v2_rsp {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le32 Timeout;
+ __le32 Flags;
+} __packed;
+
+struct create_mxac_rsp {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le32 QueryStatus;
+ __le32 MaximalAccess;
+} __packed;
+
+struct create_disk_id_rsp {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le64 DiskFileId;
+ __le64 VolumeId;
+ __u8 Reserved[16];
+} __packed;
+
+/* equivalent of the contents of SMB3.1.1 POSIX open context response */
+struct create_posix_rsp {
+ struct create_context ccontext;
+ __u8 Name[16];
+ __le32 nlink;
+ __le32 reparse_tag;
+ __le32 mode;
+ u8 SidBuffer[40];
+} __packed;
+
+#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00)
+#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01)
+#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02)
+#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04)
+
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02)
+
+struct lease_context {
+ __le64 LeaseKeyLow;
+ __le64 LeaseKeyHigh;
+ __le32 LeaseState;
+ __le32 LeaseFlags;
+ __le64 LeaseDuration;
+} __packed;
+
+struct lease_context_v2 {
+ __le64 LeaseKeyLow;
+ __le64 LeaseKeyHigh;
+ __le32 LeaseState;
+ __le32 LeaseFlags;
+ __le64 LeaseDuration;
+ __le64 ParentLeaseKeyLow;
+ __le64 ParentLeaseKeyHigh;
+ __le16 Epoch;
+ __le16 Reserved;
+} __packed;
+
+struct create_lease {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct lease_context lcontext;
+} __packed;
+
+struct create_lease_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct lease_context_v2 lcontext;
+ __u8 Pad[4];
+} __packed;
+
+/* Currently defined values for close flags */
+#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
+struct smb2_close_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __le16 Flags;
+ __le32 Reserved;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+} __packed;
+
+struct smb2_close_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* 60 */
+ __le16 Flags;
+ __le32 Reserved;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
+ __le64 EndOfFile;
+ __le32 Attributes;
+} __packed;
+
+struct smb2_flush_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __le16 Reserved1;
+ __le32 Reserved2;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+} __packed;
+
+struct smb2_flush_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __le16 Reserved;
+} __packed;
+
+struct smb2_buffer_desc_v1 {
+ __le64 offset;
+ __le32 token;
+ __le32 length;
+} __packed;
+
+#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
+#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001)
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002)
+
+struct smb2_read_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __u8 Padding; /* offset from start of SMB2 header to place read */
+ __u8 Reserved;
+ __le32 Length;
+ __le64 Offset;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 MinimumCount;
+ __le32 Channel; /* Reserved MBZ */
+ __le32 RemainingBytes;
+ __le16 ReadChannelInfoOffset; /* Reserved MBZ */
+ __le16 ReadChannelInfoLength; /* Reserved MBZ */
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_read_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 17 */
+ __u8 DataOffset;
+ __u8 Reserved;
+ __le32 DataLength;
+ __le32 DataRemaining;
+ __u32 Reserved2;
+ __u8 Buffer[1];
+} __packed;
+
+/* For write request Flags field below the following flag is defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001
+
+struct smb2_write_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __le16 DataOffset; /* offset from start of SMB2 header to write data */
+ __le32 Length;
+ __le64 Offset;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 Channel; /* Reserved MBZ */
+ __le32 RemainingBytes;
+ __le16 WriteChannelInfoOffset; /* Reserved MBZ */
+ __le16 WriteChannelInfoLength; /* Reserved MBZ */
+ __le32 Flags;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_write_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 17 */
+ __u8 DataOffset;
+ __u8 Reserved;
+ __le32 DataLength;
+ __le32 DataRemaining;
+ __u32 Reserved2;
+ __u8 Buffer[1];
+} __packed;
+
+#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
+
+struct duplicate_extents_to_file {
+ __u64 PersistentFileHandle; /* source file handle, opaque endianness */
+ __u64 VolatileFileHandle;
+ __le64 SourceFileOffset;
+ __le64 TargetFileOffset;
+ __le64 ByteCount; /* Bytes to be copied */
+} __packed;
+
+struct smb2_ioctl_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __le16 Reserved; /* offset from start of SMB2 header to write data */
+ __le32 CntCode;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 InputOffset; /* Reserved MBZ */
+ __le32 InputCount;
+ __le32 MaxInputResponse;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 MaxOutputResponse;
+ __le32 Flags;
+ __le32 Reserved2;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_ioctl_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __le16 Reserved; /* offset from start of SMB2 header to write data */
+ __le32 CntCode;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 InputOffset; /* Reserved MBZ */
+ __le32 InputCount;
+ __le32 OutputOffset;
+ __le32 OutputCount;
+ __le32 Flags;
+ __le32 Reserved2;
+ __u8 Buffer[1];
+} __packed;
+
+struct validate_negotiate_info_req {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 DialectCount;
+ __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */
+} __packed;
+
+struct validate_negotiate_info_rsp {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 Dialect; /* Dialect in use for the connection */
+} __packed;
+
+struct smb_sockaddr_in {
+ __be16 Port;
+ __be32 IPv4address;
+ __u8 Reserved[8];
+} __packed;
+
+struct smb_sockaddr_in6 {
+ __be16 Port;
+ __be32 FlowInfo;
+ __u8 IPv6address[16];
+ __be32 ScopeId;
+} __packed;
+
+#define INTERNETWORK 0x0002
+#define INTERNETWORKV6 0x0017
+
+struct sockaddr_storage_rsp {
+ __le16 Family;
+ union {
+ struct smb_sockaddr_in addr4;
+ struct smb_sockaddr_in6 addr6;
+ };
+} __packed;
+
+#define RSS_CAPABLE 0x00000001
+#define RDMA_CAPABLE 0x00000002
+
+struct network_interface_info_ioctl_rsp {
+ __le32 Next; /* next interface. zero if this is last one */
+ __le32 IfIndex;
+ __le32 Capability; /* RSS or RDMA Capable */
+ __le32 Reserved;
+ __le64 LinkSpeed;
+ char SockAddr_Storage[128];
+} __packed;
+
+struct file_object_buf_type1_ioctl_rsp {
+ __u8 ObjectId[16];
+ __u8 BirthVolumeId[16];
+ __u8 BirthObjectId[16];
+ __u8 DomainId[16];
+} __packed;
+
+struct resume_key_ioctl_rsp {
+ __le64 ResumeKey[3];
+ __le32 ContextLength;
+ __u8 Context[4]; /* ignored, Windows sets to 4 bytes of zero */
+} __packed;
+
+struct copychunk_ioctl_req {
+ __le64 ResumeKey[3];
+ __le32 ChunkCount;
+ __le32 Reserved;
+ __u8 Chunks[1]; /* array of srv_copychunk */
+} __packed;
+
+struct srv_copychunk {
+ __le64 SourceOffset;
+ __le64 TargetOffset;
+ __le32 Length;
+ __le32 Reserved;
+} __packed;
+
+struct copychunk_ioctl_rsp {
+ __le32 ChunksWritten;
+ __le32 ChunkBytesWritten;
+ __le32 TotalBytesWritten;
+} __packed;
+
+struct file_sparse {
+ __u8 SetSparse;
+} __packed;
+
+struct file_zero_data_information {
+ __le64 FileOffset;
+ __le64 BeyondFinalZero;
+} __packed;
+
+struct file_allocated_range_buffer {
+ __le64 file_offset;
+ __le64 length;
+} __packed;
+
+struct reparse_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __u8 DataBuffer[]; /* Variable Length */
+} __packed;
+
+/* Completion Filter flags for Notify */
+#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001
+#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002
+#define FILE_NOTIFY_CHANGE_NAME 0x00000003
+#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004
+#define FILE_NOTIFY_CHANGE_SIZE 0x00000008
+#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010
+#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020
+#define FILE_NOTIFY_CHANGE_CREATION 0x00000040
+#define FILE_NOTIFY_CHANGE_EA 0x00000080
+#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100
+#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200
+#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400
+#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
+
+/* Flags */
+#define SMB2_WATCH_TREE 0x0001
+
+struct smb2_notify_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 32 */
+ __le16 Flags;
+ __le32 OutputBufferLength;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __u32 CompletionFileter;
+ __u32 Reserved;
+} __packed;
+
+struct smb2_notify_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+/* SMB2 Notify Action Flags */
+#define FILE_ACTION_ADDED 0x00000001
+#define FILE_ACTION_REMOVED 0x00000002
+#define FILE_ACTION_MODIFIED 0x00000003
+#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004
+#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005
+#define FILE_ACTION_ADDED_STREAM 0x00000006
+#define FILE_ACTION_REMOVED_STREAM 0x00000007
+#define FILE_ACTION_MODIFIED_STREAM 0x00000008
+#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009
+
+#define SMB2_LOCKFLAG_SHARED 0x0001
+#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002
+#define SMB2_LOCKFLAG_UNLOCK 0x0004
+#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010
+#define SMB2_LOCKFLAG_MASK 0x0007
+
+struct smb2_lock_element {
+ __le64 Offset;
+ __le64 Length;
+ __le32 Flags;
+ __le32 Reserved;
+} __packed;
+
+struct smb2_lock_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 48 */
+ __le16 LockCount;
+ __le32 Reserved;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ /* Followed by at least one */
+ struct smb2_lock_element locks[1];
+} __packed;
+
+struct smb2_lock_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_echo_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __u16 Reserved;
+} __packed;
+
+struct smb2_echo_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __u16 Reserved;
+} __packed;
+
+/* search (query_directory) Flags field */
+#define SMB2_RESTART_SCANS 0x01
+#define SMB2_RETURN_SINGLE_ENTRY 0x02
+#define SMB2_INDEX_SPECIFIED 0x04
+#define SMB2_REOPEN 0x10
+
+struct smb2_query_directory_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 33 */
+ __u8 FileInformationClass;
+ __u8 Flags;
+ __le32 FileIndex;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le16 FileNameOffset;
+ __le16 FileNameLength;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_query_directory_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+/* Possible InfoType values */
+#define SMB2_O_INFO_FILE 0x01
+#define SMB2_O_INFO_FILESYSTEM 0x02
+#define SMB2_O_INFO_SECURITY 0x03
+#define SMB2_O_INFO_QUOTA 0x04
+
+/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */
+#define OWNER_SECINFO 0x00000001
+#define GROUP_SECINFO 0x00000002
+#define DACL_SECINFO 0x00000004
+#define SACL_SECINFO 0x00000008
+#define LABEL_SECINFO 0x00000010
+#define ATTRIBUTE_SECINFO 0x00000020
+#define SCOPE_SECINFO 0x00000040
+#define BACKUP_SECINFO 0x00010000
+#define UNPROTECTED_SACL_SECINFO 0x10000000
+#define UNPROTECTED_DACL_SECINFO 0x20000000
+#define PROTECTED_SACL_SECINFO 0x40000000
+#define PROTECTED_DACL_SECINFO 0x80000000
+
+struct smb2_query_info_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 41 */
+ __u8 InfoType;
+ __u8 FileInfoClass;
+ __le32 OutputBufferLength;
+ __le16 InputBufferOffset;
+ __u16 Reserved;
+ __le32 InputBufferLength;
+ __le32 AdditionalInformation;
+ __le32 Flags;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_query_info_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_set_info_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 33 */
+ __u8 InfoType;
+ __u8 FileInfoClass;
+ __le32 BufferLength;
+ __le16 BufferOffset;
+ __u16 Reserved;
+ __le32 AdditionalInformation;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_set_info_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 2 */
+} __packed;
+
+/* FILE Info response size */
+#define FILE_DIRECTORY_INFORMATION_SIZE 1
+#define FILE_FULL_DIRECTORY_INFORMATION_SIZE 2
+#define FILE_BOTH_DIRECTORY_INFORMATION_SIZE 3
+#define FILE_BASIC_INFORMATION_SIZE 40
+#define FILE_STANDARD_INFORMATION_SIZE 24
+#define FILE_INTERNAL_INFORMATION_SIZE 8
+#define FILE_EA_INFORMATION_SIZE 4
+#define FILE_ACCESS_INFORMATION_SIZE 4
+#define FILE_NAME_INFORMATION_SIZE 9
+#define FILE_RENAME_INFORMATION_SIZE 10
+#define FILE_LINK_INFORMATION_SIZE 11
+#define FILE_NAMES_INFORMATION_SIZE 12
+#define FILE_DISPOSITION_INFORMATION_SIZE 13
+#define FILE_POSITION_INFORMATION_SIZE 14
+#define FILE_FULL_EA_INFORMATION_SIZE 15
+#define FILE_MODE_INFORMATION_SIZE 4
+#define FILE_ALIGNMENT_INFORMATION_SIZE 4
+#define FILE_ALL_INFORMATION_SIZE 104
+#define FILE_ALLOCATION_INFORMATION_SIZE 19
+#define FILE_END_OF_FILE_INFORMATION_SIZE 20
+#define FILE_ALTERNATE_NAME_INFORMATION_SIZE 8
+#define FILE_STREAM_INFORMATION_SIZE 32
+#define FILE_PIPE_INFORMATION_SIZE 23
+#define FILE_PIPE_LOCAL_INFORMATION_SIZE 24
+#define FILE_PIPE_REMOTE_INFORMATION_SIZE 25
+#define FILE_MAILSLOT_QUERY_INFORMATION_SIZE 26
+#define FILE_MAILSLOT_SET_INFORMATION_SIZE 27
+#define FILE_COMPRESSION_INFORMATION_SIZE 16
+#define FILE_OBJECT_ID_INFORMATION_SIZE 29
+/* Number 30 not defined in documents */
+#define FILE_MOVE_CLUSTER_INFORMATION_SIZE 31
+#define FILE_QUOTA_INFORMATION_SIZE 32
+#define FILE_REPARSE_POINT_INFORMATION_SIZE 33
+#define FILE_NETWORK_OPEN_INFORMATION_SIZE 56
+#define FILE_ATTRIBUTE_TAG_INFORMATION_SIZE 8
+
+/* FS Info response size */
+#define FS_DEVICE_INFORMATION_SIZE 8
+#define FS_ATTRIBUTE_INFORMATION_SIZE 16
+#define FS_VOLUME_INFORMATION_SIZE 24
+#define FS_SIZE_INFORMATION_SIZE 24
+#define FS_FULL_SIZE_INFORMATION_SIZE 32
+#define FS_SECTOR_SIZE_INFORMATION_SIZE 28
+#define FS_OBJECT_ID_INFORMATION_SIZE 64
+#define FS_CONTROL_INFORMATION_SIZE 48
+#define FS_POSIX_INFORMATION_SIZE 56
+
+/* FS_ATTRIBUTE_File_System_Name */
+#define FS_TYPE_SUPPORT_SIZE 44
+struct fs_type_info {
+ char *fs_name;
+ long magic_number;
+} __packed;
+
+struct smb2_oplock_break {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __u8 OplockLevel;
+ __u8 Reserved;
+ __le32 Reserved2;
+ __le64 PersistentFid;
+ __le64 VolatileFid;
+} __packed;
+
+#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
+
+struct smb2_lease_break {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 44 */
+ __le16 Epoch;
+ __le32 Flags;
+ __u8 LeaseKey[16];
+ __le32 CurrentLeaseState;
+ __le32 NewLeaseState;
+ __le32 BreakReason;
+ __le32 AccessMaskHint;
+ __le32 ShareMaskHint;
+} __packed;
+
+struct smb2_lease_ack {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 36 */
+ __le16 Reserved;
+ __le32 Flags;
+ __u8 LeaseKey[16];
+ __le32 LeaseState;
+ __le64 LeaseDuration;
+} __packed;
+
+/*
+ * PDU infolevel structure definitions
+ * BB consider moving to a different header
+ */
+
+/* File System Information Classes */
+#define FS_VOLUME_INFORMATION 1 /* Query */
+#define FS_LABEL_INFORMATION 2 /* Set */
+#define FS_SIZE_INFORMATION 3 /* Query */
+#define FS_DEVICE_INFORMATION 4 /* Query */
+#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
+#define FS_CONTROL_INFORMATION 6 /* Query, Set */
+#define FS_FULL_SIZE_INFORMATION 7 /* Query */
+#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
+#define FS_DRIVER_PATH_INFORMATION 9 /* Query */
+#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */
+#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */
+
+struct smb2_fs_full_size_info {
+ __le64 TotalAllocationUnits;
+ __le64 CallerAvailableAllocationUnits;
+ __le64 ActualAvailableAllocationUnits;
+ __le32 SectorsPerAllocationUnit;
+ __le32 BytesPerSector;
+} __packed;
+
+#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001
+#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
+#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004
+#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008
+
+/* sector size info struct */
+struct smb3_fs_ss_info {
+ __le32 LogicalBytesPerSector;
+ __le32 PhysicalBytesPerSectorForAtomicity;
+ __le32 PhysicalBytesPerSectorForPerf;
+ __le32 FSEffPhysicalBytesPerSectorForAtomicity;
+ __le32 Flags;
+ __le32 ByteOffsetForSectorAlignment;
+ __le32 ByteOffsetForPartitionAlignment;
+} __packed;
+
+/* File System Control Information */
+struct smb2_fs_control_info {
+ __le64 FreeSpaceStartFiltering;
+ __le64 FreeSpaceThreshold;
+ __le64 FreeSpaceStopFiltering;
+ __le64 DefaultQuotaThreshold;
+ __le64 DefaultQuotaLimit;
+ __le32 FileSystemControlFlags;
+ __le32 Padding;
+} __packed;
+
+/* partial list of QUERY INFO levels */
+#define FILE_DIRECTORY_INFORMATION 1
+#define FILE_FULL_DIRECTORY_INFORMATION 2
+#define FILE_BOTH_DIRECTORY_INFORMATION 3
+#define FILE_BASIC_INFORMATION 4
+#define FILE_STANDARD_INFORMATION 5
+#define FILE_INTERNAL_INFORMATION 6
+#define FILE_EA_INFORMATION 7
+#define FILE_ACCESS_INFORMATION 8
+#define FILE_NAME_INFORMATION 9
+#define FILE_RENAME_INFORMATION 10
+#define FILE_LINK_INFORMATION 11
+#define FILE_NAMES_INFORMATION 12
+#define FILE_DISPOSITION_INFORMATION 13
+#define FILE_POSITION_INFORMATION 14
+#define FILE_FULL_EA_INFORMATION 15
+#define FILE_MODE_INFORMATION 16
+#define FILE_ALIGNMENT_INFORMATION 17
+#define FILE_ALL_INFORMATION 18
+#define FILE_ALLOCATION_INFORMATION 19
+#define FILE_END_OF_FILE_INFORMATION 20
+#define FILE_ALTERNATE_NAME_INFORMATION 21
+#define FILE_STREAM_INFORMATION 22
+#define FILE_PIPE_INFORMATION 23
+#define FILE_PIPE_LOCAL_INFORMATION 24
+#define FILE_PIPE_REMOTE_INFORMATION 25
+#define FILE_MAILSLOT_QUERY_INFORMATION 26
+#define FILE_MAILSLOT_SET_INFORMATION 27
+#define FILE_COMPRESSION_INFORMATION 28
+#define FILE_OBJECT_ID_INFORMATION 29
+/* Number 30 not defined in documents */
+#define FILE_MOVE_CLUSTER_INFORMATION 31
+#define FILE_QUOTA_INFORMATION 32
+#define FILE_REPARSE_POINT_INFORMATION 33
+#define FILE_NETWORK_OPEN_INFORMATION 34
+#define FILE_ATTRIBUTE_TAG_INFORMATION 35
+#define FILE_TRACKING_INFORMATION 36
+#define FILEID_BOTH_DIRECTORY_INFORMATION 37
+#define FILEID_FULL_DIRECTORY_INFORMATION 38
+#define FILE_VALID_DATA_LENGTH_INFORMATION 39
+#define FILE_SHORT_NAME_INFORMATION 40
+#define FILE_SFIO_RESERVE_INFORMATION 44
+#define FILE_SFIO_VOLUME_INFORMATION 45
+#define FILE_HARD_LINK_INFORMATION 46
+#define FILE_NORMALIZED_NAME_INFORMATION 48
+#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
+#define FILE_STANDARD_LINK_INFORMATION 54
+
+#define OP_BREAK_STRUCT_SIZE_20 24
+#define OP_BREAK_STRUCT_SIZE_21 36
+
+struct smb2_file_access_info {
+ __le32 AccessFlags;
+} __packed;
+
+struct smb2_file_alignment_info {
+ __le32 AlignmentRequirement;
+} __packed;
+
+struct smb2_file_internal_info {
+ __le64 IndexNumber;
+} __packed; /* level 6 Query */
+
+struct smb2_file_rename_info { /* encoding of request for level 10 */
+ __u8 ReplaceIfExists; /* 1 = replace existing target with new */
+ /* 0 = fail if target already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ char FileName[0]; /* New name to be assigned */
+} __packed; /* level 10 Set */
+
+struct smb2_file_link_info { /* encoding of request for level 11 */
+ __u8 ReplaceIfExists; /* 1 = replace existing link with new */
+ /* 0 = fail if link already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ char FileName[0]; /* Name to be assigned to new link */
+} __packed; /* level 11 Set */
+
+/*
+ * This level 18, although with struct with same name is different from cifs
+ * level 0x107. Level 0x107 has an extra u64 between AccessFlags and
+ * CurrentByteOffset.
+ */
+struct smb2_file_all_info { /* data block encoding of response to level 18 */
+ __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le32 Attributes;
+ __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */
+ __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
+ __le64 EndOfFile; /* size ie offset to first free byte in file */
+ __le32 NumberOfLinks; /* hard links */
+ __u8 DeletePending;
+ __u8 Directory;
+ __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */
+ __le64 IndexNumber;
+ __le32 EASize;
+ __le32 AccessFlags;
+ __le64 CurrentByteOffset;
+ __le32 Mode;
+ __le32 AlignmentRequirement;
+ __le32 FileNameLength;
+ char FileName[1];
+} __packed; /* level 18 Query */
+
+struct smb2_file_alt_name_info {
+ __le32 FileNameLength;
+ char FileName[0];
+} __packed;
+
+struct smb2_file_stream_info {
+ __le32 NextEntryOffset;
+ __le32 StreamNameLength;
+ __le64 StreamSize;
+ __le64 StreamAllocationSize;
+ char StreamName[0];
+} __packed;
+
+struct smb2_file_eof_info { /* encoding of request for level 10 */
+ __le64 EndOfFile; /* new end of file value */
+} __packed; /* level 20 Set */
+
+struct smb2_file_ntwrk_info {
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize;
+ __le64 EndOfFile;
+ __le32 Attributes;
+ __le32 Reserved;
+} __packed;
+
+struct smb2_file_standard_info {
+ __le64 AllocationSize;
+ __le64 EndOfFile;
+ __le32 NumberOfLinks; /* hard links */
+ __u8 DeletePending;
+ __u8 Directory;
+ __le16 Reserved;
+} __packed; /* level 18 Query */
+
+struct smb2_file_ea_info {
+ __le32 EASize;
+} __packed;
+
+struct smb2_file_alloc_info {
+ __le64 AllocationSize;
+} __packed;
+
+struct smb2_file_disposition_info {
+ __u8 DeletePending;
+} __packed;
+
+struct smb2_file_pos_info {
+ __le64 CurrentByteOffset;
+} __packed;
+
+#define FILE_MODE_INFO_MASK cpu_to_le32(0x0000103e)
+
+struct smb2_file_mode_info {
+ __le32 Mode;
+} __packed;
+
+#define COMPRESSION_FORMAT_NONE 0x0000
+#define COMPRESSION_FORMAT_LZNT1 0x0002
+
+struct smb2_file_comp_info {
+ __le64 CompressedFileSize;
+ __le16 CompressionFormat;
+ __u8 CompressionUnitShift;
+ __u8 ChunkShift;
+ __u8 ClusterShift;
+ __u8 Reserved[3];
+} __packed;
+
+struct smb2_file_attr_tag_info {
+ __le32 FileAttributes;
+ __le32 ReparseTag;
+} __packed;
+
+#define SL_RESTART_SCAN 0x00000001
+#define SL_RETURN_SINGLE_ENTRY 0x00000002
+#define SL_INDEX_SPECIFIED 0x00000004
+
+struct smb2_ea_info_req {
+ __le32 NextEntryOffset;
+ __u8 EaNameLength;
+ char name[1];
+} __packed; /* level 15 Query */
+
+struct smb2_ea_info {
+ __le32 NextEntryOffset;
+ __u8 Flags;
+ __u8 EaNameLength;
+ __le16 EaValueLength;
+ char name[1];
+ /* optionally followed by value */
+} __packed; /* level 15 Query */
+
+struct create_ea_buf_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct smb2_ea_info ea;
+} __packed;
+
+struct create_sd_buf_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct smb_ntsd ntsd;
+} __packed;
+
+/* Find File infolevels */
+#define SMB_FIND_FILE_POSIX_INFO 0x064
+
+/* Level 100 query info */
+struct smb311_posix_qinfo {
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 DosAttributes;
+ __le64 Inode;
+ __le32 DeviceId;
+ __le32 Zero;
+ /* beginning of POSIX Create Context Response */
+ __le32 HardLinks;
+ __le32 ReparseTag;
+ __le32 Mode;
+ u8 Sids[];
+ /*
+ * var sized owner SID
+ * var sized group SID
+ * le32 filenamelength
+ * u8 filename[]
+ */
+} __packed;
+
+struct smb2_posix_info {
+ __le32 NextEntryOffset;
+ __u32 Ignored;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 DosAttributes;
+ __le64 Inode;
+ __le32 DeviceId;
+ __le32 Zero;
+ /* beginning of POSIX Create Context Response */
+ __le32 HardLinks;
+ __le32 ReparseTag;
+ __le32 Mode;
+ u8 SidBuffer[40];
+ __le32 name_len;
+ u8 name[1];
+ /*
+ * var sized owner SID
+ * var sized group SID
+ * le32 filenamelength
+ * u8 filename[]
+ */
+} __packed;
+
+/* functions */
+int init_smb2_0_server(struct ksmbd_conn *conn);
+void init_smb2_1_server(struct ksmbd_conn *conn);
+void init_smb3_0_server(struct ksmbd_conn *conn);
+void init_smb3_02_server(struct ksmbd_conn *conn);
+int init_smb3_11_server(struct ksmbd_conn *conn);
+
+void init_smb2_max_read_size(unsigned int sz);
+void init_smb2_max_write_size(unsigned int sz);
+void init_smb2_max_trans_size(unsigned int sz);
+
+bool is_smb2_neg_cmd(struct ksmbd_work *work);
+bool is_smb2_rsp(struct ksmbd_work *work);
+
+u16 get_smb2_cmd_val(struct ksmbd_work *work);
+void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err);
+int init_smb2_rsp_hdr(struct ksmbd_work *work);
+int smb2_allocate_rsp_buf(struct ksmbd_work *work);
+bool is_chained_smb2_message(struct ksmbd_work *work);
+int init_smb2_neg_rsp(struct ksmbd_work *work);
+void smb2_set_err_rsp(struct ksmbd_work *work);
+int smb2_check_user_session(struct ksmbd_work *work);
+int smb2_get_ksmbd_tcon(struct ksmbd_work *work);
+bool smb2_is_sign_req(struct ksmbd_work *work, unsigned int command);
+int smb2_check_sign_req(struct ksmbd_work *work);
+void smb2_set_sign_rsp(struct ksmbd_work *work);
+int smb3_check_sign_req(struct ksmbd_work *work);
+void smb3_set_sign_rsp(struct ksmbd_work *work);
+int find_matching_smb2_dialect(int start_index, __le16 *cli_dialects,
+ __le16 dialects_count);
+struct file_lock *smb_flock_init(struct file *f);
+int setup_async_work(struct ksmbd_work *work, void (*fn)(void **),
+ void **arg);
+void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status);
+struct channel *lookup_chann_list(struct ksmbd_session *sess,
+ struct ksmbd_conn *conn);
+void smb3_preauth_hash_rsp(struct ksmbd_work *work);
+bool smb3_is_transform_hdr(void *buf);
+int smb3_decrypt_req(struct ksmbd_work *work);
+int smb3_encrypt_resp(struct ksmbd_work *work);
+bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work);
+int smb2_set_rsp_credits(struct ksmbd_work *work);
+
+/* smb2 misc functions */
+int ksmbd_smb2_check_message(struct ksmbd_work *work);
+
+/* smb2 command handlers */
+int smb2_handle_negotiate(struct ksmbd_work *work);
+int smb2_negotiate_request(struct ksmbd_work *work);
+int smb2_sess_setup(struct ksmbd_work *work);
+int smb2_tree_connect(struct ksmbd_work *work);
+int smb2_tree_disconnect(struct ksmbd_work *work);
+int smb2_session_logoff(struct ksmbd_work *work);
+int smb2_open(struct ksmbd_work *work);
+int smb2_query_info(struct ksmbd_work *work);
+int smb2_query_dir(struct ksmbd_work *work);
+int smb2_close(struct ksmbd_work *work);
+int smb2_echo(struct ksmbd_work *work);
+int smb2_set_info(struct ksmbd_work *work);
+int smb2_read(struct ksmbd_work *work);
+int smb2_write(struct ksmbd_work *work);
+int smb2_flush(struct ksmbd_work *work);
+int smb2_cancel(struct ksmbd_work *work);
+int smb2_lock(struct ksmbd_work *work);
+int smb2_ioctl(struct ksmbd_work *work);
+int smb2_oplock_break(struct ksmbd_work *work);
+int smb2_notify(struct ksmbd_work *ksmbd_work);
+
+#endif /* _SMB2PDU_H */
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
new file mode 100644
index 000000000000..b108b918ec84
--- /dev/null
+++ b/fs/ksmbd/smb_common.c
@@ -0,0 +1,674 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ * Copyright (C) 2018 Namjae Jeon <linkinjeon@kernel.org>
+ */
+
+#include "smb_common.h"
+#include "server.h"
+#include "misc.h"
+#include "smbstatus.h"
+#include "connection.h"
+#include "ksmbd_work.h"
+#include "mgmt/user_session.h"
+#include "mgmt/user_config.h"
+#include "mgmt/tree_connect.h"
+#include "mgmt/share_config.h"
+
+/*for shortname implementation */
+static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%";
+#define MANGLE_BASE (sizeof(basechars) / sizeof(char) - 1)
+#define MAGIC_CHAR '~'
+#define PERIOD '.'
+#define mangle(V) ((char)(basechars[(V) % MANGLE_BASE]))
+#define KSMBD_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr))
+
+struct smb_protocol {
+ int index;
+ char *name;
+ char *prot;
+ __u16 prot_id;
+};
+
+static struct smb_protocol smb1_protos[] = {
+ {
+ SMB21_PROT,
+ "\2SMB 2.1",
+ "SMB2_10",
+ SMB21_PROT_ID
+ },
+ {
+ SMB2X_PROT,
+ "\2SMB 2.???",
+ "SMB2_22",
+ SMB2X_PROT_ID
+ },
+};
+
+static struct smb_protocol smb2_protos[] = {
+ {
+ SMB21_PROT,
+ "\2SMB 2.1",
+ "SMB2_10",
+ SMB21_PROT_ID
+ },
+ {
+ SMB30_PROT,
+ "\2SMB 3.0",
+ "SMB3_00",
+ SMB30_PROT_ID
+ },
+ {
+ SMB302_PROT,
+ "\2SMB 3.02",
+ "SMB3_02",
+ SMB302_PROT_ID
+ },
+ {
+ SMB311_PROT,
+ "\2SMB 3.1.1",
+ "SMB3_11",
+ SMB311_PROT_ID
+ },
+};
+
+unsigned int ksmbd_server_side_copy_max_chunk_count(void)
+{
+ return 256;
+}
+
+unsigned int ksmbd_server_side_copy_max_chunk_size(void)
+{
+ return (2U << 30) - 1;
+}
+
+unsigned int ksmbd_server_side_copy_max_total_size(void)
+{
+ return (2U << 30) - 1;
+}
+
+inline int ksmbd_min_protocol(void)
+{
+ return SMB2_PROT;
+}
+
+inline int ksmbd_max_protocol(void)
+{
+ return SMB311_PROT;
+}
+
+int ksmbd_lookup_protocol_idx(char *str)
+{
+ int offt = ARRAY_SIZE(smb1_protos) - 1;
+ int len = strlen(str);
+
+ while (offt >= 0) {
+ if (!strncmp(str, smb1_protos[offt].prot, len)) {
+ ksmbd_debug(SMB, "selected %s dialect idx = %d\n",
+ smb1_protos[offt].prot, offt);
+ return smb1_protos[offt].index;
+ }
+ offt--;
+ }
+
+ offt = ARRAY_SIZE(smb2_protos) - 1;
+ while (offt >= 0) {
+ if (!strncmp(str, smb2_protos[offt].prot, len)) {
+ ksmbd_debug(SMB, "selected %s dialect idx = %d\n",
+ smb2_protos[offt].prot, offt);
+ return smb2_protos[offt].index;
+ }
+ offt--;
+ }
+ return -1;
+}
+
+/**
+ * ksmbd_verify_smb_message() - check for valid smb2 request header
+ * @work: smb work
+ *
+ * check for valid smb signature and packet direction(request/response)
+ *
+ * Return: 0 on success, otherwise 1
+ */
+int ksmbd_verify_smb_message(struct ksmbd_work *work)
+{
+ struct smb2_hdr *smb2_hdr = work->request_buf;
+
+ if (smb2_hdr->ProtocolId == SMB2_PROTO_NUMBER)
+ return ksmbd_smb2_check_message(work);
+
+ return 0;
+}
+
+/**
+ * ksmbd_smb_request() - check for valid smb request type
+ * @conn: connection instance
+ *
+ * Return: true on success, otherwise false
+ */
+bool ksmbd_smb_request(struct ksmbd_conn *conn)
+{
+ int type = *(char *)conn->request_buf;
+
+ switch (type) {
+ case RFC1002_SESSION_MESSAGE:
+ /* Regular SMB request */
+ return true;
+ case RFC1002_SESSION_KEEP_ALIVE:
+ ksmbd_debug(SMB, "RFC 1002 session keep alive\n");
+ break;
+ default:
+ ksmbd_debug(SMB, "RFC 1002 unknown request type 0x%x\n", type);
+ }
+
+ return false;
+}
+
+static bool supported_protocol(int idx)
+{
+ if (idx == SMB2X_PROT &&
+ (server_conf.min_protocol >= SMB21_PROT ||
+ server_conf.max_protocol <= SMB311_PROT))
+ return true;
+
+ return (server_conf.min_protocol <= idx &&
+ idx <= server_conf.max_protocol);
+}
+
+static char *next_dialect(char *dialect, int *next_off)
+{
+ dialect = dialect + *next_off;
+ *next_off = strlen(dialect);
+ return dialect;
+}
+
+static int ksmbd_lookup_dialect_by_name(char *cli_dialects, __le16 byte_count)
+{
+ int i, seq_num, bcount, next;
+ char *dialect;
+
+ for (i = ARRAY_SIZE(smb1_protos) - 1; i >= 0; i--) {
+ seq_num = 0;
+ next = 0;
+ dialect = cli_dialects;
+ bcount = le16_to_cpu(byte_count);
+ do {
+ dialect = next_dialect(dialect, &next);
+ ksmbd_debug(SMB, "client requested dialect %s\n",
+ dialect);
+ if (!strcmp(dialect, smb1_protos[i].name)) {
+ if (supported_protocol(smb1_protos[i].index)) {
+ ksmbd_debug(SMB,
+ "selected %s dialect\n",
+ smb1_protos[i].name);
+ if (smb1_protos[i].index == SMB1_PROT)
+ return seq_num;
+ return smb1_protos[i].prot_id;
+ }
+ }
+ seq_num++;
+ bcount -= (++next);
+ } while (bcount > 0);
+ }
+
+ return BAD_PROT_ID;
+}
+
+int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count)
+{
+ int i;
+ int count;
+
+ for (i = ARRAY_SIZE(smb2_protos) - 1; i >= 0; i--) {
+ count = le16_to_cpu(dialects_count);
+ while (--count >= 0) {
+ ksmbd_debug(SMB, "client requested dialect 0x%x\n",
+ le16_to_cpu(cli_dialects[count]));
+ if (le16_to_cpu(cli_dialects[count]) !=
+ smb2_protos[i].prot_id)
+ continue;
+
+ if (supported_protocol(smb2_protos[i].index)) {
+ ksmbd_debug(SMB, "selected %s dialect\n",
+ smb2_protos[i].name);
+ return smb2_protos[i].prot_id;
+ }
+ }
+ }
+
+ return BAD_PROT_ID;
+}
+
+static int ksmbd_negotiate_smb_dialect(void *buf)
+{
+ __le32 proto;
+
+ proto = ((struct smb2_hdr *)buf)->ProtocolId;
+ if (proto == SMB2_PROTO_NUMBER) {
+ struct smb2_negotiate_req *req;
+
+ req = (struct smb2_negotiate_req *)buf;
+ return ksmbd_lookup_dialect_by_id(req->Dialects,
+ req->DialectCount);
+ }
+
+ proto = *(__le32 *)((struct smb_hdr *)buf)->Protocol;
+ if (proto == SMB1_PROTO_NUMBER) {
+ struct smb_negotiate_req *req;
+
+ req = (struct smb_negotiate_req *)buf;
+ return ksmbd_lookup_dialect_by_name(req->DialectsArray,
+ req->ByteCount);
+ }
+
+ return BAD_PROT_ID;
+}
+
+#define SMB_COM_NEGOTIATE 0x72
+int ksmbd_init_smb_server(struct ksmbd_work *work)
+{
+ struct ksmbd_conn *conn = work->conn;
+
+ if (conn->need_neg == false)
+ return 0;
+
+ init_smb3_11_server(conn);
+
+ if (conn->ops->get_cmd_val(work) != SMB_COM_NEGOTIATE)
+ conn->need_neg = false;
+ return 0;
+}
+
+bool ksmbd_pdu_size_has_room(unsigned int pdu)
+{
+ return (pdu >= KSMBD_MIN_SUPPORTED_HEADER_SIZE - 4);
+}
+
+int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
+ struct ksmbd_file *dir,
+ struct ksmbd_dir_info *d_info,
+ char *search_pattern,
+ int (*fn)(struct ksmbd_conn *, int,
+ struct ksmbd_dir_info *,
+ struct user_namespace *,
+ struct ksmbd_kstat *))
+{
+ int i, rc = 0;
+ struct ksmbd_conn *conn = work->conn;
+ struct user_namespace *user_ns = file_mnt_user_ns(dir->filp);
+
+ for (i = 0; i < 2; i++) {
+ struct kstat kstat;
+ struct ksmbd_kstat ksmbd_kstat;
+
+ if (!dir->dot_dotdot[i]) { /* fill dot entry info */
+ if (i == 0) {
+ d_info->name = ".";
+ d_info->name_len = 1;
+ } else {
+ d_info->name = "..";
+ d_info->name_len = 2;
+ }
+
+ if (!match_pattern(d_info->name, d_info->name_len,
+ search_pattern)) {
+ dir->dot_dotdot[i] = 1;
+ continue;
+ }
+
+ ksmbd_kstat.kstat = &kstat;
+ ksmbd_vfs_fill_dentry_attrs(work,
+ user_ns,
+ dir->filp->f_path.dentry->d_parent,
+ &ksmbd_kstat);
+ rc = fn(conn, info_level, d_info,
+ user_ns, &ksmbd_kstat);
+ if (rc)
+ break;
+ if (d_info->out_buf_len <= 0)
+ break;
+
+ dir->dot_dotdot[i] = 1;
+ if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) {
+ d_info->out_buf_len = 0;
+ break;
+ }
+ }
+ }
+
+ return rc;
+}
+
+/**
+ * ksmbd_extract_shortname() - get shortname from long filename
+ * @conn: connection instance
+ * @longname: source long filename
+ * @shortname: destination short filename
+ *
+ * Return: shortname length or 0 when source long name is '.' or '..'
+ * TODO: Though this function comforms the restriction of 8.3 Filename spec,
+ * but the result is different with Windows 7's one. need to check.
+ */
+int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname,
+ char *shortname)
+{
+ const char *p;
+ char base[9], extension[4];
+ char out[13] = {0};
+ int baselen = 0;
+ int extlen = 0, len = 0;
+ unsigned int csum = 0;
+ const unsigned char *ptr;
+ bool dot_present = true;
+
+ p = longname;
+ if ((*p == '.') || (!(strcmp(p, "..")))) {
+ /*no mangling required */
+ return 0;
+ }
+
+ p = strrchr(longname, '.');
+ if (p == longname) { /*name starts with a dot*/
+ strscpy(extension, "___", strlen("___"));
+ } else {
+ if (p) {
+ p++;
+ while (*p && extlen < 3) {
+ if (*p != '.')
+ extension[extlen++] = toupper(*p);
+ p++;
+ }
+ extension[extlen] = '\0';
+ } else {
+ dot_present = false;
+ }
+ }
+
+ p = longname;
+ if (*p == '.') {
+ p++;
+ longname++;
+ }
+ while (*p && (baselen < 5)) {
+ if (*p != '.')
+ base[baselen++] = toupper(*p);
+ p++;
+ }
+
+ base[baselen] = MAGIC_CHAR;
+ memcpy(out, base, baselen + 1);
+
+ ptr = longname;
+ len = strlen(longname);
+ for (; len > 0; len--, ptr++)
+ csum += *ptr;
+
+ csum = csum % (MANGLE_BASE * MANGLE_BASE);
+ out[baselen + 1] = mangle(csum / MANGLE_BASE);
+ out[baselen + 2] = mangle(csum);
+ out[baselen + 3] = PERIOD;
+
+ if (dot_present)
+ memcpy(&out[baselen + 4], extension, 4);
+ else
+ out[baselen + 4] = '\0';
+ smbConvertToUTF16((__le16 *)shortname, out, PATH_MAX,
+ conn->local_nls, 0);
+ len = strlen(out) * 2;
+ return len;
+}
+
+static int __smb2_negotiate(struct ksmbd_conn *conn)
+{
+ return (conn->dialect >= SMB20_PROT_ID &&
+ conn->dialect <= SMB311_PROT_ID);
+}
+
+static int smb_handle_negotiate(struct ksmbd_work *work)
+{
+ struct smb_negotiate_rsp *neg_rsp = work->response_buf;
+
+ ksmbd_debug(SMB, "Unsupported SMB protocol\n");
+ neg_rsp->hdr.Status.CifsError = STATUS_INVALID_LOGON_TYPE;
+ return -EINVAL;
+}
+
+int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command)
+{
+ struct ksmbd_conn *conn = work->conn;
+ int ret;
+
+ conn->dialect = ksmbd_negotiate_smb_dialect(work->request_buf);
+ ksmbd_debug(SMB, "conn->dialect 0x%x\n", conn->dialect);
+
+ if (command == SMB2_NEGOTIATE_HE) {
+ struct smb2_hdr *smb2_hdr = work->request_buf;
+
+ if (smb2_hdr->ProtocolId != SMB2_PROTO_NUMBER) {
+ ksmbd_debug(SMB, "Downgrade to SMB1 negotiation\n");
+ command = SMB_COM_NEGOTIATE;
+ }
+ }
+
+ if (command == SMB2_NEGOTIATE_HE) {
+ ret = smb2_handle_negotiate(work);
+ init_smb2_neg_rsp(work);
+ return ret;
+ }
+
+ if (command == SMB_COM_NEGOTIATE) {
+ if (__smb2_negotiate(conn)) {
+ conn->need_neg = true;
+ init_smb3_11_server(conn);
+ init_smb2_neg_rsp(work);
+ ksmbd_debug(SMB, "Upgrade to SMB2 negotiation\n");
+ return 0;
+ }
+ return smb_handle_negotiate(work);
+ }
+
+ pr_err("Unknown SMB negotiation command: %u\n", command);
+ return -EINVAL;
+}
+
+enum SHARED_MODE_ERRORS {
+ SHARE_DELETE_ERROR,
+ SHARE_READ_ERROR,
+ SHARE_WRITE_ERROR,
+ FILE_READ_ERROR,
+ FILE_WRITE_ERROR,
+ FILE_DELETE_ERROR,
+};
+
+static const char * const shared_mode_errors[] = {
+ "Current access mode does not permit SHARE_DELETE",
+ "Current access mode does not permit SHARE_READ",
+ "Current access mode does not permit SHARE_WRITE",
+ "Desired access mode does not permit FILE_READ",
+ "Desired access mode does not permit FILE_WRITE",
+ "Desired access mode does not permit FILE_DELETE",
+};
+
+static void smb_shared_mode_error(int error, struct ksmbd_file *prev_fp,
+ struct ksmbd_file *curr_fp)
+{
+ ksmbd_debug(SMB, "%s\n", shared_mode_errors[error]);
+ ksmbd_debug(SMB, "Current mode: 0x%x Desired mode: 0x%x\n",
+ prev_fp->saccess, curr_fp->daccess);
+}
+
+int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp)
+{
+ int rc = 0;
+ struct ksmbd_file *prev_fp;
+
+ /*
+ * Lookup fp in master fp list, and check desired access and
+ * shared mode between previous open and current open.
+ */
+ read_lock(&curr_fp->f_ci->m_lock);
+ list_for_each_entry(prev_fp, &curr_fp->f_ci->m_fp_list, node) {
+ if (file_inode(filp) != file_inode(prev_fp->filp))
+ continue;
+
+ if (filp == prev_fp->filp)
+ continue;
+
+ if (ksmbd_stream_fd(prev_fp) && ksmbd_stream_fd(curr_fp))
+ if (strcmp(prev_fp->stream.name, curr_fp->stream.name))
+ continue;
+
+ if (prev_fp->attrib_only != curr_fp->attrib_only)
+ continue;
+
+ if (!(prev_fp->saccess & FILE_SHARE_DELETE_LE) &&
+ curr_fp->daccess & FILE_DELETE_LE) {
+ smb_shared_mode_error(SHARE_DELETE_ERROR,
+ prev_fp,
+ curr_fp);
+ rc = -EPERM;
+ break;
+ }
+
+ /*
+ * Only check FILE_SHARE_DELETE if stream opened and
+ * normal file opened.
+ */
+ if (ksmbd_stream_fd(prev_fp) && !ksmbd_stream_fd(curr_fp))
+ continue;
+
+ if (!(prev_fp->saccess & FILE_SHARE_READ_LE) &&
+ curr_fp->daccess & (FILE_EXECUTE_LE | FILE_READ_DATA_LE)) {
+ smb_shared_mode_error(SHARE_READ_ERROR,
+ prev_fp,
+ curr_fp);
+ rc = -EPERM;
+ break;
+ }
+
+ if (!(prev_fp->saccess & FILE_SHARE_WRITE_LE) &&
+ curr_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) {
+ smb_shared_mode_error(SHARE_WRITE_ERROR,
+ prev_fp,
+ curr_fp);
+ rc = -EPERM;
+ break;
+ }
+
+ if (prev_fp->daccess & (FILE_EXECUTE_LE | FILE_READ_DATA_LE) &&
+ !(curr_fp->saccess & FILE_SHARE_READ_LE)) {
+ smb_shared_mode_error(FILE_READ_ERROR,
+ prev_fp,
+ curr_fp);
+ rc = -EPERM;
+ break;
+ }
+
+ if (prev_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE) &&
+ !(curr_fp->saccess & FILE_SHARE_WRITE_LE)) {
+ smb_shared_mode_error(FILE_WRITE_ERROR,
+ prev_fp,
+ curr_fp);
+ rc = -EPERM;
+ break;
+ }
+
+ if (prev_fp->daccess & FILE_DELETE_LE &&
+ !(curr_fp->saccess & FILE_SHARE_DELETE_LE)) {
+ smb_shared_mode_error(FILE_DELETE_ERROR,
+ prev_fp,
+ curr_fp);
+ rc = -EPERM;
+ break;
+ }
+ }
+ read_unlock(&curr_fp->f_ci->m_lock);
+
+ return rc;
+}
+
+bool is_asterisk(char *p)
+{
+ return p && p[0] == '*';
+}
+
+int ksmbd_override_fsids(struct ksmbd_work *work)
+{
+ struct ksmbd_session *sess = work->sess;
+ struct ksmbd_share_config *share = work->tcon->share_conf;
+ struct cred *cred;
+ struct group_info *gi;
+ unsigned int uid;
+ unsigned int gid;
+
+ uid = user_uid(sess->user);
+ gid = user_gid(sess->user);
+ if (share->force_uid != KSMBD_SHARE_INVALID_UID)
+ uid = share->force_uid;
+ if (share->force_gid != KSMBD_SHARE_INVALID_GID)
+ gid = share->force_gid;
+
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ cred->fsuid = make_kuid(current_user_ns(), uid);
+ cred->fsgid = make_kgid(current_user_ns(), gid);
+
+ gi = groups_alloc(0);
+ if (!gi) {
+ abort_creds(cred);
+ return -ENOMEM;
+ }
+ set_groups(cred, gi);
+ put_group_info(gi);
+
+ if (!uid_eq(cred->fsuid, GLOBAL_ROOT_UID))
+ cred->cap_effective = cap_drop_fs_set(cred->cap_effective);
+
+ WARN_ON(work->saved_cred);
+ work->saved_cred = override_creds(cred);
+ if (!work->saved_cred) {
+ abort_creds(cred);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+void ksmbd_revert_fsids(struct ksmbd_work *work)
+{
+ const struct cred *cred;
+
+ WARN_ON(!work->saved_cred);
+
+ cred = current_cred();
+ revert_creds(work->saved_cred);
+ put_cred(cred);
+ work->saved_cred = NULL;
+}
+
+__le32 smb_map_generic_desired_access(__le32 daccess)
+{
+ if (daccess & FILE_GENERIC_READ_LE) {
+ daccess |= cpu_to_le32(GENERIC_READ_FLAGS);
+ daccess &= ~FILE_GENERIC_READ_LE;
+ }
+
+ if (daccess & FILE_GENERIC_WRITE_LE) {
+ daccess |= cpu_to_le32(GENERIC_WRITE_FLAGS);
+ daccess &= ~FILE_GENERIC_WRITE_LE;
+ }
+
+ if (daccess & FILE_GENERIC_EXECUTE_LE) {
+ daccess |= cpu_to_le32(GENERIC_EXECUTE_FLAGS);
+ daccess &= ~FILE_GENERIC_EXECUTE_LE;
+ }
+
+ if (daccess & FILE_GENERIC_ALL_LE) {
+ daccess |= cpu_to_le32(GENERIC_ALL_FLAGS);
+ daccess &= ~FILE_GENERIC_ALL_LE;
+ }
+
+ return daccess;
+}
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
new file mode 100644
index 000000000000..eb667d85558e
--- /dev/null
+++ b/fs/ksmbd/smb_common.h
@@ -0,0 +1,542 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __SMB_COMMON_H__
+#define __SMB_COMMON_H__
+
+#include <linux/kernel.h>
+
+#include "glob.h"
+#include "nterr.h"
+#include "smb2pdu.h"
+
+/* ksmbd's Specific ERRNO */
+#define ESHARE 50000
+
+#define SMB1_PROT 0
+#define SMB2_PROT 1
+#define SMB21_PROT 2
+/* multi-protocol negotiate request */
+#define SMB2X_PROT 3
+#define SMB30_PROT 4
+#define SMB302_PROT 5
+#define SMB311_PROT 6
+#define BAD_PROT 0xFFFF
+
+#define SMB1_VERSION_STRING "1.0"
+#define SMB20_VERSION_STRING "2.0"
+#define SMB21_VERSION_STRING "2.1"
+#define SMB30_VERSION_STRING "3.0"
+#define SMB302_VERSION_STRING "3.02"
+#define SMB311_VERSION_STRING "3.1.1"
+
+/* Dialects */
+#define SMB10_PROT_ID 0x00
+#define SMB20_PROT_ID 0x0202
+#define SMB21_PROT_ID 0x0210
+/* multi-protocol negotiate request */
+#define SMB2X_PROT_ID 0x02FF
+#define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
+#define SMB311_PROT_ID 0x0311
+#define BAD_PROT_ID 0xFFFF
+
+#define SMB_ECHO_INTERVAL (60 * HZ)
+
+#define CIFS_DEFAULT_IOSIZE (64 * 1024)
+#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */
+
+/* RFC 1002 session packet types */
+#define RFC1002_SESSION_MESSAGE 0x00
+#define RFC1002_SESSION_REQUEST 0x81
+#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82
+#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83
+#define RFC1002_RETARGET_SESSION_RESPONSE 0x84
+#define RFC1002_SESSION_KEEP_ALIVE 0x85
+
+/* Responses when opening a file. */
+#define F_SUPERSEDED 0
+#define F_OPENED 1
+#define F_CREATED 2
+#define F_OVERWRITTEN 3
+
+/*
+ * File Attribute flags
+ */
+#define ATTR_READONLY 0x0001
+#define ATTR_HIDDEN 0x0002
+#define ATTR_SYSTEM 0x0004
+#define ATTR_VOLUME 0x0008
+#define ATTR_DIRECTORY 0x0010
+#define ATTR_ARCHIVE 0x0020
+#define ATTR_DEVICE 0x0040
+#define ATTR_NORMAL 0x0080
+#define ATTR_TEMPORARY 0x0100
+#define ATTR_SPARSE 0x0200
+#define ATTR_REPARSE 0x0400
+#define ATTR_COMPRESSED 0x0800
+#define ATTR_OFFLINE 0x1000
+#define ATTR_NOT_CONTENT_INDEXED 0x2000
+#define ATTR_ENCRYPTED 0x4000
+#define ATTR_POSIX_SEMANTICS 0x01000000
+#define ATTR_BACKUP_SEMANTICS 0x02000000
+#define ATTR_DELETE_ON_CLOSE 0x04000000
+#define ATTR_SEQUENTIAL_SCAN 0x08000000
+#define ATTR_RANDOM_ACCESS 0x10000000
+#define ATTR_NO_BUFFERING 0x20000000
+#define ATTR_WRITE_THROUGH 0x80000000
+
+#define ATTR_READONLY_LE cpu_to_le32(ATTR_READONLY)
+#define ATTR_HIDDEN_LE cpu_to_le32(ATTR_HIDDEN)
+#define ATTR_SYSTEM_LE cpu_to_le32(ATTR_SYSTEM)
+#define ATTR_DIRECTORY_LE cpu_to_le32(ATTR_DIRECTORY)
+#define ATTR_ARCHIVE_LE cpu_to_le32(ATTR_ARCHIVE)
+#define ATTR_NORMAL_LE cpu_to_le32(ATTR_NORMAL)
+#define ATTR_TEMPORARY_LE cpu_to_le32(ATTR_TEMPORARY)
+#define ATTR_SPARSE_FILE_LE cpu_to_le32(ATTR_SPARSE)
+#define ATTR_REPARSE_POINT_LE cpu_to_le32(ATTR_REPARSE)
+#define ATTR_COMPRESSED_LE cpu_to_le32(ATTR_COMPRESSED)
+#define ATTR_OFFLINE_LE cpu_to_le32(ATTR_OFFLINE)
+#define ATTR_NOT_CONTENT_INDEXED_LE cpu_to_le32(ATTR_NOT_CONTENT_INDEXED)
+#define ATTR_ENCRYPTED_LE cpu_to_le32(ATTR_ENCRYPTED)
+#define ATTR_INTEGRITY_STREAML_LE cpu_to_le32(0x00008000)
+#define ATTR_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000)
+#define ATTR_MASK_LE cpu_to_le32(0x00007FB7)
+
+/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
+#define FILE_SUPPORTS_SPARSE_VDL 0x10000000 /* faster nonsparse extend */
+#define FILE_SUPPORTS_BLOCK_REFCOUNTING 0x08000000 /* allow ioctl dup extents */
+#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000
+#define FILE_SUPPORTS_USN_JOURNAL 0x02000000
+#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000
+#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000
+#define FILE_SUPPORTS_HARD_LINKS 0x00400000
+#define FILE_SUPPORTS_TRANSACTIONS 0x00200000
+#define FILE_SEQUENTIAL_WRITE_ONCE 0x00100000
+#define FILE_READ_ONLY_VOLUME 0x00080000
+#define FILE_NAMED_STREAMS 0x00040000
+#define FILE_SUPPORTS_ENCRYPTION 0x00020000
+#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
+#define FILE_VOLUME_IS_COMPRESSED 0x00008000
+#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
+#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
+#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
+#define FILE_VOLUME_QUOTAS 0x00000020
+#define FILE_FILE_COMPRESSION 0x00000010
+#define FILE_PERSISTENT_ACLS 0x00000008
+#define FILE_UNICODE_ON_DISK 0x00000004
+#define FILE_CASE_PRESERVED_NAMES 0x00000002
+#define FILE_CASE_SENSITIVE_SEARCH 0x00000001
+
+#define FILE_READ_DATA 0x00000001 /* Data can be read from the file */
+#define FILE_WRITE_DATA 0x00000002 /* Data can be written to the file */
+#define FILE_APPEND_DATA 0x00000004 /* Data can be appended to the file */
+#define FILE_READ_EA 0x00000008 /* Extended attributes associated */
+/* with the file can be read */
+#define FILE_WRITE_EA 0x00000010 /* Extended attributes associated */
+/* with the file can be written */
+#define FILE_EXECUTE 0x00000020 /*Data can be read into memory from */
+/* the file using system paging I/O */
+#define FILE_DELETE_CHILD 0x00000040
+#define FILE_READ_ATTRIBUTES 0x00000080 /* Attributes associated with the */
+/* file can be read */
+#define FILE_WRITE_ATTRIBUTES 0x00000100 /* Attributes associated with the */
+/* file can be written */
+#define DELETE 0x00010000 /* The file can be deleted */
+#define READ_CONTROL 0x00020000 /* The access control list and */
+/* ownership associated with the */
+/* file can be read */
+#define WRITE_DAC 0x00040000 /* The access control list and */
+/* ownership associated with the */
+/* file can be written. */
+#define WRITE_OWNER 0x00080000 /* Ownership information associated */
+/* with the file can be written */
+#define SYNCHRONIZE 0x00100000 /* The file handle can waited on to */
+/* synchronize with the completion */
+/* of an input/output request */
+#define GENERIC_ALL 0x10000000
+#define GENERIC_EXECUTE 0x20000000
+#define GENERIC_WRITE 0x40000000
+#define GENERIC_READ 0x80000000
+/* In summary - Relevant file */
+/* access flags from CIFS are */
+/* file_read_data, file_write_data */
+/* file_execute, file_read_attributes*/
+/* write_dac, and delete. */
+
+#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES)
+#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+ | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
+#define FILE_EXEC_RIGHTS (FILE_EXECUTE)
+
+#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \
+ | FILE_READ_ATTRIBUTES \
+ | DELETE | READ_CONTROL | WRITE_DAC \
+ | WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+ | FILE_WRITE_EA \
+ | FILE_DELETE_CHILD \
+ | FILE_WRITE_ATTRIBUTES \
+ | DELETE | READ_CONTROL | WRITE_DAC \
+ | WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_EXEC_RIGHTS (FILE_READ_EA | FILE_WRITE_EA | FILE_EXECUTE \
+ | FILE_READ_ATTRIBUTES \
+ | FILE_WRITE_ATTRIBUTES \
+ | DELETE | READ_CONTROL | WRITE_DAC \
+ | WRITE_OWNER | SYNCHRONIZE)
+
+#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \
+ | READ_CONTROL | SYNCHRONIZE)
+
+/* generic flags for file open */
+#define GENERIC_READ_FLAGS (READ_CONTROL | FILE_READ_DATA | \
+ FILE_READ_ATTRIBUTES | \
+ FILE_READ_EA | SYNCHRONIZE)
+
+#define GENERIC_WRITE_FLAGS (READ_CONTROL | FILE_WRITE_DATA | \
+ FILE_WRITE_ATTRIBUTES | FILE_WRITE_EA | \
+ FILE_APPEND_DATA | SYNCHRONIZE)
+
+#define GENERIC_EXECUTE_FLAGS (READ_CONTROL | FILE_EXECUTE | \
+ FILE_READ_ATTRIBUTES | SYNCHRONIZE)
+
+#define GENERIC_ALL_FLAGS (DELETE | READ_CONTROL | WRITE_DAC | \
+ WRITE_OWNER | SYNCHRONIZE | FILE_READ_DATA | \
+ FILE_WRITE_DATA | FILE_APPEND_DATA | \
+ FILE_READ_EA | FILE_WRITE_EA | \
+ FILE_EXECUTE | FILE_DELETE_CHILD | \
+ FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES)
+
+#define SMB1_PROTO_NUMBER cpu_to_le32(0x424d53ff)
+
+#define SMB1_CLIENT_GUID_SIZE (16)
+struct smb_hdr {
+ __be32 smb_buf_length;
+ __u8 Protocol[4];
+ __u8 Command;
+ union {
+ struct {
+ __u8 ErrorClass;
+ __u8 Reserved;
+ __le16 Error;
+ } __packed DosError;
+ __le32 CifsError;
+ } __packed Status;
+ __u8 Flags;
+ __le16 Flags2; /* note: le */
+ __le16 PidHigh;
+ union {
+ struct {
+ __le32 SequenceNumber; /* le */
+ __u32 Reserved; /* zero */
+ } __packed Sequence;
+ __u8 SecuritySignature[8]; /* le */
+ } __packed Signature;
+ __u8 pad[2];
+ __le16 Tid;
+ __le16 Pid;
+ __le16 Uid;
+ __le16 Mid;
+ __u8 WordCount;
+} __packed;
+
+struct smb_negotiate_req {
+ struct smb_hdr hdr; /* wct = 0 */
+ __le16 ByteCount;
+ unsigned char DialectsArray[1];
+} __packed;
+
+struct smb_negotiate_rsp {
+ struct smb_hdr hdr; /* wct = 17 */
+ __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
+ __u8 SecurityMode;
+ __le16 MaxMpxCount;
+ __le16 MaxNumberVcs;
+ __le32 MaxBufferSize;
+ __le32 MaxRawSize;
+ __le32 SessionKey;
+ __le32 Capabilities; /* see below */
+ __le32 SystemTimeLow;
+ __le32 SystemTimeHigh;
+ __le16 ServerTimeZone;
+ __u8 EncryptionKeyLength;
+ __le16 ByteCount;
+ union {
+ unsigned char EncryptionKey[8]; /* cap extended security off */
+ /* followed by Domain name - if extended security is off */
+ /* followed by 16 bytes of server GUID */
+ /* then security blob if cap_extended_security negotiated */
+ struct {
+ unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
+ unsigned char SecurityBlob[1];
+ } __packed extended_response;
+ } __packed u;
+} __packed;
+
+struct filesystem_attribute_info {
+ __le32 Attributes;
+ __le32 MaxPathNameComponentLength;
+ __le32 FileSystemNameLen;
+ __le16 FileSystemName[1]; /* do not have to save this - get subset? */
+} __packed;
+
+struct filesystem_device_info {
+ __le32 DeviceType;
+ __le32 DeviceCharacteristics;
+} __packed; /* device info level 0x104 */
+
+struct filesystem_vol_info {
+ __le64 VolumeCreationTime;
+ __le32 SerialNumber;
+ __le32 VolumeLabelSize;
+ __le16 Reserved;
+ __le16 VolumeLabel[1];
+} __packed;
+
+struct filesystem_info {
+ __le64 TotalAllocationUnits;
+ __le64 FreeAllocationUnits;
+ __le32 SectorsPerAllocationUnit;
+ __le32 BytesPerSector;
+} __packed; /* size info, level 0x103 */
+
+#define EXTENDED_INFO_MAGIC 0x43667364 /* Cfsd */
+#define STRING_LENGTH 28
+
+struct fs_extended_info {
+ __le32 magic;
+ __le32 version;
+ __le32 release;
+ __u64 rel_date;
+ char version_string[STRING_LENGTH];
+} __packed;
+
+struct object_id_info {
+ char objid[16];
+ struct fs_extended_info extended_info;
+} __packed;
+
+struct file_directory_info {
+ __le32 NextEntryOffset;
+ __u32 FileIndex;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 ExtFileAttributes;
+ __le32 FileNameLength;
+ char FileName[1];
+} __packed; /* level 0x101 FF resp data */
+
+struct file_names_info {
+ __le32 NextEntryOffset;
+ __u32 FileIndex;
+ __le32 FileNameLength;
+ char FileName[1];
+} __packed; /* level 0xc FF resp data */
+
+struct file_full_directory_info {
+ __le32 NextEntryOffset;
+ __u32 FileIndex;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 ExtFileAttributes;
+ __le32 FileNameLength;
+ __le32 EaSize;
+ char FileName[1];
+} __packed; /* level 0x102 FF resp */
+
+struct file_both_directory_info {
+ __le32 NextEntryOffset;
+ __u32 FileIndex;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 ExtFileAttributes;
+ __le32 FileNameLength;
+ __le32 EaSize; /* length of the xattrs */
+ __u8 ShortNameLength;
+ __u8 Reserved;
+ __u8 ShortName[24];
+ char FileName[1];
+} __packed; /* level 0x104 FFrsp data */
+
+struct file_id_both_directory_info {
+ __le32 NextEntryOffset;
+ __u32 FileIndex;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 ExtFileAttributes;
+ __le32 FileNameLength;
+ __le32 EaSize; /* length of the xattrs */
+ __u8 ShortNameLength;
+ __u8 Reserved;
+ __u8 ShortName[24];
+ __le16 Reserved2;
+ __le64 UniqueId;
+ char FileName[1];
+} __packed;
+
+struct file_id_full_dir_info {
+ __le32 NextEntryOffset;
+ __u32 FileIndex;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 ExtFileAttributes;
+ __le32 FileNameLength;
+ __le32 EaSize; /* EA size */
+ __le32 Reserved;
+ __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
+ char FileName[1];
+} __packed; /* level 0x105 FF rsp data */
+
+struct smb_version_values {
+ char *version_string;
+ __u16 protocol_id;
+ __le16 lock_cmd;
+ __u32 capabilities;
+ __u32 max_read_size;
+ __u32 max_write_size;
+ __u32 max_trans_size;
+ __u32 large_lock_type;
+ __u32 exclusive_lock_type;
+ __u32 shared_lock_type;
+ __u32 unlock_lock_type;
+ size_t header_size;
+ size_t max_header_size;
+ size_t read_rsp_size;
+ unsigned int cap_unix;
+ unsigned int cap_nt_find;
+ unsigned int cap_large_files;
+ __u16 signing_enabled;
+ __u16 signing_required;
+ size_t create_lease_size;
+ size_t create_durable_size;
+ size_t create_durable_v2_size;
+ size_t create_mxac_size;
+ size_t create_disk_id_size;
+ size_t create_posix_size;
+};
+
+struct filesystem_posix_info {
+ /* For undefined recommended transfer size return -1 in that field */
+ __le32 OptimalTransferSize; /* bsize on some os, iosize on other os */
+ __le32 BlockSize;
+ /* The next three fields are in terms of the block size.
+ * (above). If block size is unknown, 4096 would be a
+ * reasonable block size for a server to report.
+ * Note that returning the blocks/blocksavail removes need
+ * to make a second call (to QFSInfo level 0x103 to get this info.
+ * UserBlockAvail is typically less than or equal to BlocksAvail,
+ * if no distinction is made return the same value in each
+ */
+ __le64 TotalBlocks;
+ __le64 BlocksAvail; /* bfree */
+ __le64 UserBlocksAvail; /* bavail */
+ /* For undefined Node fields or FSID return -1 */
+ __le64 TotalFileNodes;
+ __le64 FreeFileNodes;
+ __le64 FileSysIdentifier; /* fsid */
+ /* NB Namelen comes from FILE_SYSTEM_ATTRIBUTE_INFO call */
+ /* NB flags can come from FILE_SYSTEM_DEVICE_INFO call */
+} __packed;
+
+struct smb_version_ops {
+ u16 (*get_cmd_val)(struct ksmbd_work *swork);
+ int (*init_rsp_hdr)(struct ksmbd_work *swork);
+ void (*set_rsp_status)(struct ksmbd_work *swork, __le32 err);
+ int (*allocate_rsp_buf)(struct ksmbd_work *work);
+ int (*set_rsp_credits)(struct ksmbd_work *work);
+ int (*check_user_session)(struct ksmbd_work *work);
+ int (*get_ksmbd_tcon)(struct ksmbd_work *work);
+ bool (*is_sign_req)(struct ksmbd_work *work, unsigned int command);
+ int (*check_sign_req)(struct ksmbd_work *work);
+ void (*set_sign_rsp)(struct ksmbd_work *work);
+ int (*generate_signingkey)(struct ksmbd_session *sess, struct ksmbd_conn *conn);
+ int (*generate_encryptionkey)(struct ksmbd_session *sess);
+ bool (*is_transform_hdr)(void *buf);
+ int (*decrypt_req)(struct ksmbd_work *work);
+ int (*encrypt_resp)(struct ksmbd_work *work);
+};
+
+struct smb_version_cmds {
+ int (*proc)(struct ksmbd_work *swork);
+};
+
+static inline size_t
+smb2_hdr_size_no_buflen(struct smb_version_values *vals)
+{
+ return vals->header_size - 4;
+}
+
+int ksmbd_min_protocol(void);
+int ksmbd_max_protocol(void);
+
+int ksmbd_lookup_protocol_idx(char *str);
+
+int ksmbd_verify_smb_message(struct ksmbd_work *work);
+bool ksmbd_smb_request(struct ksmbd_conn *conn);
+
+int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count);
+
+int ksmbd_init_smb_server(struct ksmbd_work *work);
+
+bool ksmbd_pdu_size_has_room(unsigned int pdu);
+
+struct ksmbd_kstat;
+int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work,
+ int info_level,
+ struct ksmbd_file *dir,
+ struct ksmbd_dir_info *d_info,
+ char *search_pattern,
+ int (*fn)(struct ksmbd_conn *,
+ int,
+ struct ksmbd_dir_info *,
+ struct user_namespace *,
+ struct ksmbd_kstat *));
+
+int ksmbd_extract_shortname(struct ksmbd_conn *conn,
+ const char *longname,
+ char *shortname);
+
+int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command);
+
+int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp);
+int ksmbd_override_fsids(struct ksmbd_work *work);
+void ksmbd_revert_fsids(struct ksmbd_work *work);
+
+unsigned int ksmbd_server_side_copy_max_chunk_count(void);
+unsigned int ksmbd_server_side_copy_max_chunk_size(void);
+unsigned int ksmbd_server_side_copy_max_total_size(void);
+bool is_asterisk(char *p);
+__le32 smb_map_generic_desired_access(__le32 daccess);
+
+static inline unsigned int get_rfc1002_len(void *buf)
+{
+ return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;
+}
+
+static inline void inc_rfc1001_len(void *buf, int count)
+{
+ be32_add_cpu((__be32 *)buf, count);
+}
+#endif /* __SMB_COMMON_H__ */
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
new file mode 100644
index 000000000000..5456e3ad943e
--- /dev/null
+++ b/fs/ksmbd/smbacl.c
@@ -0,0 +1,1366 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/*
+ * Copyright (C) International Business Machines Corp., 2007,2008
+ * Author(s): Steve French (sfrench@us.ibm.com)
+ * Copyright (C) 2020 Samsung Electronics Co., Ltd.
+ * Author(s): Namjae Jeon <linkinjeon@kernel.org>
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "smbacl.h"
+#include "smb_common.h"
+#include "server.h"
+#include "misc.h"
+#include "mgmt/share_config.h"
+
+static const struct smb_sid domain = {1, 4, {0, 0, 0, 0, 0, 5},
+ {cpu_to_le32(21), cpu_to_le32(1), cpu_to_le32(2), cpu_to_le32(3),
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/* security id for everyone/world system group */
+static const struct smb_sid creator_owner = {
+ 1, 1, {0, 0, 0, 0, 0, 3}, {0} };
+/* security id for everyone/world system group */
+static const struct smb_sid creator_group = {
+ 1, 1, {0, 0, 0, 0, 0, 3}, {cpu_to_le32(1)} };
+
+/* security id for everyone/world system group */
+static const struct smb_sid sid_everyone = {
+ 1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct smb_sid sid_authusers = {
+ 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
+
+/* S-1-22-1 Unmapped Unix users */
+static const struct smb_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22},
+ {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/* S-1-22-2 Unmapped Unix groups */
+static const struct smb_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22},
+ {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/*
+ * See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx
+ */
+
+/* S-1-5-88 MS NFS and Apple style UID/GID/mode */
+
+/* S-1-5-88-1 Unix uid */
+static const struct smb_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5},
+ {cpu_to_le32(88),
+ cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/* S-1-5-88-2 Unix gid */
+static const struct smb_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5},
+ {cpu_to_le32(88),
+ cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/* S-1-5-88-3 Unix mode */
+static const struct smb_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5},
+ {cpu_to_le32(88),
+ cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
+
+/*
+ * if the two SIDs (roughly equivalent to a UUID for a user or group) are
+ * the same returns zero, if they do not match returns non-zero.
+ */
+int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid)
+{
+ int i;
+ int num_subauth, num_sat, num_saw;
+
+ if (!ctsid || !cwsid)
+ return 1;
+
+ /* compare the revision */
+ if (ctsid->revision != cwsid->revision) {
+ if (ctsid->revision > cwsid->revision)
+ return 1;
+ else
+ return -1;
+ }
+
+ /* compare all of the six auth values */
+ for (i = 0; i < NUM_AUTHS; ++i) {
+ if (ctsid->authority[i] != cwsid->authority[i]) {
+ if (ctsid->authority[i] > cwsid->authority[i])
+ return 1;
+ else
+ return -1;
+ }
+ }
+
+ /* compare all of the subauth values if any */
+ num_sat = ctsid->num_subauth;
+ num_saw = cwsid->num_subauth;
+ num_subauth = num_sat < num_saw ? num_sat : num_saw;
+ if (num_subauth) {
+ for (i = 0; i < num_subauth; ++i) {
+ if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
+ if (le32_to_cpu(ctsid->sub_auth[i]) >
+ le32_to_cpu(cwsid->sub_auth[i]))
+ return 1;
+ else
+ return -1;
+ }
+ }
+ }
+
+ return 0; /* sids compare/match */
+}
+
+static void smb_copy_sid(struct smb_sid *dst, const struct smb_sid *src)
+{
+ int i;
+
+ dst->revision = src->revision;
+ dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
+ for (i = 0; i < NUM_AUTHS; ++i)
+ dst->authority[i] = src->authority[i];
+ for (i = 0; i < dst->num_subauth; ++i)
+ dst->sub_auth[i] = src->sub_auth[i];
+}
+
+/*
+ * change posix mode to reflect permissions
+ * pmode is the existing mode (we only want to overwrite part of this
+ * bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007
+ */
+static umode_t access_flags_to_mode(struct smb_fattr *fattr, __le32 ace_flags,
+ int type)
+{
+ __u32 flags = le32_to_cpu(ace_flags);
+ umode_t mode = 0;
+
+ if (flags & GENERIC_ALL) {
+ mode = 0777;
+ ksmbd_debug(SMB, "all perms\n");
+ return mode;
+ }
+
+ if ((flags & GENERIC_READ) || (flags & FILE_READ_RIGHTS))
+ mode = 0444;
+ if ((flags & GENERIC_WRITE) || (flags & FILE_WRITE_RIGHTS)) {
+ mode |= 0222;
+ if (S_ISDIR(fattr->cf_mode))
+ mode |= 0111;
+ }
+ if ((flags & GENERIC_EXECUTE) || (flags & FILE_EXEC_RIGHTS))
+ mode |= 0111;
+
+ if (type == ACCESS_DENIED_ACE_TYPE || type == ACCESS_DENIED_OBJECT_ACE_TYPE)
+ mode = ~mode;
+
+ ksmbd_debug(SMB, "access flags 0x%x mode now %04o\n", flags, mode);
+
+ return mode;
+}
+
+/*
+ * Generate access flags to reflect permissions mode is the existing mode.
+ * This function is called for every ACE in the DACL whose SID matches
+ * with either owner or group or everyone.
+ */
+static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
+ __u32 *pace_flags)
+{
+ /* reset access mask */
+ *pace_flags = 0x0;
+
+ /* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */
+ mode &= bits_to_use;
+
+ /*
+ * check for R/W/X UGO since we do not know whose flags
+ * is this but we have cleared all the bits sans RWX for
+ * either user or group or other as per bits_to_use
+ */
+ if (mode & 0444)
+ *pace_flags |= SET_FILE_READ_RIGHTS;
+ if (mode & 0222)
+ *pace_flags |= FILE_WRITE_RIGHTS;
+ if (mode & 0111)
+ *pace_flags |= SET_FILE_EXEC_RIGHTS;
+
+ ksmbd_debug(SMB, "mode: %o, access flags now 0x%x\n",
+ mode, *pace_flags);
+}
+
+static __u16 fill_ace_for_sid(struct smb_ace *pntace,
+ const struct smb_sid *psid, int type, int flags,
+ umode_t mode, umode_t bits)
+{
+ int i;
+ __u16 size = 0;
+ __u32 access_req = 0;
+
+ pntace->type = type;
+ pntace->flags = flags;
+ mode_to_access_flags(mode, bits, &access_req);
+ if (!access_req)
+ access_req = SET_MINIMUM_RIGHTS;
+ pntace->access_req = cpu_to_le32(access_req);
+
+ pntace->sid.revision = psid->revision;
+ pntace->sid.num_subauth = psid->num_subauth;
+ for (i = 0; i < NUM_AUTHS; i++)
+ pntace->sid.authority[i] = psid->authority[i];
+ for (i = 0; i < psid->num_subauth; i++)
+ pntace->sid.sub_auth[i] = psid->sub_auth[i];
+
+ size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
+ pntace->size = cpu_to_le16(size);
+
+ return size;
+}
+
+void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid)
+{
+ switch (sidtype) {
+ case SIDOWNER:
+ smb_copy_sid(ssid, &server_conf.domain_sid);
+ break;
+ case SIDUNIX_USER:
+ smb_copy_sid(ssid, &sid_unix_users);
+ break;
+ case SIDUNIX_GROUP:
+ smb_copy_sid(ssid, &sid_unix_groups);
+ break;
+ case SIDCREATOR_OWNER:
+ smb_copy_sid(ssid, &creator_owner);
+ return;
+ case SIDCREATOR_GROUP:
+ smb_copy_sid(ssid, &creator_group);
+ return;
+ case SIDNFS_USER:
+ smb_copy_sid(ssid, &sid_unix_NFS_users);
+ break;
+ case SIDNFS_GROUP:
+ smb_copy_sid(ssid, &sid_unix_NFS_groups);
+ break;
+ case SIDNFS_MODE:
+ smb_copy_sid(ssid, &sid_unix_NFS_mode);
+ break;
+ default:
+ return;
+ }
+
+ /* RID */
+ ssid->sub_auth[ssid->num_subauth] = cpu_to_le32(cid);
+ ssid->num_subauth++;
+}
+
+static int sid_to_id(struct user_namespace *user_ns,
+ struct smb_sid *psid, uint sidtype,
+ struct smb_fattr *fattr)
+{
+ int rc = -EINVAL;
+
+ /*
+ * If we have too many subauthorities, then something is really wrong.
+ * Just return an error.
+ */
+ if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
+ pr_err("%s: %u subauthorities is too many!\n",
+ __func__, psid->num_subauth);
+ return -EIO;
+ }
+
+ if (sidtype == SIDOWNER) {
+ kuid_t uid;
+ uid_t id;
+
+ id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
+ if (id > 0) {
+ uid = make_kuid(user_ns, id);
+ if (uid_valid(uid) && kuid_has_mapping(user_ns, uid)) {
+ fattr->cf_uid = uid;
+ rc = 0;
+ }
+ }
+ } else {
+ kgid_t gid;
+ gid_t id;
+
+ id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
+ if (id > 0) {
+ gid = make_kgid(user_ns, id);
+ if (gid_valid(gid) && kgid_has_mapping(user_ns, gid)) {
+ fattr->cf_gid = gid;
+ rc = 0;
+ }
+ }
+ }
+
+ return rc;
+}
+
+void posix_state_to_acl(struct posix_acl_state *state,
+ struct posix_acl_entry *pace)
+{
+ int i;
+
+ pace->e_tag = ACL_USER_OBJ;
+ pace->e_perm = state->owner.allow;
+ for (i = 0; i < state->users->n; i++) {
+ pace++;
+ pace->e_tag = ACL_USER;
+ pace->e_uid = state->users->aces[i].uid;
+ pace->e_perm = state->users->aces[i].perms.allow;
+ }
+
+ pace++;
+ pace->e_tag = ACL_GROUP_OBJ;
+ pace->e_perm = state->group.allow;
+
+ for (i = 0; i < state->groups->n; i++) {
+ pace++;
+ pace->e_tag = ACL_GROUP;
+ pace->e_gid = state->groups->aces[i].gid;
+ pace->e_perm = state->groups->aces[i].perms.allow;
+ }
+
+ if (state->users->n || state->groups->n) {
+ pace++;
+ pace->e_tag = ACL_MASK;
+ pace->e_perm = state->mask.allow;
+ }
+
+ pace++;
+ pace->e_tag = ACL_OTHER;
+ pace->e_perm = state->other.allow;
+}
+
+int init_acl_state(struct posix_acl_state *state, int cnt)
+{
+ int alloc;
+
+ memset(state, 0, sizeof(struct posix_acl_state));
+ /*
+ * In the worst case, each individual acl could be for a distinct
+ * named user or group, but we don't know which, so we allocate
+ * enough space for either:
+ */
+ alloc = sizeof(struct posix_ace_state_array)
+ + cnt * sizeof(struct posix_user_ace_state);
+ state->users = kzalloc(alloc, GFP_KERNEL);
+ if (!state->users)
+ return -ENOMEM;
+ state->groups = kzalloc(alloc, GFP_KERNEL);
+ if (!state->groups) {
+ kfree(state->users);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void free_acl_state(struct posix_acl_state *state)
+{
+ kfree(state->users);
+ kfree(state->groups);
+}
+
+static void parse_dacl(struct user_namespace *user_ns,
+ struct smb_acl *pdacl, char *end_of_acl,
+ struct smb_sid *pownersid, struct smb_sid *pgrpsid,
+ struct smb_fattr *fattr)
+{
+ int i, ret;
+ int num_aces = 0;
+ int acl_size;
+ char *acl_base;
+ struct smb_ace **ppace;
+ struct posix_acl_entry *cf_pace, *cf_pdace;
+ struct posix_acl_state acl_state, default_acl_state;
+ umode_t mode = 0, acl_mode;
+ bool owner_found = false, group_found = false, others_found = false;
+
+ if (!pdacl)
+ return;
+
+ /* validate that we do not go past end of acl */
+ if (end_of_acl <= (char *)pdacl ||
+ end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
+ pr_err("ACL too small to parse DACL\n");
+ return;
+ }
+
+ ksmbd_debug(SMB, "DACL revision %d size %d num aces %d\n",
+ le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
+ le32_to_cpu(pdacl->num_aces));
+
+ acl_base = (char *)pdacl;
+ acl_size = sizeof(struct smb_acl);
+
+ num_aces = le32_to_cpu(pdacl->num_aces);
+ if (num_aces <= 0)
+ return;
+
+ if (num_aces > ULONG_MAX / sizeof(struct smb_ace *))
+ return;
+
+ ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL);
+ if (!ppace)
+ return;
+
+ ret = init_acl_state(&acl_state, num_aces);
+ if (ret)
+ return;
+ ret = init_acl_state(&default_acl_state, num_aces);
+ if (ret) {
+ free_acl_state(&acl_state);
+ return;
+ }
+
+ /*
+ * reset rwx permissions for user/group/other.
+ * Also, if num_aces is 0 i.e. DACL has no ACEs,
+ * user/group/other have no permissions
+ */
+ for (i = 0; i < num_aces; ++i) {
+ ppace[i] = (struct smb_ace *)(acl_base + acl_size);
+ acl_base = (char *)ppace[i];
+ acl_size = le16_to_cpu(ppace[i]->size);
+ ppace[i]->access_req =
+ smb_map_generic_desired_access(ppace[i]->access_req);
+
+ if (!(compare_sids(&ppace[i]->sid, &sid_unix_NFS_mode))) {
+ fattr->cf_mode =
+ le32_to_cpu(ppace[i]->sid.sub_auth[2]);
+ break;
+ } else if (!compare_sids(&ppace[i]->sid, pownersid)) {
+ acl_mode = access_flags_to_mode(fattr,
+ ppace[i]->access_req,
+ ppace[i]->type);
+ acl_mode &= 0700;
+
+ if (!owner_found) {
+ mode &= ~(0700);
+ mode |= acl_mode;
+ }
+ owner_found = true;
+ } else if (!compare_sids(&ppace[i]->sid, pgrpsid) ||
+ ppace[i]->sid.sub_auth[ppace[i]->sid.num_subauth - 1] ==
+ DOMAIN_USER_RID_LE) {
+ acl_mode = access_flags_to_mode(fattr,
+ ppace[i]->access_req,
+ ppace[i]->type);
+ acl_mode &= 0070;
+ if (!group_found) {
+ mode &= ~(0070);
+ mode |= acl_mode;
+ }
+ group_found = true;
+ } else if (!compare_sids(&ppace[i]->sid, &sid_everyone)) {
+ acl_mode = access_flags_to_mode(fattr,
+ ppace[i]->access_req,
+ ppace[i]->type);
+ acl_mode &= 0007;
+ if (!others_found) {
+ mode &= ~(0007);
+ mode |= acl_mode;
+ }
+ others_found = true;
+ } else if (!compare_sids(&ppace[i]->sid, &creator_owner)) {
+ continue;
+ } else if (!compare_sids(&ppace[i]->sid, &creator_group)) {
+ continue;
+ } else if (!compare_sids(&ppace[i]->sid, &sid_authusers)) {
+ continue;
+ } else {
+ struct smb_fattr temp_fattr;
+
+ acl_mode = access_flags_to_mode(fattr, ppace[i]->access_req,
+ ppace[i]->type);
+ temp_fattr.cf_uid = INVALID_UID;
+ ret = sid_to_id(user_ns, &ppace[i]->sid, SIDOWNER, &temp_fattr);
+ if (ret || uid_eq(temp_fattr.cf_uid, INVALID_UID)) {
+ pr_err("%s: Error %d mapping Owner SID to uid\n",
+ __func__, ret);
+ continue;
+ }
+
+ acl_state.owner.allow = ((acl_mode & 0700) >> 6) | 0004;
+ acl_state.users->aces[acl_state.users->n].uid =
+ temp_fattr.cf_uid;
+ acl_state.users->aces[acl_state.users->n++].perms.allow =
+ ((acl_mode & 0700) >> 6) | 0004;
+ default_acl_state.owner.allow = ((acl_mode & 0700) >> 6) | 0004;
+ default_acl_state.users->aces[default_acl_state.users->n].uid =
+ temp_fattr.cf_uid;
+ default_acl_state.users->aces[default_acl_state.users->n++].perms.allow =
+ ((acl_mode & 0700) >> 6) | 0004;
+ }
+ }
+ kfree(ppace);
+
+ if (owner_found) {
+ /* The owner must be set to at least read-only. */
+ acl_state.owner.allow = ((mode & 0700) >> 6) | 0004;
+ acl_state.users->aces[acl_state.users->n].uid = fattr->cf_uid;
+ acl_state.users->aces[acl_state.users->n++].perms.allow =
+ ((mode & 0700) >> 6) | 0004;
+ default_acl_state.owner.allow = ((mode & 0700) >> 6) | 0004;
+ default_acl_state.users->aces[default_acl_state.users->n].uid =
+ fattr->cf_uid;
+ default_acl_state.users->aces[default_acl_state.users->n++].perms.allow =
+ ((mode & 0700) >> 6) | 0004;
+ }
+
+ if (group_found) {
+ acl_state.group.allow = (mode & 0070) >> 3;
+ acl_state.groups->aces[acl_state.groups->n].gid =
+ fattr->cf_gid;
+ acl_state.groups->aces[acl_state.groups->n++].perms.allow =
+ (mode & 0070) >> 3;
+ default_acl_state.group.allow = (mode & 0070) >> 3;
+ default_acl_state.groups->aces[default_acl_state.groups->n].gid =
+ fattr->cf_gid;
+ default_acl_state.groups->aces[default_acl_state.groups->n++].perms.allow =
+ (mode & 0070) >> 3;
+ }
+
+ if (others_found) {
+ fattr->cf_mode &= ~(0007);
+ fattr->cf_mode |= mode & 0007;
+
+ acl_state.other.allow = mode & 0007;
+ default_acl_state.other.allow = mode & 0007;
+ }
+
+ if (acl_state.users->n || acl_state.groups->n) {
+ acl_state.mask.allow = 0x07;
+
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
+ fattr->cf_acls =
+ posix_acl_alloc(acl_state.users->n +
+ acl_state.groups->n + 4, GFP_KERNEL);
+ if (fattr->cf_acls) {
+ cf_pace = fattr->cf_acls->a_entries;
+ posix_state_to_acl(&acl_state, cf_pace);
+ }
+ }
+ }
+
+ if (default_acl_state.users->n || default_acl_state.groups->n) {
+ default_acl_state.mask.allow = 0x07;
+
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
+ fattr->cf_dacls =
+ posix_acl_alloc(default_acl_state.users->n +
+ default_acl_state.groups->n + 4, GFP_KERNEL);
+ if (fattr->cf_dacls) {
+ cf_pdace = fattr->cf_dacls->a_entries;
+ posix_state_to_acl(&default_acl_state, cf_pdace);
+ }
+ }
+ }
+ free_acl_state(&acl_state);
+ free_acl_state(&default_acl_state);
+}
+
+static void set_posix_acl_entries_dacl(struct user_namespace *user_ns,
+ struct smb_ace *pndace,
+ struct smb_fattr *fattr, u32 *num_aces,
+ u16 *size, u32 nt_aces_num)
+{
+ struct posix_acl_entry *pace;
+ struct smb_sid *sid;
+ struct smb_ace *ntace;
+ int i, j;
+
+ if (!fattr->cf_acls)
+ goto posix_default_acl;
+
+ pace = fattr->cf_acls->a_entries;
+ for (i = 0; i < fattr->cf_acls->a_count; i++, pace++) {
+ int flags = 0;
+
+ sid = kmalloc(sizeof(struct smb_sid), GFP_KERNEL);
+ if (!sid)
+ break;
+
+ if (pace->e_tag == ACL_USER) {
+ uid_t uid;
+ unsigned int sid_type = SIDOWNER;
+
+ uid = from_kuid(user_ns, pace->e_uid);
+ if (!uid)
+ sid_type = SIDUNIX_USER;
+ id_to_sid(uid, sid_type, sid);
+ } else if (pace->e_tag == ACL_GROUP) {
+ gid_t gid;
+
+ gid = from_kgid(user_ns, pace->e_gid);
+ id_to_sid(gid, SIDUNIX_GROUP, sid);
+ } else if (pace->e_tag == ACL_OTHER && !nt_aces_num) {
+ smb_copy_sid(sid, &sid_everyone);
+ } else {
+ kfree(sid);
+ continue;
+ }
+ ntace = pndace;
+ for (j = 0; j < nt_aces_num; j++) {
+ if (ntace->sid.sub_auth[ntace->sid.num_subauth - 1] ==
+ sid->sub_auth[sid->num_subauth - 1])
+ goto pass_same_sid;
+ ntace = (struct smb_ace *)((char *)ntace +
+ le16_to_cpu(ntace->size));
+ }
+
+ if (S_ISDIR(fattr->cf_mode) && pace->e_tag == ACL_OTHER)
+ flags = 0x03;
+
+ ntace = (struct smb_ace *)((char *)pndace + *size);
+ *size += fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, flags,
+ pace->e_perm, 0777);
+ (*num_aces)++;
+ if (pace->e_tag == ACL_USER)
+ ntace->access_req |=
+ FILE_DELETE_LE | FILE_DELETE_CHILD_LE;
+
+ if (S_ISDIR(fattr->cf_mode) &&
+ (pace->e_tag == ACL_USER || pace->e_tag == ACL_GROUP)) {
+ ntace = (struct smb_ace *)((char *)pndace + *size);
+ *size += fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED,
+ 0x03, pace->e_perm, 0777);
+ (*num_aces)++;
+ if (pace->e_tag == ACL_USER)
+ ntace->access_req |=
+ FILE_DELETE_LE | FILE_DELETE_CHILD_LE;
+ }
+
+pass_same_sid:
+ kfree(sid);
+ }
+
+ if (nt_aces_num)
+ return;
+
+posix_default_acl:
+ if (!fattr->cf_dacls)
+ return;
+
+ pace = fattr->cf_dacls->a_entries;
+ for (i = 0; i < fattr->cf_dacls->a_count; i++, pace++) {
+ sid = kmalloc(sizeof(struct smb_sid), GFP_KERNEL);
+ if (!sid)
+ break;
+
+ if (pace->e_tag == ACL_USER) {
+ uid_t uid;
+
+ uid = from_kuid(user_ns, pace->e_uid);
+ id_to_sid(uid, SIDCREATOR_OWNER, sid);
+ } else if (pace->e_tag == ACL_GROUP) {
+ gid_t gid;
+
+ gid = from_kgid(user_ns, pace->e_gid);
+ id_to_sid(gid, SIDCREATOR_GROUP, sid);
+ } else {
+ kfree(sid);
+ continue;
+ }
+
+ ntace = (struct smb_ace *)((char *)pndace + *size);
+ *size += fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, 0x0b,
+ pace->e_perm, 0777);
+ (*num_aces)++;
+ if (pace->e_tag == ACL_USER)
+ ntace->access_req |=
+ FILE_DELETE_LE | FILE_DELETE_CHILD_LE;
+ kfree(sid);
+ }
+}
+
+static void set_ntacl_dacl(struct user_namespace *user_ns,
+ struct smb_acl *pndacl,
+ struct smb_acl *nt_dacl,
+ const struct smb_sid *pownersid,
+ const struct smb_sid *pgrpsid,
+ struct smb_fattr *fattr)
+{
+ struct smb_ace *ntace, *pndace;
+ int nt_num_aces = le32_to_cpu(nt_dacl->num_aces), num_aces = 0;
+ unsigned short size = 0;
+ int i;
+
+ pndace = (struct smb_ace *)((char *)pndacl + sizeof(struct smb_acl));
+ if (nt_num_aces) {
+ ntace = (struct smb_ace *)((char *)nt_dacl + sizeof(struct smb_acl));
+ for (i = 0; i < nt_num_aces; i++) {
+ memcpy((char *)pndace + size, ntace, le16_to_cpu(ntace->size));
+ size += le16_to_cpu(ntace->size);
+ ntace = (struct smb_ace *)((char *)ntace + le16_to_cpu(ntace->size));
+ num_aces++;
+ }
+ }
+
+ set_posix_acl_entries_dacl(user_ns, pndace, fattr,
+ &num_aces, &size, nt_num_aces);
+ pndacl->num_aces = cpu_to_le32(num_aces);
+ pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size);
+}
+
+static void set_mode_dacl(struct user_namespace *user_ns,
+ struct smb_acl *pndacl, struct smb_fattr *fattr)
+{
+ struct smb_ace *pace, *pndace;
+ u32 num_aces = 0;
+ u16 size = 0, ace_size = 0;
+ uid_t uid;
+ const struct smb_sid *sid;
+
+ pace = pndace = (struct smb_ace *)((char *)pndacl + sizeof(struct smb_acl));
+
+ if (fattr->cf_acls) {
+ set_posix_acl_entries_dacl(user_ns, pndace, fattr,
+ &num_aces, &size, num_aces);
+ goto out;
+ }
+
+ /* owner RID */
+ uid = from_kuid(user_ns, fattr->cf_uid);
+ if (uid)
+ sid = &server_conf.domain_sid;
+ else
+ sid = &sid_unix_users;
+ ace_size = fill_ace_for_sid(pace, sid, ACCESS_ALLOWED, 0,
+ fattr->cf_mode, 0700);
+ pace->sid.sub_auth[pace->sid.num_subauth++] = cpu_to_le32(uid);
+ pace->size = cpu_to_le16(ace_size + 4);
+ size += le16_to_cpu(pace->size);
+ pace = (struct smb_ace *)((char *)pndace + size);
+
+ /* Group RID */
+ ace_size = fill_ace_for_sid(pace, &sid_unix_groups,
+ ACCESS_ALLOWED, 0, fattr->cf_mode, 0070);
+ pace->sid.sub_auth[pace->sid.num_subauth++] =
+ cpu_to_le32(from_kgid(user_ns, fattr->cf_gid));
+ pace->size = cpu_to_le16(ace_size + 4);
+ size += le16_to_cpu(pace->size);
+ pace = (struct smb_ace *)((char *)pndace + size);
+ num_aces = 3;
+
+ if (S_ISDIR(fattr->cf_mode)) {
+ pace = (struct smb_ace *)((char *)pndace + size);
+
+ /* creator owner */
+ size += fill_ace_for_sid(pace, &creator_owner, ACCESS_ALLOWED,
+ 0x0b, fattr->cf_mode, 0700);
+ pace = (struct smb_ace *)((char *)pndace + size);
+
+ /* creator group */
+ size += fill_ace_for_sid(pace, &creator_group, ACCESS_ALLOWED,
+ 0x0b, fattr->cf_mode, 0070);
+ pace = (struct smb_ace *)((char *)pndace + size);
+ num_aces = 5;
+ }
+
+ /* other */
+ size += fill_ace_for_sid(pace, &sid_everyone, ACCESS_ALLOWED, 0,
+ fattr->cf_mode, 0007);
+
+out:
+ pndacl->num_aces = cpu_to_le32(num_aces);
+ pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size);
+}
+
+static int parse_sid(struct smb_sid *psid, char *end_of_acl)
+{
+ /*
+ * validate that we do not go past end of ACL - sid must be at least 8
+ * bytes long (assuming no sub-auths - e.g. the null SID
+ */
+ if (end_of_acl < (char *)psid + 8) {
+ pr_err("ACL too small to parse SID %p\n", psid);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Convert CIFS ACL to POSIX form */
+int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
+ int acl_len, struct smb_fattr *fattr)
+{
+ int rc = 0;
+ struct smb_sid *owner_sid_ptr, *group_sid_ptr;
+ struct smb_acl *dacl_ptr; /* no need for SACL ptr */
+ char *end_of_acl = ((char *)pntsd) + acl_len;
+ __u32 dacloffset;
+ int pntsd_type;
+
+ if (!pntsd)
+ return -EIO;
+
+ owner_sid_ptr = (struct smb_sid *)((char *)pntsd +
+ le32_to_cpu(pntsd->osidoffset));
+ group_sid_ptr = (struct smb_sid *)((char *)pntsd +
+ le32_to_cpu(pntsd->gsidoffset));
+ dacloffset = le32_to_cpu(pntsd->dacloffset);
+ dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
+ ksmbd_debug(SMB,
+ "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n",
+ pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
+ le32_to_cpu(pntsd->gsidoffset),
+ le32_to_cpu(pntsd->sacloffset), dacloffset);
+
+ pntsd_type = le16_to_cpu(pntsd->type);
+ if (!(pntsd_type & DACL_PRESENT)) {
+ ksmbd_debug(SMB, "DACL_PRESENT in DACL type is not set\n");
+ return rc;
+ }
+
+ pntsd->type = cpu_to_le16(DACL_PRESENT);
+
+ if (pntsd->osidoffset) {
+ rc = parse_sid(owner_sid_ptr, end_of_acl);
+ if (rc) {
+ pr_err("%s: Error %d parsing Owner SID\n", __func__, rc);
+ return rc;
+ }
+
+ rc = sid_to_id(user_ns, owner_sid_ptr, SIDOWNER, fattr);
+ if (rc) {
+ pr_err("%s: Error %d mapping Owner SID to uid\n",
+ __func__, rc);
+ owner_sid_ptr = NULL;
+ }
+ }
+
+ if (pntsd->gsidoffset) {
+ rc = parse_sid(group_sid_ptr, end_of_acl);
+ if (rc) {
+ pr_err("%s: Error %d mapping Owner SID to gid\n",
+ __func__, rc);
+ return rc;
+ }
+ rc = sid_to_id(user_ns, group_sid_ptr, SIDUNIX_GROUP, fattr);
+ if (rc) {
+ pr_err("%s: Error %d mapping Group SID to gid\n",
+ __func__, rc);
+ group_sid_ptr = NULL;
+ }
+ }
+
+ if ((pntsd_type & (DACL_AUTO_INHERITED | DACL_AUTO_INHERIT_REQ)) ==
+ (DACL_AUTO_INHERITED | DACL_AUTO_INHERIT_REQ))
+ pntsd->type |= cpu_to_le16(DACL_AUTO_INHERITED);
+ if (pntsd_type & DACL_PROTECTED)
+ pntsd->type |= cpu_to_le16(DACL_PROTECTED);
+
+ if (dacloffset) {
+ parse_dacl(user_ns, dacl_ptr, end_of_acl,
+ owner_sid_ptr, group_sid_ptr, fattr);
+ }
+
+ return 0;
+}
+
+/* Convert permission bits from mode to equivalent CIFS ACL */
+int build_sec_desc(struct user_namespace *user_ns,
+ struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd,
+ int addition_info, __u32 *secdesclen,
+ struct smb_fattr *fattr)
+{
+ int rc = 0;
+ __u32 offset;
+ struct smb_sid *owner_sid_ptr, *group_sid_ptr;
+ struct smb_sid *nowner_sid_ptr, *ngroup_sid_ptr;
+ struct smb_acl *dacl_ptr = NULL; /* no need for SACL ptr */
+ uid_t uid;
+ gid_t gid;
+ unsigned int sid_type = SIDOWNER;
+
+ nowner_sid_ptr = kmalloc(sizeof(struct smb_sid), GFP_KERNEL);
+ if (!nowner_sid_ptr)
+ return -ENOMEM;
+
+ uid = from_kuid(user_ns, fattr->cf_uid);
+ if (!uid)
+ sid_type = SIDUNIX_USER;
+ id_to_sid(uid, sid_type, nowner_sid_ptr);
+
+ ngroup_sid_ptr = kmalloc(sizeof(struct smb_sid), GFP_KERNEL);
+ if (!ngroup_sid_ptr) {
+ kfree(nowner_sid_ptr);
+ return -ENOMEM;
+ }
+
+ gid = from_kgid(user_ns, fattr->cf_gid);
+ id_to_sid(gid, SIDUNIX_GROUP, ngroup_sid_ptr);
+
+ offset = sizeof(struct smb_ntsd);
+ pntsd->sacloffset = 0;
+ pntsd->revision = cpu_to_le16(1);
+ pntsd->type = cpu_to_le16(SELF_RELATIVE);
+ if (ppntsd)
+ pntsd->type |= ppntsd->type;
+
+ if (addition_info & OWNER_SECINFO) {
+ pntsd->osidoffset = cpu_to_le32(offset);
+ owner_sid_ptr = (struct smb_sid *)((char *)pntsd + offset);
+ smb_copy_sid(owner_sid_ptr, nowner_sid_ptr);
+ offset += 1 + 1 + 6 + (nowner_sid_ptr->num_subauth * 4);
+ }
+
+ if (addition_info & GROUP_SECINFO) {
+ pntsd->gsidoffset = cpu_to_le32(offset);
+ group_sid_ptr = (struct smb_sid *)((char *)pntsd + offset);
+ smb_copy_sid(group_sid_ptr, ngroup_sid_ptr);
+ offset += 1 + 1 + 6 + (ngroup_sid_ptr->num_subauth * 4);
+ }
+
+ if (addition_info & DACL_SECINFO) {
+ pntsd->type |= cpu_to_le16(DACL_PRESENT);
+ dacl_ptr = (struct smb_acl *)((char *)pntsd + offset);
+ dacl_ptr->revision = cpu_to_le16(2);
+ dacl_ptr->size = cpu_to_le16(sizeof(struct smb_acl));
+ dacl_ptr->num_aces = 0;
+
+ if (!ppntsd) {
+ set_mode_dacl(user_ns, dacl_ptr, fattr);
+ } else if (!ppntsd->dacloffset) {
+ goto out;
+ } else {
+ struct smb_acl *ppdacl_ptr;
+
+ ppdacl_ptr = (struct smb_acl *)((char *)ppntsd +
+ le32_to_cpu(ppntsd->dacloffset));
+ set_ntacl_dacl(user_ns, dacl_ptr, ppdacl_ptr,
+ nowner_sid_ptr, ngroup_sid_ptr, fattr);
+ }
+ pntsd->dacloffset = cpu_to_le32(offset);
+ offset += le16_to_cpu(dacl_ptr->size);
+ }
+
+out:
+ kfree(nowner_sid_ptr);
+ kfree(ngroup_sid_ptr);
+ *secdesclen = offset;
+ return rc;
+}
+
+static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type,
+ u8 flags, __le32 access_req)
+{
+ ace->type = type;
+ ace->flags = flags;
+ ace->access_req = access_req;
+ smb_copy_sid(&ace->sid, sid);
+ ace->size = cpu_to_le16(1 + 1 + 2 + 4 + 1 + 1 + 6 + (sid->num_subauth * 4));
+}
+
+int smb_inherit_dacl(struct ksmbd_conn *conn,
+ struct path *path,
+ unsigned int uid, unsigned int gid)
+{
+ const struct smb_sid *psid, *creator = NULL;
+ struct smb_ace *parent_aces, *aces;
+ struct smb_acl *parent_pdacl;
+ struct smb_ntsd *parent_pntsd = NULL;
+ struct smb_sid owner_sid, group_sid;
+ struct dentry *parent = path->dentry->d_parent;
+ struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+ int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0;
+ int rc = 0, num_aces, dacloffset, pntsd_type, acl_len;
+ char *aces_base;
+ bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode);
+
+ acl_len = ksmbd_vfs_get_sd_xattr(conn, user_ns,
+ parent, &parent_pntsd);
+ if (acl_len <= 0)
+ return -ENOENT;
+ dacloffset = le32_to_cpu(parent_pntsd->dacloffset);
+ if (!dacloffset) {
+ rc = -EINVAL;
+ goto free_parent_pntsd;
+ }
+
+ parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset);
+ num_aces = le32_to_cpu(parent_pdacl->num_aces);
+ pntsd_type = le16_to_cpu(parent_pntsd->type);
+
+ aces_base = kmalloc(sizeof(struct smb_ace) * num_aces * 2, GFP_KERNEL);
+ if (!aces_base) {
+ rc = -ENOMEM;
+ goto free_parent_pntsd;
+ }
+
+ aces = (struct smb_ace *)aces_base;
+ parent_aces = (struct smb_ace *)((char *)parent_pdacl +
+ sizeof(struct smb_acl));
+
+ if (pntsd_type & DACL_AUTO_INHERITED)
+ inherited_flags = INHERITED_ACE;
+
+ for (i = 0; i < num_aces; i++) {
+ flags = parent_aces->flags;
+ if (!smb_inherit_flags(flags, is_dir))
+ goto pass;
+ if (is_dir) {
+ flags &= ~(INHERIT_ONLY_ACE | INHERITED_ACE);
+ if (!(flags & CONTAINER_INHERIT_ACE))
+ flags |= INHERIT_ONLY_ACE;
+ if (flags & NO_PROPAGATE_INHERIT_ACE)
+ flags = 0;
+ } else {
+ flags = 0;
+ }
+
+ if (!compare_sids(&creator_owner, &parent_aces->sid)) {
+ creator = &creator_owner;
+ id_to_sid(uid, SIDOWNER, &owner_sid);
+ psid = &owner_sid;
+ } else if (!compare_sids(&creator_group, &parent_aces->sid)) {
+ creator = &creator_group;
+ id_to_sid(gid, SIDUNIX_GROUP, &group_sid);
+ psid = &group_sid;
+ } else {
+ creator = NULL;
+ psid = &parent_aces->sid;
+ }
+
+ if (is_dir && creator && flags & CONTAINER_INHERIT_ACE) {
+ smb_set_ace(aces, psid, parent_aces->type, inherited_flags,
+ parent_aces->access_req);
+ nt_size += le16_to_cpu(aces->size);
+ ace_cnt++;
+ aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size));
+ flags |= INHERIT_ONLY_ACE;
+ psid = creator;
+ } else if (is_dir && !(parent_aces->flags & NO_PROPAGATE_INHERIT_ACE)) {
+ psid = &parent_aces->sid;
+ }
+
+ smb_set_ace(aces, psid, parent_aces->type, flags | inherited_flags,
+ parent_aces->access_req);
+ nt_size += le16_to_cpu(aces->size);
+ aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size));
+ ace_cnt++;
+pass:
+ parent_aces =
+ (struct smb_ace *)((char *)parent_aces + le16_to_cpu(parent_aces->size));
+ }
+
+ if (nt_size > 0) {
+ struct smb_ntsd *pntsd;
+ struct smb_acl *pdacl;
+ struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL;
+ int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size;
+
+ if (parent_pntsd->osidoffset) {
+ powner_sid = (struct smb_sid *)((char *)parent_pntsd +
+ le32_to_cpu(parent_pntsd->osidoffset));
+ powner_sid_size = 1 + 1 + 6 + (powner_sid->num_subauth * 4);
+ }
+ if (parent_pntsd->gsidoffset) {
+ pgroup_sid = (struct smb_sid *)((char *)parent_pntsd +
+ le32_to_cpu(parent_pntsd->gsidoffset));
+ pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4);
+ }
+
+ pntsd = kzalloc(sizeof(struct smb_ntsd) + powner_sid_size +
+ pgroup_sid_size + sizeof(struct smb_acl) +
+ nt_size, GFP_KERNEL);
+ if (!pntsd) {
+ rc = -ENOMEM;
+ goto free_aces_base;
+ }
+
+ pntsd->revision = cpu_to_le16(1);
+ pntsd->type = cpu_to_le16(SELF_RELATIVE | DACL_PRESENT);
+ if (le16_to_cpu(parent_pntsd->type) & DACL_AUTO_INHERITED)
+ pntsd->type |= cpu_to_le16(DACL_AUTO_INHERITED);
+ pntsd_size = sizeof(struct smb_ntsd);
+ pntsd->osidoffset = parent_pntsd->osidoffset;
+ pntsd->gsidoffset = parent_pntsd->gsidoffset;
+ pntsd->dacloffset = parent_pntsd->dacloffset;
+
+ if (pntsd->osidoffset) {
+ struct smb_sid *owner_sid = (struct smb_sid *)((char *)pntsd +
+ le32_to_cpu(pntsd->osidoffset));
+ memcpy(owner_sid, powner_sid, powner_sid_size);
+ pntsd_size += powner_sid_size;
+ }
+
+ if (pntsd->gsidoffset) {
+ struct smb_sid *group_sid = (struct smb_sid *)((char *)pntsd +
+ le32_to_cpu(pntsd->gsidoffset));
+ memcpy(group_sid, pgroup_sid, pgroup_sid_size);
+ pntsd_size += pgroup_sid_size;
+ }
+
+ if (pntsd->dacloffset) {
+ struct smb_ace *pace;
+
+ pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset));
+ pdacl->revision = cpu_to_le16(2);
+ pdacl->size = cpu_to_le16(sizeof(struct smb_acl) + nt_size);
+ pdacl->num_aces = cpu_to_le32(ace_cnt);
+ pace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
+ memcpy(pace, aces_base, nt_size);
+ pntsd_size += sizeof(struct smb_acl) + nt_size;
+ }
+
+ ksmbd_vfs_set_sd_xattr(conn, user_ns,
+ path->dentry, pntsd, pntsd_size);
+ kfree(pntsd);
+ }
+
+free_aces_base:
+ kfree(aces_base);
+free_parent_pntsd:
+ kfree(parent_pntsd);
+ return rc;
+}
+
+bool smb_inherit_flags(int flags, bool is_dir)
+{
+ if (!is_dir)
+ return (flags & OBJECT_INHERIT_ACE) != 0;
+
+ if (flags & OBJECT_INHERIT_ACE && !(flags & NO_PROPAGATE_INHERIT_ACE))
+ return true;
+
+ if (flags & CONTAINER_INHERIT_ACE)
+ return true;
+ return false;
+}
+
+int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
+ __le32 *pdaccess, int uid)
+{
+ struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+ struct smb_ntsd *pntsd = NULL;
+ struct smb_acl *pdacl;
+ struct posix_acl *posix_acls;
+ int rc = 0, acl_size;
+ struct smb_sid sid;
+ int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE);
+ struct smb_ace *ace;
+ int i, found = 0;
+ unsigned int access_bits = 0;
+ struct smb_ace *others_ace = NULL;
+ struct posix_acl_entry *pa_entry;
+ unsigned int sid_type = SIDOWNER;
+ char *end_of_acl;
+
+ ksmbd_debug(SMB, "check permission using windows acl\n");
+ acl_size = ksmbd_vfs_get_sd_xattr(conn, user_ns,
+ path->dentry, &pntsd);
+ if (acl_size <= 0 || !pntsd || !pntsd->dacloffset) {
+ kfree(pntsd);
+ return 0;
+ }
+
+ pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset));
+ end_of_acl = ((char *)pntsd) + acl_size;
+ if (end_of_acl <= (char *)pdacl) {
+ kfree(pntsd);
+ return 0;
+ }
+
+ if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size) ||
+ le16_to_cpu(pdacl->size) < sizeof(struct smb_acl)) {
+ kfree(pntsd);
+ return 0;
+ }
+
+ if (!pdacl->num_aces) {
+ if (!(le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) &&
+ *pdaccess & ~(FILE_READ_CONTROL_LE | FILE_WRITE_DAC_LE)) {
+ rc = -EACCES;
+ goto err_out;
+ }
+ kfree(pntsd);
+ return 0;
+ }
+
+ if (*pdaccess & FILE_MAXIMAL_ACCESS_LE) {
+ granted = READ_CONTROL | WRITE_DAC | FILE_READ_ATTRIBUTES |
+ DELETE;
+
+ ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
+ for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) {
+ granted |= le32_to_cpu(ace->access_req);
+ ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size));
+ if (end_of_acl < (char *)ace)
+ goto err_out;
+ }
+
+ if (!pdacl->num_aces)
+ granted = GENERIC_ALL_FLAGS;
+ }
+
+ if (!uid)
+ sid_type = SIDUNIX_USER;
+ id_to_sid(uid, sid_type, &sid);
+
+ ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
+ for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) {
+ if (!compare_sids(&sid, &ace->sid) ||
+ !compare_sids(&sid_unix_NFS_mode, &ace->sid)) {
+ found = 1;
+ break;
+ }
+ if (!compare_sids(&sid_everyone, &ace->sid))
+ others_ace = ace;
+
+ ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size));
+ if (end_of_acl < (char *)ace)
+ goto err_out;
+ }
+
+ if (*pdaccess & FILE_MAXIMAL_ACCESS_LE && found) {
+ granted = READ_CONTROL | WRITE_DAC | FILE_READ_ATTRIBUTES |
+ DELETE;
+
+ granted |= le32_to_cpu(ace->access_req);
+
+ if (!pdacl->num_aces)
+ granted = GENERIC_ALL_FLAGS;
+ }
+
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
+ posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
+ if (posix_acls && !found) {
+ unsigned int id = -1;
+
+ pa_entry = posix_acls->a_entries;
+ for (i = 0; i < posix_acls->a_count; i++, pa_entry++) {
+ if (pa_entry->e_tag == ACL_USER)
+ id = from_kuid(user_ns,
+ pa_entry->e_uid);
+ else if (pa_entry->e_tag == ACL_GROUP)
+ id = from_kgid(user_ns,
+ pa_entry->e_gid);
+ else
+ continue;
+
+ if (id == uid) {
+ mode_to_access_flags(pa_entry->e_perm,
+ 0777,
+ &access_bits);
+ if (!access_bits)
+ access_bits =
+ SET_MINIMUM_RIGHTS;
+ goto check_access_bits;
+ }
+ }
+ }
+ if (posix_acls)
+ posix_acl_release(posix_acls);
+ }
+
+ if (!found) {
+ if (others_ace) {
+ ace = others_ace;
+ } else {
+ ksmbd_debug(SMB, "Can't find corresponding sid\n");
+ rc = -EACCES;
+ goto err_out;
+ }
+ }
+
+ switch (ace->type) {
+ case ACCESS_ALLOWED_ACE_TYPE:
+ access_bits = le32_to_cpu(ace->access_req);
+ break;
+ case ACCESS_DENIED_ACE_TYPE:
+ case ACCESS_DENIED_CALLBACK_ACE_TYPE:
+ access_bits = le32_to_cpu(~ace->access_req);
+ break;
+ }
+
+check_access_bits:
+ if (granted &
+ ~(access_bits | FILE_READ_ATTRIBUTES | READ_CONTROL | WRITE_DAC | DELETE)) {
+ ksmbd_debug(SMB, "Access denied with winACL, granted : %x, access_req : %x\n",
+ granted, le32_to_cpu(ace->access_req));
+ rc = -EACCES;
+ goto err_out;
+ }
+
+ *pdaccess = cpu_to_le32(granted);
+err_out:
+ kfree(pntsd);
+ return rc;
+}
+
+int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
+ struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
+ bool type_check)
+{
+ int rc;
+ struct smb_fattr fattr = {{0}};
+ struct inode *inode = d_inode(path->dentry);
+ struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+ struct iattr newattrs;
+
+ fattr.cf_uid = INVALID_UID;
+ fattr.cf_gid = INVALID_GID;
+ fattr.cf_mode = inode->i_mode;
+
+ rc = parse_sec_desc(user_ns, pntsd, ntsd_len, &fattr);
+ if (rc)
+ goto out;
+
+ newattrs.ia_valid = ATTR_CTIME;
+ if (!uid_eq(fattr.cf_uid, INVALID_UID)) {
+ newattrs.ia_valid |= ATTR_UID;
+ newattrs.ia_uid = fattr.cf_uid;
+ }
+ if (!gid_eq(fattr.cf_gid, INVALID_GID)) {
+ newattrs.ia_valid |= ATTR_GID;
+ newattrs.ia_gid = fattr.cf_gid;
+ }
+ newattrs.ia_valid |= ATTR_MODE;
+ newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777);
+
+ inode_lock(inode);
+ rc = notify_change(user_ns, path->dentry, &newattrs, NULL);
+ inode_unlock(inode);
+ if (rc)
+ goto out;
+
+ ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry);
+ /* Update posix acls */
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) {
+ rc = set_posix_acl(user_ns, inode,
+ ACL_TYPE_ACCESS, fattr.cf_acls);
+ if (S_ISDIR(inode->i_mode) && fattr.cf_dacls)
+ rc = set_posix_acl(user_ns, inode,
+ ACL_TYPE_DEFAULT, fattr.cf_dacls);
+ }
+
+ /* Check it only calling from SD BUFFER context */
+ if (type_check && !(le16_to_cpu(pntsd->type) & DACL_PRESENT))
+ goto out;
+
+ if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) {
+ /* Update WinACL in xattr */
+ ksmbd_vfs_remove_sd_xattrs(user_ns, path->dentry);
+ ksmbd_vfs_set_sd_xattr(conn, user_ns,
+ path->dentry, pntsd, ntsd_len);
+ }
+
+out:
+ posix_acl_release(fattr.cf_acls);
+ posix_acl_release(fattr.cf_dacls);
+ mark_inode_dirty(inode);
+ return rc;
+}
+
+void ksmbd_init_domain(u32 *sub_auth)
+{
+ int i;
+
+ memcpy(&server_conf.domain_sid, &domain, sizeof(struct smb_sid));
+ for (i = 0; i < 3; ++i)
+ server_conf.domain_sid.sub_auth[i + 1] = cpu_to_le32(sub_auth[i]);
+}
diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h
new file mode 100644
index 000000000000..940f686a1d95
--- /dev/null
+++ b/fs/ksmbd/smbacl.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/*
+ * Copyright (c) International Business Machines Corp., 2007
+ * Author(s): Steve French (sfrench@us.ibm.com)
+ * Modified by Namjae Jeon (linkinjeon@kernel.org)
+ */
+
+#ifndef _SMBACL_H
+#define _SMBACL_H
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/posix_acl.h>
+
+#include "mgmt/tree_connect.h"
+
+#define NUM_AUTHS (6) /* number of authority fields */
+#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
+
+/*
+ * ACE types - see MS-DTYP 2.4.4.1
+ */
+enum {
+ ACCESS_ALLOWED,
+ ACCESS_DENIED,
+};
+
+/*
+ * Security ID types
+ */
+enum {
+ SIDOWNER = 1,
+ SIDGROUP,
+ SIDCREATOR_OWNER,
+ SIDCREATOR_GROUP,
+ SIDUNIX_USER,
+ SIDUNIX_GROUP,
+ SIDNFS_USER,
+ SIDNFS_GROUP,
+ SIDNFS_MODE,
+};
+
+/* Revision for ACLs */
+#define SD_REVISION 1
+
+/* Control flags for Security Descriptor */
+#define OWNER_DEFAULTED 0x0001
+#define GROUP_DEFAULTED 0x0002
+#define DACL_PRESENT 0x0004
+#define DACL_DEFAULTED 0x0008
+#define SACL_PRESENT 0x0010
+#define SACL_DEFAULTED 0x0020
+#define DACL_TRUSTED 0x0040
+#define SERVER_SECURITY 0x0080
+#define DACL_AUTO_INHERIT_REQ 0x0100
+#define SACL_AUTO_INHERIT_REQ 0x0200
+#define DACL_AUTO_INHERITED 0x0400
+#define SACL_AUTO_INHERITED 0x0800
+#define DACL_PROTECTED 0x1000
+#define SACL_PROTECTED 0x2000
+#define RM_CONTROL_VALID 0x4000
+#define SELF_RELATIVE 0x8000
+
+/* ACE types - see MS-DTYP 2.4.4.1 */
+#define ACCESS_ALLOWED_ACE_TYPE 0x00
+#define ACCESS_DENIED_ACE_TYPE 0x01
+#define SYSTEM_AUDIT_ACE_TYPE 0x02
+#define SYSTEM_ALARM_ACE_TYPE 0x03
+#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
+#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
+#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
+#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
+#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
+#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
+#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
+#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
+#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
+#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
+#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */
+#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
+#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */
+#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11
+#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12
+#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13
+
+/* ACE flags */
+#define OBJECT_INHERIT_ACE 0x01
+#define CONTAINER_INHERIT_ACE 0x02
+#define NO_PROPAGATE_INHERIT_ACE 0x04
+#define INHERIT_ONLY_ACE 0x08
+#define INHERITED_ACE 0x10
+#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40
+#define FAILED_ACCESS_ACE_FLAG 0x80
+
+/*
+ * Maximum size of a string representation of a SID:
+ *
+ * The fields are unsigned values in decimal. So:
+ *
+ * u8: max 3 bytes in decimal
+ * u32: max 10 bytes in decimal
+ *
+ * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
+ *
+ * For authority field, max is when all 6 values are non-zero and it must be
+ * represented in hex. So "-0x" + 12 hex digits.
+ *
+ * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
+ */
+#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
+#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
+
+#define DOMAIN_USER_RID_LE cpu_to_le32(513)
+
+struct ksmbd_conn;
+
+struct smb_ntsd {
+ __le16 revision; /* revision level */
+ __le16 type;
+ __le32 osidoffset;
+ __le32 gsidoffset;
+ __le32 sacloffset;
+ __le32 dacloffset;
+} __packed;
+
+struct smb_sid {
+ __u8 revision; /* revision level */
+ __u8 num_subauth;
+ __u8 authority[NUM_AUTHS];
+ __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
+} __packed;
+
+/* size of a struct cifs_sid, sans sub_auth array */
+#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
+
+struct smb_acl {
+ __le16 revision; /* revision level */
+ __le16 size;
+ __le32 num_aces;
+} __packed;
+
+struct smb_ace {
+ __u8 type;
+ __u8 flags;
+ __le16 size;
+ __le32 access_req;
+ struct smb_sid sid; /* ie UUID of user or group who gets these perms */
+} __packed;
+
+struct smb_fattr {
+ kuid_t cf_uid;
+ kgid_t cf_gid;
+ umode_t cf_mode;
+ __le32 daccess;
+ struct posix_acl *cf_acls;
+ struct posix_acl *cf_dacls;
+};
+
+struct posix_ace_state {
+ u32 allow;
+ u32 deny;
+};
+
+struct posix_user_ace_state {
+ union {
+ kuid_t uid;
+ kgid_t gid;
+ };
+ struct posix_ace_state perms;
+};
+
+struct posix_ace_state_array {
+ int n;
+ struct posix_user_ace_state aces[];
+};
+
+/*
+ * while processing the nfsv4 ace, this maintains the partial permissions
+ * calculated so far:
+ */
+
+struct posix_acl_state {
+ struct posix_ace_state owner;
+ struct posix_ace_state group;
+ struct posix_ace_state other;
+ struct posix_ace_state everyone;
+ struct posix_ace_state mask; /* deny unused in this case */
+ struct posix_ace_state_array *users;
+ struct posix_ace_state_array *groups;
+};
+
+int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
+ int acl_len, struct smb_fattr *fattr);
+int build_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
+ struct smb_ntsd *ppntsd, int addition_info,
+ __u32 *secdesclen, struct smb_fattr *fattr);
+int init_acl_state(struct posix_acl_state *state, int cnt);
+void free_acl_state(struct posix_acl_state *state);
+void posix_state_to_acl(struct posix_acl_state *state,
+ struct posix_acl_entry *pace);
+int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid);
+bool smb_inherit_flags(int flags, bool is_dir);
+int smb_inherit_dacl(struct ksmbd_conn *conn, struct path *path,
+ unsigned int uid, unsigned int gid);
+int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
+ __le32 *pdaccess, int uid);
+int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
+ struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
+ bool type_check);
+void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid);
+void ksmbd_init_domain(u32 *sub_auth);
+#endif /* _SMBACL_H */
diff --git a/fs/ksmbd/smbfsctl.h b/fs/ksmbd/smbfsctl.h
new file mode 100644
index 000000000000..b98418aae20c
--- /dev/null
+++ b/fs/ksmbd/smbfsctl.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/*
+ * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
+ *
+ * Copyright (c) International Business Machines Corp., 2002,2009
+ * Author(s): Steve French (sfrench@us.ibm.com)
+ */
+
+/* IOCTL information */
+/*
+ * List of ioctl/fsctl function codes that are or could be useful in the
+ * future to remote clients like cifs or SMB2 client. There is probably
+ * a slightly larger set of fsctls that NTFS local filesystem could handle,
+ * including the seven below that we do not have struct definitions for.
+ * Even with protocol definitions for most of these now available, we still
+ * need to do some experimentation to identify which are practical to do
+ * remotely. Some of the following, such as the encryption/compression ones
+ * could be invoked from tools via a specialized hook into the VFS rather
+ * than via the standard vfs entry points
+ */
+
+#ifndef __KSMBD_SMBFSCTL_H
+#define __KSMBD_SMBFSCTL_H
+
+#define FSCTL_DFS_GET_REFERRALS 0x00060194
+#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0
+#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
+#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
+#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
+#define FSCTL_LOCK_VOLUME 0x00090018
+#define FSCTL_UNLOCK_VOLUME 0x0009001C
+#define FSCTL_IS_PATHNAME_VALID 0x0009002C /* BB add struct */
+#define FSCTL_GET_COMPRESSION 0x0009003C /* BB add struct */
+#define FSCTL_SET_COMPRESSION 0x0009C040 /* BB add struct */
+#define FSCTL_QUERY_FAT_BPB 0x00090058 /* BB add struct */
+/* Verify the next FSCTL number, we had it as 0x00090090 before */
+#define FSCTL_FILESYSTEM_GET_STATS 0x00090060 /* BB add struct */
+#define FSCTL_GET_NTFS_VOLUME_DATA 0x00090064 /* BB add struct */
+#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
+#define FSCTL_IS_VOLUME_DIRTY 0x00090078 /* BB add struct */
+#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
+#define FSCTL_REQUEST_FILTER_OPLOCK 0x0009008C
+#define FSCTL_FIND_FILES_BY_SID 0x0009008F /* BB add struct */
+#define FSCTL_SET_OBJECT_ID 0x00090098 /* BB add struct */
+#define FSCTL_GET_OBJECT_ID 0x0009009C /* BB add struct */
+#define FSCTL_DELETE_OBJECT_ID 0x000900A0 /* BB add struct */
+#define FSCTL_SET_REPARSE_POINT 0x000900A4 /* BB add struct */
+#define FSCTL_GET_REPARSE_POINT 0x000900A8 /* BB add struct */
+#define FSCTL_DELETE_REPARSE_POINT 0x000900AC /* BB add struct */
+#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
+#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
+#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA 0x000980C8 /* BB add struct */
+#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */
+#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */
+#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */
+#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */
+#define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */
+#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */
+#define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */
+#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */
+#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */
+#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
+#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
+#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344
+#define FSCTL_SIS_LINK_FILES 0x0009C104
+#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
+#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */
+/* strange that the number for this op is not sequential with previous op */
+#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */
+#define FSCTL_REQUEST_RESUME_KEY 0x00140078
+#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
+#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
+#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC
+#define FSCTL_COPYCHUNK 0x001440F2
+#define FSCTL_COPYCHUNK_WRITE 0x001480F2
+
+#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
+#define IO_REPARSE_TAG_HSM 0xC0000004
+#define IO_REPARSE_TAG_SIS 0x80000007
+
+/* WSL reparse tags */
+#define IO_REPARSE_TAG_LX_SYMLINK_LE cpu_to_le32(0xA000001D)
+#define IO_REPARSE_TAG_AF_UNIX_LE cpu_to_le32(0x80000023)
+#define IO_REPARSE_TAG_LX_FIFO_LE cpu_to_le32(0x80000024)
+#define IO_REPARSE_TAG_LX_CHR_LE cpu_to_le32(0x80000025)
+#define IO_REPARSE_TAG_LX_BLK_LE cpu_to_le32(0x80000026)
+#endif /* __KSMBD_SMBFSCTL_H */
diff --git a/fs/ksmbd/smbstatus.h b/fs/ksmbd/smbstatus.h
new file mode 100644
index 000000000000..108a8b6ed24a
--- /dev/null
+++ b/fs/ksmbd/smbstatus.h
@@ -0,0 +1,1822 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/*
+ * fs/cifs/smb2status.h
+ *
+ * SMB2 Status code (network error) definitions
+ * Definitions are from MS-ERREF
+ *
+ * Copyright (c) International Business Machines Corp., 2009,2011
+ * Author(s): Steve French (sfrench@us.ibm.com)
+ */
+
+/*
+ * 0 1 2 3 4 5 6 7 8 9 0 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ * SEV C N <-------Facility--------> <------Error Status Code------>
+ *
+ * C is set if "customer defined" error, N bit is reserved and MBZ
+ */
+
+#define STATUS_SEVERITY_SUCCESS cpu_to_le32(0x0000)
+#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001)
+#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002)
+#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003)
+
+struct ntstatus {
+ /* Facility is the high 12 bits of the following field */
+ __le32 Facility; /* low 2 bits Severity, next is Customer, then rsrvd */
+ __le32 Code;
+};
+
+#define STATUS_SUCCESS 0x00000000
+#define STATUS_WAIT_0 cpu_to_le32(0x00000000)
+#define STATUS_WAIT_1 cpu_to_le32(0x00000001)
+#define STATUS_WAIT_2 cpu_to_le32(0x00000002)
+#define STATUS_WAIT_3 cpu_to_le32(0x00000003)
+#define STATUS_WAIT_63 cpu_to_le32(0x0000003F)
+#define STATUS_ABANDONED cpu_to_le32(0x00000080)
+#define STATUS_ABANDONED_WAIT_0 cpu_to_le32(0x00000080)
+#define STATUS_ABANDONED_WAIT_63 cpu_to_le32(0x000000BF)
+#define STATUS_USER_APC cpu_to_le32(0x000000C0)
+#define STATUS_KERNEL_APC cpu_to_le32(0x00000100)
+#define STATUS_ALERTED cpu_to_le32(0x00000101)
+#define STATUS_TIMEOUT cpu_to_le32(0x00000102)
+#define STATUS_PENDING cpu_to_le32(0x00000103)
+#define STATUS_REPARSE cpu_to_le32(0x00000104)
+#define STATUS_MORE_ENTRIES cpu_to_le32(0x00000105)
+#define STATUS_NOT_ALL_ASSIGNED cpu_to_le32(0x00000106)
+#define STATUS_SOME_NOT_MAPPED cpu_to_le32(0x00000107)
+#define STATUS_OPLOCK_BREAK_IN_PROGRESS cpu_to_le32(0x00000108)
+#define STATUS_VOLUME_MOUNTED cpu_to_le32(0x00000109)
+#define STATUS_RXACT_COMMITTED cpu_to_le32(0x0000010A)
+#define STATUS_NOTIFY_CLEANUP cpu_to_le32(0x0000010B)
+#define STATUS_NOTIFY_ENUM_DIR cpu_to_le32(0x0000010C)
+#define STATUS_NO_QUOTAS_FOR_ACCOUNT cpu_to_le32(0x0000010D)
+#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED cpu_to_le32(0x0000010E)
+#define STATUS_PAGE_FAULT_TRANSITION cpu_to_le32(0x00000110)
+#define STATUS_PAGE_FAULT_DEMAND_ZERO cpu_to_le32(0x00000111)
+#define STATUS_PAGE_FAULT_COPY_ON_WRITE cpu_to_le32(0x00000112)
+#define STATUS_PAGE_FAULT_GUARD_PAGE cpu_to_le32(0x00000113)
+#define STATUS_PAGE_FAULT_PAGING_FILE cpu_to_le32(0x00000114)
+#define STATUS_CACHE_PAGE_LOCKED cpu_to_le32(0x00000115)
+#define STATUS_CRASH_DUMP cpu_to_le32(0x00000116)
+#define STATUS_BUFFER_ALL_ZEROS cpu_to_le32(0x00000117)
+#define STATUS_REPARSE_OBJECT cpu_to_le32(0x00000118)
+#define STATUS_RESOURCE_REQUIREMENTS_CHANGED cpu_to_le32(0x00000119)
+#define STATUS_TRANSLATION_COMPLETE cpu_to_le32(0x00000120)
+#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY cpu_to_le32(0x00000121)
+#define STATUS_NOTHING_TO_TERMINATE cpu_to_le32(0x00000122)
+#define STATUS_PROCESS_NOT_IN_JOB cpu_to_le32(0x00000123)
+#define STATUS_PROCESS_IN_JOB cpu_to_le32(0x00000124)
+#define STATUS_VOLSNAP_HIBERNATE_READY cpu_to_le32(0x00000125)
+#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY cpu_to_le32(0x00000126)
+#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED cpu_to_le32(0x00000127)
+#define STATUS_INTERRUPT_STILL_CONNECTED cpu_to_le32(0x00000128)
+#define STATUS_PROCESS_CLONED cpu_to_le32(0x00000129)
+#define STATUS_FILE_LOCKED_WITH_ONLY_READERS cpu_to_le32(0x0000012A)
+#define STATUS_FILE_LOCKED_WITH_WRITERS cpu_to_le32(0x0000012B)
+#define STATUS_RESOURCEMANAGER_READ_ONLY cpu_to_le32(0x00000202)
+#define STATUS_WAIT_FOR_OPLOCK cpu_to_le32(0x00000367)
+#define DBG_EXCEPTION_HANDLED cpu_to_le32(0x00010001)
+#define DBG_CONTINUE cpu_to_le32(0x00010002)
+#define STATUS_FLT_IO_COMPLETE cpu_to_le32(0x001C0001)
+#define STATUS_OBJECT_NAME_EXISTS cpu_to_le32(0x40000000)
+#define STATUS_THREAD_WAS_SUSPENDED cpu_to_le32(0x40000001)
+#define STATUS_WORKING_SET_LIMIT_RANGE cpu_to_le32(0x40000002)
+#define STATUS_IMAGE_NOT_AT_BASE cpu_to_le32(0x40000003)
+#define STATUS_RXACT_STATE_CREATED cpu_to_le32(0x40000004)
+#define STATUS_SEGMENT_NOTIFICATION cpu_to_le32(0x40000005)
+#define STATUS_LOCAL_USER_SESSION_KEY cpu_to_le32(0x40000006)
+#define STATUS_BAD_CURRENT_DIRECTORY cpu_to_le32(0x40000007)
+#define STATUS_SERIAL_MORE_WRITES cpu_to_le32(0x40000008)
+#define STATUS_REGISTRY_RECOVERED cpu_to_le32(0x40000009)
+#define STATUS_FT_READ_RECOVERY_FROM_BACKUP cpu_to_le32(0x4000000A)
+#define STATUS_FT_WRITE_RECOVERY cpu_to_le32(0x4000000B)
+#define STATUS_SERIAL_COUNTER_TIMEOUT cpu_to_le32(0x4000000C)
+#define STATUS_NULL_LM_PASSWORD cpu_to_le32(0x4000000D)
+#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH cpu_to_le32(0x4000000E)
+#define STATUS_RECEIVE_PARTIAL cpu_to_le32(0x4000000F)
+#define STATUS_RECEIVE_EXPEDITED cpu_to_le32(0x40000010)
+#define STATUS_RECEIVE_PARTIAL_EXPEDITED cpu_to_le32(0x40000011)
+#define STATUS_EVENT_DONE cpu_to_le32(0x40000012)
+#define STATUS_EVENT_PENDING cpu_to_le32(0x40000013)
+#define STATUS_CHECKING_FILE_SYSTEM cpu_to_le32(0x40000014)
+#define STATUS_FATAL_APP_EXIT cpu_to_le32(0x40000015)
+#define STATUS_PREDEFINED_HANDLE cpu_to_le32(0x40000016)
+#define STATUS_WAS_UNLOCKED cpu_to_le32(0x40000017)
+#define STATUS_SERVICE_NOTIFICATION cpu_to_le32(0x40000018)
+#define STATUS_WAS_LOCKED cpu_to_le32(0x40000019)
+#define STATUS_LOG_HARD_ERROR cpu_to_le32(0x4000001A)
+#define STATUS_ALREADY_WIN32 cpu_to_le32(0x4000001B)
+#define STATUS_WX86_UNSIMULATE cpu_to_le32(0x4000001C)
+#define STATUS_WX86_CONTINUE cpu_to_le32(0x4000001D)
+#define STATUS_WX86_SINGLE_STEP cpu_to_le32(0x4000001E)
+#define STATUS_WX86_BREAKPOINT cpu_to_le32(0x4000001F)
+#define STATUS_WX86_EXCEPTION_CONTINUE cpu_to_le32(0x40000020)
+#define STATUS_WX86_EXCEPTION_LASTCHANCE cpu_to_le32(0x40000021)
+#define STATUS_WX86_EXCEPTION_CHAIN cpu_to_le32(0x40000022)
+#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE cpu_to_le32(0x40000023)
+#define STATUS_NO_YIELD_PERFORMED cpu_to_le32(0x40000024)
+#define STATUS_TIMER_RESUME_IGNORED cpu_to_le32(0x40000025)
+#define STATUS_ARBITRATION_UNHANDLED cpu_to_le32(0x40000026)
+#define STATUS_CARDBUS_NOT_SUPPORTED cpu_to_le32(0x40000027)
+#define STATUS_WX86_CREATEWX86TIB cpu_to_le32(0x40000028)
+#define STATUS_MP_PROCESSOR_MISMATCH cpu_to_le32(0x40000029)
+#define STATUS_HIBERNATED cpu_to_le32(0x4000002A)
+#define STATUS_RESUME_HIBERNATION cpu_to_le32(0x4000002B)
+#define STATUS_FIRMWARE_UPDATED cpu_to_le32(0x4000002C)
+#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES cpu_to_le32(0x4000002D)
+#define STATUS_MESSAGE_RETRIEVED cpu_to_le32(0x4000002E)
+#define STATUS_SYSTEM_POWERSTATE_TRANSITION cpu_to_le32(0x4000002F)
+#define STATUS_ALPC_CHECK_COMPLETION_LIST cpu_to_le32(0x40000030)
+#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION cpu_to_le32(0x40000031)
+#define STATUS_ACCESS_AUDIT_BY_POLICY cpu_to_le32(0x40000032)
+#define STATUS_ABANDON_HIBERFILE cpu_to_le32(0x40000033)
+#define STATUS_BIZRULES_NOT_ENABLED cpu_to_le32(0x40000034)
+#define STATUS_WAKE_SYSTEM cpu_to_le32(0x40000294)
+#define STATUS_DS_SHUTTING_DOWN cpu_to_le32(0x40000370)
+#define DBG_REPLY_LATER cpu_to_le32(0x40010001)
+#define DBG_UNABLE_TO_PROVIDE_HANDLE cpu_to_le32(0x40010002)
+#define DBG_TERMINATE_THREAD cpu_to_le32(0x40010003)
+#define DBG_TERMINATE_PROCESS cpu_to_le32(0x40010004)
+#define DBG_CONTROL_C cpu_to_le32(0x40010005)
+#define DBG_PRINTEXCEPTION_C cpu_to_le32(0x40010006)
+#define DBG_RIPEXCEPTION cpu_to_le32(0x40010007)
+#define DBG_CONTROL_BREAK cpu_to_le32(0x40010008)
+#define DBG_COMMAND_EXCEPTION cpu_to_le32(0x40010009)
+#define RPC_NT_UUID_LOCAL_ONLY cpu_to_le32(0x40020056)
+#define RPC_NT_SEND_INCOMPLETE cpu_to_le32(0x400200AF)
+#define STATUS_CTX_CDM_CONNECT cpu_to_le32(0x400A0004)
+#define STATUS_CTX_CDM_DISCONNECT cpu_to_le32(0x400A0005)
+#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT cpu_to_le32(0x4015000D)
+#define STATUS_RECOVERY_NOT_NEEDED cpu_to_le32(0x40190034)
+#define STATUS_RM_ALREADY_STARTED cpu_to_le32(0x40190035)
+#define STATUS_LOG_NO_RESTART cpu_to_le32(0x401A000C)
+#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST cpu_to_le32(0x401B00EC)
+#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED cpu_to_le32(0x401E000A)
+#define STATUS_GRAPHICS_DRIVER_MISMATCH cpu_to_le32(0x401E0117)
+#define STATUS_GRAPHICS_MODE_NOT_PINNED cpu_to_le32(0x401E0307)
+#define STATUS_GRAPHICS_NO_PREFERRED_MODE cpu_to_le32(0x401E031E)
+#define STATUS_GRAPHICS_DATASET_IS_EMPTY cpu_to_le32(0x401E034B)
+#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET cpu_to_le32(0x401E034C)
+#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED \
+ cpu_to_le32(0x401E0351)
+#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS cpu_to_le32(0x401E042F)
+#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED cpu_to_le32(0x401E0437)
+#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY cpu_to_le32(0x401E0439)
+#define STATUS_GRAPHICS_START_DEFERRED cpu_to_le32(0x401E043A)
+#define STATUS_NDIS_INDICATION_REQUIRED cpu_to_le32(0x40230001)
+#define STATUS_GUARD_PAGE_VIOLATION cpu_to_le32(0x80000001)
+#define STATUS_DATATYPE_MISALIGNMENT cpu_to_le32(0x80000002)
+#define STATUS_BREAKPOINT cpu_to_le32(0x80000003)
+#define STATUS_SINGLE_STEP cpu_to_le32(0x80000004)
+#define STATUS_BUFFER_OVERFLOW cpu_to_le32(0x80000005)
+#define STATUS_NO_MORE_FILES cpu_to_le32(0x80000006)
+#define STATUS_WAKE_SYSTEM_DEBUGGER cpu_to_le32(0x80000007)
+#define STATUS_HANDLES_CLOSED cpu_to_le32(0x8000000A)
+#define STATUS_NO_INHERITANCE cpu_to_le32(0x8000000B)
+#define STATUS_GUID_SUBSTITUTION_MADE cpu_to_le32(0x8000000C)
+#define STATUS_PARTIAL_COPY cpu_to_le32(0x8000000D)
+#define STATUS_DEVICE_PAPER_EMPTY cpu_to_le32(0x8000000E)
+#define STATUS_DEVICE_POWERED_OFF cpu_to_le32(0x8000000F)
+#define STATUS_DEVICE_OFF_LINE cpu_to_le32(0x80000010)
+#define STATUS_DEVICE_BUSY cpu_to_le32(0x80000011)
+#define STATUS_NO_MORE_EAS cpu_to_le32(0x80000012)
+#define STATUS_INVALID_EA_NAME cpu_to_le32(0x80000013)
+#define STATUS_EA_LIST_INCONSISTENT cpu_to_le32(0x80000014)
+#define STATUS_INVALID_EA_FLAG cpu_to_le32(0x80000015)
+#define STATUS_VERIFY_REQUIRED cpu_to_le32(0x80000016)
+#define STATUS_EXTRANEOUS_INFORMATION cpu_to_le32(0x80000017)
+#define STATUS_RXACT_COMMIT_NECESSARY cpu_to_le32(0x80000018)
+#define STATUS_NO_MORE_ENTRIES cpu_to_le32(0x8000001A)
+#define STATUS_FILEMARK_DETECTED cpu_to_le32(0x8000001B)
+#define STATUS_MEDIA_CHANGED cpu_to_le32(0x8000001C)
+#define STATUS_BUS_RESET cpu_to_le32(0x8000001D)
+#define STATUS_END_OF_MEDIA cpu_to_le32(0x8000001E)
+#define STATUS_BEGINNING_OF_MEDIA cpu_to_le32(0x8000001F)
+#define STATUS_MEDIA_CHECK cpu_to_le32(0x80000020)
+#define STATUS_SETMARK_DETECTED cpu_to_le32(0x80000021)
+#define STATUS_NO_DATA_DETECTED cpu_to_le32(0x80000022)
+#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES cpu_to_le32(0x80000023)
+#define STATUS_SERVER_HAS_OPEN_HANDLES cpu_to_le32(0x80000024)
+#define STATUS_ALREADY_DISCONNECTED cpu_to_le32(0x80000025)
+#define STATUS_LONGJUMP cpu_to_le32(0x80000026)
+#define STATUS_CLEANER_CARTRIDGE_INSTALLED cpu_to_le32(0x80000027)
+#define STATUS_PLUGPLAY_QUERY_VETOED cpu_to_le32(0x80000028)
+#define STATUS_UNWIND_CONSOLIDATE cpu_to_le32(0x80000029)
+#define STATUS_REGISTRY_HIVE_RECOVERED cpu_to_le32(0x8000002A)
+#define STATUS_DLL_MIGHT_BE_INSECURE cpu_to_le32(0x8000002B)
+#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE cpu_to_le32(0x8000002C)
+#define STATUS_STOPPED_ON_SYMLINK cpu_to_le32(0x8000002D)
+#define STATUS_DEVICE_REQUIRES_CLEANING cpu_to_le32(0x80000288)
+#define STATUS_DEVICE_DOOR_OPEN cpu_to_le32(0x80000289)
+#define STATUS_DATA_LOST_REPAIR cpu_to_le32(0x80000803)
+#define DBG_EXCEPTION_NOT_HANDLED cpu_to_le32(0x80010001)
+#define STATUS_CLUSTER_NODE_ALREADY_UP cpu_to_le32(0x80130001)
+#define STATUS_CLUSTER_NODE_ALREADY_DOWN cpu_to_le32(0x80130002)
+#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE cpu_to_le32(0x80130003)
+#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE cpu_to_le32(0x80130004)
+#define STATUS_CLUSTER_NODE_ALREADY_MEMBER cpu_to_le32(0x80130005)
+#define STATUS_COULD_NOT_RESIZE_LOG cpu_to_le32(0x80190009)
+#define STATUS_NO_TXF_METADATA cpu_to_le32(0x80190029)
+#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN cpu_to_le32(0x80190031)
+#define STATUS_TXF_METADATA_ALREADY_PRESENT cpu_to_le32(0x80190041)
+#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET cpu_to_le32(0x80190042)
+#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED \
+ cpu_to_le32(0x801B00EB)
+#define STATUS_FLT_BUFFER_TOO_SMALL cpu_to_le32(0x801C0001)
+#define STATUS_FVE_PARTIAL_METADATA cpu_to_le32(0x80210001)
+#define STATUS_UNSUCCESSFUL cpu_to_le32(0xC0000001)
+#define STATUS_NOT_IMPLEMENTED cpu_to_le32(0xC0000002)
+#define STATUS_INVALID_INFO_CLASS cpu_to_le32(0xC0000003)
+#define STATUS_INFO_LENGTH_MISMATCH cpu_to_le32(0xC0000004)
+#define STATUS_ACCESS_VIOLATION cpu_to_le32(0xC0000005)
+#define STATUS_IN_PAGE_ERROR cpu_to_le32(0xC0000006)
+#define STATUS_PAGEFILE_QUOTA cpu_to_le32(0xC0000007)
+#define STATUS_INVALID_HANDLE cpu_to_le32(0xC0000008)
+#define STATUS_BAD_INITIAL_STACK cpu_to_le32(0xC0000009)
+#define STATUS_BAD_INITIAL_PC cpu_to_le32(0xC000000A)
+#define STATUS_INVALID_CID cpu_to_le32(0xC000000B)
+#define STATUS_TIMER_NOT_CANCELED cpu_to_le32(0xC000000C)
+#define STATUS_INVALID_PARAMETER cpu_to_le32(0xC000000D)
+#define STATUS_NO_SUCH_DEVICE cpu_to_le32(0xC000000E)
+#define STATUS_NO_SUCH_FILE cpu_to_le32(0xC000000F)
+#define STATUS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0000010)
+#define STATUS_END_OF_FILE cpu_to_le32(0xC0000011)
+#define STATUS_WRONG_VOLUME cpu_to_le32(0xC0000012)
+#define STATUS_NO_MEDIA_IN_DEVICE cpu_to_le32(0xC0000013)
+#define STATUS_UNRECOGNIZED_MEDIA cpu_to_le32(0xC0000014)
+#define STATUS_NONEXISTENT_SECTOR cpu_to_le32(0xC0000015)
+#define STATUS_MORE_PROCESSING_REQUIRED cpu_to_le32(0xC0000016)
+#define STATUS_NO_MEMORY cpu_to_le32(0xC0000017)
+#define STATUS_CONFLICTING_ADDRESSES cpu_to_le32(0xC0000018)
+#define STATUS_NOT_MAPPED_VIEW cpu_to_le32(0xC0000019)
+#define STATUS_UNABLE_TO_FREE_VM cpu_to_le32(0xC000001A)
+#define STATUS_UNABLE_TO_DELETE_SECTION cpu_to_le32(0xC000001B)
+#define STATUS_INVALID_SYSTEM_SERVICE cpu_to_le32(0xC000001C)
+#define STATUS_ILLEGAL_INSTRUCTION cpu_to_le32(0xC000001D)
+#define STATUS_INVALID_LOCK_SEQUENCE cpu_to_le32(0xC000001E)
+#define STATUS_INVALID_VIEW_SIZE cpu_to_le32(0xC000001F)
+#define STATUS_INVALID_FILE_FOR_SECTION cpu_to_le32(0xC0000020)
+#define STATUS_ALREADY_COMMITTED cpu_to_le32(0xC0000021)
+#define STATUS_ACCESS_DENIED cpu_to_le32(0xC0000022)
+#define STATUS_BUFFER_TOO_SMALL cpu_to_le32(0xC0000023)
+#define STATUS_OBJECT_TYPE_MISMATCH cpu_to_le32(0xC0000024)
+#define STATUS_NONCONTINUABLE_EXCEPTION cpu_to_le32(0xC0000025)
+#define STATUS_INVALID_DISPOSITION cpu_to_le32(0xC0000026)
+#define STATUS_UNWIND cpu_to_le32(0xC0000027)
+#define STATUS_BAD_STACK cpu_to_le32(0xC0000028)
+#define STATUS_INVALID_UNWIND_TARGET cpu_to_le32(0xC0000029)
+#define STATUS_NOT_LOCKED cpu_to_le32(0xC000002A)
+#define STATUS_PARITY_ERROR cpu_to_le32(0xC000002B)
+#define STATUS_UNABLE_TO_DECOMMIT_VM cpu_to_le32(0xC000002C)
+#define STATUS_NOT_COMMITTED cpu_to_le32(0xC000002D)
+#define STATUS_INVALID_PORT_ATTRIBUTES cpu_to_le32(0xC000002E)
+#define STATUS_PORT_MESSAGE_TOO_LONG cpu_to_le32(0xC000002F)
+#define STATUS_INVALID_PARAMETER_MIX cpu_to_le32(0xC0000030)
+#define STATUS_INVALID_QUOTA_LOWER cpu_to_le32(0xC0000031)
+#define STATUS_DISK_CORRUPT_ERROR cpu_to_le32(0xC0000032)
+#define STATUS_OBJECT_NAME_INVALID cpu_to_le32(0xC0000033)
+#define STATUS_OBJECT_NAME_NOT_FOUND cpu_to_le32(0xC0000034)
+#define STATUS_OBJECT_NAME_COLLISION cpu_to_le32(0xC0000035)
+#define STATUS_PORT_DISCONNECTED cpu_to_le32(0xC0000037)
+#define STATUS_DEVICE_ALREADY_ATTACHED cpu_to_le32(0xC0000038)
+#define STATUS_OBJECT_PATH_INVALID cpu_to_le32(0xC0000039)
+#define STATUS_OBJECT_PATH_NOT_FOUND cpu_to_le32(0xC000003A)
+#define STATUS_OBJECT_PATH_SYNTAX_BAD cpu_to_le32(0xC000003B)
+#define STATUS_DATA_OVERRUN cpu_to_le32(0xC000003C)
+#define STATUS_DATA_LATE_ERROR cpu_to_le32(0xC000003D)
+#define STATUS_DATA_ERROR cpu_to_le32(0xC000003E)
+#define STATUS_CRC_ERROR cpu_to_le32(0xC000003F)
+#define STATUS_SECTION_TOO_BIG cpu_to_le32(0xC0000040)
+#define STATUS_PORT_CONNECTION_REFUSED cpu_to_le32(0xC0000041)
+#define STATUS_INVALID_PORT_HANDLE cpu_to_le32(0xC0000042)
+#define STATUS_SHARING_VIOLATION cpu_to_le32(0xC0000043)
+#define STATUS_QUOTA_EXCEEDED cpu_to_le32(0xC0000044)
+#define STATUS_INVALID_PAGE_PROTECTION cpu_to_le32(0xC0000045)
+#define STATUS_MUTANT_NOT_OWNED cpu_to_le32(0xC0000046)
+#define STATUS_SEMAPHORE_LIMIT_EXCEEDED cpu_to_le32(0xC0000047)
+#define STATUS_PORT_ALREADY_SET cpu_to_le32(0xC0000048)
+#define STATUS_SECTION_NOT_IMAGE cpu_to_le32(0xC0000049)
+#define STATUS_SUSPEND_COUNT_EXCEEDED cpu_to_le32(0xC000004A)
+#define STATUS_THREAD_IS_TERMINATING cpu_to_le32(0xC000004B)
+#define STATUS_BAD_WORKING_SET_LIMIT cpu_to_le32(0xC000004C)
+#define STATUS_INCOMPATIBLE_FILE_MAP cpu_to_le32(0xC000004D)
+#define STATUS_SECTION_PROTECTION cpu_to_le32(0xC000004E)
+#define STATUS_EAS_NOT_SUPPORTED cpu_to_le32(0xC000004F)
+#define STATUS_EA_TOO_LARGE cpu_to_le32(0xC0000050)
+#define STATUS_NONEXISTENT_EA_ENTRY cpu_to_le32(0xC0000051)
+#define STATUS_NO_EAS_ON_FILE cpu_to_le32(0xC0000052)
+#define STATUS_EA_CORRUPT_ERROR cpu_to_le32(0xC0000053)
+#define STATUS_FILE_LOCK_CONFLICT cpu_to_le32(0xC0000054)
+#define STATUS_LOCK_NOT_GRANTED cpu_to_le32(0xC0000055)
+#define STATUS_DELETE_PENDING cpu_to_le32(0xC0000056)
+#define STATUS_CTL_FILE_NOT_SUPPORTED cpu_to_le32(0xC0000057)
+#define STATUS_UNKNOWN_REVISION cpu_to_le32(0xC0000058)
+#define STATUS_REVISION_MISMATCH cpu_to_le32(0xC0000059)
+#define STATUS_INVALID_OWNER cpu_to_le32(0xC000005A)
+#define STATUS_INVALID_PRIMARY_GROUP cpu_to_le32(0xC000005B)
+#define STATUS_NO_IMPERSONATION_TOKEN cpu_to_le32(0xC000005C)
+#define STATUS_CANT_DISABLE_MANDATORY cpu_to_le32(0xC000005D)
+#define STATUS_NO_LOGON_SERVERS cpu_to_le32(0xC000005E)
+#define STATUS_NO_SUCH_LOGON_SESSION cpu_to_le32(0xC000005F)
+#define STATUS_NO_SUCH_PRIVILEGE cpu_to_le32(0xC0000060)
+#define STATUS_PRIVILEGE_NOT_HELD cpu_to_le32(0xC0000061)
+#define STATUS_INVALID_ACCOUNT_NAME cpu_to_le32(0xC0000062)
+#define STATUS_USER_EXISTS cpu_to_le32(0xC0000063)
+#define STATUS_NO_SUCH_USER cpu_to_le32(0xC0000064)
+#define STATUS_GROUP_EXISTS cpu_to_le32(0xC0000065)
+#define STATUS_NO_SUCH_GROUP cpu_to_le32(0xC0000066)
+#define STATUS_MEMBER_IN_GROUP cpu_to_le32(0xC0000067)
+#define STATUS_MEMBER_NOT_IN_GROUP cpu_to_le32(0xC0000068)
+#define STATUS_LAST_ADMIN cpu_to_le32(0xC0000069)
+#define STATUS_WRONG_PASSWORD cpu_to_le32(0xC000006A)
+#define STATUS_ILL_FORMED_PASSWORD cpu_to_le32(0xC000006B)
+#define STATUS_PASSWORD_RESTRICTION cpu_to_le32(0xC000006C)
+#define STATUS_LOGON_FAILURE cpu_to_le32(0xC000006D)
+#define STATUS_ACCOUNT_RESTRICTION cpu_to_le32(0xC000006E)
+#define STATUS_INVALID_LOGON_HOURS cpu_to_le32(0xC000006F)
+#define STATUS_INVALID_WORKSTATION cpu_to_le32(0xC0000070)
+#define STATUS_PASSWORD_EXPIRED cpu_to_le32(0xC0000071)
+#define STATUS_ACCOUNT_DISABLED cpu_to_le32(0xC0000072)
+#define STATUS_NONE_MAPPED cpu_to_le32(0xC0000073)
+#define STATUS_TOO_MANY_LUIDS_REQUESTED cpu_to_le32(0xC0000074)
+#define STATUS_LUIDS_EXHAUSTED cpu_to_le32(0xC0000075)
+#define STATUS_INVALID_SUB_AUTHORITY cpu_to_le32(0xC0000076)
+#define STATUS_INVALID_ACL cpu_to_le32(0xC0000077)
+#define STATUS_INVALID_SID cpu_to_le32(0xC0000078)
+#define STATUS_INVALID_SECURITY_DESCR cpu_to_le32(0xC0000079)
+#define STATUS_PROCEDURE_NOT_FOUND cpu_to_le32(0xC000007A)
+#define STATUS_INVALID_IMAGE_FORMAT cpu_to_le32(0xC000007B)
+#define STATUS_NO_TOKEN cpu_to_le32(0xC000007C)
+#define STATUS_BAD_INHERITANCE_ACL cpu_to_le32(0xC000007D)
+#define STATUS_RANGE_NOT_LOCKED cpu_to_le32(0xC000007E)
+#define STATUS_DISK_FULL cpu_to_le32(0xC000007F)
+#define STATUS_SERVER_DISABLED cpu_to_le32(0xC0000080)
+#define STATUS_SERVER_NOT_DISABLED cpu_to_le32(0xC0000081)
+#define STATUS_TOO_MANY_GUIDS_REQUESTED cpu_to_le32(0xC0000082)
+#define STATUS_GUIDS_EXHAUSTED cpu_to_le32(0xC0000083)
+#define STATUS_INVALID_ID_AUTHORITY cpu_to_le32(0xC0000084)
+#define STATUS_AGENTS_EXHAUSTED cpu_to_le32(0xC0000085)
+#define STATUS_INVALID_VOLUME_LABEL cpu_to_le32(0xC0000086)
+#define STATUS_SECTION_NOT_EXTENDED cpu_to_le32(0xC0000087)
+#define STATUS_NOT_MAPPED_DATA cpu_to_le32(0xC0000088)
+#define STATUS_RESOURCE_DATA_NOT_FOUND cpu_to_le32(0xC0000089)
+#define STATUS_RESOURCE_TYPE_NOT_FOUND cpu_to_le32(0xC000008A)
+#define STATUS_RESOURCE_NAME_NOT_FOUND cpu_to_le32(0xC000008B)
+#define STATUS_ARRAY_BOUNDS_EXCEEDED cpu_to_le32(0xC000008C)
+#define STATUS_FLOAT_DENORMAL_OPERAND cpu_to_le32(0xC000008D)
+#define STATUS_FLOAT_DIVIDE_BY_ZERO cpu_to_le32(0xC000008E)
+#define STATUS_FLOAT_INEXACT_RESULT cpu_to_le32(0xC000008F)
+#define STATUS_FLOAT_INVALID_OPERATION cpu_to_le32(0xC0000090)
+#define STATUS_FLOAT_OVERFLOW cpu_to_le32(0xC0000091)
+#define STATUS_FLOAT_STACK_CHECK cpu_to_le32(0xC0000092)
+#define STATUS_FLOAT_UNDERFLOW cpu_to_le32(0xC0000093)
+#define STATUS_INTEGER_DIVIDE_BY_ZERO cpu_to_le32(0xC0000094)
+#define STATUS_INTEGER_OVERFLOW cpu_to_le32(0xC0000095)
+#define STATUS_PRIVILEGED_INSTRUCTION cpu_to_le32(0xC0000096)
+#define STATUS_TOO_MANY_PAGING_FILES cpu_to_le32(0xC0000097)
+#define STATUS_FILE_INVALID cpu_to_le32(0xC0000098)
+#define STATUS_ALLOTTED_SPACE_EXCEEDED cpu_to_le32(0xC0000099)
+#define STATUS_INSUFFICIENT_RESOURCES cpu_to_le32(0xC000009A)
+#define STATUS_DFS_EXIT_PATH_FOUND cpu_to_le32(0xC000009B)
+#define STATUS_DEVICE_DATA_ERROR cpu_to_le32(0xC000009C)
+#define STATUS_DEVICE_NOT_CONNECTED cpu_to_le32(0xC000009D)
+#define STATUS_DEVICE_POWER_FAILURE cpu_to_le32(0xC000009E)
+#define STATUS_FREE_VM_NOT_AT_BASE cpu_to_le32(0xC000009F)
+#define STATUS_MEMORY_NOT_ALLOCATED cpu_to_le32(0xC00000A0)
+#define STATUS_WORKING_SET_QUOTA cpu_to_le32(0xC00000A1)
+#define STATUS_MEDIA_WRITE_PROTECTED cpu_to_le32(0xC00000A2)
+#define STATUS_DEVICE_NOT_READY cpu_to_le32(0xC00000A3)
+#define STATUS_INVALID_GROUP_ATTRIBUTES cpu_to_le32(0xC00000A4)
+#define STATUS_BAD_IMPERSONATION_LEVEL cpu_to_le32(0xC00000A5)
+#define STATUS_CANT_OPEN_ANONYMOUS cpu_to_le32(0xC00000A6)
+#define STATUS_BAD_VALIDATION_CLASS cpu_to_le32(0xC00000A7)
+#define STATUS_BAD_TOKEN_TYPE cpu_to_le32(0xC00000A8)
+#define STATUS_BAD_MASTER_BOOT_RECORD cpu_to_le32(0xC00000A9)
+#define STATUS_INSTRUCTION_MISALIGNMENT cpu_to_le32(0xC00000AA)
+#define STATUS_INSTANCE_NOT_AVAILABLE cpu_to_le32(0xC00000AB)
+#define STATUS_PIPE_NOT_AVAILABLE cpu_to_le32(0xC00000AC)
+#define STATUS_INVALID_PIPE_STATE cpu_to_le32(0xC00000AD)
+#define STATUS_PIPE_BUSY cpu_to_le32(0xC00000AE)
+#define STATUS_ILLEGAL_FUNCTION cpu_to_le32(0xC00000AF)
+#define STATUS_PIPE_DISCONNECTED cpu_to_le32(0xC00000B0)
+#define STATUS_PIPE_CLOSING cpu_to_le32(0xC00000B1)
+#define STATUS_PIPE_CONNECTED cpu_to_le32(0xC00000B2)
+#define STATUS_PIPE_LISTENING cpu_to_le32(0xC00000B3)
+#define STATUS_INVALID_READ_MODE cpu_to_le32(0xC00000B4)
+#define STATUS_IO_TIMEOUT cpu_to_le32(0xC00000B5)
+#define STATUS_FILE_FORCED_CLOSED cpu_to_le32(0xC00000B6)
+#define STATUS_PROFILING_NOT_STARTED cpu_to_le32(0xC00000B7)
+#define STATUS_PROFILING_NOT_STOPPED cpu_to_le32(0xC00000B8)
+#define STATUS_COULD_NOT_INTERPRET cpu_to_le32(0xC00000B9)
+#define STATUS_FILE_IS_A_DIRECTORY cpu_to_le32(0xC00000BA)
+#define STATUS_NOT_SUPPORTED cpu_to_le32(0xC00000BB)
+#define STATUS_REMOTE_NOT_LISTENING cpu_to_le32(0xC00000BC)
+#define STATUS_DUPLICATE_NAME cpu_to_le32(0xC00000BD)
+#define STATUS_BAD_NETWORK_PATH cpu_to_le32(0xC00000BE)
+#define STATUS_NETWORK_BUSY cpu_to_le32(0xC00000BF)
+#define STATUS_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC00000C0)
+#define STATUS_TOO_MANY_COMMANDS cpu_to_le32(0xC00000C1)
+#define STATUS_ADAPTER_HARDWARE_ERROR cpu_to_le32(0xC00000C2)
+#define STATUS_INVALID_NETWORK_RESPONSE cpu_to_le32(0xC00000C3)
+#define STATUS_UNEXPECTED_NETWORK_ERROR cpu_to_le32(0xC00000C4)
+#define STATUS_BAD_REMOTE_ADAPTER cpu_to_le32(0xC00000C5)
+#define STATUS_PRINT_QUEUE_FULL cpu_to_le32(0xC00000C6)
+#define STATUS_NO_SPOOL_SPACE cpu_to_le32(0xC00000C7)
+#define STATUS_PRINT_CANCELLED cpu_to_le32(0xC00000C8)
+#define STATUS_NETWORK_NAME_DELETED cpu_to_le32(0xC00000C9)
+#define STATUS_NETWORK_ACCESS_DENIED cpu_to_le32(0xC00000CA)
+#define STATUS_BAD_DEVICE_TYPE cpu_to_le32(0xC00000CB)
+#define STATUS_BAD_NETWORK_NAME cpu_to_le32(0xC00000CC)
+#define STATUS_TOO_MANY_NAMES cpu_to_le32(0xC00000CD)
+#define STATUS_TOO_MANY_SESSIONS cpu_to_le32(0xC00000CE)
+#define STATUS_SHARING_PAUSED cpu_to_le32(0xC00000CF)
+#define STATUS_REQUEST_NOT_ACCEPTED cpu_to_le32(0xC00000D0)
+#define STATUS_REDIRECTOR_PAUSED cpu_to_le32(0xC00000D1)
+#define STATUS_NET_WRITE_FAULT cpu_to_le32(0xC00000D2)
+#define STATUS_PROFILING_AT_LIMIT cpu_to_le32(0xC00000D3)
+#define STATUS_NOT_SAME_DEVICE cpu_to_le32(0xC00000D4)
+#define STATUS_FILE_RENAMED cpu_to_le32(0xC00000D5)
+#define STATUS_VIRTUAL_CIRCUIT_CLOSED cpu_to_le32(0xC00000D6)
+#define STATUS_NO_SECURITY_ON_OBJECT cpu_to_le32(0xC00000D7)
+#define STATUS_CANT_WAIT cpu_to_le32(0xC00000D8)
+#define STATUS_PIPE_EMPTY cpu_to_le32(0xC00000D9)
+#define STATUS_CANT_ACCESS_DOMAIN_INFO cpu_to_le32(0xC00000DA)
+#define STATUS_CANT_TERMINATE_SELF cpu_to_le32(0xC00000DB)
+#define STATUS_INVALID_SERVER_STATE cpu_to_le32(0xC00000DC)
+#define STATUS_INVALID_DOMAIN_STATE cpu_to_le32(0xC00000DD)
+#define STATUS_INVALID_DOMAIN_ROLE cpu_to_le32(0xC00000DE)
+#define STATUS_NO_SUCH_DOMAIN cpu_to_le32(0xC00000DF)
+#define STATUS_DOMAIN_EXISTS cpu_to_le32(0xC00000E0)
+#define STATUS_DOMAIN_LIMIT_EXCEEDED cpu_to_le32(0xC00000E1)
+#define STATUS_OPLOCK_NOT_GRANTED cpu_to_le32(0xC00000E2)
+#define STATUS_INVALID_OPLOCK_PROTOCOL cpu_to_le32(0xC00000E3)
+#define STATUS_INTERNAL_DB_CORRUPTION cpu_to_le32(0xC00000E4)
+#define STATUS_INTERNAL_ERROR cpu_to_le32(0xC00000E5)
+#define STATUS_GENERIC_NOT_MAPPED cpu_to_le32(0xC00000E6)
+#define STATUS_BAD_DESCRIPTOR_FORMAT cpu_to_le32(0xC00000E7)
+#define STATUS_INVALID_USER_BUFFER cpu_to_le32(0xC00000E8)
+#define STATUS_UNEXPECTED_IO_ERROR cpu_to_le32(0xC00000E9)
+#define STATUS_UNEXPECTED_MM_CREATE_ERR cpu_to_le32(0xC00000EA)
+#define STATUS_UNEXPECTED_MM_MAP_ERROR cpu_to_le32(0xC00000EB)
+#define STATUS_UNEXPECTED_MM_EXTEND_ERR cpu_to_le32(0xC00000EC)
+#define STATUS_NOT_LOGON_PROCESS cpu_to_le32(0xC00000ED)
+#define STATUS_LOGON_SESSION_EXISTS cpu_to_le32(0xC00000EE)
+#define STATUS_INVALID_PARAMETER_1 cpu_to_le32(0xC00000EF)
+#define STATUS_INVALID_PARAMETER_2 cpu_to_le32(0xC00000F0)
+#define STATUS_INVALID_PARAMETER_3 cpu_to_le32(0xC00000F1)
+#define STATUS_INVALID_PARAMETER_4 cpu_to_le32(0xC00000F2)
+#define STATUS_INVALID_PARAMETER_5 cpu_to_le32(0xC00000F3)
+#define STATUS_INVALID_PARAMETER_6 cpu_to_le32(0xC00000F4)
+#define STATUS_INVALID_PARAMETER_7 cpu_to_le32(0xC00000F5)
+#define STATUS_INVALID_PARAMETER_8 cpu_to_le32(0xC00000F6)
+#define STATUS_INVALID_PARAMETER_9 cpu_to_le32(0xC00000F7)
+#define STATUS_INVALID_PARAMETER_10 cpu_to_le32(0xC00000F8)
+#define STATUS_INVALID_PARAMETER_11 cpu_to_le32(0xC00000F9)
+#define STATUS_INVALID_PARAMETER_12 cpu_to_le32(0xC00000FA)
+#define STATUS_REDIRECTOR_NOT_STARTED cpu_to_le32(0xC00000FB)
+#define STATUS_REDIRECTOR_STARTED cpu_to_le32(0xC00000FC)
+#define STATUS_STACK_OVERFLOW cpu_to_le32(0xC00000FD)
+#define STATUS_NO_SUCH_PACKAGE cpu_to_le32(0xC00000FE)
+#define STATUS_BAD_FUNCTION_TABLE cpu_to_le32(0xC00000FF)
+#define STATUS_VARIABLE_NOT_FOUND cpu_to_le32(0xC0000100)
+#define STATUS_DIRECTORY_NOT_EMPTY cpu_to_le32(0xC0000101)
+#define STATUS_FILE_CORRUPT_ERROR cpu_to_le32(0xC0000102)
+#define STATUS_NOT_A_DIRECTORY cpu_to_le32(0xC0000103)
+#define STATUS_BAD_LOGON_SESSION_STATE cpu_to_le32(0xC0000104)
+#define STATUS_LOGON_SESSION_COLLISION cpu_to_le32(0xC0000105)
+#define STATUS_NAME_TOO_LONG cpu_to_le32(0xC0000106)
+#define STATUS_FILES_OPEN cpu_to_le32(0xC0000107)
+#define STATUS_CONNECTION_IN_USE cpu_to_le32(0xC0000108)
+#define STATUS_MESSAGE_NOT_FOUND cpu_to_le32(0xC0000109)
+#define STATUS_PROCESS_IS_TERMINATING cpu_to_le32(0xC000010A)
+#define STATUS_INVALID_LOGON_TYPE cpu_to_le32(0xC000010B)
+#define STATUS_NO_GUID_TRANSLATION cpu_to_le32(0xC000010C)
+#define STATUS_CANNOT_IMPERSONATE cpu_to_le32(0xC000010D)
+#define STATUS_IMAGE_ALREADY_LOADED cpu_to_le32(0xC000010E)
+#define STATUS_ABIOS_NOT_PRESENT cpu_to_le32(0xC000010F)
+#define STATUS_ABIOS_LID_NOT_EXIST cpu_to_le32(0xC0000110)
+#define STATUS_ABIOS_LID_ALREADY_OWNED cpu_to_le32(0xC0000111)
+#define STATUS_ABIOS_NOT_LID_OWNER cpu_to_le32(0xC0000112)
+#define STATUS_ABIOS_INVALID_COMMAND cpu_to_le32(0xC0000113)
+#define STATUS_ABIOS_INVALID_LID cpu_to_le32(0xC0000114)
+#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE cpu_to_le32(0xC0000115)
+#define STATUS_ABIOS_INVALID_SELECTOR cpu_to_le32(0xC0000116)
+#define STATUS_NO_LDT cpu_to_le32(0xC0000117)
+#define STATUS_INVALID_LDT_SIZE cpu_to_le32(0xC0000118)
+#define STATUS_INVALID_LDT_OFFSET cpu_to_le32(0xC0000119)
+#define STATUS_INVALID_LDT_DESCRIPTOR cpu_to_le32(0xC000011A)
+#define STATUS_INVALID_IMAGE_NE_FORMAT cpu_to_le32(0xC000011B)
+#define STATUS_RXACT_INVALID_STATE cpu_to_le32(0xC000011C)
+#define STATUS_RXACT_COMMIT_FAILURE cpu_to_le32(0xC000011D)
+#define STATUS_MAPPED_FILE_SIZE_ZERO cpu_to_le32(0xC000011E)
+#define STATUS_TOO_MANY_OPENED_FILES cpu_to_le32(0xC000011F)
+#define STATUS_CANCELLED cpu_to_le32(0xC0000120)
+#define STATUS_CANNOT_DELETE cpu_to_le32(0xC0000121)
+#define STATUS_INVALID_COMPUTER_NAME cpu_to_le32(0xC0000122)
+#define STATUS_FILE_DELETED cpu_to_le32(0xC0000123)
+#define STATUS_SPECIAL_ACCOUNT cpu_to_le32(0xC0000124)
+#define STATUS_SPECIAL_GROUP cpu_to_le32(0xC0000125)
+#define STATUS_SPECIAL_USER cpu_to_le32(0xC0000126)
+#define STATUS_MEMBERS_PRIMARY_GROUP cpu_to_le32(0xC0000127)
+#define STATUS_FILE_CLOSED cpu_to_le32(0xC0000128)
+#define STATUS_TOO_MANY_THREADS cpu_to_le32(0xC0000129)
+#define STATUS_THREAD_NOT_IN_PROCESS cpu_to_le32(0xC000012A)
+#define STATUS_TOKEN_ALREADY_IN_USE cpu_to_le32(0xC000012B)
+#define STATUS_PAGEFILE_QUOTA_EXCEEDED cpu_to_le32(0xC000012C)
+#define STATUS_COMMITMENT_LIMIT cpu_to_le32(0xC000012D)
+#define STATUS_INVALID_IMAGE_LE_FORMAT cpu_to_le32(0xC000012E)
+#define STATUS_INVALID_IMAGE_NOT_MZ cpu_to_le32(0xC000012F)
+#define STATUS_INVALID_IMAGE_PROTECT cpu_to_le32(0xC0000130)
+#define STATUS_INVALID_IMAGE_WIN_16 cpu_to_le32(0xC0000131)
+#define STATUS_LOGON_SERVER_CONFLICT cpu_to_le32(0xC0000132)
+#define STATUS_TIME_DIFFERENCE_AT_DC cpu_to_le32(0xC0000133)
+#define STATUS_SYNCHRONIZATION_REQUIRED cpu_to_le32(0xC0000134)
+#define STATUS_DLL_NOT_FOUND cpu_to_le32(0xC0000135)
+#define STATUS_OPEN_FAILED cpu_to_le32(0xC0000136)
+#define STATUS_IO_PRIVILEGE_FAILED cpu_to_le32(0xC0000137)
+#define STATUS_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000138)
+#define STATUS_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000139)
+#define STATUS_CONTROL_C_EXIT cpu_to_le32(0xC000013A)
+#define STATUS_LOCAL_DISCONNECT cpu_to_le32(0xC000013B)
+#define STATUS_REMOTE_DISCONNECT cpu_to_le32(0xC000013C)
+#define STATUS_REMOTE_RESOURCES cpu_to_le32(0xC000013D)
+#define STATUS_LINK_FAILED cpu_to_le32(0xC000013E)
+#define STATUS_LINK_TIMEOUT cpu_to_le32(0xC000013F)
+#define STATUS_INVALID_CONNECTION cpu_to_le32(0xC0000140)
+#define STATUS_INVALID_ADDRESS cpu_to_le32(0xC0000141)
+#define STATUS_DLL_INIT_FAILED cpu_to_le32(0xC0000142)
+#define STATUS_MISSING_SYSTEMFILE cpu_to_le32(0xC0000143)
+#define STATUS_UNHANDLED_EXCEPTION cpu_to_le32(0xC0000144)
+#define STATUS_APP_INIT_FAILURE cpu_to_le32(0xC0000145)
+#define STATUS_PAGEFILE_CREATE_FAILED cpu_to_le32(0xC0000146)
+#define STATUS_NO_PAGEFILE cpu_to_le32(0xC0000147)
+#define STATUS_INVALID_LEVEL cpu_to_le32(0xC0000148)
+#define STATUS_WRONG_PASSWORD_CORE cpu_to_le32(0xC0000149)
+#define STATUS_ILLEGAL_FLOAT_CONTEXT cpu_to_le32(0xC000014A)
+#define STATUS_PIPE_BROKEN cpu_to_le32(0xC000014B)
+#define STATUS_REGISTRY_CORRUPT cpu_to_le32(0xC000014C)
+#define STATUS_REGISTRY_IO_FAILED cpu_to_le32(0xC000014D)
+#define STATUS_NO_EVENT_PAIR cpu_to_le32(0xC000014E)
+#define STATUS_UNRECOGNIZED_VOLUME cpu_to_le32(0xC000014F)
+#define STATUS_SERIAL_NO_DEVICE_INITED cpu_to_le32(0xC0000150)
+#define STATUS_NO_SUCH_ALIAS cpu_to_le32(0xC0000151)
+#define STATUS_MEMBER_NOT_IN_ALIAS cpu_to_le32(0xC0000152)
+#define STATUS_MEMBER_IN_ALIAS cpu_to_le32(0xC0000153)
+#define STATUS_ALIAS_EXISTS cpu_to_le32(0xC0000154)
+#define STATUS_LOGON_NOT_GRANTED cpu_to_le32(0xC0000155)
+#define STATUS_TOO_MANY_SECRETS cpu_to_le32(0xC0000156)
+#define STATUS_SECRET_TOO_LONG cpu_to_le32(0xC0000157)
+#define STATUS_INTERNAL_DB_ERROR cpu_to_le32(0xC0000158)
+#define STATUS_FULLSCREEN_MODE cpu_to_le32(0xC0000159)
+#define STATUS_TOO_MANY_CONTEXT_IDS cpu_to_le32(0xC000015A)
+#define STATUS_LOGON_TYPE_NOT_GRANTED cpu_to_le32(0xC000015B)
+#define STATUS_NOT_REGISTRY_FILE cpu_to_le32(0xC000015C)
+#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000015D)
+#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR cpu_to_le32(0xC000015E)
+#define STATUS_FT_MISSING_MEMBER cpu_to_le32(0xC000015F)
+#define STATUS_ILL_FORMED_SERVICE_ENTRY cpu_to_le32(0xC0000160)
+#define STATUS_ILLEGAL_CHARACTER cpu_to_le32(0xC0000161)
+#define STATUS_UNMAPPABLE_CHARACTER cpu_to_le32(0xC0000162)
+#define STATUS_UNDEFINED_CHARACTER cpu_to_le32(0xC0000163)
+#define STATUS_FLOPPY_VOLUME cpu_to_le32(0xC0000164)
+#define STATUS_FLOPPY_ID_MARK_NOT_FOUND cpu_to_le32(0xC0000165)
+#define STATUS_FLOPPY_WRONG_CYLINDER cpu_to_le32(0xC0000166)
+#define STATUS_FLOPPY_UNKNOWN_ERROR cpu_to_le32(0xC0000167)
+#define STATUS_FLOPPY_BAD_REGISTERS cpu_to_le32(0xC0000168)
+#define STATUS_DISK_RECALIBRATE_FAILED cpu_to_le32(0xC0000169)
+#define STATUS_DISK_OPERATION_FAILED cpu_to_le32(0xC000016A)
+#define STATUS_DISK_RESET_FAILED cpu_to_le32(0xC000016B)
+#define STATUS_SHARED_IRQ_BUSY cpu_to_le32(0xC000016C)
+#define STATUS_FT_ORPHANING cpu_to_le32(0xC000016D)
+#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT cpu_to_le32(0xC000016E)
+#define STATUS_PARTITION_FAILURE cpu_to_le32(0xC0000172)
+#define STATUS_INVALID_BLOCK_LENGTH cpu_to_le32(0xC0000173)
+#define STATUS_DEVICE_NOT_PARTITIONED cpu_to_le32(0xC0000174)
+#define STATUS_UNABLE_TO_LOCK_MEDIA cpu_to_le32(0xC0000175)
+#define STATUS_UNABLE_TO_UNLOAD_MEDIA cpu_to_le32(0xC0000176)
+#define STATUS_EOM_OVERFLOW cpu_to_le32(0xC0000177)
+#define STATUS_NO_MEDIA cpu_to_le32(0xC0000178)
+#define STATUS_NO_SUCH_MEMBER cpu_to_le32(0xC000017A)
+#define STATUS_INVALID_MEMBER cpu_to_le32(0xC000017B)
+#define STATUS_KEY_DELETED cpu_to_le32(0xC000017C)
+#define STATUS_NO_LOG_SPACE cpu_to_le32(0xC000017D)
+#define STATUS_TOO_MANY_SIDS cpu_to_le32(0xC000017E)
+#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000017F)
+#define STATUS_KEY_HAS_CHILDREN cpu_to_le32(0xC0000180)
+#define STATUS_CHILD_MUST_BE_VOLATILE cpu_to_le32(0xC0000181)
+#define STATUS_DEVICE_CONFIGURATION_ERROR cpu_to_le32(0xC0000182)
+#define STATUS_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC0000183)
+#define STATUS_INVALID_DEVICE_STATE cpu_to_le32(0xC0000184)
+#define STATUS_IO_DEVICE_ERROR cpu_to_le32(0xC0000185)
+#define STATUS_DEVICE_PROTOCOL_ERROR cpu_to_le32(0xC0000186)
+#define STATUS_BACKUP_CONTROLLER cpu_to_le32(0xC0000187)
+#define STATUS_LOG_FILE_FULL cpu_to_le32(0xC0000188)
+#define STATUS_TOO_LATE cpu_to_le32(0xC0000189)
+#define STATUS_NO_TRUST_LSA_SECRET cpu_to_le32(0xC000018A)
+#define STATUS_NO_TRUST_SAM_ACCOUNT cpu_to_le32(0xC000018B)
+#define STATUS_TRUSTED_DOMAIN_FAILURE cpu_to_le32(0xC000018C)
+#define STATUS_TRUSTED_RELATIONSHIP_FAILURE cpu_to_le32(0xC000018D)
+#define STATUS_EVENTLOG_FILE_CORRUPT cpu_to_le32(0xC000018E)
+#define STATUS_EVENTLOG_CANT_START cpu_to_le32(0xC000018F)
+#define STATUS_TRUST_FAILURE cpu_to_le32(0xC0000190)
+#define STATUS_MUTANT_LIMIT_EXCEEDED cpu_to_le32(0xC0000191)
+#define STATUS_NETLOGON_NOT_STARTED cpu_to_le32(0xC0000192)
+#define STATUS_ACCOUNT_EXPIRED cpu_to_le32(0xC0000193)
+#define STATUS_POSSIBLE_DEADLOCK cpu_to_le32(0xC0000194)
+#define STATUS_NETWORK_CREDENTIAL_CONFLICT cpu_to_le32(0xC0000195)
+#define STATUS_REMOTE_SESSION_LIMIT cpu_to_le32(0xC0000196)
+#define STATUS_EVENTLOG_FILE_CHANGED cpu_to_le32(0xC0000197)
+#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT cpu_to_le32(0xC0000198)
+#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT cpu_to_le32(0xC0000199)
+#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT cpu_to_le32(0xC000019A)
+#define STATUS_DOMAIN_TRUST_INCONSISTENT cpu_to_le32(0xC000019B)
+#define STATUS_FS_DRIVER_REQUIRED cpu_to_le32(0xC000019C)
+#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL cpu_to_le32(0xC000019D)
+#define STATUS_NETWORK_OPEN_RESTRICTION cpu_to_le32(0xC0000201)
+#define STATUS_NO_USER_SESSION_KEY cpu_to_le32(0xC0000202)
+#define STATUS_USER_SESSION_DELETED cpu_to_le32(0xC0000203)
+#define STATUS_RESOURCE_LANG_NOT_FOUND cpu_to_le32(0xC0000204)
+#define STATUS_INSUFF_SERVER_RESOURCES cpu_to_le32(0xC0000205)
+#define STATUS_INVALID_BUFFER_SIZE cpu_to_le32(0xC0000206)
+#define STATUS_INVALID_ADDRESS_COMPONENT cpu_to_le32(0xC0000207)
+#define STATUS_INVALID_ADDRESS_WILDCARD cpu_to_le32(0xC0000208)
+#define STATUS_TOO_MANY_ADDRESSES cpu_to_le32(0xC0000209)
+#define STATUS_ADDRESS_ALREADY_EXISTS cpu_to_le32(0xC000020A)
+#define STATUS_ADDRESS_CLOSED cpu_to_le32(0xC000020B)
+#define STATUS_CONNECTION_DISCONNECTED cpu_to_le32(0xC000020C)
+#define STATUS_CONNECTION_RESET cpu_to_le32(0xC000020D)
+#define STATUS_TOO_MANY_NODES cpu_to_le32(0xC000020E)
+#define STATUS_TRANSACTION_ABORTED cpu_to_le32(0xC000020F)
+#define STATUS_TRANSACTION_TIMED_OUT cpu_to_le32(0xC0000210)
+#define STATUS_TRANSACTION_NO_RELEASE cpu_to_le32(0xC0000211)
+#define STATUS_TRANSACTION_NO_MATCH cpu_to_le32(0xC0000212)
+#define STATUS_TRANSACTION_RESPONDED cpu_to_le32(0xC0000213)
+#define STATUS_TRANSACTION_INVALID_ID cpu_to_le32(0xC0000214)
+#define STATUS_TRANSACTION_INVALID_TYPE cpu_to_le32(0xC0000215)
+#define STATUS_NOT_SERVER_SESSION cpu_to_le32(0xC0000216)
+#define STATUS_NOT_CLIENT_SESSION cpu_to_le32(0xC0000217)
+#define STATUS_CANNOT_LOAD_REGISTRY_FILE cpu_to_le32(0xC0000218)
+#define STATUS_DEBUG_ATTACH_FAILED cpu_to_le32(0xC0000219)
+#define STATUS_SYSTEM_PROCESS_TERMINATED cpu_to_le32(0xC000021A)
+#define STATUS_DATA_NOT_ACCEPTED cpu_to_le32(0xC000021B)
+#define STATUS_NO_BROWSER_SERVERS_FOUND cpu_to_le32(0xC000021C)
+#define STATUS_VDM_HARD_ERROR cpu_to_le32(0xC000021D)
+#define STATUS_DRIVER_CANCEL_TIMEOUT cpu_to_le32(0xC000021E)
+#define STATUS_REPLY_MESSAGE_MISMATCH cpu_to_le32(0xC000021F)
+#define STATUS_MAPPED_ALIGNMENT cpu_to_le32(0xC0000220)
+#define STATUS_IMAGE_CHECKSUM_MISMATCH cpu_to_le32(0xC0000221)
+#define STATUS_LOST_WRITEBEHIND_DATA cpu_to_le32(0xC0000222)
+#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID cpu_to_le32(0xC0000223)
+#define STATUS_PASSWORD_MUST_CHANGE cpu_to_le32(0xC0000224)
+#define STATUS_NOT_FOUND cpu_to_le32(0xC0000225)
+#define STATUS_NOT_TINY_STREAM cpu_to_le32(0xC0000226)
+#define STATUS_RECOVERY_FAILURE cpu_to_le32(0xC0000227)
+#define STATUS_STACK_OVERFLOW_READ cpu_to_le32(0xC0000228)
+#define STATUS_FAIL_CHECK cpu_to_le32(0xC0000229)
+#define STATUS_DUPLICATE_OBJECTID cpu_to_le32(0xC000022A)
+#define STATUS_OBJECTID_EXISTS cpu_to_le32(0xC000022B)
+#define STATUS_CONVERT_TO_LARGE cpu_to_le32(0xC000022C)
+#define STATUS_RETRY cpu_to_le32(0xC000022D)
+#define STATUS_FOUND_OUT_OF_SCOPE cpu_to_le32(0xC000022E)
+#define STATUS_ALLOCATE_BUCKET cpu_to_le32(0xC000022F)
+#define STATUS_PROPSET_NOT_FOUND cpu_to_le32(0xC0000230)
+#define STATUS_MARSHALL_OVERFLOW cpu_to_le32(0xC0000231)
+#define STATUS_INVALID_VARIANT cpu_to_le32(0xC0000232)
+#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND cpu_to_le32(0xC0000233)
+#define STATUS_ACCOUNT_LOCKED_OUT cpu_to_le32(0xC0000234)
+#define STATUS_HANDLE_NOT_CLOSABLE cpu_to_le32(0xC0000235)
+#define STATUS_CONNECTION_REFUSED cpu_to_le32(0xC0000236)
+#define STATUS_GRACEFUL_DISCONNECT cpu_to_le32(0xC0000237)
+#define STATUS_ADDRESS_ALREADY_ASSOCIATED cpu_to_le32(0xC0000238)
+#define STATUS_ADDRESS_NOT_ASSOCIATED cpu_to_le32(0xC0000239)
+#define STATUS_CONNECTION_INVALID cpu_to_le32(0xC000023A)
+#define STATUS_CONNECTION_ACTIVE cpu_to_le32(0xC000023B)
+#define STATUS_NETWORK_UNREACHABLE cpu_to_le32(0xC000023C)
+#define STATUS_HOST_UNREACHABLE cpu_to_le32(0xC000023D)
+#define STATUS_PROTOCOL_UNREACHABLE cpu_to_le32(0xC000023E)
+#define STATUS_PORT_UNREACHABLE cpu_to_le32(0xC000023F)
+#define STATUS_REQUEST_ABORTED cpu_to_le32(0xC0000240)
+#define STATUS_CONNECTION_ABORTED cpu_to_le32(0xC0000241)
+#define STATUS_BAD_COMPRESSION_BUFFER cpu_to_le32(0xC0000242)
+#define STATUS_USER_MAPPED_FILE cpu_to_le32(0xC0000243)
+#define STATUS_AUDIT_FAILED cpu_to_le32(0xC0000244)
+#define STATUS_TIMER_RESOLUTION_NOT_SET cpu_to_le32(0xC0000245)
+#define STATUS_CONNECTION_COUNT_LIMIT cpu_to_le32(0xC0000246)
+#define STATUS_LOGIN_TIME_RESTRICTION cpu_to_le32(0xC0000247)
+#define STATUS_LOGIN_WKSTA_RESTRICTION cpu_to_le32(0xC0000248)
+#define STATUS_IMAGE_MP_UP_MISMATCH cpu_to_le32(0xC0000249)
+#define STATUS_INSUFFICIENT_LOGON_INFO cpu_to_le32(0xC0000250)
+#define STATUS_BAD_DLL_ENTRYPOINT cpu_to_le32(0xC0000251)
+#define STATUS_BAD_SERVICE_ENTRYPOINT cpu_to_le32(0xC0000252)
+#define STATUS_LPC_REPLY_LOST cpu_to_le32(0xC0000253)
+#define STATUS_IP_ADDRESS_CONFLICT1 cpu_to_le32(0xC0000254)
+#define STATUS_IP_ADDRESS_CONFLICT2 cpu_to_le32(0xC0000255)
+#define STATUS_REGISTRY_QUOTA_LIMIT cpu_to_le32(0xC0000256)
+#define STATUS_PATH_NOT_COVERED cpu_to_le32(0xC0000257)
+#define STATUS_NO_CALLBACK_ACTIVE cpu_to_le32(0xC0000258)
+#define STATUS_LICENSE_QUOTA_EXCEEDED cpu_to_le32(0xC0000259)
+#define STATUS_PWD_TOO_SHORT cpu_to_le32(0xC000025A)
+#define STATUS_PWD_TOO_RECENT cpu_to_le32(0xC000025B)
+#define STATUS_PWD_HISTORY_CONFLICT cpu_to_le32(0xC000025C)
+#define STATUS_PLUGPLAY_NO_DEVICE cpu_to_le32(0xC000025E)
+#define STATUS_UNSUPPORTED_COMPRESSION cpu_to_le32(0xC000025F)
+#define STATUS_INVALID_HW_PROFILE cpu_to_le32(0xC0000260)
+#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH cpu_to_le32(0xC0000261)
+#define STATUS_DRIVER_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000262)
+#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000263)
+#define STATUS_RESOURCE_NOT_OWNED cpu_to_le32(0xC0000264)
+#define STATUS_TOO_MANY_LINKS cpu_to_le32(0xC0000265)
+#define STATUS_QUOTA_LIST_INCONSISTENT cpu_to_le32(0xC0000266)
+#define STATUS_FILE_IS_OFFLINE cpu_to_le32(0xC0000267)
+#define STATUS_EVALUATION_EXPIRATION cpu_to_le32(0xC0000268)
+#define STATUS_ILLEGAL_DLL_RELOCATION cpu_to_le32(0xC0000269)
+#define STATUS_LICENSE_VIOLATION cpu_to_le32(0xC000026A)
+#define STATUS_DLL_INIT_FAILED_LOGOFF cpu_to_le32(0xC000026B)
+#define STATUS_DRIVER_UNABLE_TO_LOAD cpu_to_le32(0xC000026C)
+#define STATUS_DFS_UNAVAILABLE cpu_to_le32(0xC000026D)
+#define STATUS_VOLUME_DISMOUNTED cpu_to_le32(0xC000026E)
+#define STATUS_WX86_INTERNAL_ERROR cpu_to_le32(0xC000026F)
+#define STATUS_WX86_FLOAT_STACK_CHECK cpu_to_le32(0xC0000270)
+#define STATUS_VALIDATE_CONTINUE cpu_to_le32(0xC0000271)
+#define STATUS_NO_MATCH cpu_to_le32(0xC0000272)
+#define STATUS_NO_MORE_MATCHES cpu_to_le32(0xC0000273)
+#define STATUS_NOT_A_REPARSE_POINT cpu_to_le32(0xC0000275)
+#define STATUS_IO_REPARSE_TAG_INVALID cpu_to_le32(0xC0000276)
+#define STATUS_IO_REPARSE_TAG_MISMATCH cpu_to_le32(0xC0000277)
+#define STATUS_IO_REPARSE_DATA_INVALID cpu_to_le32(0xC0000278)
+#define STATUS_IO_REPARSE_TAG_NOT_HANDLED cpu_to_le32(0xC0000279)
+#define STATUS_REPARSE_POINT_NOT_RESOLVED cpu_to_le32(0xC0000280)
+#define STATUS_DIRECTORY_IS_A_REPARSE_POINT cpu_to_le32(0xC0000281)
+#define STATUS_RANGE_LIST_CONFLICT cpu_to_le32(0xC0000282)
+#define STATUS_SOURCE_ELEMENT_EMPTY cpu_to_le32(0xC0000283)
+#define STATUS_DESTINATION_ELEMENT_FULL cpu_to_le32(0xC0000284)
+#define STATUS_ILLEGAL_ELEMENT_ADDRESS cpu_to_le32(0xC0000285)
+#define STATUS_MAGAZINE_NOT_PRESENT cpu_to_le32(0xC0000286)
+#define STATUS_REINITIALIZATION_NEEDED cpu_to_le32(0xC0000287)
+#define STATUS_ENCRYPTION_FAILED cpu_to_le32(0xC000028A)
+#define STATUS_DECRYPTION_FAILED cpu_to_le32(0xC000028B)
+#define STATUS_RANGE_NOT_FOUND cpu_to_le32(0xC000028C)
+#define STATUS_NO_RECOVERY_POLICY cpu_to_le32(0xC000028D)
+#define STATUS_NO_EFS cpu_to_le32(0xC000028E)
+#define STATUS_WRONG_EFS cpu_to_le32(0xC000028F)
+#define STATUS_NO_USER_KEYS cpu_to_le32(0xC0000290)
+#define STATUS_FILE_NOT_ENCRYPTED cpu_to_le32(0xC0000291)
+#define STATUS_NOT_EXPORT_FORMAT cpu_to_le32(0xC0000292)
+#define STATUS_FILE_ENCRYPTED cpu_to_le32(0xC0000293)
+#define STATUS_WMI_GUID_NOT_FOUND cpu_to_le32(0xC0000295)
+#define STATUS_WMI_INSTANCE_NOT_FOUND cpu_to_le32(0xC0000296)
+#define STATUS_WMI_ITEMID_NOT_FOUND cpu_to_le32(0xC0000297)
+#define STATUS_WMI_TRY_AGAIN cpu_to_le32(0xC0000298)
+#define STATUS_SHARED_POLICY cpu_to_le32(0xC0000299)
+#define STATUS_POLICY_OBJECT_NOT_FOUND cpu_to_le32(0xC000029A)
+#define STATUS_POLICY_ONLY_IN_DS cpu_to_le32(0xC000029B)
+#define STATUS_VOLUME_NOT_UPGRADED cpu_to_le32(0xC000029C)
+#define STATUS_REMOTE_STORAGE_NOT_ACTIVE cpu_to_le32(0xC000029D)
+#define STATUS_REMOTE_STORAGE_MEDIA_ERROR cpu_to_le32(0xC000029E)
+#define STATUS_NO_TRACKING_SERVICE cpu_to_le32(0xC000029F)
+#define STATUS_SERVER_SID_MISMATCH cpu_to_le32(0xC00002A0)
+#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE cpu_to_le32(0xC00002A1)
+#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX cpu_to_le32(0xC00002A2)
+#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED cpu_to_le32(0xC00002A3)
+#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS cpu_to_le32(0xC00002A4)
+#define STATUS_DS_BUSY cpu_to_le32(0xC00002A5)
+#define STATUS_DS_UNAVAILABLE cpu_to_le32(0xC00002A6)
+#define STATUS_DS_NO_RIDS_ALLOCATED cpu_to_le32(0xC00002A7)
+#define STATUS_DS_NO_MORE_RIDS cpu_to_le32(0xC00002A8)
+#define STATUS_DS_INCORRECT_ROLE_OWNER cpu_to_le32(0xC00002A9)
+#define STATUS_DS_RIDMGR_INIT_ERROR cpu_to_le32(0xC00002AA)
+#define STATUS_DS_OBJ_CLASS_VIOLATION cpu_to_le32(0xC00002AB)
+#define STATUS_DS_CANT_ON_NON_LEAF cpu_to_le32(0xC00002AC)
+#define STATUS_DS_CANT_ON_RDN cpu_to_le32(0xC00002AD)
+#define STATUS_DS_CANT_MOD_OBJ_CLASS cpu_to_le32(0xC00002AE)
+#define STATUS_DS_CROSS_DOM_MOVE_FAILED cpu_to_le32(0xC00002AF)
+#define STATUS_DS_GC_NOT_AVAILABLE cpu_to_le32(0xC00002B0)
+#define STATUS_DIRECTORY_SERVICE_REQUIRED cpu_to_le32(0xC00002B1)
+#define STATUS_REPARSE_ATTRIBUTE_CONFLICT cpu_to_le32(0xC00002B2)
+#define STATUS_CANT_ENABLE_DENY_ONLY cpu_to_le32(0xC00002B3)
+#define STATUS_FLOAT_MULTIPLE_FAULTS cpu_to_le32(0xC00002B4)
+#define STATUS_FLOAT_MULTIPLE_TRAPS cpu_to_le32(0xC00002B5)
+#define STATUS_DEVICE_REMOVED cpu_to_le32(0xC00002B6)
+#define STATUS_JOURNAL_DELETE_IN_PROGRESS cpu_to_le32(0xC00002B7)
+#define STATUS_JOURNAL_NOT_ACTIVE cpu_to_le32(0xC00002B8)
+#define STATUS_NOINTERFACE cpu_to_le32(0xC00002B9)
+#define STATUS_DS_ADMIN_LIMIT_EXCEEDED cpu_to_le32(0xC00002C1)
+#define STATUS_DRIVER_FAILED_SLEEP cpu_to_le32(0xC00002C2)
+#define STATUS_MUTUAL_AUTHENTICATION_FAILED cpu_to_le32(0xC00002C3)
+#define STATUS_CORRUPT_SYSTEM_FILE cpu_to_le32(0xC00002C4)
+#define STATUS_DATATYPE_MISALIGNMENT_ERROR cpu_to_le32(0xC00002C5)
+#define STATUS_WMI_READ_ONLY cpu_to_le32(0xC00002C6)
+#define STATUS_WMI_SET_FAILURE cpu_to_le32(0xC00002C7)
+#define STATUS_COMMITMENT_MINIMUM cpu_to_le32(0xC00002C8)
+#define STATUS_REG_NAT_CONSUMPTION cpu_to_le32(0xC00002C9)
+#define STATUS_TRANSPORT_FULL cpu_to_le32(0xC00002CA)
+#define STATUS_DS_SAM_INIT_FAILURE cpu_to_le32(0xC00002CB)
+#define STATUS_ONLY_IF_CONNECTED cpu_to_le32(0xC00002CC)
+#define STATUS_DS_SENSITIVE_GROUP_VIOLATION cpu_to_le32(0xC00002CD)
+#define STATUS_PNP_RESTART_ENUMERATION cpu_to_le32(0xC00002CE)
+#define STATUS_JOURNAL_ENTRY_DELETED cpu_to_le32(0xC00002CF)
+#define STATUS_DS_CANT_MOD_PRIMARYGROUPID cpu_to_le32(0xC00002D0)
+#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE cpu_to_le32(0xC00002D1)
+#define STATUS_PNP_REBOOT_REQUIRED cpu_to_le32(0xC00002D2)
+#define STATUS_POWER_STATE_INVALID cpu_to_le32(0xC00002D3)
+#define STATUS_DS_INVALID_GROUP_TYPE cpu_to_le32(0xC00002D4)
+#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D5)
+#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D6)
+#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D7)
+#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC00002D8)
+#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D9)
+#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER cpu_to_le32(0xC00002DA)
+#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER \
+ cpu_to_le32(0xC00002DB)
+#define STATUS_DS_HAVE_PRIMARY_MEMBERS cpu_to_le32(0xC00002DC)
+#define STATUS_WMI_NOT_SUPPORTED cpu_to_le32(0xC00002DD)
+#define STATUS_INSUFFICIENT_POWER cpu_to_le32(0xC00002DE)
+#define STATUS_SAM_NEED_BOOTKEY_PASSWORD cpu_to_le32(0xC00002DF)
+#define STATUS_SAM_NEED_BOOTKEY_FLOPPY cpu_to_le32(0xC00002E0)
+#define STATUS_DS_CANT_START cpu_to_le32(0xC00002E1)
+#define STATUS_DS_INIT_FAILURE cpu_to_le32(0xC00002E2)
+#define STATUS_SAM_INIT_FAILURE cpu_to_le32(0xC00002E3)
+#define STATUS_DS_GC_REQUIRED cpu_to_le32(0xC00002E4)
+#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY cpu_to_le32(0xC00002E5)
+#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS cpu_to_le32(0xC00002E6)
+#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED cpu_to_le32(0xC00002E7)
+#define STATUS_MULTIPLE_FAULT_VIOLATION cpu_to_le32(0xC00002E8)
+#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED cpu_to_le32(0xC00002E9)
+#define STATUS_CANNOT_MAKE cpu_to_le32(0xC00002EA)
+#define STATUS_SYSTEM_SHUTDOWN cpu_to_le32(0xC00002EB)
+#define STATUS_DS_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002EC)
+#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002ED)
+#define STATUS_UNFINISHED_CONTEXT_DELETED cpu_to_le32(0xC00002EE)
+#define STATUS_NO_TGT_REPLY cpu_to_le32(0xC00002EF)
+#define STATUS_OBJECTID_NOT_FOUND cpu_to_le32(0xC00002F0)
+#define STATUS_NO_IP_ADDRESSES cpu_to_le32(0xC00002F1)
+#define STATUS_WRONG_CREDENTIAL_HANDLE cpu_to_le32(0xC00002F2)
+#define STATUS_CRYPTO_SYSTEM_INVALID cpu_to_le32(0xC00002F3)
+#define STATUS_MAX_REFERRALS_EXCEEDED cpu_to_le32(0xC00002F4)
+#define STATUS_MUST_BE_KDC cpu_to_le32(0xC00002F5)
+#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED cpu_to_le32(0xC00002F6)
+#define STATUS_TOO_MANY_PRINCIPALS cpu_to_le32(0xC00002F7)
+#define STATUS_NO_PA_DATA cpu_to_le32(0xC00002F8)
+#define STATUS_PKINIT_NAME_MISMATCH cpu_to_le32(0xC00002F9)
+#define STATUS_SMARTCARD_LOGON_REQUIRED cpu_to_le32(0xC00002FA)
+#define STATUS_KDC_INVALID_REQUEST cpu_to_le32(0xC00002FB)
+#define STATUS_KDC_UNABLE_TO_REFER cpu_to_le32(0xC00002FC)
+#define STATUS_KDC_UNKNOWN_ETYPE cpu_to_le32(0xC00002FD)
+#define STATUS_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FE)
+#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FF)
+#define STATUS_NOT_SUPPORTED_ON_SBS cpu_to_le32(0xC0000300)
+#define STATUS_WMI_GUID_DISCONNECTED cpu_to_le32(0xC0000301)
+#define STATUS_WMI_ALREADY_DISABLED cpu_to_le32(0xC0000302)
+#define STATUS_WMI_ALREADY_ENABLED cpu_to_le32(0xC0000303)
+#define STATUS_MFT_TOO_FRAGMENTED cpu_to_le32(0xC0000304)
+#define STATUS_COPY_PROTECTION_FAILURE cpu_to_le32(0xC0000305)
+#define STATUS_CSS_AUTHENTICATION_FAILURE cpu_to_le32(0xC0000306)
+#define STATUS_CSS_KEY_NOT_PRESENT cpu_to_le32(0xC0000307)
+#define STATUS_CSS_KEY_NOT_ESTABLISHED cpu_to_le32(0xC0000308)
+#define STATUS_CSS_SCRAMBLED_SECTOR cpu_to_le32(0xC0000309)
+#define STATUS_CSS_REGION_MISMATCH cpu_to_le32(0xC000030A)
+#define STATUS_CSS_RESETS_EXHAUSTED cpu_to_le32(0xC000030B)
+#define STATUS_PKINIT_FAILURE cpu_to_le32(0xC0000320)
+#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE cpu_to_le32(0xC0000321)
+#define STATUS_NO_KERB_KEY cpu_to_le32(0xC0000322)
+#define STATUS_HOST_DOWN cpu_to_le32(0xC0000350)
+#define STATUS_UNSUPPORTED_PREAUTH cpu_to_le32(0xC0000351)
+#define STATUS_EFS_ALG_BLOB_TOO_BIG cpu_to_le32(0xC0000352)
+#define STATUS_PORT_NOT_SET cpu_to_le32(0xC0000353)
+#define STATUS_DEBUGGER_INACTIVE cpu_to_le32(0xC0000354)
+#define STATUS_DS_VERSION_CHECK_FAILURE cpu_to_le32(0xC0000355)
+#define STATUS_AUDITING_DISABLED cpu_to_le32(0xC0000356)
+#define STATUS_PRENT4_MACHINE_ACCOUNT cpu_to_le32(0xC0000357)
+#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC0000358)
+#define STATUS_INVALID_IMAGE_WIN_32 cpu_to_le32(0xC0000359)
+#define STATUS_INVALID_IMAGE_WIN_64 cpu_to_le32(0xC000035A)
+#define STATUS_BAD_BINDINGS cpu_to_le32(0xC000035B)
+#define STATUS_NETWORK_SESSION_EXPIRED cpu_to_le32(0xC000035C)
+#define STATUS_APPHELP_BLOCK cpu_to_le32(0xC000035D)
+#define STATUS_ALL_SIDS_FILTERED cpu_to_le32(0xC000035E)
+#define STATUS_NOT_SAFE_MODE_DRIVER cpu_to_le32(0xC000035F)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT cpu_to_le32(0xC0000361)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH cpu_to_le32(0xC0000362)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER cpu_to_le32(0xC0000363)
+#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER cpu_to_le32(0xC0000364)
+#define STATUS_FAILED_DRIVER_ENTRY cpu_to_le32(0xC0000365)
+#define STATUS_DEVICE_ENUMERATION_ERROR cpu_to_le32(0xC0000366)
+#define STATUS_MOUNT_POINT_NOT_RESOLVED cpu_to_le32(0xC0000368)
+#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER cpu_to_le32(0xC0000369)
+#define STATUS_MCA_OCCURRED cpu_to_le32(0xC000036A)
+#define STATUS_DRIVER_BLOCKED_CRITICAL cpu_to_le32(0xC000036B)
+#define STATUS_DRIVER_BLOCKED cpu_to_le32(0xC000036C)
+#define STATUS_DRIVER_DATABASE_ERROR cpu_to_le32(0xC000036D)
+#define STATUS_SYSTEM_HIVE_TOO_LARGE cpu_to_le32(0xC000036E)
+#define STATUS_INVALID_IMPORT_OF_NON_DLL cpu_to_le32(0xC000036F)
+#define STATUS_NO_SECRETS cpu_to_le32(0xC0000371)
+#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY cpu_to_le32(0xC0000372)
+#define STATUS_FAILED_STACK_SWITCH cpu_to_le32(0xC0000373)
+#define STATUS_HEAP_CORRUPTION cpu_to_le32(0xC0000374)
+#define STATUS_SMARTCARD_WRONG_PIN cpu_to_le32(0xC0000380)
+#define STATUS_SMARTCARD_CARD_BLOCKED cpu_to_le32(0xC0000381)
+#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED cpu_to_le32(0xC0000382)
+#define STATUS_SMARTCARD_NO_CARD cpu_to_le32(0xC0000383)
+#define STATUS_SMARTCARD_NO_KEY_CONTAINER cpu_to_le32(0xC0000384)
+#define STATUS_SMARTCARD_NO_CERTIFICATE cpu_to_le32(0xC0000385)
+#define STATUS_SMARTCARD_NO_KEYSET cpu_to_le32(0xC0000386)
+#define STATUS_SMARTCARD_IO_ERROR cpu_to_le32(0xC0000387)
+#define STATUS_DOWNGRADE_DETECTED cpu_to_le32(0xC0000388)
+#define STATUS_SMARTCARD_CERT_REVOKED cpu_to_le32(0xC0000389)
+#define STATUS_ISSUING_CA_UNTRUSTED cpu_to_le32(0xC000038A)
+#define STATUS_REVOCATION_OFFLINE_C cpu_to_le32(0xC000038B)
+#define STATUS_PKINIT_CLIENT_FAILURE cpu_to_le32(0xC000038C)
+#define STATUS_SMARTCARD_CERT_EXPIRED cpu_to_le32(0xC000038D)
+#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD cpu_to_le32(0xC000038E)
+#define STATUS_SMARTCARD_SILENT_CONTEXT cpu_to_le32(0xC000038F)
+#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000401)
+#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000402)
+#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000403)
+#define STATUS_DS_NAME_NOT_UNIQUE cpu_to_le32(0xC0000404)
+#define STATUS_DS_DUPLICATE_ID_FOUND cpu_to_le32(0xC0000405)
+#define STATUS_DS_GROUP_CONVERSION_ERROR cpu_to_le32(0xC0000406)
+#define STATUS_VOLSNAP_PREPARE_HIBERNATE cpu_to_le32(0xC0000407)
+#define STATUS_USER2USER_REQUIRED cpu_to_le32(0xC0000408)
+#define STATUS_STACK_BUFFER_OVERRUN cpu_to_le32(0xC0000409)
+#define STATUS_NO_S4U_PROT_SUPPORT cpu_to_le32(0xC000040A)
+#define STATUS_CROSSREALM_DELEGATION_FAILURE cpu_to_le32(0xC000040B)
+#define STATUS_REVOCATION_OFFLINE_KDC cpu_to_le32(0xC000040C)
+#define STATUS_ISSUING_CA_UNTRUSTED_KDC cpu_to_le32(0xC000040D)
+#define STATUS_KDC_CERT_EXPIRED cpu_to_le32(0xC000040E)
+#define STATUS_KDC_CERT_REVOKED cpu_to_le32(0xC000040F)
+#define STATUS_PARAMETER_QUOTA_EXCEEDED cpu_to_le32(0xC0000410)
+#define STATUS_HIBERNATION_FAILURE cpu_to_le32(0xC0000411)
+#define STATUS_DELAY_LOAD_FAILED cpu_to_le32(0xC0000412)
+#define STATUS_AUTHENTICATION_FIREWALL_FAILED cpu_to_le32(0xC0000413)
+#define STATUS_VDM_DISALLOWED cpu_to_le32(0xC0000414)
+#define STATUS_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC0000415)
+#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE \
+ cpu_to_le32(0xC0000416)
+#define STATUS_INVALID_CRUNTIME_PARAMETER cpu_to_le32(0xC0000417)
+#define STATUS_NTLM_BLOCKED cpu_to_le32(0xC0000418)
+#define STATUS_ASSERTION_FAILURE cpu_to_le32(0xC0000420)
+#define STATUS_VERIFIER_STOP cpu_to_le32(0xC0000421)
+#define STATUS_CALLBACK_POP_STACK cpu_to_le32(0xC0000423)
+#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED cpu_to_le32(0xC0000424)
+#define STATUS_HIVE_UNLOADED cpu_to_le32(0xC0000425)
+#define STATUS_COMPRESSION_DISABLED cpu_to_le32(0xC0000426)
+#define STATUS_FILE_SYSTEM_LIMITATION cpu_to_le32(0xC0000427)
+#define STATUS_INVALID_IMAGE_HASH cpu_to_le32(0xC0000428)
+#define STATUS_NOT_CAPABLE cpu_to_le32(0xC0000429)
+#define STATUS_REQUEST_OUT_OF_SEQUENCE cpu_to_le32(0xC000042A)
+#define STATUS_IMPLEMENTATION_LIMIT cpu_to_le32(0xC000042B)
+#define STATUS_ELEVATION_REQUIRED cpu_to_le32(0xC000042C)
+#define STATUS_BEYOND_VDL cpu_to_le32(0xC0000432)
+#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS cpu_to_le32(0xC0000433)
+#define STATUS_PTE_CHANGED cpu_to_le32(0xC0000434)
+#define STATUS_PURGE_FAILED cpu_to_le32(0xC0000435)
+#define STATUS_CRED_REQUIRES_CONFIRMATION cpu_to_le32(0xC0000440)
+#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE cpu_to_le32(0xC0000441)
+#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER cpu_to_le32(0xC0000442)
+#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE cpu_to_le32(0xC0000443)
+#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE cpu_to_le32(0xC0000444)
+#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE cpu_to_le32(0xC0000445)
+#define STATUS_INVALID_LABEL cpu_to_le32(0xC0000446)
+#define STATUS_DRIVER_PROCESS_TERMINATED cpu_to_le32(0xC0000450)
+#define STATUS_AMBIGUOUS_SYSTEM_DEVICE cpu_to_le32(0xC0000451)
+#define STATUS_SYSTEM_DEVICE_NOT_FOUND cpu_to_le32(0xC0000452)
+#define STATUS_RESTART_BOOT_APPLICATION cpu_to_le32(0xC0000453)
+#define STATUS_INVALID_TASK_NAME cpu_to_le32(0xC0000500)
+#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501)
+#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502)
+#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503)
+#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700)
+#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701)
+#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702)
+#define STATUS_REQUEST_CANCELED cpu_to_le32(0xC0000703)
+#define STATUS_RECURSIVE_DISPATCH cpu_to_le32(0xC0000704)
+#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED cpu_to_le32(0xC0000705)
+#define STATUS_LPC_INVALID_CONNECTION_USAGE cpu_to_le32(0xC0000706)
+#define STATUS_LPC_REQUESTS_NOT_ALLOWED cpu_to_le32(0xC0000707)
+#define STATUS_RESOURCE_IN_USE cpu_to_le32(0xC0000708)
+#define STATUS_HARDWARE_MEMORY_ERROR cpu_to_le32(0xC0000709)
+#define STATUS_THREADPOOL_HANDLE_EXCEPTION cpu_to_le32(0xC000070A)
+#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED cpu_to_le32(0xC000070B)
+#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED \
+ cpu_to_le32(0xC000070C)
+#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED \
+ cpu_to_le32(0xC000070D)
+#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED \
+ cpu_to_le32(0xC000070E)
+#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION cpu_to_le32(0xC000070F)
+#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000710)
+#define STATUS_APC_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000711)
+#define STATUS_PROCESS_IS_PROTECTED cpu_to_le32(0xC0000712)
+#define STATUS_MCA_EXCEPTION cpu_to_le32(0xC0000713)
+#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE cpu_to_le32(0xC0000714)
+#define STATUS_SYMLINK_CLASS_DISABLED cpu_to_le32(0xC0000715)
+#define STATUS_INVALID_IDN_NORMALIZATION cpu_to_le32(0xC0000716)
+#define STATUS_NO_UNICODE_TRANSLATION cpu_to_le32(0xC0000717)
+#define STATUS_ALREADY_REGISTERED cpu_to_le32(0xC0000718)
+#define STATUS_CONTEXT_MISMATCH cpu_to_le32(0xC0000719)
+#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST cpu_to_le32(0xC000071A)
+#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY cpu_to_le32(0xC000071B)
+#define STATUS_INVALID_THREAD cpu_to_le32(0xC000071C)
+#define STATUS_CALLBACK_RETURNED_TRANSACTION cpu_to_le32(0xC000071D)
+#define STATUS_CALLBACK_RETURNED_LDR_LOCK cpu_to_le32(0xC000071E)
+#define STATUS_CALLBACK_RETURNED_LANG cpu_to_le32(0xC000071F)
+#define STATUS_CALLBACK_RETURNED_PRI_BACK cpu_to_le32(0xC0000720)
+#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY cpu_to_le32(0xC0000721)
+#define STATUS_DISK_REPAIR_DISABLED cpu_to_le32(0xC0000800)
+#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS cpu_to_le32(0xC0000801)
+#define STATUS_DISK_QUOTA_EXCEEDED cpu_to_le32(0xC0000802)
+#define STATUS_CONTENT_BLOCKED cpu_to_le32(0xC0000804)
+#define STATUS_BAD_CLUSTERS cpu_to_le32(0xC0000805)
+#define STATUS_VOLUME_DIRTY cpu_to_le32(0xC0000806)
+#define STATUS_FILE_CHECKED_OUT cpu_to_le32(0xC0000901)
+#define STATUS_CHECKOUT_REQUIRED cpu_to_le32(0xC0000902)
+#define STATUS_BAD_FILE_TYPE cpu_to_le32(0xC0000903)
+#define STATUS_FILE_TOO_LARGE cpu_to_le32(0xC0000904)
+#define STATUS_FORMS_AUTH_REQUIRED cpu_to_le32(0xC0000905)
+#define STATUS_VIRUS_INFECTED cpu_to_le32(0xC0000906)
+#define STATUS_VIRUS_DELETED cpu_to_le32(0xC0000907)
+#define STATUS_BAD_MCFG_TABLE cpu_to_le32(0xC0000908)
+#define STATUS_WOW_ASSERTION cpu_to_le32(0xC0009898)
+#define STATUS_INVALID_SIGNATURE cpu_to_le32(0xC000A000)
+#define STATUS_HMAC_NOT_SUPPORTED cpu_to_le32(0xC000A001)
+#define STATUS_IPSEC_QUEUE_OVERFLOW cpu_to_le32(0xC000A010)
+#define STATUS_ND_QUEUE_OVERFLOW cpu_to_le32(0xC000A011)
+#define STATUS_HOPLIMIT_EXCEEDED cpu_to_le32(0xC000A012)
+#define STATUS_PROTOCOL_NOT_SUPPORTED cpu_to_le32(0xC000A013)
+#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED \
+ cpu_to_le32(0xC000A080)
+#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR \
+ cpu_to_le32(0xC000A081)
+#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR cpu_to_le32(0xC000A082)
+#define STATUS_XML_PARSE_ERROR cpu_to_le32(0xC000A083)
+#define STATUS_XMLDSIG_ERROR cpu_to_le32(0xC000A084)
+#define STATUS_WRONG_COMPARTMENT cpu_to_le32(0xC000A085)
+#define STATUS_AUTHIP_FAILURE cpu_to_le32(0xC000A086)
+#define DBG_NO_STATE_CHANGE cpu_to_le32(0xC0010001)
+#define DBG_APP_NOT_IDLE cpu_to_le32(0xC0010002)
+#define RPC_NT_INVALID_STRING_BINDING cpu_to_le32(0xC0020001)
+#define RPC_NT_WRONG_KIND_OF_BINDING cpu_to_le32(0xC0020002)
+#define RPC_NT_INVALID_BINDING cpu_to_le32(0xC0020003)
+#define RPC_NT_PROTSEQ_NOT_SUPPORTED cpu_to_le32(0xC0020004)
+#define RPC_NT_INVALID_RPC_PROTSEQ cpu_to_le32(0xC0020005)
+#define RPC_NT_INVALID_STRING_UUID cpu_to_le32(0xC0020006)
+#define RPC_NT_INVALID_ENDPOINT_FORMAT cpu_to_le32(0xC0020007)
+#define RPC_NT_INVALID_NET_ADDR cpu_to_le32(0xC0020008)
+#define RPC_NT_NO_ENDPOINT_FOUND cpu_to_le32(0xC0020009)
+#define RPC_NT_INVALID_TIMEOUT cpu_to_le32(0xC002000A)
+#define RPC_NT_OBJECT_NOT_FOUND cpu_to_le32(0xC002000B)
+#define RPC_NT_ALREADY_REGISTERED cpu_to_le32(0xC002000C)
+#define RPC_NT_TYPE_ALREADY_REGISTERED cpu_to_le32(0xC002000D)
+#define RPC_NT_ALREADY_LISTENING cpu_to_le32(0xC002000E)
+#define RPC_NT_NO_PROTSEQS_REGISTERED cpu_to_le32(0xC002000F)
+#define RPC_NT_NOT_LISTENING cpu_to_le32(0xC0020010)
+#define RPC_NT_UNKNOWN_MGR_TYPE cpu_to_le32(0xC0020011)
+#define RPC_NT_UNKNOWN_IF cpu_to_le32(0xC0020012)
+#define RPC_NT_NO_BINDINGS cpu_to_le32(0xC0020013)
+#define RPC_NT_NO_PROTSEQS cpu_to_le32(0xC0020014)
+#define RPC_NT_CANT_CREATE_ENDPOINT cpu_to_le32(0xC0020015)
+#define RPC_NT_OUT_OF_RESOURCES cpu_to_le32(0xC0020016)
+#define RPC_NT_SERVER_UNAVAILABLE cpu_to_le32(0xC0020017)
+#define RPC_NT_SERVER_TOO_BUSY cpu_to_le32(0xC0020018)
+#define RPC_NT_INVALID_NETWORK_OPTIONS cpu_to_le32(0xC0020019)
+#define RPC_NT_NO_CALL_ACTIVE cpu_to_le32(0xC002001A)
+#define RPC_NT_CALL_FAILED cpu_to_le32(0xC002001B)
+#define RPC_NT_CALL_FAILED_DNE cpu_to_le32(0xC002001C)
+#define RPC_NT_PROTOCOL_ERROR cpu_to_le32(0xC002001D)
+#define RPC_NT_UNSUPPORTED_TRANS_SYN cpu_to_le32(0xC002001F)
+#define RPC_NT_UNSUPPORTED_TYPE cpu_to_le32(0xC0020021)
+#define RPC_NT_INVALID_TAG cpu_to_le32(0xC0020022)
+#define RPC_NT_INVALID_BOUND cpu_to_le32(0xC0020023)
+#define RPC_NT_NO_ENTRY_NAME cpu_to_le32(0xC0020024)
+#define RPC_NT_INVALID_NAME_SYNTAX cpu_to_le32(0xC0020025)
+#define RPC_NT_UNSUPPORTED_NAME_SYNTAX cpu_to_le32(0xC0020026)
+#define RPC_NT_UUID_NO_ADDRESS cpu_to_le32(0xC0020028)
+#define RPC_NT_DUPLICATE_ENDPOINT cpu_to_le32(0xC0020029)
+#define RPC_NT_UNKNOWN_AUTHN_TYPE cpu_to_le32(0xC002002A)
+#define RPC_NT_MAX_CALLS_TOO_SMALL cpu_to_le32(0xC002002B)
+#define RPC_NT_STRING_TOO_LONG cpu_to_le32(0xC002002C)
+#define RPC_NT_PROTSEQ_NOT_FOUND cpu_to_le32(0xC002002D)
+#define RPC_NT_PROCNUM_OUT_OF_RANGE cpu_to_le32(0xC002002E)
+#define RPC_NT_BINDING_HAS_NO_AUTH cpu_to_le32(0xC002002F)
+#define RPC_NT_UNKNOWN_AUTHN_SERVICE cpu_to_le32(0xC0020030)
+#define RPC_NT_UNKNOWN_AUTHN_LEVEL cpu_to_le32(0xC0020031)
+#define RPC_NT_INVALID_AUTH_IDENTITY cpu_to_le32(0xC0020032)
+#define RPC_NT_UNKNOWN_AUTHZ_SERVICE cpu_to_le32(0xC0020033)
+#define EPT_NT_INVALID_ENTRY cpu_to_le32(0xC0020034)
+#define EPT_NT_CANT_PERFORM_OP cpu_to_le32(0xC0020035)
+#define EPT_NT_NOT_REGISTERED cpu_to_le32(0xC0020036)
+#define RPC_NT_NOTHING_TO_EXPORT cpu_to_le32(0xC0020037)
+#define RPC_NT_INCOMPLETE_NAME cpu_to_le32(0xC0020038)
+#define RPC_NT_INVALID_VERS_OPTION cpu_to_le32(0xC0020039)
+#define RPC_NT_NO_MORE_MEMBERS cpu_to_le32(0xC002003A)
+#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED cpu_to_le32(0xC002003B)
+#define RPC_NT_INTERFACE_NOT_FOUND cpu_to_le32(0xC002003C)
+#define RPC_NT_ENTRY_ALREADY_EXISTS cpu_to_le32(0xC002003D)
+#define RPC_NT_ENTRY_NOT_FOUND cpu_to_le32(0xC002003E)
+#define RPC_NT_NAME_SERVICE_UNAVAILABLE cpu_to_le32(0xC002003F)
+#define RPC_NT_INVALID_NAF_ID cpu_to_le32(0xC0020040)
+#define RPC_NT_CANNOT_SUPPORT cpu_to_le32(0xC0020041)
+#define RPC_NT_NO_CONTEXT_AVAILABLE cpu_to_le32(0xC0020042)
+#define RPC_NT_INTERNAL_ERROR cpu_to_le32(0xC0020043)
+#define RPC_NT_ZERO_DIVIDE cpu_to_le32(0xC0020044)
+#define RPC_NT_ADDRESS_ERROR cpu_to_le32(0xC0020045)
+#define RPC_NT_FP_DIV_ZERO cpu_to_le32(0xC0020046)
+#define RPC_NT_FP_UNDERFLOW cpu_to_le32(0xC0020047)
+#define RPC_NT_FP_OVERFLOW cpu_to_le32(0xC0020048)
+#define RPC_NT_CALL_IN_PROGRESS cpu_to_le32(0xC0020049)
+#define RPC_NT_NO_MORE_BINDINGS cpu_to_le32(0xC002004A)
+#define RPC_NT_GROUP_MEMBER_NOT_FOUND cpu_to_le32(0xC002004B)
+#define EPT_NT_CANT_CREATE cpu_to_le32(0xC002004C)
+#define RPC_NT_INVALID_OBJECT cpu_to_le32(0xC002004D)
+#define RPC_NT_NO_INTERFACES cpu_to_le32(0xC002004F)
+#define RPC_NT_CALL_CANCELLED cpu_to_le32(0xC0020050)
+#define RPC_NT_BINDING_INCOMPLETE cpu_to_le32(0xC0020051)
+#define RPC_NT_COMM_FAILURE cpu_to_le32(0xC0020052)
+#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL cpu_to_le32(0xC0020053)
+#define RPC_NT_NO_PRINC_NAME cpu_to_le32(0xC0020054)
+#define RPC_NT_NOT_RPC_ERROR cpu_to_le32(0xC0020055)
+#define RPC_NT_SEC_PKG_ERROR cpu_to_le32(0xC0020057)
+#define RPC_NT_NOT_CANCELLED cpu_to_le32(0xC0020058)
+#define RPC_NT_INVALID_ASYNC_HANDLE cpu_to_le32(0xC0020062)
+#define RPC_NT_INVALID_ASYNC_CALL cpu_to_le32(0xC0020063)
+#define RPC_NT_PROXY_ACCESS_DENIED cpu_to_le32(0xC0020064)
+#define RPC_NT_NO_MORE_ENTRIES cpu_to_le32(0xC0030001)
+#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL cpu_to_le32(0xC0030002)
+#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE cpu_to_le32(0xC0030003)
+#define RPC_NT_SS_IN_NULL_CONTEXT cpu_to_le32(0xC0030004)
+#define RPC_NT_SS_CONTEXT_MISMATCH cpu_to_le32(0xC0030005)
+#define RPC_NT_SS_CONTEXT_DAMAGED cpu_to_le32(0xC0030006)
+#define RPC_NT_SS_HANDLES_MISMATCH cpu_to_le32(0xC0030007)
+#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE cpu_to_le32(0xC0030008)
+#define RPC_NT_NULL_REF_POINTER cpu_to_le32(0xC0030009)
+#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE cpu_to_le32(0xC003000A)
+#define RPC_NT_BYTE_COUNT_TOO_SMALL cpu_to_le32(0xC003000B)
+#define RPC_NT_BAD_STUB_DATA cpu_to_le32(0xC003000C)
+#define RPC_NT_INVALID_ES_ACTION cpu_to_le32(0xC0030059)
+#define RPC_NT_WRONG_ES_VERSION cpu_to_le32(0xC003005A)
+#define RPC_NT_WRONG_STUB_VERSION cpu_to_le32(0xC003005B)
+#define RPC_NT_INVALID_PIPE_OBJECT cpu_to_le32(0xC003005C)
+#define RPC_NT_INVALID_PIPE_OPERATION cpu_to_le32(0xC003005D)
+#define RPC_NT_WRONG_PIPE_VERSION cpu_to_le32(0xC003005E)
+#define RPC_NT_PIPE_CLOSED cpu_to_le32(0xC003005F)
+#define RPC_NT_PIPE_DISCIPLINE_ERROR cpu_to_le32(0xC0030060)
+#define RPC_NT_PIPE_EMPTY cpu_to_le32(0xC0030061)
+#define STATUS_PNP_BAD_MPS_TABLE cpu_to_le32(0xC0040035)
+#define STATUS_PNP_TRANSLATION_FAILED cpu_to_le32(0xC0040036)
+#define STATUS_PNP_IRQ_TRANSLATION_FAILED cpu_to_le32(0xC0040037)
+#define STATUS_PNP_INVALID_ID cpu_to_le32(0xC0040038)
+#define STATUS_IO_REISSUE_AS_CACHED cpu_to_le32(0xC0040039)
+#define STATUS_CTX_WINSTATION_NAME_INVALID cpu_to_le32(0xC00A0001)
+#define STATUS_CTX_INVALID_PD cpu_to_le32(0xC00A0002)
+#define STATUS_CTX_PD_NOT_FOUND cpu_to_le32(0xC00A0003)
+#define STATUS_CTX_CLOSE_PENDING cpu_to_le32(0xC00A0006)
+#define STATUS_CTX_NO_OUTBUF cpu_to_le32(0xC00A0007)
+#define STATUS_CTX_MODEM_INF_NOT_FOUND cpu_to_le32(0xC00A0008)
+#define STATUS_CTX_INVALID_MODEMNAME cpu_to_le32(0xC00A0009)
+#define STATUS_CTX_RESPONSE_ERROR cpu_to_le32(0xC00A000A)
+#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT cpu_to_le32(0xC00A000B)
+#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER cpu_to_le32(0xC00A000C)
+#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE cpu_to_le32(0xC00A000D)
+#define STATUS_CTX_MODEM_RESPONSE_BUSY cpu_to_le32(0xC00A000E)
+#define STATUS_CTX_MODEM_RESPONSE_VOICE cpu_to_le32(0xC00A000F)
+#define STATUS_CTX_TD_ERROR cpu_to_le32(0xC00A0010)
+#define STATUS_CTX_LICENSE_CLIENT_INVALID cpu_to_le32(0xC00A0012)
+#define STATUS_CTX_LICENSE_NOT_AVAILABLE cpu_to_le32(0xC00A0013)
+#define STATUS_CTX_LICENSE_EXPIRED cpu_to_le32(0xC00A0014)
+#define STATUS_CTX_WINSTATION_NOT_FOUND cpu_to_le32(0xC00A0015)
+#define STATUS_CTX_WINSTATION_NAME_COLLISION cpu_to_le32(0xC00A0016)
+#define STATUS_CTX_WINSTATION_BUSY cpu_to_le32(0xC00A0017)
+#define STATUS_CTX_BAD_VIDEO_MODE cpu_to_le32(0xC00A0018)
+#define STATUS_CTX_GRAPHICS_INVALID cpu_to_le32(0xC00A0022)
+#define STATUS_CTX_NOT_CONSOLE cpu_to_le32(0xC00A0024)
+#define STATUS_CTX_CLIENT_QUERY_TIMEOUT cpu_to_le32(0xC00A0026)
+#define STATUS_CTX_CONSOLE_DISCONNECT cpu_to_le32(0xC00A0027)
+#define STATUS_CTX_CONSOLE_CONNECT cpu_to_le32(0xC00A0028)
+#define STATUS_CTX_SHADOW_DENIED cpu_to_le32(0xC00A002A)
+#define STATUS_CTX_WINSTATION_ACCESS_DENIED cpu_to_le32(0xC00A002B)
+#define STATUS_CTX_INVALID_WD cpu_to_le32(0xC00A002E)
+#define STATUS_CTX_WD_NOT_FOUND cpu_to_le32(0xC00A002F)
+#define STATUS_CTX_SHADOW_INVALID cpu_to_le32(0xC00A0030)
+#define STATUS_CTX_SHADOW_DISABLED cpu_to_le32(0xC00A0031)
+#define STATUS_RDP_PROTOCOL_ERROR cpu_to_le32(0xC00A0032)
+#define STATUS_CTX_CLIENT_LICENSE_NOT_SET cpu_to_le32(0xC00A0033)
+#define STATUS_CTX_CLIENT_LICENSE_IN_USE cpu_to_le32(0xC00A0034)
+#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE cpu_to_le32(0xC00A0035)
+#define STATUS_CTX_SHADOW_NOT_RUNNING cpu_to_le32(0xC00A0036)
+#define STATUS_CTX_LOGON_DISABLED cpu_to_le32(0xC00A0037)
+#define STATUS_CTX_SECURITY_LAYER_ERROR cpu_to_le32(0xC00A0038)
+#define STATUS_TS_INCOMPATIBLE_SESSIONS cpu_to_le32(0xC00A0039)
+#define STATUS_MUI_FILE_NOT_FOUND cpu_to_le32(0xC00B0001)
+#define STATUS_MUI_INVALID_FILE cpu_to_le32(0xC00B0002)
+#define STATUS_MUI_INVALID_RC_CONFIG cpu_to_le32(0xC00B0003)
+#define STATUS_MUI_INVALID_LOCALE_NAME cpu_to_le32(0xC00B0004)
+#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME cpu_to_le32(0xC00B0005)
+#define STATUS_MUI_FILE_NOT_LOADED cpu_to_le32(0xC00B0006)
+#define STATUS_RESOURCE_ENUM_USER_STOP cpu_to_le32(0xC00B0007)
+#define STATUS_CLUSTER_INVALID_NODE cpu_to_le32(0xC0130001)
+#define STATUS_CLUSTER_NODE_EXISTS cpu_to_le32(0xC0130002)
+#define STATUS_CLUSTER_JOIN_IN_PROGRESS cpu_to_le32(0xC0130003)
+#define STATUS_CLUSTER_NODE_NOT_FOUND cpu_to_le32(0xC0130004)
+#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND cpu_to_le32(0xC0130005)
+#define STATUS_CLUSTER_NETWORK_EXISTS cpu_to_le32(0xC0130006)
+#define STATUS_CLUSTER_NETWORK_NOT_FOUND cpu_to_le32(0xC0130007)
+#define STATUS_CLUSTER_NETINTERFACE_EXISTS cpu_to_le32(0xC0130008)
+#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND cpu_to_le32(0xC0130009)
+#define STATUS_CLUSTER_INVALID_REQUEST cpu_to_le32(0xC013000A)
+#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER cpu_to_le32(0xC013000B)
+#define STATUS_CLUSTER_NODE_DOWN cpu_to_le32(0xC013000C)
+#define STATUS_CLUSTER_NODE_UNREACHABLE cpu_to_le32(0xC013000D)
+#define STATUS_CLUSTER_NODE_NOT_MEMBER cpu_to_le32(0xC013000E)
+#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS cpu_to_le32(0xC013000F)
+#define STATUS_CLUSTER_INVALID_NETWORK cpu_to_le32(0xC0130010)
+#define STATUS_CLUSTER_NO_NET_ADAPTERS cpu_to_le32(0xC0130011)
+#define STATUS_CLUSTER_NODE_UP cpu_to_le32(0xC0130012)
+#define STATUS_CLUSTER_NODE_PAUSED cpu_to_le32(0xC0130013)
+#define STATUS_CLUSTER_NODE_NOT_PAUSED cpu_to_le32(0xC0130014)
+#define STATUS_CLUSTER_NO_SECURITY_CONTEXT cpu_to_le32(0xC0130015)
+#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL cpu_to_le32(0xC0130016)
+#define STATUS_CLUSTER_POISONED cpu_to_le32(0xC0130017)
+#define STATUS_ACPI_INVALID_OPCODE cpu_to_le32(0xC0140001)
+#define STATUS_ACPI_STACK_OVERFLOW cpu_to_le32(0xC0140002)
+#define STATUS_ACPI_ASSERT_FAILED cpu_to_le32(0xC0140003)
+#define STATUS_ACPI_INVALID_INDEX cpu_to_le32(0xC0140004)
+#define STATUS_ACPI_INVALID_ARGUMENT cpu_to_le32(0xC0140005)
+#define STATUS_ACPI_FATAL cpu_to_le32(0xC0140006)
+#define STATUS_ACPI_INVALID_SUPERNAME cpu_to_le32(0xC0140007)
+#define STATUS_ACPI_INVALID_ARGTYPE cpu_to_le32(0xC0140008)
+#define STATUS_ACPI_INVALID_OBJTYPE cpu_to_le32(0xC0140009)
+#define STATUS_ACPI_INVALID_TARGETTYPE cpu_to_le32(0xC014000A)
+#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT cpu_to_le32(0xC014000B)
+#define STATUS_ACPI_ADDRESS_NOT_MAPPED cpu_to_le32(0xC014000C)
+#define STATUS_ACPI_INVALID_EVENTTYPE cpu_to_le32(0xC014000D)
+#define STATUS_ACPI_HANDLER_COLLISION cpu_to_le32(0xC014000E)
+#define STATUS_ACPI_INVALID_DATA cpu_to_le32(0xC014000F)
+#define STATUS_ACPI_INVALID_REGION cpu_to_le32(0xC0140010)
+#define STATUS_ACPI_INVALID_ACCESS_SIZE cpu_to_le32(0xC0140011)
+#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK cpu_to_le32(0xC0140012)
+#define STATUS_ACPI_ALREADY_INITIALIZED cpu_to_le32(0xC0140013)
+#define STATUS_ACPI_NOT_INITIALIZED cpu_to_le32(0xC0140014)
+#define STATUS_ACPI_INVALID_MUTEX_LEVEL cpu_to_le32(0xC0140015)
+#define STATUS_ACPI_MUTEX_NOT_OWNED cpu_to_le32(0xC0140016)
+#define STATUS_ACPI_MUTEX_NOT_OWNER cpu_to_le32(0xC0140017)
+#define STATUS_ACPI_RS_ACCESS cpu_to_le32(0xC0140018)
+#define STATUS_ACPI_INVALID_TABLE cpu_to_le32(0xC0140019)
+#define STATUS_ACPI_REG_HANDLER_FAILED cpu_to_le32(0xC0140020)
+#define STATUS_ACPI_POWER_REQUEST_FAILED cpu_to_le32(0xC0140021)
+#define STATUS_SXS_SECTION_NOT_FOUND cpu_to_le32(0xC0150001)
+#define STATUS_SXS_CANT_GEN_ACTCTX cpu_to_le32(0xC0150002)
+#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT cpu_to_le32(0xC0150003)
+#define STATUS_SXS_ASSEMBLY_NOT_FOUND cpu_to_le32(0xC0150004)
+#define STATUS_SXS_MANIFEST_FORMAT_ERROR cpu_to_le32(0xC0150005)
+#define STATUS_SXS_MANIFEST_PARSE_ERROR cpu_to_le32(0xC0150006)
+#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED cpu_to_le32(0xC0150007)
+#define STATUS_SXS_KEY_NOT_FOUND cpu_to_le32(0xC0150008)
+#define STATUS_SXS_VERSION_CONFLICT cpu_to_le32(0xC0150009)
+#define STATUS_SXS_WRONG_SECTION_TYPE cpu_to_le32(0xC015000A)
+#define STATUS_SXS_THREAD_QUERIES_DISABLED cpu_to_le32(0xC015000B)
+#define STATUS_SXS_ASSEMBLY_MISSING cpu_to_le32(0xC015000C)
+#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET cpu_to_le32(0xC015000E)
+#define STATUS_SXS_EARLY_DEACTIVATION cpu_to_le32(0xC015000F)
+#define STATUS_SXS_INVALID_DEACTIVATION cpu_to_le32(0xC0150010)
+#define STATUS_SXS_MULTIPLE_DEACTIVATION cpu_to_le32(0xC0150011)
+#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY \
+ cpu_to_le32(0xC0150012)
+#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED cpu_to_le32(0xC0150013)
+#define STATUS_SXS_CORRUPT_ACTIVATION_STACK cpu_to_le32(0xC0150014)
+#define STATUS_SXS_CORRUPTION cpu_to_le32(0xC0150015)
+#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE cpu_to_le32(0xC0150016)
+#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME cpu_to_le32(0xC0150017)
+#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE cpu_to_le32(0xC0150018)
+#define STATUS_SXS_IDENTITY_PARSE_ERROR cpu_to_le32(0xC0150019)
+#define STATUS_SXS_COMPONENT_STORE_CORRUPT cpu_to_le32(0xC015001A)
+#define STATUS_SXS_FILE_HASH_MISMATCH cpu_to_le32(0xC015001B)
+#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT \
+ cpu_to_le32(0xC015001C)
+#define STATUS_SXS_IDENTITIES_DIFFERENT cpu_to_le32(0xC015001D)
+#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT cpu_to_le32(0xC015001E)
+#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY cpu_to_le32(0xC015001F)
+#define STATUS_ADVANCED_INSTALLER_FAILED cpu_to_le32(0xC0150020)
+#define STATUS_XML_ENCODING_MISMATCH cpu_to_le32(0xC0150021)
+#define STATUS_SXS_MANIFEST_TOO_BIG cpu_to_le32(0xC0150022)
+#define STATUS_SXS_SETTING_NOT_REGISTERED cpu_to_le32(0xC0150023)
+#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE cpu_to_le32(0xC0150024)
+#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED cpu_to_le32(0xC0150025)
+#define STATUS_GENERIC_COMMAND_FAILED cpu_to_le32(0xC0150026)
+#define STATUS_SXS_FILE_HASH_MISSING cpu_to_le32(0xC0150027)
+#define STATUS_TRANSACTIONAL_CONFLICT cpu_to_le32(0xC0190001)
+#define STATUS_INVALID_TRANSACTION cpu_to_le32(0xC0190002)
+#define STATUS_TRANSACTION_NOT_ACTIVE cpu_to_le32(0xC0190003)
+#define STATUS_TM_INITIALIZATION_FAILED cpu_to_le32(0xC0190004)
+#define STATUS_RM_NOT_ACTIVE cpu_to_le32(0xC0190005)
+#define STATUS_RM_METADATA_CORRUPT cpu_to_le32(0xC0190006)
+#define STATUS_TRANSACTION_NOT_JOINED cpu_to_le32(0xC0190007)
+#define STATUS_DIRECTORY_NOT_RM cpu_to_le32(0xC0190008)
+#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE cpu_to_le32(0xC019000A)
+#define STATUS_LOG_RESIZE_INVALID_SIZE cpu_to_le32(0xC019000B)
+#define STATUS_REMOTE_FILE_VERSION_MISMATCH cpu_to_le32(0xC019000C)
+#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS cpu_to_le32(0xC019000F)
+#define STATUS_TRANSACTION_PROPAGATION_FAILED cpu_to_le32(0xC0190010)
+#define STATUS_CRM_PROTOCOL_NOT_FOUND cpu_to_le32(0xC0190011)
+#define STATUS_TRANSACTION_SUPERIOR_EXISTS cpu_to_le32(0xC0190012)
+#define STATUS_TRANSACTION_REQUEST_NOT_VALID cpu_to_le32(0xC0190013)
+#define STATUS_TRANSACTION_NOT_REQUESTED cpu_to_le32(0xC0190014)
+#define STATUS_TRANSACTION_ALREADY_ABORTED cpu_to_le32(0xC0190015)
+#define STATUS_TRANSACTION_ALREADY_COMMITTED cpu_to_le32(0xC0190016)
+#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER cpu_to_le32(0xC0190017)
+#define STATUS_CURRENT_TRANSACTION_NOT_VALID cpu_to_le32(0xC0190018)
+#define STATUS_LOG_GROWTH_FAILED cpu_to_le32(0xC0190019)
+#define STATUS_OBJECT_NO_LONGER_EXISTS cpu_to_le32(0xC0190021)
+#define STATUS_STREAM_MINIVERSION_NOT_FOUND cpu_to_le32(0xC0190022)
+#define STATUS_STREAM_MINIVERSION_NOT_VALID cpu_to_le32(0xC0190023)
+#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION \
+ cpu_to_le32(0xC0190024)
+#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT cpu_to_le32(0xC0190025)
+#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS cpu_to_le32(0xC0190026)
+#define STATUS_HANDLE_NO_LONGER_VALID cpu_to_le32(0xC0190028)
+#define STATUS_LOG_CORRUPTION_DETECTED cpu_to_le32(0xC0190030)
+#define STATUS_RM_DISCONNECTED cpu_to_le32(0xC0190032)
+#define STATUS_ENLISTMENT_NOT_SUPERIOR cpu_to_le32(0xC0190033)
+#define STATUS_FILE_IDENTITY_NOT_PERSISTENT cpu_to_le32(0xC0190036)
+#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY cpu_to_le32(0xC0190037)
+#define STATUS_CANT_CROSS_RM_BOUNDARY cpu_to_le32(0xC0190038)
+#define STATUS_TXF_DIR_NOT_EMPTY cpu_to_le32(0xC0190039)
+#define STATUS_INDOUBT_TRANSACTIONS_EXIST cpu_to_le32(0xC019003A)
+#define STATUS_TM_VOLATILE cpu_to_le32(0xC019003B)
+#define STATUS_ROLLBACK_TIMER_EXPIRED cpu_to_le32(0xC019003C)
+#define STATUS_TXF_ATTRIBUTE_CORRUPT cpu_to_le32(0xC019003D)
+#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC019003E)
+#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED cpu_to_le32(0xC019003F)
+#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE cpu_to_le32(0xC0190040)
+#define STATUS_TRANSACTION_REQUIRED_PROMOTION cpu_to_le32(0xC0190043)
+#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION cpu_to_le32(0xC0190044)
+#define STATUS_TRANSACTIONS_NOT_FROZEN cpu_to_le32(0xC0190045)
+#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS cpu_to_le32(0xC0190046)
+#define STATUS_NOT_SNAPSHOT_VOLUME cpu_to_le32(0xC0190047)
+#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES cpu_to_le32(0xC0190048)
+#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190049)
+#define STATUS_TM_IDENTITY_MISMATCH cpu_to_le32(0xC019004A)
+#define STATUS_FLOATED_SECTION cpu_to_le32(0xC019004B)
+#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK cpu_to_le32(0xC019004C)
+#define STATUS_CANNOT_ABORT_TRANSACTIONS cpu_to_le32(0xC019004D)
+#define STATUS_TRANSACTION_NOT_FOUND cpu_to_le32(0xC019004E)
+#define STATUS_RESOURCEMANAGER_NOT_FOUND cpu_to_le32(0xC019004F)
+#define STATUS_ENLISTMENT_NOT_FOUND cpu_to_le32(0xC0190050)
+#define STATUS_TRANSACTIONMANAGER_NOT_FOUND cpu_to_le32(0xC0190051)
+#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE cpu_to_le32(0xC0190052)
+#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION \
+ cpu_to_le32(0xC0190053)
+#define STATUS_TRANSACTION_NOT_ROOT cpu_to_le32(0xC0190054)
+#define STATUS_TRANSACTION_OBJECT_EXPIRED cpu_to_le32(0xC0190055)
+#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190056)
+#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED cpu_to_le32(0xC0190057)
+#define STATUS_TRANSACTION_RECORD_TOO_LONG cpu_to_le32(0xC0190058)
+#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION cpu_to_le32(0xC0190059)
+#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION cpu_to_le32(0xC019005A)
+#define STATUS_TRANSACTION_INTEGRITY_VIOLATED cpu_to_le32(0xC019005B)
+#define STATUS_LOG_SECTOR_INVALID cpu_to_le32(0xC01A0001)
+#define STATUS_LOG_SECTOR_PARITY_INVALID cpu_to_le32(0xC01A0002)
+#define STATUS_LOG_SECTOR_REMAPPED cpu_to_le32(0xC01A0003)
+#define STATUS_LOG_BLOCK_INCOMPLETE cpu_to_le32(0xC01A0004)
+#define STATUS_LOG_INVALID_RANGE cpu_to_le32(0xC01A0005)
+#define STATUS_LOG_BLOCKS_EXHAUSTED cpu_to_le32(0xC01A0006)
+#define STATUS_LOG_READ_CONTEXT_INVALID cpu_to_le32(0xC01A0007)
+#define STATUS_LOG_RESTART_INVALID cpu_to_le32(0xC01A0008)
+#define STATUS_LOG_BLOCK_VERSION cpu_to_le32(0xC01A0009)
+#define STATUS_LOG_BLOCK_INVALID cpu_to_le32(0xC01A000A)
+#define STATUS_LOG_READ_MODE_INVALID cpu_to_le32(0xC01A000B)
+#define STATUS_LOG_METADATA_CORRUPT cpu_to_le32(0xC01A000D)
+#define STATUS_LOG_METADATA_INVALID cpu_to_le32(0xC01A000E)
+#define STATUS_LOG_METADATA_INCONSISTENT cpu_to_le32(0xC01A000F)
+#define STATUS_LOG_RESERVATION_INVALID cpu_to_le32(0xC01A0010)
+#define STATUS_LOG_CANT_DELETE cpu_to_le32(0xC01A0011)
+#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED cpu_to_le32(0xC01A0012)
+#define STATUS_LOG_START_OF_LOG cpu_to_le32(0xC01A0013)
+#define STATUS_LOG_POLICY_ALREADY_INSTALLED cpu_to_le32(0xC01A0014)
+#define STATUS_LOG_POLICY_NOT_INSTALLED cpu_to_le32(0xC01A0015)
+#define STATUS_LOG_POLICY_INVALID cpu_to_le32(0xC01A0016)
+#define STATUS_LOG_POLICY_CONFLICT cpu_to_le32(0xC01A0017)
+#define STATUS_LOG_PINNED_ARCHIVE_TAIL cpu_to_le32(0xC01A0018)
+#define STATUS_LOG_RECORD_NONEXISTENT cpu_to_le32(0xC01A0019)
+#define STATUS_LOG_RECORDS_RESERVED_INVALID cpu_to_le32(0xC01A001A)
+#define STATUS_LOG_SPACE_RESERVED_INVALID cpu_to_le32(0xC01A001B)
+#define STATUS_LOG_TAIL_INVALID cpu_to_le32(0xC01A001C)
+#define STATUS_LOG_FULL cpu_to_le32(0xC01A001D)
+#define STATUS_LOG_MULTIPLEXED cpu_to_le32(0xC01A001E)
+#define STATUS_LOG_DEDICATED cpu_to_le32(0xC01A001F)
+#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS cpu_to_le32(0xC01A0020)
+#define STATUS_LOG_ARCHIVE_IN_PROGRESS cpu_to_le32(0xC01A0021)
+#define STATUS_LOG_EPHEMERAL cpu_to_le32(0xC01A0022)
+#define STATUS_LOG_NOT_ENOUGH_CONTAINERS cpu_to_le32(0xC01A0023)
+#define STATUS_LOG_CLIENT_ALREADY_REGISTERED cpu_to_le32(0xC01A0024)
+#define STATUS_LOG_CLIENT_NOT_REGISTERED cpu_to_le32(0xC01A0025)
+#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS cpu_to_le32(0xC01A0026)
+#define STATUS_LOG_CONTAINER_READ_FAILED cpu_to_le32(0xC01A0027)
+#define STATUS_LOG_CONTAINER_WRITE_FAILED cpu_to_le32(0xC01A0028)
+#define STATUS_LOG_CONTAINER_OPEN_FAILED cpu_to_le32(0xC01A0029)
+#define STATUS_LOG_CONTAINER_STATE_INVALID cpu_to_le32(0xC01A002A)
+#define STATUS_LOG_STATE_INVALID cpu_to_le32(0xC01A002B)
+#define STATUS_LOG_PINNED cpu_to_le32(0xC01A002C)
+#define STATUS_LOG_METADATA_FLUSH_FAILED cpu_to_le32(0xC01A002D)
+#define STATUS_LOG_INCONSISTENT_SECURITY cpu_to_le32(0xC01A002E)
+#define STATUS_LOG_APPENDED_FLUSH_FAILED cpu_to_le32(0xC01A002F)
+#define STATUS_LOG_PINNED_RESERVATION cpu_to_le32(0xC01A0030)
+#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC01B00EA)
+#define STATUS_FLT_NO_HANDLER_DEFINED cpu_to_le32(0xC01C0001)
+#define STATUS_FLT_CONTEXT_ALREADY_DEFINED cpu_to_le32(0xC01C0002)
+#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST cpu_to_le32(0xC01C0003)
+#define STATUS_FLT_DISALLOW_FAST_IO cpu_to_le32(0xC01C0004)
+#define STATUS_FLT_INVALID_NAME_REQUEST cpu_to_le32(0xC01C0005)
+#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION cpu_to_le32(0xC01C0006)
+#define STATUS_FLT_NOT_INITIALIZED cpu_to_le32(0xC01C0007)
+#define STATUS_FLT_FILTER_NOT_READY cpu_to_le32(0xC01C0008)
+#define STATUS_FLT_POST_OPERATION_CLEANUP cpu_to_le32(0xC01C0009)
+#define STATUS_FLT_INTERNAL_ERROR cpu_to_le32(0xC01C000A)
+#define STATUS_FLT_DELETING_OBJECT cpu_to_le32(0xC01C000B)
+#define STATUS_FLT_MUST_BE_NONPAGED_POOL cpu_to_le32(0xC01C000C)
+#define STATUS_FLT_DUPLICATE_ENTRY cpu_to_le32(0xC01C000D)
+#define STATUS_FLT_CBDQ_DISABLED cpu_to_le32(0xC01C000E)
+#define STATUS_FLT_DO_NOT_ATTACH cpu_to_le32(0xC01C000F)
+#define STATUS_FLT_DO_NOT_DETACH cpu_to_le32(0xC01C0010)
+#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION cpu_to_le32(0xC01C0011)
+#define STATUS_FLT_INSTANCE_NAME_COLLISION cpu_to_le32(0xC01C0012)
+#define STATUS_FLT_FILTER_NOT_FOUND cpu_to_le32(0xC01C0013)
+#define STATUS_FLT_VOLUME_NOT_FOUND cpu_to_le32(0xC01C0014)
+#define STATUS_FLT_INSTANCE_NOT_FOUND cpu_to_le32(0xC01C0015)
+#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND cpu_to_le32(0xC01C0016)
+#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION cpu_to_le32(0xC01C0017)
+#define STATUS_FLT_NAME_CACHE_MISS cpu_to_le32(0xC01C0018)
+#define STATUS_FLT_NO_DEVICE_OBJECT cpu_to_le32(0xC01C0019)
+#define STATUS_FLT_VOLUME_ALREADY_MOUNTED cpu_to_le32(0xC01C001A)
+#define STATUS_FLT_ALREADY_ENLISTED cpu_to_le32(0xC01C001B)
+#define STATUS_FLT_CONTEXT_ALREADY_LINKED cpu_to_le32(0xC01C001C)
+#define STATUS_FLT_NO_WAITER_FOR_REPLY cpu_to_le32(0xC01C0020)
+#define STATUS_MONITOR_NO_DESCRIPTOR cpu_to_le32(0xC01D0001)
+#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT cpu_to_le32(0xC01D0002)
+#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM cpu_to_le32(0xC01D0003)
+#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK cpu_to_le32(0xC01D0004)
+#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED cpu_to_le32(0xC01D0005)
+#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK \
+ cpu_to_le32(0xC01D0006)
+#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK \
+ cpu_to_le32(0xC01D0007)
+#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA cpu_to_le32(0xC01D0008)
+#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK cpu_to_le32(0xC01D0009)
+#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER cpu_to_le32(0xC01E0000)
+#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER cpu_to_le32(0xC01E0001)
+#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER cpu_to_le32(0xC01E0002)
+#define STATUS_GRAPHICS_ADAPTER_WAS_RESET cpu_to_le32(0xC01E0003)
+#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL cpu_to_le32(0xC01E0004)
+#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED cpu_to_le32(0xC01E0005)
+#define STATUS_GRAPHICS_PRESENT_OCCLUDED cpu_to_le32(0xC01E0006)
+#define STATUS_GRAPHICS_PRESENT_DENIED cpu_to_le32(0xC01E0007)
+#define STATUS_GRAPHICS_CANNOTCOLORCONVERT cpu_to_le32(0xC01E0008)
+#define STATUS_GRAPHICS_NO_VIDEO_MEMORY cpu_to_le32(0xC01E0100)
+#define STATUS_GRAPHICS_CANT_LOCK_MEMORY cpu_to_le32(0xC01E0101)
+#define STATUS_GRAPHICS_ALLOCATION_BUSY cpu_to_le32(0xC01E0102)
+#define STATUS_GRAPHICS_TOO_MANY_REFERENCES cpu_to_le32(0xC01E0103)
+#define STATUS_GRAPHICS_TRY_AGAIN_LATER cpu_to_le32(0xC01E0104)
+#define STATUS_GRAPHICS_TRY_AGAIN_NOW cpu_to_le32(0xC01E0105)
+#define STATUS_GRAPHICS_ALLOCATION_INVALID cpu_to_le32(0xC01E0106)
+#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE cpu_to_le32(0xC01E0107)
+#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED cpu_to_le32(0xC01E0108)
+#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION cpu_to_le32(0xC01E0109)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE cpu_to_le32(0xC01E0110)
+#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION cpu_to_le32(0xC01E0111)
+#define STATUS_GRAPHICS_ALLOCATION_CLOSED cpu_to_le32(0xC01E0112)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE cpu_to_le32(0xC01E0113)
+#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE cpu_to_le32(0xC01E0114)
+#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE cpu_to_le32(0xC01E0115)
+#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST cpu_to_le32(0xC01E0116)
+#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE cpu_to_le32(0xC01E0200)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0300)
+#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED cpu_to_le32(0xC01E0301)
+#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED \
+ cpu_to_le32(0xC01E0302)
+#define STATUS_GRAPHICS_INVALID_VIDPN cpu_to_le32(0xC01E0303)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE cpu_to_le32(0xC01E0304)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET cpu_to_le32(0xC01E0305)
+#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED cpu_to_le32(0xC01E0306)
+#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET cpu_to_le32(0xC01E0308)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET cpu_to_le32(0xC01E0309)
+#define STATUS_GRAPHICS_INVALID_FREQUENCY cpu_to_le32(0xC01E030A)
+#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION cpu_to_le32(0xC01E030B)
+#define STATUS_GRAPHICS_INVALID_TOTAL_REGION cpu_to_le32(0xC01E030C)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE \
+ cpu_to_le32(0xC01E0310)
+#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE \
+ cpu_to_le32(0xC01E0311)
+#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET cpu_to_le32(0xC01E0312)
+#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY cpu_to_le32(0xC01E0313)
+#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET cpu_to_le32(0xC01E0314)
+#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET cpu_to_le32(0xC01E0315)
+#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET cpu_to_le32(0xC01E0316)
+#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET cpu_to_le32(0xC01E0317)
+#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET cpu_to_le32(0xC01E0318)
+#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH cpu_to_le32(0xC01E0319)
+#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY cpu_to_le32(0xC01E031A)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET \
+ cpu_to_le32(0xC01E031B)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE cpu_to_le32(0xC01E031C)
+#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET cpu_to_le32(0xC01E031D)
+#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET cpu_to_le32(0xC01E031F)
+#define STATUS_GRAPHICS_STALE_MODESET cpu_to_le32(0xC01E0320)
+#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET cpu_to_le32(0xC01E0321)
+#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE cpu_to_le32(0xC01E0322)
+#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN cpu_to_le32(0xC01E0323)
+#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0324)
+#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION \
+ cpu_to_le32(0xC01E0325)
+#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES \
+ cpu_to_le32(0xC01E0326)
+#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0327)
+#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE \
+ cpu_to_le32(0xC01E0328)
+#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET \
+ cpu_to_le32(0xC01E0329)
+#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET cpu_to_le32(0xC01E032A)
+#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR cpu_to_le32(0xC01E032B)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET cpu_to_le32(0xC01E032C)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET cpu_to_le32(0xC01E032D)
+#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE \
+ cpu_to_le32(0xC01E032E)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE cpu_to_le32(0xC01E032F)
+#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED cpu_to_le32(0xC01E0330)
+#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0331)
+#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0332)
+#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET cpu_to_le32(0xC01E0333)
+#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER \
+ cpu_to_le32(0xC01E0334)
+#define STATUS_GRAPHICS_NO_VIDPNMGR cpu_to_le32(0xC01E0335)
+#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN cpu_to_le32(0xC01E0336)
+#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0337)
+#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED cpu_to_le32(0xC01E0338)
+#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0339)
+#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE cpu_to_le32(0xC01E033A)
+#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE cpu_to_le32(0xC01E033B)
+#define STATUS_GRAPHICS_INVALID_STRIDE cpu_to_le32(0xC01E033C)
+#define STATUS_GRAPHICS_INVALID_PIXELFORMAT cpu_to_le32(0xC01E033D)
+#define STATUS_GRAPHICS_INVALID_COLORBASIS cpu_to_le32(0xC01E033E)
+#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE cpu_to_le32(0xC01E033F)
+#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0340)
+#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT \
+ cpu_to_le32(0xC01E0341)
+#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE cpu_to_le32(0xC01E0342)
+#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN cpu_to_le32(0xC01E0343)
+#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL cpu_to_le32(0xC01E0344)
+#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION \
+ cpu_to_le32(0xC01E0345)
+#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED \
+ cpu_to_le32(0xC01E0346)
+#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP cpu_to_le32(0xC01E0347)
+#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED cpu_to_le32(0xC01E0348)
+#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED cpu_to_le32(0xC01E0349)
+#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET cpu_to_le32(0xC01E034A)
+#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON \
+ cpu_to_le32(0xC01E034D)
+#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE cpu_to_le32(0xC01E034E)
+#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE cpu_to_le32(0xC01E034F)
+#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS \
+ cpu_to_le32(0xC01E0350)
+#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING cpu_to_le32(0xC01E0352)
+#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED cpu_to_le32(0xC01E0353)
+#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS cpu_to_le32(0xC01E0354)
+#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT cpu_to_le32(0xC01E0355)
+#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM cpu_to_le32(0xC01E0356)
+#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN \
+ cpu_to_le32(0xC01E0357)
+#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT \
+ cpu_to_le32(0xC01E0358)
+#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED cpu_to_le32(0xC01E0359)
+#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION \
+ cpu_to_le32(0xC01E035A)
+#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE cpu_to_le32(0xC01E035B)
+#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET cpu_to_le32(0xC01E035C)
+#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED \
+ cpu_to_le32(0xC01E0400)
+#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED cpu_to_le32(0xC01E0401)
+#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER cpu_to_le32(0xC01E0430)
+#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED cpu_to_le32(0xC01E0431)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED cpu_to_le32(0xC01E0432)
+#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY cpu_to_le32(0xC01E0433)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED cpu_to_le32(0xC01E0434)
+#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON cpu_to_le32(0xC01E0435)
+#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE cpu_to_le32(0xC01E0436)
+#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER cpu_to_le32(0xC01E0438)
+#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED cpu_to_le32(0xC01E043B)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS \
+ cpu_to_le32(0xC01E051C)
+#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST cpu_to_le32(0xC01E051D)
+#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC01E051E)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS \
+ cpu_to_le32(0xC01E051F)
+#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED cpu_to_le32(0xC01E0520)
+#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST \
+ cpu_to_le32(0xC01E0521)
+#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED cpu_to_le32(0xC01E0500)
+#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED cpu_to_le32(0xC01E0501)
+#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED cpu_to_le32(0xC01E0502)
+#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS cpu_to_le32(0xC01E0503)
+#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E0504)
+#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST cpu_to_le32(0xC01E0505)
+#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME \
+ cpu_to_le32(0xC01E0506)
+#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP \
+ cpu_to_le32(0xC01E0507)
+#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED \
+ cpu_to_le32(0xC01E0508)
+#define STATUS_GRAPHICS_OPM_INVALID_POINTER cpu_to_le32(0xC01E050A)
+#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR cpu_to_le32(0xC01E050B)
+#define STATUS_GRAPHICS_OPM_INVALID_HANDLE cpu_to_le32(0xC01E050C)
+#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE \
+ cpu_to_le32(0xC01E050D)
+#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH cpu_to_le32(0xC01E050E)
+#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED cpu_to_le32(0xC01E050F)
+#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED cpu_to_le32(0xC01E0510)
+#define STATUS_GRAPHICS_PVP_HFS_FAILED cpu_to_le32(0xC01E0511)
+#define STATUS_GRAPHICS_OPM_INVALID_SRM cpu_to_le32(0xC01E0512)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP cpu_to_le32(0xC01E0513)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP cpu_to_le32(0xC01E0514)
+#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA \
+ cpu_to_le32(0xC01E0515)
+#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET cpu_to_le32(0xC01E0516)
+#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH cpu_to_le32(0xC01E0517)
+#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE \
+ cpu_to_le32(0xC01E0518)
+#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS \
+ cpu_to_le32(0xC01E051A)
+#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS \
+ cpu_to_le32(0xC01E051B)
+#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED cpu_to_le32(0xC01E0580)
+#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC01E0581)
+#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA cpu_to_le32(0xC01E0582)
+#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA cpu_to_le32(0xC01E0583)
+#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED cpu_to_le32(0xC01E0584)
+#define STATUS_GRAPHICS_DDCCI_INVALID_DATA cpu_to_le32(0xC01E0585)
+#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE \
+ cpu_to_le32(0xC01E0586)
+#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING \
+ cpu_to_le32(0xC01E0587)
+#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR cpu_to_le32(0xC01E0588)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND cpu_to_le32(0xC01E0589)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH cpu_to_le32(0xC01E058A)
+#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM cpu_to_le32(0xC01E058B)
+#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE cpu_to_le32(0xC01E058C)
+#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS cpu_to_le32(0xC01E058D)
+#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED cpu_to_le32(0xC01E05E0)
+#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME \
+ cpu_to_le32(0xC01E05E1)
+#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP \
+ cpu_to_le32(0xC01E05E2)
+#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED cpu_to_le32(0xC01E05E3)
+#define STATUS_GRAPHICS_INVALID_POINTER cpu_to_le32(0xC01E05E4)
+#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE \
+ cpu_to_le32(0xC01E05E5)
+#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E05E6)
+#define STATUS_GRAPHICS_INTERNAL_ERROR cpu_to_le32(0xC01E05E7)
+#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS cpu_to_le32(0xC01E05E8)
+#define STATUS_FVE_LOCKED_VOLUME cpu_to_le32(0xC0210000)
+#define STATUS_FVE_NOT_ENCRYPTED cpu_to_le32(0xC0210001)
+#define STATUS_FVE_BAD_INFORMATION cpu_to_le32(0xC0210002)
+#define STATUS_FVE_TOO_SMALL cpu_to_le32(0xC0210003)
+#define STATUS_FVE_FAILED_WRONG_FS cpu_to_le32(0xC0210004)
+#define STATUS_FVE_FAILED_BAD_FS cpu_to_le32(0xC0210005)
+#define STATUS_FVE_FS_NOT_EXTENDED cpu_to_le32(0xC0210006)
+#define STATUS_FVE_FS_MOUNTED cpu_to_le32(0xC0210007)
+#define STATUS_FVE_NO_LICENSE cpu_to_le32(0xC0210008)
+#define STATUS_FVE_ACTION_NOT_ALLOWED cpu_to_le32(0xC0210009)
+#define STATUS_FVE_BAD_DATA cpu_to_le32(0xC021000A)
+#define STATUS_FVE_VOLUME_NOT_BOUND cpu_to_le32(0xC021000B)
+#define STATUS_FVE_NOT_DATA_VOLUME cpu_to_le32(0xC021000C)
+#define STATUS_FVE_CONV_READ_ERROR cpu_to_le32(0xC021000D)
+#define STATUS_FVE_CONV_WRITE_ERROR cpu_to_le32(0xC021000E)
+#define STATUS_FVE_OVERLAPPED_UPDATE cpu_to_le32(0xC021000F)
+#define STATUS_FVE_FAILED_SECTOR_SIZE cpu_to_le32(0xC0210010)
+#define STATUS_FVE_FAILED_AUTHENTICATION cpu_to_le32(0xC0210011)
+#define STATUS_FVE_NOT_OS_VOLUME cpu_to_le32(0xC0210012)
+#define STATUS_FVE_KEYFILE_NOT_FOUND cpu_to_le32(0xC0210013)
+#define STATUS_FVE_KEYFILE_INVALID cpu_to_le32(0xC0210014)
+#define STATUS_FVE_KEYFILE_NO_VMK cpu_to_le32(0xC0210015)
+#define STATUS_FVE_TPM_DISABLED cpu_to_le32(0xC0210016)
+#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO cpu_to_le32(0xC0210017)
+#define STATUS_FVE_TPM_INVALID_PCR cpu_to_le32(0xC0210018)
+#define STATUS_FVE_TPM_NO_VMK cpu_to_le32(0xC0210019)
+#define STATUS_FVE_PIN_INVALID cpu_to_le32(0xC021001A)
+#define STATUS_FVE_AUTH_INVALID_APPLICATION cpu_to_le32(0xC021001B)
+#define STATUS_FVE_AUTH_INVALID_CONFIG cpu_to_le32(0xC021001C)
+#define STATUS_FVE_DEBUGGER_ENABLED cpu_to_le32(0xC021001D)
+#define STATUS_FVE_DRY_RUN_FAILED cpu_to_le32(0xC021001E)
+#define STATUS_FVE_BAD_METADATA_POINTER cpu_to_le32(0xC021001F)
+#define STATUS_FVE_OLD_METADATA_COPY cpu_to_le32(0xC0210020)
+#define STATUS_FVE_REBOOT_REQUIRED cpu_to_le32(0xC0210021)
+#define STATUS_FVE_RAW_ACCESS cpu_to_le32(0xC0210022)
+#define STATUS_FVE_RAW_BLOCKED cpu_to_le32(0xC0210023)
+#define STATUS_FWP_CALLOUT_NOT_FOUND cpu_to_le32(0xC0220001)
+#define STATUS_FWP_CONDITION_NOT_FOUND cpu_to_le32(0xC0220002)
+#define STATUS_FWP_FILTER_NOT_FOUND cpu_to_le32(0xC0220003)
+#define STATUS_FWP_LAYER_NOT_FOUND cpu_to_le32(0xC0220004)
+#define STATUS_FWP_PROVIDER_NOT_FOUND cpu_to_le32(0xC0220005)
+#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND cpu_to_le32(0xC0220006)
+#define STATUS_FWP_SUBLAYER_NOT_FOUND cpu_to_le32(0xC0220007)
+#define STATUS_FWP_NOT_FOUND cpu_to_le32(0xC0220008)
+#define STATUS_FWP_ALREADY_EXISTS cpu_to_le32(0xC0220009)
+#define STATUS_FWP_IN_USE cpu_to_le32(0xC022000A)
+#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS cpu_to_le32(0xC022000B)
+#define STATUS_FWP_WRONG_SESSION cpu_to_le32(0xC022000C)
+#define STATUS_FWP_NO_TXN_IN_PROGRESS cpu_to_le32(0xC022000D)
+#define STATUS_FWP_TXN_IN_PROGRESS cpu_to_le32(0xC022000E)
+#define STATUS_FWP_TXN_ABORTED cpu_to_le32(0xC022000F)
+#define STATUS_FWP_SESSION_ABORTED cpu_to_le32(0xC0220010)
+#define STATUS_FWP_INCOMPATIBLE_TXN cpu_to_le32(0xC0220011)
+#define STATUS_FWP_TIMEOUT cpu_to_le32(0xC0220012)
+#define STATUS_FWP_NET_EVENTS_DISABLED cpu_to_le32(0xC0220013)
+#define STATUS_FWP_INCOMPATIBLE_LAYER cpu_to_le32(0xC0220014)
+#define STATUS_FWP_KM_CLIENTS_ONLY cpu_to_le32(0xC0220015)
+#define STATUS_FWP_LIFETIME_MISMATCH cpu_to_le32(0xC0220016)
+#define STATUS_FWP_BUILTIN_OBJECT cpu_to_le32(0xC0220017)
+#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS cpu_to_le32(0xC0220018)
+#define STATUS_FWP_TOO_MANY_CALLOUTS cpu_to_le32(0xC0220018)
+#define STATUS_FWP_NOTIFICATION_DROPPED cpu_to_le32(0xC0220019)
+#define STATUS_FWP_TRAFFIC_MISMATCH cpu_to_le32(0xC022001A)
+#define STATUS_FWP_INCOMPATIBLE_SA_STATE cpu_to_le32(0xC022001B)
+#define STATUS_FWP_NULL_POINTER cpu_to_le32(0xC022001C)
+#define STATUS_FWP_INVALID_ENUMERATOR cpu_to_le32(0xC022001D)
+#define STATUS_FWP_INVALID_FLAGS cpu_to_le32(0xC022001E)
+#define STATUS_FWP_INVALID_NET_MASK cpu_to_le32(0xC022001F)
+#define STATUS_FWP_INVALID_RANGE cpu_to_le32(0xC0220020)
+#define STATUS_FWP_INVALID_INTERVAL cpu_to_le32(0xC0220021)
+#define STATUS_FWP_ZERO_LENGTH_ARRAY cpu_to_le32(0xC0220022)
+#define STATUS_FWP_NULL_DISPLAY_NAME cpu_to_le32(0xC0220023)
+#define STATUS_FWP_INVALID_ACTION_TYPE cpu_to_le32(0xC0220024)
+#define STATUS_FWP_INVALID_WEIGHT cpu_to_le32(0xC0220025)
+#define STATUS_FWP_MATCH_TYPE_MISMATCH cpu_to_le32(0xC0220026)
+#define STATUS_FWP_TYPE_MISMATCH cpu_to_le32(0xC0220027)
+#define STATUS_FWP_OUT_OF_BOUNDS cpu_to_le32(0xC0220028)
+#define STATUS_FWP_RESERVED cpu_to_le32(0xC0220029)
+#define STATUS_FWP_DUPLICATE_CONDITION cpu_to_le32(0xC022002A)
+#define STATUS_FWP_DUPLICATE_KEYMOD cpu_to_le32(0xC022002B)
+#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002C)
+#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER cpu_to_le32(0xC022002D)
+#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002E)
+#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT cpu_to_le32(0xC022002F)
+#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD cpu_to_le32(0xC0220030)
+#define STATUS_FWP_INCOMPATIBLE_DH_GROUP cpu_to_le32(0xC0220031)
+#define STATUS_FWP_EM_NOT_SUPPORTED cpu_to_le32(0xC0220032)
+#define STATUS_FWP_NEVER_MATCH cpu_to_le32(0xC0220033)
+#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH cpu_to_le32(0xC0220034)
+#define STATUS_FWP_INVALID_PARAMETER cpu_to_le32(0xC0220035)
+#define STATUS_FWP_TOO_MANY_SUBLAYERS cpu_to_le32(0xC0220036)
+#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED cpu_to_le32(0xC0220037)
+#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG cpu_to_le32(0xC0220038)
+#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG cpu_to_le32(0xC0220039)
+#define STATUS_FWP_TCPIP_NOT_READY cpu_to_le32(0xC0220100)
+#define STATUS_FWP_INJECT_HANDLE_CLOSING cpu_to_le32(0xC0220101)
+#define STATUS_FWP_INJECT_HANDLE_STALE cpu_to_le32(0xC0220102)
+#define STATUS_FWP_CANNOT_PEND cpu_to_le32(0xC0220103)
+#define STATUS_NDIS_CLOSING cpu_to_le32(0xC0230002)
+#define STATUS_NDIS_BAD_VERSION cpu_to_le32(0xC0230004)
+#define STATUS_NDIS_BAD_CHARACTERISTICS cpu_to_le32(0xC0230005)
+#define STATUS_NDIS_ADAPTER_NOT_FOUND cpu_to_le32(0xC0230006)
+#define STATUS_NDIS_OPEN_FAILED cpu_to_le32(0xC0230007)
+#define STATUS_NDIS_DEVICE_FAILED cpu_to_le32(0xC0230008)
+#define STATUS_NDIS_MULTICAST_FULL cpu_to_le32(0xC0230009)
+#define STATUS_NDIS_MULTICAST_EXISTS cpu_to_le32(0xC023000A)
+#define STATUS_NDIS_MULTICAST_NOT_FOUND cpu_to_le32(0xC023000B)
+#define STATUS_NDIS_REQUEST_ABORTED cpu_to_le32(0xC023000C)
+#define STATUS_NDIS_RESET_IN_PROGRESS cpu_to_le32(0xC023000D)
+#define STATUS_NDIS_INVALID_PACKET cpu_to_le32(0xC023000F)
+#define STATUS_NDIS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0230010)
+#define STATUS_NDIS_ADAPTER_NOT_READY cpu_to_le32(0xC0230011)
+#define STATUS_NDIS_INVALID_LENGTH cpu_to_le32(0xC0230014)
+#define STATUS_NDIS_INVALID_DATA cpu_to_le32(0xC0230015)
+#define STATUS_NDIS_BUFFER_TOO_SHORT cpu_to_le32(0xC0230016)
+#define STATUS_NDIS_INVALID_OID cpu_to_le32(0xC0230017)
+#define STATUS_NDIS_ADAPTER_REMOVED cpu_to_le32(0xC0230018)
+#define STATUS_NDIS_UNSUPPORTED_MEDIA cpu_to_le32(0xC0230019)
+#define STATUS_NDIS_GROUP_ADDRESS_IN_USE cpu_to_le32(0xC023001A)
+#define STATUS_NDIS_FILE_NOT_FOUND cpu_to_le32(0xC023001B)
+#define STATUS_NDIS_ERROR_READING_FILE cpu_to_le32(0xC023001C)
+#define STATUS_NDIS_ALREADY_MAPPED cpu_to_le32(0xC023001D)
+#define STATUS_NDIS_RESOURCE_CONFLICT cpu_to_le32(0xC023001E)
+#define STATUS_NDIS_MEDIA_DISCONNECTED cpu_to_le32(0xC023001F)
+#define STATUS_NDIS_INVALID_ADDRESS cpu_to_le32(0xC0230022)
+#define STATUS_NDIS_PAUSED cpu_to_le32(0xC023002A)
+#define STATUS_NDIS_INTERFACE_NOT_FOUND cpu_to_le32(0xC023002B)
+#define STATUS_NDIS_UNSUPPORTED_REVISION cpu_to_le32(0xC023002C)
+#define STATUS_NDIS_INVALID_PORT cpu_to_le32(0xC023002D)
+#define STATUS_NDIS_INVALID_PORT_STATE cpu_to_le32(0xC023002E)
+#define STATUS_NDIS_LOW_POWER_STATE cpu_to_le32(0xC023002F)
+#define STATUS_NDIS_NOT_SUPPORTED cpu_to_le32(0xC02300BB)
+#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED cpu_to_le32(0xC0232000)
+#define STATUS_NDIS_DOT11_MEDIA_IN_USE cpu_to_le32(0xC0232001)
+#define STATUS_NDIS_DOT11_POWER_STATE_INVALID cpu_to_le32(0xC0232002)
+#define STATUS_IPSEC_BAD_SPI cpu_to_le32(0xC0360001)
+#define STATUS_IPSEC_SA_LIFETIME_EXPIRED cpu_to_le32(0xC0360002)
+#define STATUS_IPSEC_WRONG_SA cpu_to_le32(0xC0360003)
+#define STATUS_IPSEC_REPLAY_CHECK_FAILED cpu_to_le32(0xC0360004)
+#define STATUS_IPSEC_INVALID_PACKET cpu_to_le32(0xC0360005)
+#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED cpu_to_le32(0xC0360006)
+#define STATUS_IPSEC_CLEAR_TEXT_DROP cpu_to_le32(0xC0360007)
+
+#define STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP cpu_to_le32(0xC05D0000)
+#define STATUS_INVALID_LOCK_RANGE cpu_to_le32(0xC00001a1)
diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
new file mode 100644
index 000000000000..44aea33a67fa
--- /dev/null
+++ b/fs/ksmbd/transport_ipc.c
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/hashtable.h>
+#include <net/net_namespace.h>
+#include <net/genetlink.h>
+#include <linux/socket.h>
+#include <linux/workqueue.h>
+
+#include "vfs_cache.h"
+#include "transport_ipc.h"
+#include "server.h"
+#include "smb_common.h"
+
+#include "mgmt/user_config.h"
+#include "mgmt/share_config.h"
+#include "mgmt/user_session.h"
+#include "mgmt/tree_connect.h"
+#include "mgmt/ksmbd_ida.h"
+#include "connection.h"
+#include "transport_tcp.h"
+
+#define IPC_WAIT_TIMEOUT (2 * HZ)
+
+#define IPC_MSG_HASH_BITS 3
+static DEFINE_HASHTABLE(ipc_msg_table, IPC_MSG_HASH_BITS);
+static DECLARE_RWSEM(ipc_msg_table_lock);
+static DEFINE_MUTEX(startup_lock);
+
+static DEFINE_IDA(ipc_ida);
+
+static unsigned int ksmbd_tools_pid;
+
+static bool ksmbd_ipc_validate_version(struct genl_info *m)
+{
+ if (m->genlhdr->version != KSMBD_GENL_VERSION) {
+ pr_err("%s. ksmbd: %d, kernel module: %d. %s.\n",
+ "Daemon and kernel module version mismatch",
+ m->genlhdr->version,
+ KSMBD_GENL_VERSION,
+ "User-space ksmbd should terminate");
+ return false;
+ }
+ return true;
+}
+
+struct ksmbd_ipc_msg {
+ unsigned int type;
+ unsigned int sz;
+ unsigned char payload[];
+};
+
+struct ipc_msg_table_entry {
+ unsigned int handle;
+ unsigned int type;
+ wait_queue_head_t wait;
+ struct hlist_node ipc_table_hlist;
+
+ void *response;
+};
+
+static struct delayed_work ipc_timer_work;
+
+static int handle_startup_event(struct sk_buff *skb, struct genl_info *info);
+static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info);
+static int handle_generic_event(struct sk_buff *skb, struct genl_info *info);
+static int ksmbd_ipc_heartbeat_request(void);
+
+static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = {
+ [KSMBD_EVENT_UNSPEC] = {
+ .len = 0,
+ },
+ [KSMBD_EVENT_HEARTBEAT_REQUEST] = {
+ .len = sizeof(struct ksmbd_heartbeat),
+ },
+ [KSMBD_EVENT_STARTING_UP] = {
+ .len = sizeof(struct ksmbd_startup_request),
+ },
+ [KSMBD_EVENT_SHUTTING_DOWN] = {
+ .len = sizeof(struct ksmbd_shutdown_request),
+ },
+ [KSMBD_EVENT_LOGIN_REQUEST] = {
+ .len = sizeof(struct ksmbd_login_request),
+ },
+ [KSMBD_EVENT_LOGIN_RESPONSE] = {
+ .len = sizeof(struct ksmbd_login_response),
+ },
+ [KSMBD_EVENT_SHARE_CONFIG_REQUEST] = {
+ .len = sizeof(struct ksmbd_share_config_request),
+ },
+ [KSMBD_EVENT_SHARE_CONFIG_RESPONSE] = {
+ .len = sizeof(struct ksmbd_share_config_response),
+ },
+ [KSMBD_EVENT_TREE_CONNECT_REQUEST] = {
+ .len = sizeof(struct ksmbd_tree_connect_request),
+ },
+ [KSMBD_EVENT_TREE_CONNECT_RESPONSE] = {
+ .len = sizeof(struct ksmbd_tree_connect_response),
+ },
+ [KSMBD_EVENT_TREE_DISCONNECT_REQUEST] = {
+ .len = sizeof(struct ksmbd_tree_disconnect_request),
+ },
+ [KSMBD_EVENT_LOGOUT_REQUEST] = {
+ .len = sizeof(struct ksmbd_logout_request),
+ },
+ [KSMBD_EVENT_RPC_REQUEST] = {
+ },
+ [KSMBD_EVENT_RPC_RESPONSE] = {
+ },
+ [KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST] = {
+ },
+ [KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE] = {
+ },
+};
+
+static struct genl_ops ksmbd_genl_ops[] = {
+ {
+ .cmd = KSMBD_EVENT_UNSPEC,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_HEARTBEAT_REQUEST,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_STARTING_UP,
+ .doit = handle_startup_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_SHUTTING_DOWN,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_LOGIN_REQUEST,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_LOGIN_RESPONSE,
+ .doit = handle_generic_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_SHARE_CONFIG_REQUEST,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_SHARE_CONFIG_RESPONSE,
+ .doit = handle_generic_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_TREE_CONNECT_REQUEST,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_TREE_CONNECT_RESPONSE,
+ .doit = handle_generic_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_TREE_DISCONNECT_REQUEST,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_LOGOUT_REQUEST,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_RPC_REQUEST,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_RPC_RESPONSE,
+ .doit = handle_generic_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE,
+ .doit = handle_generic_event,
+ },
+};
+
+static struct genl_family ksmbd_genl_family = {
+ .name = KSMBD_GENL_NAME,
+ .version = KSMBD_GENL_VERSION,
+ .hdrsize = 0,
+ .maxattr = KSMBD_EVENT_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = ksmbd_genl_ops,
+ .n_ops = ARRAY_SIZE(ksmbd_genl_ops),
+};
+
+static void ksmbd_nl_init_fixup(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ksmbd_genl_ops); i++)
+ ksmbd_genl_ops[i].validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP;
+
+ ksmbd_genl_family.policy = ksmbd_nl_policy;
+}
+
+static int rpc_context_flags(struct ksmbd_session *sess)
+{
+ if (user_guest(sess->user))
+ return KSMBD_RPC_RESTRICTED_CONTEXT;
+ return 0;
+}
+
+static void ipc_update_last_active(void)
+{
+ if (server_conf.ipc_timeout)
+ server_conf.ipc_last_active = jiffies;
+}
+
+static struct ksmbd_ipc_msg *ipc_msg_alloc(size_t sz)
+{
+ struct ksmbd_ipc_msg *msg;
+ size_t msg_sz = sz + sizeof(struct ksmbd_ipc_msg);
+
+ msg = kvmalloc(msg_sz, GFP_KERNEL | __GFP_ZERO);
+ if (msg)
+ msg->sz = sz;
+ return msg;
+}
+
+static void ipc_msg_free(struct ksmbd_ipc_msg *msg)
+{
+ kvfree(msg);
+}
+
+static void ipc_msg_handle_free(int handle)
+{
+ if (handle >= 0)
+ ksmbd_release_id(&ipc_ida, handle);
+}
+
+static int handle_response(int type, void *payload, size_t sz)
+{
+ unsigned int handle = *(unsigned int *)payload;
+ struct ipc_msg_table_entry *entry;
+ int ret = 0;
+
+ ipc_update_last_active();
+ down_read(&ipc_msg_table_lock);
+ hash_for_each_possible(ipc_msg_table, entry, ipc_table_hlist, handle) {
+ if (handle != entry->handle)
+ continue;
+
+ entry->response = NULL;
+ /*
+ * Response message type value should be equal to
+ * request message type + 1.
+ */
+ if (entry->type + 1 != type) {
+ pr_err("Waiting for IPC type %d, got %d. Ignore.\n",
+ entry->type + 1, type);
+ }
+
+ entry->response = kvmalloc(sz, GFP_KERNEL | __GFP_ZERO);
+ if (!entry->response) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ memcpy(entry->response, payload, sz);
+ wake_up_interruptible(&entry->wait);
+ ret = 0;
+ break;
+ }
+ up_read(&ipc_msg_table_lock);
+
+ return ret;
+}
+
+static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
+{
+ int ret;
+
+ ksmbd_set_fd_limit(req->file_max);
+ server_conf.flags = req->flags;
+ server_conf.signing = req->signing;
+ server_conf.tcp_port = req->tcp_port;
+ server_conf.ipc_timeout = req->ipc_timeout * HZ;
+ server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL;
+ server_conf.share_fake_fscaps = req->share_fake_fscaps;
+ ksmbd_init_domain(req->sub_auth);
+
+ if (req->smb2_max_read)
+ init_smb2_max_read_size(req->smb2_max_read);
+ if (req->smb2_max_write)
+ init_smb2_max_write_size(req->smb2_max_write);
+ if (req->smb2_max_trans)
+ init_smb2_max_trans_size(req->smb2_max_trans);
+
+ ret = ksmbd_set_netbios_name(req->netbios_name);
+ ret |= ksmbd_set_server_string(req->server_string);
+ ret |= ksmbd_set_work_group(req->work_group);
+ ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req),
+ req->ifc_list_sz);
+ if (ret) {
+ pr_err("Server configuration error: %s %s %s\n",
+ req->netbios_name, req->server_string,
+ req->work_group);
+ return ret;
+ }
+
+ if (req->min_prot[0]) {
+ ret = ksmbd_lookup_protocol_idx(req->min_prot);
+ if (ret >= 0)
+ server_conf.min_protocol = ret;
+ }
+ if (req->max_prot[0]) {
+ ret = ksmbd_lookup_protocol_idx(req->max_prot);
+ if (ret >= 0)
+ server_conf.max_protocol = ret;
+ }
+
+ if (server_conf.ipc_timeout)
+ schedule_delayed_work(&ipc_timer_work, server_conf.ipc_timeout);
+ return 0;
+}
+
+static int handle_startup_event(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret = 0;
+
+#ifdef CONFIG_SMB_SERVER_CHECK_CAP_NET_ADMIN
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+#endif
+
+ if (!ksmbd_ipc_validate_version(info))
+ return -EINVAL;
+
+ if (!info->attrs[KSMBD_EVENT_STARTING_UP])
+ return -EINVAL;
+
+ mutex_lock(&startup_lock);
+ if (!ksmbd_server_configurable()) {
+ mutex_unlock(&startup_lock);
+ pr_err("Server reset is in progress, can't start daemon\n");
+ return -EINVAL;
+ }
+
+ if (ksmbd_tools_pid) {
+ if (ksmbd_ipc_heartbeat_request() == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pr_err("Reconnect to a new user space daemon\n");
+ } else {
+ struct ksmbd_startup_request *req;
+
+ req = nla_data(info->attrs[info->genlhdr->cmd]);
+ ret = ipc_server_config_on_startup(req);
+ if (ret)
+ goto out;
+ server_queue_ctrl_init_work();
+ }
+
+ ksmbd_tools_pid = info->snd_portid;
+ ipc_update_last_active();
+
+out:
+ mutex_unlock(&startup_lock);
+ return ret;
+}
+
+static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info)
+{
+ pr_err("Unknown IPC event: %d, ignore.\n", info->genlhdr->cmd);
+ return -EINVAL;
+}
+
+static int handle_generic_event(struct sk_buff *skb, struct genl_info *info)
+{
+ void *payload;
+ int sz;
+ int type = info->genlhdr->cmd;
+
+#ifdef CONFIG_SMB_SERVER_CHECK_CAP_NET_ADMIN
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+#endif
+
+ if (type >= KSMBD_EVENT_MAX) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (!ksmbd_ipc_validate_version(info))
+ return -EINVAL;
+
+ if (!info->attrs[type])
+ return -EINVAL;
+
+ payload = nla_data(info->attrs[info->genlhdr->cmd]);
+ sz = nla_len(info->attrs[info->genlhdr->cmd]);
+ return handle_response(type, payload, sz);
+}
+
+static int ipc_msg_send(struct ksmbd_ipc_msg *msg)
+{
+ struct genlmsghdr *nlh;
+ struct sk_buff *skb;
+ int ret = -EINVAL;
+
+ if (!ksmbd_tools_pid)
+ return ret;
+
+ skb = genlmsg_new(msg->sz, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ nlh = genlmsg_put(skb, 0, 0, &ksmbd_genl_family, 0, msg->type);
+ if (!nlh)
+ goto out;
+
+ ret = nla_put(skb, msg->type, msg->sz, msg->payload);
+ if (ret) {
+ genlmsg_cancel(skb, nlh);
+ goto out;
+ }
+
+ genlmsg_end(skb, nlh);
+ ret = genlmsg_unicast(&init_net, skb, ksmbd_tools_pid);
+ if (!ret)
+ ipc_update_last_active();
+ return ret;
+
+out:
+ nlmsg_free(skb);
+ return ret;
+}
+
+static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle)
+{
+ struct ipc_msg_table_entry entry;
+ int ret;
+
+ if ((int)handle < 0)
+ return NULL;
+
+ entry.type = msg->type;
+ entry.response = NULL;
+ init_waitqueue_head(&entry.wait);
+
+ down_write(&ipc_msg_table_lock);
+ entry.handle = handle;
+ hash_add(ipc_msg_table, &entry.ipc_table_hlist, entry.handle);
+ up_write(&ipc_msg_table_lock);
+
+ ret = ipc_msg_send(msg);
+ if (ret)
+ goto out;
+
+ ret = wait_event_interruptible_timeout(entry.wait,
+ entry.response != NULL,
+ IPC_WAIT_TIMEOUT);
+out:
+ down_write(&ipc_msg_table_lock);
+ hash_del(&entry.ipc_table_hlist);
+ up_write(&ipc_msg_table_lock);
+ return entry.response;
+}
+
+static int ksmbd_ipc_heartbeat_request(void)
+{
+ struct ksmbd_ipc_msg *msg;
+ int ret;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_heartbeat));
+ if (!msg)
+ return -EINVAL;
+
+ msg->type = KSMBD_EVENT_HEARTBEAT_REQUEST;
+ ret = ipc_msg_send(msg);
+ ipc_msg_free(msg);
+ return ret;
+}
+
+struct ksmbd_login_response *ksmbd_ipc_login_request(const char *account)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_login_request *req;
+ struct ksmbd_login_response *resp;
+
+ if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ)
+ return NULL;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request));
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_LOGIN_REQUEST;
+ req = (struct ksmbd_login_request *)msg->payload;
+ req->handle = ksmbd_acquire_id(&ipc_ida);
+ strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ);
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_handle_free(req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+struct ksmbd_spnego_authen_response *
+ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_spnego_authen_request *req;
+ struct ksmbd_spnego_authen_response *resp;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_spnego_authen_request) +
+ blob_len + 1);
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST;
+ req = (struct ksmbd_spnego_authen_request *)msg->payload;
+ req->handle = ksmbd_acquire_id(&ipc_ida);
+ req->spnego_blob_len = blob_len;
+ memcpy(req->spnego_blob, spnego_blob, blob_len);
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_handle_free(req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+struct ksmbd_tree_connect_response *
+ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess,
+ struct ksmbd_share_config *share,
+ struct ksmbd_tree_connect *tree_conn,
+ struct sockaddr *peer_addr)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_tree_connect_request *req;
+ struct ksmbd_tree_connect_response *resp;
+
+ if (strlen(user_name(sess->user)) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ)
+ return NULL;
+
+ if (strlen(share->name) >= KSMBD_REQ_MAX_SHARE_NAME)
+ return NULL;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_tree_connect_request));
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_TREE_CONNECT_REQUEST;
+ req = (struct ksmbd_tree_connect_request *)msg->payload;
+
+ req->handle = ksmbd_acquire_id(&ipc_ida);
+ req->account_flags = sess->user->flags;
+ req->session_id = sess->id;
+ req->connect_id = tree_conn->id;
+ strscpy(req->account, user_name(sess->user), KSMBD_REQ_MAX_ACCOUNT_NAME_SZ);
+ strscpy(req->share, share->name, KSMBD_REQ_MAX_SHARE_NAME);
+ snprintf(req->peer_addr, sizeof(req->peer_addr), "%pIS", peer_addr);
+
+ if (peer_addr->sa_family == AF_INET6)
+ req->flags |= KSMBD_TREE_CONN_FLAG_REQUEST_IPV6;
+ if (test_session_flag(sess, CIFDS_SESSION_FLAG_SMB2))
+ req->flags |= KSMBD_TREE_CONN_FLAG_REQUEST_SMB2;
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_handle_free(req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id,
+ unsigned long long connect_id)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_tree_disconnect_request *req;
+ int ret;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_tree_disconnect_request));
+ if (!msg)
+ return -ENOMEM;
+
+ msg->type = KSMBD_EVENT_TREE_DISCONNECT_REQUEST;
+ req = (struct ksmbd_tree_disconnect_request *)msg->payload;
+ req->session_id = session_id;
+ req->connect_id = connect_id;
+
+ ret = ipc_msg_send(msg);
+ ipc_msg_free(msg);
+ return ret;
+}
+
+int ksmbd_ipc_logout_request(const char *account)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_logout_request *req;
+ int ret;
+
+ if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ)
+ return -EINVAL;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_logout_request));
+ if (!msg)
+ return -ENOMEM;
+
+ msg->type = KSMBD_EVENT_LOGOUT_REQUEST;
+ req = (struct ksmbd_logout_request *)msg->payload;
+ strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ);
+
+ ret = ipc_msg_send(msg);
+ ipc_msg_free(msg);
+ return ret;
+}
+
+struct ksmbd_share_config_response *
+ksmbd_ipc_share_config_request(const char *name)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_share_config_request *req;
+ struct ksmbd_share_config_response *resp;
+
+ if (strlen(name) >= KSMBD_REQ_MAX_SHARE_NAME)
+ return NULL;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_share_config_request));
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_SHARE_CONFIG_REQUEST;
+ req = (struct ksmbd_share_config_request *)msg->payload;
+ req->handle = ksmbd_acquire_id(&ipc_ida);
+ strscpy(req->share_name, name, KSMBD_REQ_MAX_SHARE_NAME);
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_handle_free(req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+struct ksmbd_rpc_command *ksmbd_rpc_open(struct ksmbd_session *sess, int handle)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_rpc_command *req;
+ struct ksmbd_rpc_command *resp;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command));
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_RPC_REQUEST;
+ req = (struct ksmbd_rpc_command *)msg->payload;
+ req->handle = handle;
+ req->flags = ksmbd_session_rpc_method(sess, handle);
+ req->flags |= KSMBD_RPC_OPEN_METHOD;
+ req->payload_sz = 0;
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+struct ksmbd_rpc_command *ksmbd_rpc_close(struct ksmbd_session *sess, int handle)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_rpc_command *req;
+ struct ksmbd_rpc_command *resp;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command));
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_RPC_REQUEST;
+ req = (struct ksmbd_rpc_command *)msg->payload;
+ req->handle = handle;
+ req->flags = ksmbd_session_rpc_method(sess, handle);
+ req->flags |= KSMBD_RPC_CLOSE_METHOD;
+ req->payload_sz = 0;
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle,
+ void *payload, size_t payload_sz)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_rpc_command *req;
+ struct ksmbd_rpc_command *resp;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1);
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_RPC_REQUEST;
+ req = (struct ksmbd_rpc_command *)msg->payload;
+ req->handle = handle;
+ req->flags = ksmbd_session_rpc_method(sess, handle);
+ req->flags |= rpc_context_flags(sess);
+ req->flags |= KSMBD_RPC_WRITE_METHOD;
+ req->payload_sz = payload_sz;
+ memcpy(req->payload, payload, payload_sz);
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_rpc_command *req;
+ struct ksmbd_rpc_command *resp;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command));
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_RPC_REQUEST;
+ req = (struct ksmbd_rpc_command *)msg->payload;
+ req->handle = handle;
+ req->flags = ksmbd_session_rpc_method(sess, handle);
+ req->flags |= rpc_context_flags(sess);
+ req->flags |= KSMBD_RPC_READ_METHOD;
+ req->payload_sz = 0;
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle,
+ void *payload, size_t payload_sz)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_rpc_command *req;
+ struct ksmbd_rpc_command *resp;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1);
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_RPC_REQUEST;
+ req = (struct ksmbd_rpc_command *)msg->payload;
+ req->handle = handle;
+ req->flags = ksmbd_session_rpc_method(sess, handle);
+ req->flags |= rpc_context_flags(sess);
+ req->flags |= KSMBD_RPC_IOCTL_METHOD;
+ req->payload_sz = payload_sz;
+ memcpy(req->payload, payload, payload_sz);
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+struct ksmbd_rpc_command *ksmbd_rpc_rap(struct ksmbd_session *sess, void *payload,
+ size_t payload_sz)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_rpc_command *req;
+ struct ksmbd_rpc_command *resp;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1);
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_RPC_REQUEST;
+ req = (struct ksmbd_rpc_command *)msg->payload;
+ req->handle = ksmbd_acquire_id(&ipc_ida);
+ req->flags = rpc_context_flags(sess);
+ req->flags |= KSMBD_RPC_RAP_METHOD;
+ req->payload_sz = payload_sz;
+ memcpy(req->payload, payload, payload_sz);
+
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_handle_free(req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
+static int __ipc_heartbeat(void)
+{
+ unsigned long delta;
+
+ if (!ksmbd_server_running())
+ return 0;
+
+ if (time_after(jiffies, server_conf.ipc_last_active)) {
+ delta = (jiffies - server_conf.ipc_last_active);
+ } else {
+ ipc_update_last_active();
+ schedule_delayed_work(&ipc_timer_work,
+ server_conf.ipc_timeout);
+ return 0;
+ }
+
+ if (delta < server_conf.ipc_timeout) {
+ schedule_delayed_work(&ipc_timer_work,
+ server_conf.ipc_timeout - delta);
+ return 0;
+ }
+
+ if (ksmbd_ipc_heartbeat_request() == 0) {
+ schedule_delayed_work(&ipc_timer_work,
+ server_conf.ipc_timeout);
+ return 0;
+ }
+
+ mutex_lock(&startup_lock);
+ WRITE_ONCE(server_conf.state, SERVER_STATE_RESETTING);
+ server_conf.ipc_last_active = 0;
+ ksmbd_tools_pid = 0;
+ pr_err("No IPC daemon response for %lus\n", delta / HZ);
+ mutex_unlock(&startup_lock);
+ return -EINVAL;
+}
+
+static void ipc_timer_heartbeat(struct work_struct *w)
+{
+ if (__ipc_heartbeat())
+ server_queue_ctrl_reset_work();
+}
+
+int ksmbd_ipc_id_alloc(void)
+{
+ return ksmbd_acquire_id(&ipc_ida);
+}
+
+void ksmbd_rpc_id_free(int handle)
+{
+ ksmbd_release_id(&ipc_ida, handle);
+}
+
+void ksmbd_ipc_release(void)
+{
+ cancel_delayed_work_sync(&ipc_timer_work);
+ genl_unregister_family(&ksmbd_genl_family);
+}
+
+void ksmbd_ipc_soft_reset(void)
+{
+ mutex_lock(&startup_lock);
+ ksmbd_tools_pid = 0;
+ cancel_delayed_work_sync(&ipc_timer_work);
+ mutex_unlock(&startup_lock);
+}
+
+int ksmbd_ipc_init(void)
+{
+ int ret = 0;
+
+ ksmbd_nl_init_fixup();
+ INIT_DELAYED_WORK(&ipc_timer_work, ipc_timer_heartbeat);
+
+ ret = genl_register_family(&ksmbd_genl_family);
+ if (ret) {
+ pr_err("Failed to register KSMBD netlink interface %d\n", ret);
+ cancel_delayed_work_sync(&ipc_timer_work);
+ }
+
+ return ret;
+}
diff --git a/fs/ksmbd/transport_ipc.h b/fs/ksmbd/transport_ipc.h
new file mode 100644
index 000000000000..9eacc895ffdb
--- /dev/null
+++ b/fs/ksmbd/transport_ipc.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_TRANSPORT_IPC_H__
+#define __KSMBD_TRANSPORT_IPC_H__
+
+#include <linux/wait.h>
+
+#define KSMBD_IPC_MAX_PAYLOAD 4096
+
+struct ksmbd_login_response *
+ksmbd_ipc_login_request(const char *account);
+
+struct ksmbd_session;
+struct ksmbd_share_config;
+struct ksmbd_tree_connect;
+struct sockaddr;
+
+struct ksmbd_tree_connect_response *
+ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess,
+ struct ksmbd_share_config *share,
+ struct ksmbd_tree_connect *tree_conn,
+ struct sockaddr *peer_addr);
+int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id,
+ unsigned long long connect_id);
+int ksmbd_ipc_logout_request(const char *account);
+struct ksmbd_share_config_response *
+ksmbd_ipc_share_config_request(const char *name);
+struct ksmbd_spnego_authen_response *
+ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len);
+int ksmbd_ipc_id_alloc(void);
+void ksmbd_rpc_id_free(int handle);
+struct ksmbd_rpc_command *ksmbd_rpc_open(struct ksmbd_session *sess, int handle);
+struct ksmbd_rpc_command *ksmbd_rpc_close(struct ksmbd_session *sess, int handle);
+struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle,
+ void *payload, size_t payload_sz);
+struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle);
+struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle,
+ void *payload, size_t payload_sz);
+struct ksmbd_rpc_command *ksmbd_rpc_rap(struct ksmbd_session *sess, void *payload,
+ size_t payload_sz);
+void ksmbd_ipc_release(void);
+void ksmbd_ipc_soft_reset(void);
+int ksmbd_ipc_init(void);
+#endif /* __KSMBD_TRANSPORT_IPC_H__ */
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
new file mode 100644
index 000000000000..58f530056ac0
--- /dev/null
+++ b/fs/ksmbd/transport_rdma.c
@@ -0,0 +1,2058 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (C) 2018, LG Electronics.
+ *
+ * Author(s): Long Li <longli@microsoft.com>,
+ * Hyunchul Lee <hyc.lee@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#define SUBMOD_NAME "smb_direct"
+
+#include <linux/kthread.h>
+#include <linux/rwlock.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/highmem.h>
+#include <linux/scatterlist.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
+
+#include "glob.h"
+#include "connection.h"
+#include "smb_common.h"
+#include "smbstatus.h"
+#include "transport_rdma.h"
+
+#define SMB_DIRECT_PORT 5445
+
+#define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100)
+
+/* SMB_DIRECT negotiation timeout in seconds */
+#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120
+
+#define SMB_DIRECT_MAX_SEND_SGES 8
+#define SMB_DIRECT_MAX_RECV_SGES 1
+
+/*
+ * Default maximum number of RDMA read/write outstanding on this connection
+ * This value is possibly decreased during QP creation on hardware limit
+ */
+#define SMB_DIRECT_CM_INITIATOR_DEPTH 8
+
+/* Maximum number of retries on data transfer operations */
+#define SMB_DIRECT_CM_RETRY 6
+/* No need to retry on Receiver Not Ready since SMB_DIRECT manages credits */
+#define SMB_DIRECT_CM_RNR_RETRY 0
+
+/*
+ * User configurable initial values per SMB_DIRECT transport connection
+ * as defined in [MS-SMBD] 3.1.1.1
+ * Those may change after a SMB_DIRECT negotiation
+ */
+/* The local peer's maximum number of credits to grant to the peer */
+static int smb_direct_receive_credit_max = 255;
+
+/* The remote peer's credit request of local peer */
+static int smb_direct_send_credit_target = 255;
+
+/* The maximum single message size can be sent to remote peer */
+static int smb_direct_max_send_size = 8192;
+
+/* The maximum fragmented upper-layer payload receive size supported */
+static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
+
+/* The maximum single-message size which can be received */
+static int smb_direct_max_receive_size = 8192;
+
+static int smb_direct_max_read_write_size = 1024 * 1024;
+
+static int smb_direct_max_outstanding_rw_ops = 8;
+
+static struct smb_direct_listener {
+ struct rdma_cm_id *cm_id;
+} smb_direct_listener;
+
+static struct workqueue_struct *smb_direct_wq;
+
+enum smb_direct_status {
+ SMB_DIRECT_CS_NEW = 0,
+ SMB_DIRECT_CS_CONNECTED,
+ SMB_DIRECT_CS_DISCONNECTING,
+ SMB_DIRECT_CS_DISCONNECTED,
+};
+
+struct smb_direct_transport {
+ struct ksmbd_transport transport;
+
+ enum smb_direct_status status;
+ bool full_packet_received;
+ wait_queue_head_t wait_status;
+
+ struct rdma_cm_id *cm_id;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ struct ib_pd *pd;
+ struct ib_qp *qp;
+
+ int max_send_size;
+ int max_recv_size;
+ int max_fragmented_send_size;
+ int max_fragmented_recv_size;
+ int max_rdma_rw_size;
+
+ spinlock_t reassembly_queue_lock;
+ struct list_head reassembly_queue;
+ int reassembly_data_length;
+ int reassembly_queue_length;
+ int first_entry_offset;
+ wait_queue_head_t wait_reassembly_queue;
+
+ spinlock_t receive_credit_lock;
+ int recv_credits;
+ int count_avail_recvmsg;
+ int recv_credit_max;
+ int recv_credit_target;
+
+ spinlock_t recvmsg_queue_lock;
+ struct list_head recvmsg_queue;
+
+ spinlock_t empty_recvmsg_queue_lock;
+ struct list_head empty_recvmsg_queue;
+
+ int send_credit_target;
+ atomic_t send_credits;
+ spinlock_t lock_new_recv_credits;
+ int new_recv_credits;
+ atomic_t rw_avail_ops;
+
+ wait_queue_head_t wait_send_credits;
+ wait_queue_head_t wait_rw_avail_ops;
+
+ mempool_t *sendmsg_mempool;
+ struct kmem_cache *sendmsg_cache;
+ mempool_t *recvmsg_mempool;
+ struct kmem_cache *recvmsg_cache;
+
+ wait_queue_head_t wait_send_payload_pending;
+ atomic_t send_payload_pending;
+ wait_queue_head_t wait_send_pending;
+ atomic_t send_pending;
+
+ struct delayed_work post_recv_credits_work;
+ struct work_struct send_immediate_work;
+ struct work_struct disconnect_work;
+
+ bool negotiation_requested;
+};
+
+#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport))
+
+enum {
+ SMB_DIRECT_MSG_NEGOTIATE_REQ = 0,
+ SMB_DIRECT_MSG_DATA_TRANSFER
+};
+
+static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops;
+
+struct smb_direct_send_ctx {
+ struct list_head msg_list;
+ int wr_cnt;
+ bool need_invalidate_rkey;
+ unsigned int remote_key;
+};
+
+struct smb_direct_sendmsg {
+ struct smb_direct_transport *transport;
+ struct ib_send_wr wr;
+ struct list_head list;
+ int num_sge;
+ struct ib_sge sge[SMB_DIRECT_MAX_SEND_SGES];
+ struct ib_cqe cqe;
+ u8 packet[];
+};
+
+struct smb_direct_recvmsg {
+ struct smb_direct_transport *transport;
+ struct list_head list;
+ int type;
+ struct ib_sge sge;
+ struct ib_cqe cqe;
+ bool first_segment;
+ u8 packet[];
+};
+
+struct smb_direct_rdma_rw_msg {
+ struct smb_direct_transport *t;
+ struct ib_cqe cqe;
+ struct completion *completion;
+ struct rdma_rw_ctx rw_ctx;
+ struct sg_table sgt;
+ struct scatterlist sg_list[0];
+};
+
+static inline int get_buf_page_count(void *buf, int size)
+{
+ return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) -
+ (uintptr_t)buf / PAGE_SIZE;
+}
+
+static void smb_direct_destroy_pools(struct smb_direct_transport *transport);
+static void smb_direct_post_recv_credits(struct work_struct *work);
+static int smb_direct_post_send_data(struct smb_direct_transport *t,
+ struct smb_direct_send_ctx *send_ctx,
+ struct kvec *iov, int niov,
+ int remaining_data_length);
+
+static inline struct smb_direct_transport *
+smb_trans_direct_transfort(struct ksmbd_transport *t)
+{
+ return container_of(t, struct smb_direct_transport, transport);
+}
+
+static inline void
+*smb_direct_recvmsg_payload(struct smb_direct_recvmsg *recvmsg)
+{
+ return (void *)recvmsg->packet;
+}
+
+static inline bool is_receive_credit_post_required(int receive_credits,
+ int avail_recvmsg_count)
+{
+ return receive_credits <= (smb_direct_receive_credit_max >> 3) &&
+ avail_recvmsg_count >= (receive_credits >> 2);
+}
+
+static struct
+smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t)
+{
+ struct smb_direct_recvmsg *recvmsg = NULL;
+
+ spin_lock(&t->recvmsg_queue_lock);
+ if (!list_empty(&t->recvmsg_queue)) {
+ recvmsg = list_first_entry(&t->recvmsg_queue,
+ struct smb_direct_recvmsg,
+ list);
+ list_del(&recvmsg->list);
+ }
+ spin_unlock(&t->recvmsg_queue_lock);
+ return recvmsg;
+}
+
+static void put_recvmsg(struct smb_direct_transport *t,
+ struct smb_direct_recvmsg *recvmsg)
+{
+ ib_dma_unmap_single(t->cm_id->device, recvmsg->sge.addr,
+ recvmsg->sge.length, DMA_FROM_DEVICE);
+
+ spin_lock(&t->recvmsg_queue_lock);
+ list_add(&recvmsg->list, &t->recvmsg_queue);
+ spin_unlock(&t->recvmsg_queue_lock);
+}
+
+static struct
+smb_direct_recvmsg *get_empty_recvmsg(struct smb_direct_transport *t)
+{
+ struct smb_direct_recvmsg *recvmsg = NULL;
+
+ spin_lock(&t->empty_recvmsg_queue_lock);
+ if (!list_empty(&t->empty_recvmsg_queue)) {
+ recvmsg = list_first_entry(&t->empty_recvmsg_queue,
+ struct smb_direct_recvmsg, list);
+ list_del(&recvmsg->list);
+ }
+ spin_unlock(&t->empty_recvmsg_queue_lock);
+ return recvmsg;
+}
+
+static void put_empty_recvmsg(struct smb_direct_transport *t,
+ struct smb_direct_recvmsg *recvmsg)
+{
+ ib_dma_unmap_single(t->cm_id->device, recvmsg->sge.addr,
+ recvmsg->sge.length, DMA_FROM_DEVICE);
+
+ spin_lock(&t->empty_recvmsg_queue_lock);
+ list_add_tail(&recvmsg->list, &t->empty_recvmsg_queue);
+ spin_unlock(&t->empty_recvmsg_queue_lock);
+}
+
+static void enqueue_reassembly(struct smb_direct_transport *t,
+ struct smb_direct_recvmsg *recvmsg,
+ int data_length)
+{
+ spin_lock(&t->reassembly_queue_lock);
+ list_add_tail(&recvmsg->list, &t->reassembly_queue);
+ t->reassembly_queue_length++;
+ /*
+ * Make sure reassembly_data_length is updated after list and
+ * reassembly_queue_length are updated. On the dequeue side
+ * reassembly_data_length is checked without a lock to determine
+ * if reassembly_queue_length and list is up to date
+ */
+ virt_wmb();
+ t->reassembly_data_length += data_length;
+ spin_unlock(&t->reassembly_queue_lock);
+}
+
+static struct smb_direct_recvmsg *get_first_reassembly(struct smb_direct_transport *t)
+{
+ if (!list_empty(&t->reassembly_queue))
+ return list_first_entry(&t->reassembly_queue,
+ struct smb_direct_recvmsg, list);
+ else
+ return NULL;
+}
+
+static void smb_direct_disconnect_rdma_work(struct work_struct *work)
+{
+ struct smb_direct_transport *t =
+ container_of(work, struct smb_direct_transport,
+ disconnect_work);
+
+ if (t->status == SMB_DIRECT_CS_CONNECTED) {
+ t->status = SMB_DIRECT_CS_DISCONNECTING;
+ rdma_disconnect(t->cm_id);
+ }
+}
+
+static void
+smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t)
+{
+ if (t->status == SMB_DIRECT_CS_CONNECTED)
+ queue_work(smb_direct_wq, &t->disconnect_work);
+}
+
+static void smb_direct_send_immediate_work(struct work_struct *work)
+{
+ struct smb_direct_transport *t = container_of(work,
+ struct smb_direct_transport, send_immediate_work);
+
+ if (t->status != SMB_DIRECT_CS_CONNECTED)
+ return;
+
+ smb_direct_post_send_data(t, NULL, NULL, 0, 0);
+}
+
+static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
+{
+ struct smb_direct_transport *t;
+ struct ksmbd_conn *conn;
+
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return NULL;
+
+ t->cm_id = cm_id;
+ cm_id->context = t;
+
+ t->status = SMB_DIRECT_CS_NEW;
+ init_waitqueue_head(&t->wait_status);
+
+ spin_lock_init(&t->reassembly_queue_lock);
+ INIT_LIST_HEAD(&t->reassembly_queue);
+ t->reassembly_data_length = 0;
+ t->reassembly_queue_length = 0;
+ init_waitqueue_head(&t->wait_reassembly_queue);
+ init_waitqueue_head(&t->wait_send_credits);
+ init_waitqueue_head(&t->wait_rw_avail_ops);
+
+ spin_lock_init(&t->receive_credit_lock);
+ spin_lock_init(&t->recvmsg_queue_lock);
+ INIT_LIST_HEAD(&t->recvmsg_queue);
+
+ spin_lock_init(&t->empty_recvmsg_queue_lock);
+ INIT_LIST_HEAD(&t->empty_recvmsg_queue);
+
+ init_waitqueue_head(&t->wait_send_payload_pending);
+ atomic_set(&t->send_payload_pending, 0);
+ init_waitqueue_head(&t->wait_send_pending);
+ atomic_set(&t->send_pending, 0);
+
+ spin_lock_init(&t->lock_new_recv_credits);
+
+ INIT_DELAYED_WORK(&t->post_recv_credits_work,
+ smb_direct_post_recv_credits);
+ INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work);
+ INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work);
+
+ conn = ksmbd_conn_alloc();
+ if (!conn)
+ goto err;
+ conn->transport = KSMBD_TRANS(t);
+ KSMBD_TRANS(t)->conn = conn;
+ KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops;
+ return t;
+err:
+ kfree(t);
+ return NULL;
+}
+
+static void free_transport(struct smb_direct_transport *t)
+{
+ struct smb_direct_recvmsg *recvmsg;
+
+ wake_up_interruptible(&t->wait_send_credits);
+
+ ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n");
+ wait_event(t->wait_send_payload_pending,
+ atomic_read(&t->send_payload_pending) == 0);
+ wait_event(t->wait_send_pending,
+ atomic_read(&t->send_pending) == 0);
+
+ cancel_work_sync(&t->disconnect_work);
+ cancel_delayed_work_sync(&t->post_recv_credits_work);
+ cancel_work_sync(&t->send_immediate_work);
+
+ if (t->qp) {
+ ib_drain_qp(t->qp);
+ ib_destroy_qp(t->qp);
+ }
+
+ ksmbd_debug(RDMA, "drain the reassembly queue\n");
+ do {
+ spin_lock(&t->reassembly_queue_lock);
+ recvmsg = get_first_reassembly(t);
+ if (recvmsg) {
+ list_del(&recvmsg->list);
+ spin_unlock(&t->reassembly_queue_lock);
+ put_recvmsg(t, recvmsg);
+ } else {
+ spin_unlock(&t->reassembly_queue_lock);
+ }
+ } while (recvmsg);
+ t->reassembly_data_length = 0;
+
+ if (t->send_cq)
+ ib_free_cq(t->send_cq);
+ if (t->recv_cq)
+ ib_free_cq(t->recv_cq);
+ if (t->pd)
+ ib_dealloc_pd(t->pd);
+ if (t->cm_id)
+ rdma_destroy_id(t->cm_id);
+
+ smb_direct_destroy_pools(t);
+ ksmbd_conn_free(KSMBD_TRANS(t)->conn);
+ kfree(t);
+}
+
+static struct smb_direct_sendmsg
+*smb_direct_alloc_sendmsg(struct smb_direct_transport *t)
+{
+ struct smb_direct_sendmsg *msg;
+
+ msg = mempool_alloc(t->sendmsg_mempool, GFP_KERNEL);
+ if (!msg)
+ return ERR_PTR(-ENOMEM);
+ msg->transport = t;
+ INIT_LIST_HEAD(&msg->list);
+ msg->num_sge = 0;
+ return msg;
+}
+
+static void smb_direct_free_sendmsg(struct smb_direct_transport *t,
+ struct smb_direct_sendmsg *msg)
+{
+ int i;
+
+ if (msg->num_sge > 0) {
+ ib_dma_unmap_single(t->cm_id->device,
+ msg->sge[0].addr, msg->sge[0].length,
+ DMA_TO_DEVICE);
+ for (i = 1; i < msg->num_sge; i++)
+ ib_dma_unmap_page(t->cm_id->device,
+ msg->sge[i].addr, msg->sge[i].length,
+ DMA_TO_DEVICE);
+ }
+ mempool_free(msg, t->sendmsg_mempool);
+}
+
+static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg)
+{
+ switch (recvmsg->type) {
+ case SMB_DIRECT_MSG_DATA_TRANSFER: {
+ struct smb_direct_data_transfer *req =
+ (struct smb_direct_data_transfer *)recvmsg->packet;
+ struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet
+ + le32_to_cpu(req->data_offset) - 4);
+ ksmbd_debug(RDMA,
+ "CreditGranted: %u, CreditRequested: %u, DataLength: %u, RemainingDataLength: %u, SMB: %x, Command: %u\n",
+ le16_to_cpu(req->credits_granted),
+ le16_to_cpu(req->credits_requested),
+ req->data_length, req->remaining_data_length,
+ hdr->ProtocolId, hdr->Command);
+ break;
+ }
+ case SMB_DIRECT_MSG_NEGOTIATE_REQ: {
+ struct smb_direct_negotiate_req *req =
+ (struct smb_direct_negotiate_req *)recvmsg->packet;
+ ksmbd_debug(RDMA,
+ "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n",
+ le16_to_cpu(req->min_version),
+ le16_to_cpu(req->max_version),
+ le16_to_cpu(req->credits_requested),
+ le32_to_cpu(req->preferred_send_size),
+ le32_to_cpu(req->max_receive_size),
+ le32_to_cpu(req->max_fragmented_size));
+ if (le16_to_cpu(req->min_version) > 0x0100 ||
+ le16_to_cpu(req->max_version) < 0x0100)
+ return -EOPNOTSUPP;
+ if (le16_to_cpu(req->credits_requested) <= 0 ||
+ le32_to_cpu(req->max_receive_size) <= 128 ||
+ le32_to_cpu(req->max_fragmented_size) <=
+ 128 * 1024)
+ return -ECONNABORTED;
+
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smb_direct_recvmsg *recvmsg;
+ struct smb_direct_transport *t;
+
+ recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe);
+ t = recvmsg->transport;
+
+ if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR) {
+ pr_err("Recv error. status='%s (%d)' opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status,
+ wc->opcode);
+ smb_direct_disconnect_rdma_connection(t);
+ }
+ put_empty_recvmsg(t, recvmsg);
+ return;
+ }
+
+ ksmbd_debug(RDMA, "Recv completed. status='%s (%d)', opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status,
+ wc->opcode);
+
+ ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr,
+ recvmsg->sge.length, DMA_FROM_DEVICE);
+
+ switch (recvmsg->type) {
+ case SMB_DIRECT_MSG_NEGOTIATE_REQ:
+ t->negotiation_requested = true;
+ t->full_packet_received = true;
+ wake_up_interruptible(&t->wait_status);
+ break;
+ case SMB_DIRECT_MSG_DATA_TRANSFER: {
+ struct smb_direct_data_transfer *data_transfer =
+ (struct smb_direct_data_transfer *)recvmsg->packet;
+ int data_length = le32_to_cpu(data_transfer->data_length);
+ int avail_recvmsg_count, receive_credits;
+
+ if (data_length) {
+ if (t->full_packet_received)
+ recvmsg->first_segment = true;
+
+ if (le32_to_cpu(data_transfer->remaining_data_length))
+ t->full_packet_received = false;
+ else
+ t->full_packet_received = true;
+
+ enqueue_reassembly(t, recvmsg, data_length);
+ wake_up_interruptible(&t->wait_reassembly_queue);
+
+ spin_lock(&t->receive_credit_lock);
+ receive_credits = --(t->recv_credits);
+ avail_recvmsg_count = t->count_avail_recvmsg;
+ spin_unlock(&t->receive_credit_lock);
+ } else {
+ put_empty_recvmsg(t, recvmsg);
+
+ spin_lock(&t->receive_credit_lock);
+ receive_credits = --(t->recv_credits);
+ avail_recvmsg_count = ++(t->count_avail_recvmsg);
+ spin_unlock(&t->receive_credit_lock);
+ }
+
+ t->recv_credit_target =
+ le16_to_cpu(data_transfer->credits_requested);
+ atomic_add(le16_to_cpu(data_transfer->credits_granted),
+ &t->send_credits);
+
+ if (le16_to_cpu(data_transfer->flags) &
+ SMB_DIRECT_RESPONSE_REQUESTED)
+ queue_work(smb_direct_wq, &t->send_immediate_work);
+
+ if (atomic_read(&t->send_credits) > 0)
+ wake_up_interruptible(&t->wait_send_credits);
+
+ if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count))
+ mod_delayed_work(smb_direct_wq,
+ &t->post_recv_credits_work, 0);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+static int smb_direct_post_recv(struct smb_direct_transport *t,
+ struct smb_direct_recvmsg *recvmsg)
+{
+ struct ib_recv_wr wr;
+ int ret;
+
+ recvmsg->sge.addr = ib_dma_map_single(t->cm_id->device,
+ recvmsg->packet, t->max_recv_size,
+ DMA_FROM_DEVICE);
+ ret = ib_dma_mapping_error(t->cm_id->device, recvmsg->sge.addr);
+ if (ret)
+ return ret;
+ recvmsg->sge.length = t->max_recv_size;
+ recvmsg->sge.lkey = t->pd->local_dma_lkey;
+ recvmsg->cqe.done = recv_done;
+
+ wr.wr_cqe = &recvmsg->cqe;
+ wr.next = NULL;
+ wr.sg_list = &recvmsg->sge;
+ wr.num_sge = 1;
+
+ ret = ib_post_recv(t->qp, &wr, NULL);
+ if (ret) {
+ pr_err("Can't post recv: %d\n", ret);
+ ib_dma_unmap_single(t->cm_id->device,
+ recvmsg->sge.addr, recvmsg->sge.length,
+ DMA_FROM_DEVICE);
+ smb_direct_disconnect_rdma_connection(t);
+ return ret;
+ }
+ return ret;
+}
+
+static int smb_direct_read(struct ksmbd_transport *t, char *buf,
+ unsigned int size)
+{
+ struct smb_direct_recvmsg *recvmsg;
+ struct smb_direct_data_transfer *data_transfer;
+ int to_copy, to_read, data_read, offset;
+ u32 data_length, remaining_data_length, data_offset;
+ int rc;
+ struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+
+again:
+ if (st->status != SMB_DIRECT_CS_CONNECTED) {
+ pr_err("disconnected\n");
+ return -ENOTCONN;
+ }
+
+ /*
+ * No need to hold the reassembly queue lock all the time as we are
+ * the only one reading from the front of the queue. The transport
+ * may add more entries to the back of the queue at the same time
+ */
+ if (st->reassembly_data_length >= size) {
+ int queue_length;
+ int queue_removed = 0;
+
+ /*
+ * Need to make sure reassembly_data_length is read before
+ * reading reassembly_queue_length and calling
+ * get_first_reassembly. This call is lock free
+ * as we never read at the end of the queue which are being
+ * updated in SOFTIRQ as more data is received
+ */
+ virt_rmb();
+ queue_length = st->reassembly_queue_length;
+ data_read = 0;
+ to_read = size;
+ offset = st->first_entry_offset;
+ while (data_read < size) {
+ recvmsg = get_first_reassembly(st);
+ data_transfer = smb_direct_recvmsg_payload(recvmsg);
+ data_length = le32_to_cpu(data_transfer->data_length);
+ remaining_data_length =
+ le32_to_cpu(data_transfer->remaining_data_length);
+ data_offset = le32_to_cpu(data_transfer->data_offset);
+
+ /*
+ * The upper layer expects RFC1002 length at the
+ * beginning of the payload. Return it to indicate
+ * the total length of the packet. This minimize the
+ * change to upper layer packet processing logic. This
+ * will be eventually remove when an intermediate
+ * transport layer is added
+ */
+ if (recvmsg->first_segment && size == 4) {
+ unsigned int rfc1002_len =
+ data_length + remaining_data_length;
+ *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
+ data_read = 4;
+ recvmsg->first_segment = false;
+ ksmbd_debug(RDMA,
+ "returning rfc1002 length %d\n",
+ rfc1002_len);
+ goto read_rfc1002_done;
+ }
+
+ to_copy = min_t(int, data_length - offset, to_read);
+ memcpy(buf + data_read, (char *)data_transfer + data_offset + offset,
+ to_copy);
+
+ /* move on to the next buffer? */
+ if (to_copy == data_length - offset) {
+ queue_length--;
+ /*
+ * No need to lock if we are not at the
+ * end of the queue
+ */
+ if (queue_length) {
+ list_del(&recvmsg->list);
+ } else {
+ spin_lock_irq(&st->reassembly_queue_lock);
+ list_del(&recvmsg->list);
+ spin_unlock_irq(&st->reassembly_queue_lock);
+ }
+ queue_removed++;
+ put_recvmsg(st, recvmsg);
+ offset = 0;
+ } else {
+ offset += to_copy;
+ }
+
+ to_read -= to_copy;
+ data_read += to_copy;
+ }
+
+ spin_lock_irq(&st->reassembly_queue_lock);
+ st->reassembly_data_length -= data_read;
+ st->reassembly_queue_length -= queue_removed;
+ spin_unlock_irq(&st->reassembly_queue_lock);
+
+ spin_lock(&st->receive_credit_lock);
+ st->count_avail_recvmsg += queue_removed;
+ if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) {
+ spin_unlock(&st->receive_credit_lock);
+ mod_delayed_work(smb_direct_wq,
+ &st->post_recv_credits_work, 0);
+ } else {
+ spin_unlock(&st->receive_credit_lock);
+ }
+
+ st->first_entry_offset = offset;
+ ksmbd_debug(RDMA,
+ "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
+ data_read, st->reassembly_data_length,
+ st->first_entry_offset);
+read_rfc1002_done:
+ return data_read;
+ }
+
+ ksmbd_debug(RDMA, "wait_event on more data\n");
+ rc = wait_event_interruptible(st->wait_reassembly_queue,
+ st->reassembly_data_length >= size ||
+ st->status != SMB_DIRECT_CS_CONNECTED);
+ if (rc)
+ return -EINTR;
+
+ goto again;
+}
+
+static void smb_direct_post_recv_credits(struct work_struct *work)
+{
+ struct smb_direct_transport *t = container_of(work,
+ struct smb_direct_transport, post_recv_credits_work.work);
+ struct smb_direct_recvmsg *recvmsg;
+ int receive_credits, credits = 0;
+ int ret;
+ int use_free = 1;
+
+ spin_lock(&t->receive_credit_lock);
+ receive_credits = t->recv_credits;
+ spin_unlock(&t->receive_credit_lock);
+
+ if (receive_credits < t->recv_credit_target) {
+ while (true) {
+ if (use_free)
+ recvmsg = get_free_recvmsg(t);
+ else
+ recvmsg = get_empty_recvmsg(t);
+ if (!recvmsg) {
+ if (use_free) {
+ use_free = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER;
+ recvmsg->first_segment = false;
+
+ ret = smb_direct_post_recv(t, recvmsg);
+ if (ret) {
+ pr_err("Can't post recv: %d\n", ret);
+ put_recvmsg(t, recvmsg);
+ break;
+ }
+ credits++;
+ }
+ }
+
+ spin_lock(&t->receive_credit_lock);
+ t->recv_credits += credits;
+ t->count_avail_recvmsg -= credits;
+ spin_unlock(&t->receive_credit_lock);
+
+ spin_lock(&t->lock_new_recv_credits);
+ t->new_recv_credits += credits;
+ spin_unlock(&t->lock_new_recv_credits);
+
+ if (credits)
+ queue_work(smb_direct_wq, &t->send_immediate_work);
+}
+
+static void send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smb_direct_sendmsg *sendmsg, *sibling;
+ struct smb_direct_transport *t;
+ struct list_head *pos, *prev, *end;
+
+ sendmsg = container_of(wc->wr_cqe, struct smb_direct_sendmsg, cqe);
+ t = sendmsg->transport;
+
+ ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status,
+ wc->opcode);
+
+ if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
+ pr_err("Send error. status='%s (%d)', opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->status,
+ wc->opcode);
+ smb_direct_disconnect_rdma_connection(t);
+ }
+
+ if (sendmsg->num_sge > 1) {
+ if (atomic_dec_and_test(&t->send_payload_pending))
+ wake_up(&t->wait_send_payload_pending);
+ } else {
+ if (atomic_dec_and_test(&t->send_pending))
+ wake_up(&t->wait_send_pending);
+ }
+
+ /* iterate and free the list of messages in reverse. the list's head
+ * is invalid.
+ */
+ for (pos = &sendmsg->list, prev = pos->prev, end = sendmsg->list.next;
+ prev != end; pos = prev, prev = prev->prev) {
+ sibling = container_of(pos, struct smb_direct_sendmsg, list);
+ smb_direct_free_sendmsg(t, sibling);
+ }
+
+ sibling = container_of(pos, struct smb_direct_sendmsg, list);
+ smb_direct_free_sendmsg(t, sibling);
+}
+
+static int manage_credits_prior_sending(struct smb_direct_transport *t)
+{
+ int new_credits;
+
+ spin_lock(&t->lock_new_recv_credits);
+ new_credits = t->new_recv_credits;
+ t->new_recv_credits = 0;
+ spin_unlock(&t->lock_new_recv_credits);
+
+ return new_credits;
+}
+
+static int smb_direct_post_send(struct smb_direct_transport *t,
+ struct ib_send_wr *wr)
+{
+ int ret;
+
+ if (wr->num_sge > 1)
+ atomic_inc(&t->send_payload_pending);
+ else
+ atomic_inc(&t->send_pending);
+
+ ret = ib_post_send(t->qp, wr, NULL);
+ if (ret) {
+ pr_err("failed to post send: %d\n", ret);
+ if (wr->num_sge > 1) {
+ if (atomic_dec_and_test(&t->send_payload_pending))
+ wake_up(&t->wait_send_payload_pending);
+ } else {
+ if (atomic_dec_and_test(&t->send_pending))
+ wake_up(&t->wait_send_pending);
+ }
+ smb_direct_disconnect_rdma_connection(t);
+ }
+ return ret;
+}
+
+static void smb_direct_send_ctx_init(struct smb_direct_transport *t,
+ struct smb_direct_send_ctx *send_ctx,
+ bool need_invalidate_rkey,
+ unsigned int remote_key)
+{
+ INIT_LIST_HEAD(&send_ctx->msg_list);
+ send_ctx->wr_cnt = 0;
+ send_ctx->need_invalidate_rkey = need_invalidate_rkey;
+ send_ctx->remote_key = remote_key;
+}
+
+static int smb_direct_flush_send_list(struct smb_direct_transport *t,
+ struct smb_direct_send_ctx *send_ctx,
+ bool is_last)
+{
+ struct smb_direct_sendmsg *first, *last;
+ int ret;
+
+ if (list_empty(&send_ctx->msg_list))
+ return 0;
+
+ first = list_first_entry(&send_ctx->msg_list,
+ struct smb_direct_sendmsg,
+ list);
+ last = list_last_entry(&send_ctx->msg_list,
+ struct smb_direct_sendmsg,
+ list);
+
+ last->wr.send_flags = IB_SEND_SIGNALED;
+ last->wr.wr_cqe = &last->cqe;
+ if (is_last && send_ctx->need_invalidate_rkey) {
+ last->wr.opcode = IB_WR_SEND_WITH_INV;
+ last->wr.ex.invalidate_rkey = send_ctx->remote_key;
+ }
+
+ ret = smb_direct_post_send(t, &first->wr);
+ if (!ret) {
+ smb_direct_send_ctx_init(t, send_ctx,
+ send_ctx->need_invalidate_rkey,
+ send_ctx->remote_key);
+ } else {
+ atomic_add(send_ctx->wr_cnt, &t->send_credits);
+ wake_up(&t->wait_send_credits);
+ list_for_each_entry_safe(first, last, &send_ctx->msg_list,
+ list) {
+ smb_direct_free_sendmsg(t, first);
+ }
+ }
+ return ret;
+}
+
+static int wait_for_credits(struct smb_direct_transport *t,
+ wait_queue_head_t *waitq, atomic_t *credits)
+{
+ int ret;
+
+ do {
+ if (atomic_dec_return(credits) >= 0)
+ return 0;
+
+ atomic_inc(credits);
+ ret = wait_event_interruptible(*waitq,
+ atomic_read(credits) > 0 ||
+ t->status != SMB_DIRECT_CS_CONNECTED);
+
+ if (t->status != SMB_DIRECT_CS_CONNECTED)
+ return -ENOTCONN;
+ else if (ret < 0)
+ return ret;
+ } while (true);
+}
+
+static int wait_for_send_credits(struct smb_direct_transport *t,
+ struct smb_direct_send_ctx *send_ctx)
+{
+ int ret;
+
+ if (send_ctx &&
+ (send_ctx->wr_cnt >= 16 || atomic_read(&t->send_credits) <= 1)) {
+ ret = smb_direct_flush_send_list(t, send_ctx, false);
+ if (ret)
+ return ret;
+ }
+
+ return wait_for_credits(t, &t->wait_send_credits, &t->send_credits);
+}
+
+static int smb_direct_create_header(struct smb_direct_transport *t,
+ int size, int remaining_data_length,
+ struct smb_direct_sendmsg **sendmsg_out)
+{
+ struct smb_direct_sendmsg *sendmsg;
+ struct smb_direct_data_transfer *packet;
+ int header_length;
+ int ret;
+
+ sendmsg = smb_direct_alloc_sendmsg(t);
+ if (IS_ERR(sendmsg))
+ return PTR_ERR(sendmsg);
+
+ /* Fill in the packet header */
+ packet = (struct smb_direct_data_transfer *)sendmsg->packet;
+ packet->credits_requested = cpu_to_le16(t->send_credit_target);
+ packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t));
+
+ packet->flags = 0;
+ packet->reserved = 0;
+ if (!size)
+ packet->data_offset = 0;
+ else
+ packet->data_offset = cpu_to_le32(24);
+ packet->data_length = cpu_to_le32(size);
+ packet->remaining_data_length = cpu_to_le32(remaining_data_length);
+ packet->padding = 0;
+
+ ksmbd_debug(RDMA,
+ "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
+ le16_to_cpu(packet->credits_requested),
+ le16_to_cpu(packet->credits_granted),
+ le32_to_cpu(packet->data_offset),
+ le32_to_cpu(packet->data_length),
+ le32_to_cpu(packet->remaining_data_length));
+
+ /* Map the packet to DMA */
+ header_length = sizeof(struct smb_direct_data_transfer);
+ /* If this is a packet without payload, don't send padding */
+ if (!size)
+ header_length =
+ offsetof(struct smb_direct_data_transfer, padding);
+
+ sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device,
+ (void *)packet,
+ header_length,
+ DMA_TO_DEVICE);
+ ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr);
+ if (ret) {
+ smb_direct_free_sendmsg(t, sendmsg);
+ return ret;
+ }
+
+ sendmsg->num_sge = 1;
+ sendmsg->sge[0].length = header_length;
+ sendmsg->sge[0].lkey = t->pd->local_dma_lkey;
+
+ *sendmsg_out = sendmsg;
+ return 0;
+}
+
+static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nentries)
+{
+ bool high = is_vmalloc_addr(buf);
+ struct page *page;
+ int offset, len;
+ int i = 0;
+
+ if (nentries < get_buf_page_count(buf, size))
+ return -EINVAL;
+
+ offset = offset_in_page(buf);
+ buf -= offset;
+ while (size > 0) {
+ len = min_t(int, PAGE_SIZE - offset, size);
+ if (high)
+ page = vmalloc_to_page(buf);
+ else
+ page = kmap_to_page(buf);
+
+ if (!sg_list)
+ return -EINVAL;
+ sg_set_page(sg_list, page, len, offset);
+ sg_list = sg_next(sg_list);
+
+ buf += PAGE_SIZE;
+ size -= len;
+ offset = 0;
+ i++;
+ }
+ return i;
+}
+
+static int get_mapped_sg_list(struct ib_device *device, void *buf, int size,
+ struct scatterlist *sg_list, int nentries,
+ enum dma_data_direction dir)
+{
+ int npages;
+
+ npages = get_sg_list(buf, size, sg_list, nentries);
+ if (npages <= 0)
+ return -EINVAL;
+ return ib_dma_map_sg(device, sg_list, npages, dir);
+}
+
+static int post_sendmsg(struct smb_direct_transport *t,
+ struct smb_direct_send_ctx *send_ctx,
+ struct smb_direct_sendmsg *msg)
+{
+ int i;
+
+ for (i = 0; i < msg->num_sge; i++)
+ ib_dma_sync_single_for_device(t->cm_id->device,
+ msg->sge[i].addr, msg->sge[i].length,
+ DMA_TO_DEVICE);
+
+ msg->cqe.done = send_done;
+ msg->wr.opcode = IB_WR_SEND;
+ msg->wr.sg_list = &msg->sge[0];
+ msg->wr.num_sge = msg->num_sge;
+ msg->wr.next = NULL;
+
+ if (send_ctx) {
+ msg->wr.wr_cqe = NULL;
+ msg->wr.send_flags = 0;
+ if (!list_empty(&send_ctx->msg_list)) {
+ struct smb_direct_sendmsg *last;
+
+ last = list_last_entry(&send_ctx->msg_list,
+ struct smb_direct_sendmsg,
+ list);
+ last->wr.next = &msg->wr;
+ }
+ list_add_tail(&msg->list, &send_ctx->msg_list);
+ send_ctx->wr_cnt++;
+ return 0;
+ }
+
+ msg->wr.wr_cqe = &msg->cqe;
+ msg->wr.send_flags = IB_SEND_SIGNALED;
+ return smb_direct_post_send(t, &msg->wr);
+}
+
+static int smb_direct_post_send_data(struct smb_direct_transport *t,
+ struct smb_direct_send_ctx *send_ctx,
+ struct kvec *iov, int niov,
+ int remaining_data_length)
+{
+ int i, j, ret;
+ struct smb_direct_sendmsg *msg;
+ int data_length;
+ struct scatterlist sg[SMB_DIRECT_MAX_SEND_SGES - 1];
+
+ ret = wait_for_send_credits(t, send_ctx);
+ if (ret)
+ return ret;
+
+ data_length = 0;
+ for (i = 0; i < niov; i++)
+ data_length += iov[i].iov_len;
+
+ ret = smb_direct_create_header(t, data_length, remaining_data_length,
+ &msg);
+ if (ret) {
+ atomic_inc(&t->send_credits);
+ return ret;
+ }
+
+ for (i = 0; i < niov; i++) {
+ struct ib_sge *sge;
+ int sg_cnt;
+
+ sg_init_table(sg, SMB_DIRECT_MAX_SEND_SGES - 1);
+ sg_cnt = get_mapped_sg_list(t->cm_id->device,
+ iov[i].iov_base, iov[i].iov_len,
+ sg, SMB_DIRECT_MAX_SEND_SGES - 1,
+ DMA_TO_DEVICE);
+ if (sg_cnt <= 0) {
+ pr_err("failed to map buffer\n");
+ ret = -ENOMEM;
+ goto err;
+ } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES - 1) {
+ pr_err("buffer not fitted into sges\n");
+ ret = -E2BIG;
+ ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt,
+ DMA_TO_DEVICE);
+ goto err;
+ }
+
+ for (j = 0; j < sg_cnt; j++) {
+ sge = &msg->sge[msg->num_sge];
+ sge->addr = sg_dma_address(&sg[j]);
+ sge->length = sg_dma_len(&sg[j]);
+ sge->lkey = t->pd->local_dma_lkey;
+ msg->num_sge++;
+ }
+ }
+
+ ret = post_sendmsg(t, send_ctx, msg);
+ if (ret)
+ goto err;
+ return 0;
+err:
+ smb_direct_free_sendmsg(t, msg);
+ atomic_inc(&t->send_credits);
+ return ret;
+}
+
+static int smb_direct_writev(struct ksmbd_transport *t,
+ struct kvec *iov, int niovs, int buflen,
+ bool need_invalidate, unsigned int remote_key)
+{
+ struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+ int remaining_data_length;
+ int start, i, j;
+ int max_iov_size = st->max_send_size -
+ sizeof(struct smb_direct_data_transfer);
+ int ret;
+ struct kvec vec;
+ struct smb_direct_send_ctx send_ctx;
+
+ if (st->status != SMB_DIRECT_CS_CONNECTED)
+ return -ENOTCONN;
+
+ //FIXME: skip RFC1002 header..
+ buflen -= 4;
+ iov[0].iov_base += 4;
+ iov[0].iov_len -= 4;
+
+ remaining_data_length = buflen;
+ ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
+
+ smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key);
+ start = i = 0;
+ buflen = 0;
+ while (true) {
+ buflen += iov[i].iov_len;
+ if (buflen > max_iov_size) {
+ if (i > start) {
+ remaining_data_length -=
+ (buflen - iov[i].iov_len);
+ ret = smb_direct_post_send_data(st, &send_ctx,
+ &iov[start], i - start,
+ remaining_data_length);
+ if (ret)
+ goto done;
+ } else {
+ /* iov[start] is too big, break it */
+ int nvec = (buflen + max_iov_size - 1) /
+ max_iov_size;
+
+ for (j = 0; j < nvec; j++) {
+ vec.iov_base =
+ (char *)iov[start].iov_base +
+ j * max_iov_size;
+ vec.iov_len =
+ min_t(int, max_iov_size,
+ buflen - max_iov_size * j);
+ remaining_data_length -= vec.iov_len;
+ ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1,
+ remaining_data_length);
+ if (ret)
+ goto done;
+ }
+ i++;
+ if (i == niovs)
+ break;
+ }
+ start = i;
+ buflen = 0;
+ } else {
+ i++;
+ if (i == niovs) {
+ /* send out all remaining vecs */
+ remaining_data_length -= buflen;
+ ret = smb_direct_post_send_data(st, &send_ctx,
+ &iov[start], i - start,
+ remaining_data_length);
+ if (ret)
+ goto done;
+ break;
+ }
+ }
+ }
+
+done:
+ ret = smb_direct_flush_send_list(st, &send_ctx, true);
+
+ /*
+ * As an optimization, we don't wait for individual I/O to finish
+ * before sending the next one.
+ * Send them all and wait for pending send count to get to 0
+ * that means all the I/Os have been out and we are good to return
+ */
+
+ wait_event(st->wait_send_payload_pending,
+ atomic_read(&st->send_payload_pending) == 0);
+ return ret;
+}
+
+static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
+ enum dma_data_direction dir)
+{
+ struct smb_direct_rdma_rw_msg *msg = container_of(wc->wr_cqe,
+ struct smb_direct_rdma_rw_msg, cqe);
+ struct smb_direct_transport *t = msg->t;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ pr_err("read/write error. opcode = %d, status = %s(%d)\n",
+ wc->opcode, ib_wc_status_msg(wc->status), wc->status);
+ smb_direct_disconnect_rdma_connection(t);
+ }
+
+ if (atomic_inc_return(&t->rw_avail_ops) > 0)
+ wake_up(&t->wait_rw_avail_ops);
+
+ rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sg_list, msg->sgt.nents, dir);
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ complete(msg->completion);
+ kfree(msg);
+}
+
+static void read_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ read_write_done(cq, wc, DMA_FROM_DEVICE);
+}
+
+static void write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ read_write_done(cq, wc, DMA_TO_DEVICE);
+}
+
+static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf,
+ int buf_len, u32 remote_key, u64 remote_offset,
+ u32 remote_len, bool is_read)
+{
+ struct smb_direct_rdma_rw_msg *msg;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(completion);
+ struct ib_send_wr *first_wr = NULL;
+
+ ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops);
+ if (ret < 0)
+ return ret;
+
+ /* TODO: mempool */
+ msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) +
+ sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL);
+ if (!msg) {
+ atomic_inc(&t->rw_avail_ops);
+ return -ENOMEM;
+ }
+
+ msg->sgt.sgl = &msg->sg_list[0];
+ ret = sg_alloc_table_chained(&msg->sgt,
+ get_buf_page_count(buf, buf_len),
+ msg->sg_list, SG_CHUNK_SIZE);
+ if (ret) {
+ atomic_inc(&t->rw_avail_ops);
+ kfree(msg);
+ return -ENOMEM;
+ }
+
+ ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents);
+ if (ret <= 0) {
+ pr_err("failed to get pages\n");
+ goto err;
+ }
+
+ ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sg_list, get_buf_page_count(buf, buf_len),
+ 0, remote_offset, remote_key,
+ is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ if (ret < 0) {
+ pr_err("failed to init rdma_rw_ctx: %d\n", ret);
+ goto err;
+ }
+
+ msg->t = t;
+ msg->cqe.done = is_read ? read_done : write_done;
+ msg->completion = &completion;
+ first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port,
+ &msg->cqe, NULL);
+
+ ret = ib_post_send(t->qp, first_wr, NULL);
+ if (ret) {
+ pr_err("failed to post send wr: %d\n", ret);
+ goto err;
+ }
+
+ wait_for_completion(&completion);
+ return 0;
+
+err:
+ atomic_inc(&t->rw_avail_ops);
+ if (first_wr)
+ rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sg_list, msg->sgt.nents,
+ is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ kfree(msg);
+ return ret;
+}
+
+static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf,
+ unsigned int buflen, u32 remote_key,
+ u64 remote_offset, u32 remote_len)
+{
+ return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
+ remote_key, remote_offset,
+ remote_len, false);
+}
+
+static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf,
+ unsigned int buflen, u32 remote_key,
+ u64 remote_offset, u32 remote_len)
+{
+ return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
+ remote_key, remote_offset,
+ remote_len, true);
+}
+
+static void smb_direct_disconnect(struct ksmbd_transport *t)
+{
+ struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+
+ ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", st->cm_id);
+
+ smb_direct_disconnect_rdma_work(&st->disconnect_work);
+ wait_event_interruptible(st->wait_status,
+ st->status == SMB_DIRECT_CS_DISCONNECTED);
+ free_transport(st);
+}
+
+static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ struct smb_direct_transport *t = cm_id->context;
+
+ ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n",
+ cm_id, rdma_event_msg(event->event), event->event);
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_ESTABLISHED: {
+ t->status = SMB_DIRECT_CS_CONNECTED;
+ wake_up_interruptible(&t->wait_status);
+ break;
+ }
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ case RDMA_CM_EVENT_DISCONNECTED: {
+ t->status = SMB_DIRECT_CS_DISCONNECTED;
+ wake_up_interruptible(&t->wait_status);
+ wake_up_interruptible(&t->wait_reassembly_queue);
+ wake_up(&t->wait_send_credits);
+ break;
+ }
+ case RDMA_CM_EVENT_CONNECT_ERROR: {
+ t->status = SMB_DIRECT_CS_DISCONNECTED;
+ wake_up_interruptible(&t->wait_status);
+ break;
+ }
+ default:
+ pr_err("Unexpected RDMA CM event. cm_id=%p, event=%s (%d)\n",
+ cm_id, rdma_event_msg(event->event),
+ event->event);
+ break;
+ }
+ return 0;
+}
+
+static void smb_direct_qpair_handler(struct ib_event *event, void *context)
+{
+ struct smb_direct_transport *t = context;
+
+ ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n",
+ t->cm_id, ib_event_msg(event->event), event->event);
+
+ switch (event->event) {
+ case IB_EVENT_CQ_ERR:
+ case IB_EVENT_QP_FATAL:
+ smb_direct_disconnect_rdma_connection(t);
+ break;
+ default:
+ break;
+ }
+}
+
+static int smb_direct_send_negotiate_response(struct smb_direct_transport *t,
+ int failed)
+{
+ struct smb_direct_sendmsg *sendmsg;
+ struct smb_direct_negotiate_resp *resp;
+ int ret;
+
+ sendmsg = smb_direct_alloc_sendmsg(t);
+ if (IS_ERR(sendmsg))
+ return -ENOMEM;
+
+ resp = (struct smb_direct_negotiate_resp *)sendmsg->packet;
+ if (failed) {
+ memset(resp, 0, sizeof(*resp));
+ resp->min_version = cpu_to_le16(0x0100);
+ resp->max_version = cpu_to_le16(0x0100);
+ resp->status = STATUS_NOT_SUPPORTED;
+ } else {
+ resp->status = STATUS_SUCCESS;
+ resp->min_version = SMB_DIRECT_VERSION_LE;
+ resp->max_version = SMB_DIRECT_VERSION_LE;
+ resp->negotiated_version = SMB_DIRECT_VERSION_LE;
+ resp->reserved = 0;
+ resp->credits_requested =
+ cpu_to_le16(t->send_credit_target);
+ resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(t));
+ resp->max_readwrite_size = cpu_to_le32(t->max_rdma_rw_size);
+ resp->preferred_send_size = cpu_to_le32(t->max_send_size);
+ resp->max_receive_size = cpu_to_le32(t->max_recv_size);
+ resp->max_fragmented_size =
+ cpu_to_le32(t->max_fragmented_recv_size);
+ }
+
+ sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device,
+ (void *)resp, sizeof(*resp),
+ DMA_TO_DEVICE);
+ ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr);
+ if (ret) {
+ smb_direct_free_sendmsg(t, sendmsg);
+ return ret;
+ }
+
+ sendmsg->num_sge = 1;
+ sendmsg->sge[0].length = sizeof(*resp);
+ sendmsg->sge[0].lkey = t->pd->local_dma_lkey;
+
+ ret = post_sendmsg(t, NULL, sendmsg);
+ if (ret) {
+ smb_direct_free_sendmsg(t, sendmsg);
+ return ret;
+ }
+
+ wait_event(t->wait_send_pending,
+ atomic_read(&t->send_pending) == 0);
+ return 0;
+}
+
+static int smb_direct_accept_client(struct smb_direct_transport *t)
+{
+ struct rdma_conn_param conn_param;
+ struct ib_port_immutable port_immutable;
+ u32 ird_ord_hdr[2];
+ int ret;
+
+ memset(&conn_param, 0, sizeof(conn_param));
+ conn_param.initiator_depth = min_t(u8, t->cm_id->device->attrs.max_qp_rd_atom,
+ SMB_DIRECT_CM_INITIATOR_DEPTH);
+ conn_param.responder_resources = 0;
+
+ t->cm_id->device->ops.get_port_immutable(t->cm_id->device,
+ t->cm_id->port_num,
+ &port_immutable);
+ if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
+ ird_ord_hdr[0] = conn_param.responder_resources;
+ ird_ord_hdr[1] = 1;
+ conn_param.private_data = ird_ord_hdr;
+ conn_param.private_data_len = sizeof(ird_ord_hdr);
+ } else {
+ conn_param.private_data = NULL;
+ conn_param.private_data_len = 0;
+ }
+ conn_param.retry_count = SMB_DIRECT_CM_RETRY;
+ conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY;
+ conn_param.flow_control = 0;
+
+ ret = rdma_accept(t->cm_id, &conn_param);
+ if (ret) {
+ pr_err("error at rdma_accept: %d\n", ret);
+ return ret;
+ }
+
+ wait_event_interruptible(t->wait_status,
+ t->status != SMB_DIRECT_CS_NEW);
+ if (t->status != SMB_DIRECT_CS_CONNECTED)
+ return -ENOTCONN;
+ return 0;
+}
+
+static int smb_direct_negotiate(struct smb_direct_transport *t)
+{
+ int ret;
+ struct smb_direct_recvmsg *recvmsg;
+ struct smb_direct_negotiate_req *req;
+
+ recvmsg = get_free_recvmsg(t);
+ if (!recvmsg)
+ return -ENOMEM;
+ recvmsg->type = SMB_DIRECT_MSG_NEGOTIATE_REQ;
+
+ ret = smb_direct_post_recv(t, recvmsg);
+ if (ret) {
+ pr_err("Can't post recv: %d\n", ret);
+ goto out;
+ }
+
+ t->negotiation_requested = false;
+ ret = smb_direct_accept_client(t);
+ if (ret) {
+ pr_err("Can't accept client\n");
+ goto out;
+ }
+
+ smb_direct_post_recv_credits(&t->post_recv_credits_work.work);
+
+ ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n");
+ ret = wait_event_interruptible_timeout(t->wait_status,
+ t->negotiation_requested ||
+ t->status == SMB_DIRECT_CS_DISCONNECTED,
+ SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ);
+ if (ret <= 0 || t->status == SMB_DIRECT_CS_DISCONNECTED) {
+ ret = ret < 0 ? ret : -ETIMEDOUT;
+ goto out;
+ }
+
+ ret = smb_direct_check_recvmsg(recvmsg);
+ if (ret == -ECONNABORTED)
+ goto out;
+
+ req = (struct smb_direct_negotiate_req *)recvmsg->packet;
+ t->max_recv_size = min_t(int, t->max_recv_size,
+ le32_to_cpu(req->preferred_send_size));
+ t->max_send_size = min_t(int, t->max_send_size,
+ le32_to_cpu(req->max_receive_size));
+ t->max_fragmented_send_size =
+ le32_to_cpu(req->max_fragmented_size);
+
+ ret = smb_direct_send_negotiate_response(t, ret);
+out:
+ if (recvmsg)
+ put_recvmsg(t, recvmsg);
+ return ret;
+}
+
+static int smb_direct_init_params(struct smb_direct_transport *t,
+ struct ib_qp_cap *cap)
+{
+ struct ib_device *device = t->cm_id->device;
+ int max_send_sges, max_pages, max_rw_wrs, max_send_wrs;
+
+ /* need 2 more sge. because a SMB_DIRECT header will be mapped,
+ * and maybe a send buffer could be not page aligned.
+ */
+ t->max_send_size = smb_direct_max_send_size;
+ max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2;
+ if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) {
+ pr_err("max_send_size %d is too large\n", t->max_send_size);
+ return -EINVAL;
+ }
+
+ /*
+ * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA
+ * read/writes. HCA guarantees at least max_send_sge of sges for
+ * a RDMA read/write work request, and if memory registration is used,
+ * we need reg_mr, local_inv wrs for each read/write.
+ */
+ t->max_rdma_rw_size = smb_direct_max_read_write_size;
+ max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
+ max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES);
+ max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num,
+ max_pages) * 2;
+ max_rw_wrs *= smb_direct_max_outstanding_rw_ops;
+
+ max_send_wrs = smb_direct_send_credit_target + max_rw_wrs;
+ if (max_send_wrs > device->attrs.max_cqe ||
+ max_send_wrs > device->attrs.max_qp_wr) {
+ pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n",
+ smb_direct_send_credit_target,
+ smb_direct_max_outstanding_rw_ops);
+ pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
+ device->attrs.max_cqe, device->attrs.max_qp_wr);
+ return -EINVAL;
+ }
+
+ if (smb_direct_receive_credit_max > device->attrs.max_cqe ||
+ smb_direct_receive_credit_max > device->attrs.max_qp_wr) {
+ pr_err("consider lowering receive_credit_max = %d\n",
+ smb_direct_receive_credit_max);
+ pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
+ device->attrs.max_cqe, device->attrs.max_qp_wr);
+ return -EINVAL;
+ }
+
+ if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) {
+ pr_err("warning: device max_send_sge = %d too small\n",
+ device->attrs.max_send_sge);
+ return -EINVAL;
+ }
+ if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
+ pr_err("warning: device max_recv_sge = %d too small\n",
+ device->attrs.max_recv_sge);
+ return -EINVAL;
+ }
+
+ t->recv_credits = 0;
+ t->count_avail_recvmsg = 0;
+
+ t->recv_credit_max = smb_direct_receive_credit_max;
+ t->recv_credit_target = 10;
+ t->new_recv_credits = 0;
+
+ t->send_credit_target = smb_direct_send_credit_target;
+ atomic_set(&t->send_credits, 0);
+ atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops);
+
+ t->max_send_size = smb_direct_max_send_size;
+ t->max_recv_size = smb_direct_max_receive_size;
+ t->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size;
+
+ cap->max_send_wr = max_send_wrs;
+ cap->max_recv_wr = t->recv_credit_max;
+ cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
+ cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
+ cap->max_inline_data = 0;
+ cap->max_rdma_ctxs = 0;
+ return 0;
+}
+
+static void smb_direct_destroy_pools(struct smb_direct_transport *t)
+{
+ struct smb_direct_recvmsg *recvmsg;
+
+ while ((recvmsg = get_free_recvmsg(t)))
+ mempool_free(recvmsg, t->recvmsg_mempool);
+ while ((recvmsg = get_empty_recvmsg(t)))
+ mempool_free(recvmsg, t->recvmsg_mempool);
+
+ mempool_destroy(t->recvmsg_mempool);
+ t->recvmsg_mempool = NULL;
+
+ kmem_cache_destroy(t->recvmsg_cache);
+ t->recvmsg_cache = NULL;
+
+ mempool_destroy(t->sendmsg_mempool);
+ t->sendmsg_mempool = NULL;
+
+ kmem_cache_destroy(t->sendmsg_cache);
+ t->sendmsg_cache = NULL;
+}
+
+static int smb_direct_create_pools(struct smb_direct_transport *t)
+{
+ char name[80];
+ int i;
+ struct smb_direct_recvmsg *recvmsg;
+
+ snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t);
+ t->sendmsg_cache = kmem_cache_create(name,
+ sizeof(struct smb_direct_sendmsg) +
+ sizeof(struct smb_direct_negotiate_resp),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!t->sendmsg_cache)
+ return -ENOMEM;
+
+ t->sendmsg_mempool = mempool_create(t->send_credit_target,
+ mempool_alloc_slab, mempool_free_slab,
+ t->sendmsg_cache);
+ if (!t->sendmsg_mempool)
+ goto err;
+
+ snprintf(name, sizeof(name), "smb_direct_resp_%p", t);
+ t->recvmsg_cache = kmem_cache_create(name,
+ sizeof(struct smb_direct_recvmsg) +
+ t->max_recv_size,
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!t->recvmsg_cache)
+ goto err;
+
+ t->recvmsg_mempool =
+ mempool_create(t->recv_credit_max, mempool_alloc_slab,
+ mempool_free_slab, t->recvmsg_cache);
+ if (!t->recvmsg_mempool)
+ goto err;
+
+ INIT_LIST_HEAD(&t->recvmsg_queue);
+
+ for (i = 0; i < t->recv_credit_max; i++) {
+ recvmsg = mempool_alloc(t->recvmsg_mempool, GFP_KERNEL);
+ if (!recvmsg)
+ goto err;
+ recvmsg->transport = t;
+ list_add(&recvmsg->list, &t->recvmsg_queue);
+ }
+ t->count_avail_recvmsg = t->recv_credit_max;
+
+ return 0;
+err:
+ smb_direct_destroy_pools(t);
+ return -ENOMEM;
+}
+
+static int smb_direct_create_qpair(struct smb_direct_transport *t,
+ struct ib_qp_cap *cap)
+{
+ int ret;
+ struct ib_qp_init_attr qp_attr;
+
+ t->pd = ib_alloc_pd(t->cm_id->device, 0);
+ if (IS_ERR(t->pd)) {
+ pr_err("Can't create RDMA PD\n");
+ ret = PTR_ERR(t->pd);
+ t->pd = NULL;
+ return ret;
+ }
+
+ t->send_cq = ib_alloc_cq(t->cm_id->device, t,
+ t->send_credit_target, 0, IB_POLL_WORKQUEUE);
+ if (IS_ERR(t->send_cq)) {
+ pr_err("Can't create RDMA send CQ\n");
+ ret = PTR_ERR(t->send_cq);
+ t->send_cq = NULL;
+ goto err;
+ }
+
+ t->recv_cq = ib_alloc_cq(t->cm_id->device, t,
+ cap->max_send_wr + cap->max_rdma_ctxs,
+ 0, IB_POLL_WORKQUEUE);
+ if (IS_ERR(t->recv_cq)) {
+ pr_err("Can't create RDMA recv CQ\n");
+ ret = PTR_ERR(t->recv_cq);
+ t->recv_cq = NULL;
+ goto err;
+ }
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.event_handler = smb_direct_qpair_handler;
+ qp_attr.qp_context = t;
+ qp_attr.cap = *cap;
+ qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ qp_attr.qp_type = IB_QPT_RC;
+ qp_attr.send_cq = t->send_cq;
+ qp_attr.recv_cq = t->recv_cq;
+ qp_attr.port_num = ~0;
+
+ ret = rdma_create_qp(t->cm_id, t->pd, &qp_attr);
+ if (ret) {
+ pr_err("Can't create RDMA QP: %d\n", ret);
+ goto err;
+ }
+
+ t->qp = t->cm_id->qp;
+ t->cm_id->event_handler = smb_direct_cm_handler;
+
+ return 0;
+err:
+ if (t->qp) {
+ ib_destroy_qp(t->qp);
+ t->qp = NULL;
+ }
+ if (t->recv_cq) {
+ ib_destroy_cq(t->recv_cq);
+ t->recv_cq = NULL;
+ }
+ if (t->send_cq) {
+ ib_destroy_cq(t->send_cq);
+ t->send_cq = NULL;
+ }
+ if (t->pd) {
+ ib_dealloc_pd(t->pd);
+ t->pd = NULL;
+ }
+ return ret;
+}
+
+static int smb_direct_prepare(struct ksmbd_transport *t)
+{
+ struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+ int ret;
+ struct ib_qp_cap qp_cap;
+
+ ret = smb_direct_init_params(st, &qp_cap);
+ if (ret) {
+ pr_err("Can't configure RDMA parameters\n");
+ return ret;
+ }
+
+ ret = smb_direct_create_pools(st);
+ if (ret) {
+ pr_err("Can't init RDMA pool: %d\n", ret);
+ return ret;
+ }
+
+ ret = smb_direct_create_qpair(st, &qp_cap);
+ if (ret) {
+ pr_err("Can't accept RDMA client: %d\n", ret);
+ return ret;
+ }
+
+ ret = smb_direct_negotiate(st);
+ if (ret) {
+ pr_err("Can't negotiate: %d\n", ret);
+ return ret;
+ }
+
+ st->status = SMB_DIRECT_CS_CONNECTED;
+ return 0;
+}
+
+static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
+{
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ return false;
+ if (attrs->max_fast_reg_page_list_len == 0)
+ return false;
+ return true;
+}
+
+static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
+{
+ struct smb_direct_transport *t;
+
+ if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
+ ksmbd_debug(RDMA,
+ "Fast Registration Work Requests is not supported. device capabilities=%llx\n",
+ new_cm_id->device->attrs.device_cap_flags);
+ return -EPROTONOSUPPORT;
+ }
+
+ t = alloc_transport(new_cm_id);
+ if (!t)
+ return -ENOMEM;
+
+ KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:r%u",
+ SMB_DIRECT_PORT);
+ if (IS_ERR(KSMBD_TRANS(t)->handler)) {
+ int ret = PTR_ERR(KSMBD_TRANS(t)->handler);
+
+ pr_err("Can't start thread\n");
+ free_transport(t);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int smb_direct_listen_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ switch (event->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST: {
+ int ret = smb_direct_handle_connect_request(cm_id);
+
+ if (ret) {
+ pr_err("Can't create transport: %d\n", ret);
+ return ret;
+ }
+
+ ksmbd_debug(RDMA, "Received connection request. cm_id=%p\n",
+ cm_id);
+ break;
+ }
+ default:
+ pr_err("Unexpected listen event. cm_id=%p, event=%s (%d)\n",
+ cm_id, rdma_event_msg(event->event), event->event);
+ break;
+ }
+ return 0;
+}
+
+static int smb_direct_listen(int port)
+{
+ int ret;
+ struct rdma_cm_id *cm_id;
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ .sin_port = htons(port),
+ };
+
+ cm_id = rdma_create_id(&init_net, smb_direct_listen_handler,
+ &smb_direct_listener, RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(cm_id)) {
+ pr_err("Can't create cm id: %ld\n", PTR_ERR(cm_id));
+ return PTR_ERR(cm_id);
+ }
+
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ if (ret) {
+ pr_err("Can't bind: %d\n", ret);
+ goto err;
+ }
+
+ smb_direct_listener.cm_id = cm_id;
+
+ ret = rdma_listen(cm_id, 10);
+ if (ret) {
+ pr_err("Can't listen: %d\n", ret);
+ goto err;
+ }
+ return 0;
+err:
+ smb_direct_listener.cm_id = NULL;
+ rdma_destroy_id(cm_id);
+ return ret;
+}
+
+int ksmbd_rdma_init(void)
+{
+ int ret;
+
+ smb_direct_listener.cm_id = NULL;
+
+ /* When a client is running out of send credits, the credits are
+ * granted by the server's sending a packet using this queue.
+ * This avoids the situation that a clients cannot send packets
+ * for lack of credits
+ */
+ smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
+ if (!smb_direct_wq)
+ return -ENOMEM;
+
+ ret = smb_direct_listen(SMB_DIRECT_PORT);
+ if (ret) {
+ destroy_workqueue(smb_direct_wq);
+ smb_direct_wq = NULL;
+ pr_err("Can't listen: %d\n", ret);
+ return ret;
+ }
+
+ ksmbd_debug(RDMA, "init RDMA listener. cm_id=%p\n",
+ smb_direct_listener.cm_id);
+ return 0;
+}
+
+int ksmbd_rdma_destroy(void)
+{
+ if (smb_direct_listener.cm_id)
+ rdma_destroy_id(smb_direct_listener.cm_id);
+ smb_direct_listener.cm_id = NULL;
+
+ if (smb_direct_wq) {
+ flush_workqueue(smb_direct_wq);
+ destroy_workqueue(smb_direct_wq);
+ smb_direct_wq = NULL;
+ }
+ return 0;
+}
+
+bool ksmbd_rdma_capable_netdev(struct net_device *netdev)
+{
+ struct ib_device *ibdev;
+ bool rdma_capable = false;
+
+ ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN);
+ if (ibdev) {
+ if (rdma_frwr_is_supported(&ibdev->attrs))
+ rdma_capable = true;
+ ib_device_put(ibdev);
+ }
+ return rdma_capable;
+}
+
+static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = {
+ .prepare = smb_direct_prepare,
+ .disconnect = smb_direct_disconnect,
+ .writev = smb_direct_writev,
+ .read = smb_direct_read,
+ .rdma_read = smb_direct_rdma_read,
+ .rdma_write = smb_direct_rdma_write,
+};
diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h
new file mode 100644
index 000000000000..0fa8adc0776f
--- /dev/null
+++ b/fs/ksmbd/transport_rdma.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (C) 2018, LG Electronics.
+ */
+
+#ifndef __KSMBD_TRANSPORT_RDMA_H__
+#define __KSMBD_TRANSPORT_RDMA_H__
+
+#define SMB_DIRECT_PORT 5445
+
+/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */
+struct smb_direct_negotiate_req {
+ __le16 min_version;
+ __le16 max_version;
+ __le16 reserved;
+ __le16 credits_requested;
+ __le32 preferred_send_size;
+ __le32 max_receive_size;
+ __le32 max_fragmented_size;
+} __packed;
+
+/* SMB DIRECT negotiation response packet [MS-SMBD] 2.2.2 */
+struct smb_direct_negotiate_resp {
+ __le16 min_version;
+ __le16 max_version;
+ __le16 negotiated_version;
+ __le16 reserved;
+ __le16 credits_requested;
+ __le16 credits_granted;
+ __le32 status;
+ __le32 max_readwrite_size;
+ __le32 preferred_send_size;
+ __le32 max_receive_size;
+ __le32 max_fragmented_size;
+} __packed;
+
+#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001
+
+/* SMB DIRECT data transfer packet with payload [MS-SMBD] 2.2.3 */
+struct smb_direct_data_transfer {
+ __le16 credits_requested;
+ __le16 credits_granted;
+ __le16 flags;
+ __le16 reserved;
+ __le32 remaining_data_length;
+ __le32 data_offset;
+ __le32 data_length;
+ __le32 padding;
+ __u8 buffer[];
+} __packed;
+
+#ifdef CONFIG_SMB_SERVER_SMBDIRECT
+int ksmbd_rdma_init(void);
+int ksmbd_rdma_destroy(void);
+bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
+#else
+static inline int ksmbd_rdma_init(void) { return 0; }
+static inline int ksmbd_rdma_destroy(void) { return 0; }
+static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
+#endif
+
+#endif /* __KSMBD_TRANSPORT_RDMA_H__ */
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
new file mode 100644
index 000000000000..dc15a5ecd2e0
--- /dev/null
+++ b/fs/ksmbd/transport_tcp.c
@@ -0,0 +1,618 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/freezer.h>
+
+#include "smb_common.h"
+#include "server.h"
+#include "auth.h"
+#include "connection.h"
+#include "transport_tcp.h"
+
+#define IFACE_STATE_DOWN BIT(0)
+#define IFACE_STATE_CONFIGURED BIT(1)
+
+struct interface {
+ struct task_struct *ksmbd_kthread;
+ struct socket *ksmbd_socket;
+ struct list_head entry;
+ char *name;
+ struct mutex sock_release_lock;
+ int state;
+};
+
+static LIST_HEAD(iface_list);
+
+static int bind_additional_ifaces;
+
+struct tcp_transport {
+ struct ksmbd_transport transport;
+ struct socket *sock;
+ struct kvec *iov;
+ unsigned int nr_iov;
+};
+
+static struct ksmbd_transport_ops ksmbd_tcp_transport_ops;
+
+static void tcp_stop_kthread(struct task_struct *kthread);
+static struct interface *alloc_iface(char *ifname);
+
+#define KSMBD_TRANS(t) (&(t)->transport)
+#define TCP_TRANS(t) ((struct tcp_transport *)container_of(t, \
+ struct tcp_transport, transport))
+
+static inline void ksmbd_tcp_nodelay(struct socket *sock)
+{
+ tcp_sock_set_nodelay(sock->sk);
+}
+
+static inline void ksmbd_tcp_reuseaddr(struct socket *sock)
+{
+ sock_set_reuseaddr(sock->sk);
+}
+
+static inline void ksmbd_tcp_rcv_timeout(struct socket *sock, s64 secs)
+{
+ lock_sock(sock->sk);
+ if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+ sock->sk->sk_rcvtimeo = secs * HZ;
+ else
+ sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ release_sock(sock->sk);
+}
+
+static inline void ksmbd_tcp_snd_timeout(struct socket *sock, s64 secs)
+{
+ sock_set_sndtimeo(sock->sk, secs);
+}
+
+static struct tcp_transport *alloc_transport(struct socket *client_sk)
+{
+ struct tcp_transport *t;
+ struct ksmbd_conn *conn;
+
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return NULL;
+ t->sock = client_sk;
+
+ conn = ksmbd_conn_alloc();
+ if (!conn) {
+ kfree(t);
+ return NULL;
+ }
+
+ conn->transport = KSMBD_TRANS(t);
+ KSMBD_TRANS(t)->conn = conn;
+ KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops;
+ return t;
+}
+
+static void free_transport(struct tcp_transport *t)
+{
+ kernel_sock_shutdown(t->sock, SHUT_RDWR);
+ sock_release(t->sock);
+ t->sock = NULL;
+
+ ksmbd_conn_free(KSMBD_TRANS(t)->conn);
+ kfree(t->iov);
+ kfree(t);
+}
+
+/**
+ * kvec_array_init() - initialize a IO vector segment
+ * @new: IO vector to be initialized
+ * @iov: base IO vector
+ * @nr_segs: number of segments in base iov
+ * @bytes: total iovec length so far for read
+ *
+ * Return: Number of IO segments
+ */
+static unsigned int kvec_array_init(struct kvec *new, struct kvec *iov,
+ unsigned int nr_segs, size_t bytes)
+{
+ size_t base = 0;
+
+ while (bytes || !iov->iov_len) {
+ int copy = min(bytes, iov->iov_len);
+
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+
+ memcpy(new, iov, sizeof(*iov) * nr_segs);
+ new->iov_base += base;
+ new->iov_len -= base;
+ return nr_segs;
+}
+
+/**
+ * get_conn_iovec() - get connection iovec for reading from socket
+ * @t: TCP transport instance
+ * @nr_segs: number of segments in iov
+ *
+ * Return: return existing or newly allocate iovec
+ */
+static struct kvec *get_conn_iovec(struct tcp_transport *t, unsigned int nr_segs)
+{
+ struct kvec *new_iov;
+
+ if (t->iov && nr_segs <= t->nr_iov)
+ return t->iov;
+
+ /* not big enough -- allocate a new one and release the old */
+ new_iov = kmalloc_array(nr_segs, sizeof(*new_iov), GFP_KERNEL);
+ if (new_iov) {
+ kfree(t->iov);
+ t->iov = new_iov;
+ t->nr_iov = nr_segs;
+ }
+ return new_iov;
+}
+
+static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa)
+{
+ switch (sa->sa_family) {
+ case AF_INET:
+ return ntohs(((struct sockaddr_in *)sa)->sin_port);
+ case AF_INET6:
+ return ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
+ }
+ return 0;
+}
+
+/**
+ * ksmbd_tcp_new_connection() - create a new tcp session on mount
+ * @client_sk: socket associated with new connection
+ *
+ * whenever a new connection is requested, create a conn thread
+ * (session thread) to handle new incoming smb requests from the connection
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int ksmbd_tcp_new_connection(struct socket *client_sk)
+{
+ struct sockaddr *csin;
+ int rc = 0;
+ struct tcp_transport *t;
+
+ t = alloc_transport(client_sk);
+ if (!t)
+ return -ENOMEM;
+
+ csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn);
+ if (kernel_getpeername(client_sk, csin) < 0) {
+ pr_err("client ip resolution failed\n");
+ rc = -EINVAL;
+ goto out_error;
+ }
+
+ KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn,
+ "ksmbd:%u",
+ ksmbd_tcp_get_port(csin));
+ if (IS_ERR(KSMBD_TRANS(t)->handler)) {
+ pr_err("cannot start conn thread\n");
+ rc = PTR_ERR(KSMBD_TRANS(t)->handler);
+ free_transport(t);
+ }
+ return rc;
+
+out_error:
+ free_transport(t);
+ return rc;
+}
+
+/**
+ * ksmbd_kthread_fn() - listen to new SMB connections and callback server
+ * @p: arguments to forker thread
+ *
+ * Return: Returns a task_struct or ERR_PTR
+ */
+static int ksmbd_kthread_fn(void *p)
+{
+ struct socket *client_sk = NULL;
+ struct interface *iface = (struct interface *)p;
+ int ret;
+
+ while (!kthread_should_stop()) {
+ mutex_lock(&iface->sock_release_lock);
+ if (!iface->ksmbd_socket) {
+ mutex_unlock(&iface->sock_release_lock);
+ break;
+ }
+ ret = kernel_accept(iface->ksmbd_socket, &client_sk,
+ O_NONBLOCK);
+ mutex_unlock(&iface->sock_release_lock);
+ if (ret) {
+ if (ret == -EAGAIN)
+ /* check for new connections every 100 msecs */
+ schedule_timeout_interruptible(HZ / 10);
+ continue;
+ }
+
+ ksmbd_debug(CONN, "connect success: accepted new connection\n");
+ client_sk->sk->sk_rcvtimeo = KSMBD_TCP_RECV_TIMEOUT;
+ client_sk->sk->sk_sndtimeo = KSMBD_TCP_SEND_TIMEOUT;
+
+ ksmbd_tcp_new_connection(client_sk);
+ }
+
+ ksmbd_debug(CONN, "releasing socket\n");
+ return 0;
+}
+
+/**
+ * ksmbd_tcp_run_kthread() - start forker thread
+ * @iface: pointer to struct interface
+ *
+ * start forker thread(ksmbd/0) at module init time to listen
+ * on port 445 for new SMB connection requests. It creates per connection
+ * server threads(ksmbd/x)
+ *
+ * Return: 0 on success or error number
+ */
+static int ksmbd_tcp_run_kthread(struct interface *iface)
+{
+ int rc;
+ struct task_struct *kthread;
+
+ kthread = kthread_run(ksmbd_kthread_fn, (void *)iface, "ksmbd-%s",
+ iface->name);
+ if (IS_ERR(kthread)) {
+ rc = PTR_ERR(kthread);
+ return rc;
+ }
+ iface->ksmbd_kthread = kthread;
+
+ return 0;
+}
+
+/**
+ * ksmbd_tcp_readv() - read data from socket in given iovec
+ * @t: TCP transport instance
+ * @iov_orig: base IO vector
+ * @nr_segs: number of segments in base iov
+ * @to_read: number of bytes to read from socket
+ *
+ * Return: on success return number of bytes read from socket,
+ * otherwise return error number
+ */
+static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
+ unsigned int nr_segs, unsigned int to_read)
+{
+ int length = 0;
+ int total_read;
+ unsigned int segs;
+ struct msghdr ksmbd_msg;
+ struct kvec *iov;
+ struct ksmbd_conn *conn = KSMBD_TRANS(t)->conn;
+
+ iov = get_conn_iovec(t, nr_segs);
+ if (!iov)
+ return -ENOMEM;
+
+ ksmbd_msg.msg_control = NULL;
+ ksmbd_msg.msg_controllen = 0;
+
+ for (total_read = 0; to_read; total_read += length, to_read -= length) {
+ try_to_freeze();
+
+ if (!ksmbd_conn_alive(conn)) {
+ total_read = -ESHUTDOWN;
+ break;
+ }
+ segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+
+ length = kernel_recvmsg(t->sock, &ksmbd_msg,
+ iov, segs, to_read, 0);
+
+ if (length == -EINTR) {
+ total_read = -ESHUTDOWN;
+ break;
+ } else if (conn->status == KSMBD_SESS_NEED_RECONNECT) {
+ total_read = -EAGAIN;
+ break;
+ } else if (length == -ERESTARTSYS || length == -EAGAIN) {
+ usleep_range(1000, 2000);
+ length = 0;
+ continue;
+ } else if (length <= 0) {
+ total_read = -EAGAIN;
+ break;
+ }
+ }
+ return total_read;
+}
+
+/**
+ * ksmbd_tcp_read() - read data from socket in given buffer
+ * @t: TCP transport instance
+ * @buf: buffer to store read data from socket
+ * @to_read: number of bytes to read from socket
+ *
+ * Return: on success return number of bytes read from socket,
+ * otherwise return error number
+ */
+static int ksmbd_tcp_read(struct ksmbd_transport *t, char *buf, unsigned int to_read)
+{
+ struct kvec iov;
+
+ iov.iov_base = buf;
+ iov.iov_len = to_read;
+
+ return ksmbd_tcp_readv(TCP_TRANS(t), &iov, 1, to_read);
+}
+
+static int ksmbd_tcp_writev(struct ksmbd_transport *t, struct kvec *iov,
+ int nvecs, int size, bool need_invalidate,
+ unsigned int remote_key)
+
+{
+ struct msghdr smb_msg = {.msg_flags = MSG_NOSIGNAL};
+
+ return kernel_sendmsg(TCP_TRANS(t)->sock, &smb_msg, iov, nvecs, size);
+}
+
+static void ksmbd_tcp_disconnect(struct ksmbd_transport *t)
+{
+ free_transport(TCP_TRANS(t));
+}
+
+static void tcp_destroy_socket(struct socket *ksmbd_socket)
+{
+ int ret;
+
+ if (!ksmbd_socket)
+ return;
+
+ /* set zero to timeout */
+ ksmbd_tcp_rcv_timeout(ksmbd_socket, 0);
+ ksmbd_tcp_snd_timeout(ksmbd_socket, 0);
+
+ ret = kernel_sock_shutdown(ksmbd_socket, SHUT_RDWR);
+ if (ret)
+ pr_err("Failed to shutdown socket: %d\n", ret);
+ sock_release(ksmbd_socket);
+}
+
+/**
+ * create_socket - create socket for ksmbd/0
+ *
+ * Return: Returns a task_struct or ERR_PTR
+ */
+static int create_socket(struct interface *iface)
+{
+ int ret;
+ struct sockaddr_in6 sin6;
+ struct sockaddr_in sin;
+ struct socket *ksmbd_socket;
+ bool ipv4 = false;
+
+ ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket);
+ if (ret) {
+ pr_err("Can't create socket for ipv6, try ipv4: %d\n", ret);
+ ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &ksmbd_socket);
+ if (ret) {
+ pr_err("Can't create socket for ipv4: %d\n", ret);
+ goto out_error;
+ }
+
+ sin.sin_family = PF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(server_conf.tcp_port);
+ ipv4 = true;
+ } else {
+ sin6.sin6_family = PF_INET6;
+ sin6.sin6_addr = in6addr_any;
+ sin6.sin6_port = htons(server_conf.tcp_port);
+ }
+
+ ksmbd_tcp_nodelay(ksmbd_socket);
+ ksmbd_tcp_reuseaddr(ksmbd_socket);
+
+ ret = sock_setsockopt(ksmbd_socket,
+ SOL_SOCKET,
+ SO_BINDTODEVICE,
+ KERNEL_SOCKPTR(iface->name),
+ strlen(iface->name));
+ if (ret != -ENODEV && ret < 0) {
+ pr_err("Failed to set SO_BINDTODEVICE: %d\n", ret);
+ goto out_error;
+ }
+
+ if (ipv4)
+ ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin,
+ sizeof(sin));
+ else
+ ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin6,
+ sizeof(sin6));
+ if (ret) {
+ pr_err("Failed to bind socket: %d\n", ret);
+ goto out_error;
+ }
+
+ ksmbd_socket->sk->sk_rcvtimeo = KSMBD_TCP_RECV_TIMEOUT;
+ ksmbd_socket->sk->sk_sndtimeo = KSMBD_TCP_SEND_TIMEOUT;
+
+ ret = kernel_listen(ksmbd_socket, KSMBD_SOCKET_BACKLOG);
+ if (ret) {
+ pr_err("Port listen() error: %d\n", ret);
+ goto out_error;
+ }
+
+ iface->ksmbd_socket = ksmbd_socket;
+ ret = ksmbd_tcp_run_kthread(iface);
+ if (ret) {
+ pr_err("Can't start ksmbd main kthread: %d\n", ret);
+ goto out_error;
+ }
+ iface->state = IFACE_STATE_CONFIGURED;
+
+ return 0;
+
+out_error:
+ tcp_destroy_socket(ksmbd_socket);
+ iface->ksmbd_socket = NULL;
+ return ret;
+}
+
+static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct interface *iface;
+ int ret, found = 0;
+
+ switch (event) {
+ case NETDEV_UP:
+ if (netdev->priv_flags & IFF_BRIDGE_PORT)
+ return NOTIFY_OK;
+
+ list_for_each_entry(iface, &iface_list, entry) {
+ if (!strcmp(iface->name, netdev->name)) {
+ found = 1;
+ if (iface->state != IFACE_STATE_DOWN)
+ break;
+ ret = create_socket(iface);
+ if (ret)
+ return NOTIFY_OK;
+ break;
+ }
+ }
+ if (!found && bind_additional_ifaces) {
+ iface = alloc_iface(kstrdup(netdev->name, GFP_KERNEL));
+ if (!iface)
+ return NOTIFY_OK;
+ ret = create_socket(iface);
+ if (ret)
+ break;
+ }
+ break;
+ case NETDEV_DOWN:
+ list_for_each_entry(iface, &iface_list, entry) {
+ if (!strcmp(iface->name, netdev->name) &&
+ iface->state == IFACE_STATE_CONFIGURED) {
+ tcp_stop_kthread(iface->ksmbd_kthread);
+ iface->ksmbd_kthread = NULL;
+ mutex_lock(&iface->sock_release_lock);
+ tcp_destroy_socket(iface->ksmbd_socket);
+ iface->ksmbd_socket = NULL;
+ mutex_unlock(&iface->sock_release_lock);
+
+ iface->state = IFACE_STATE_DOWN;
+ break;
+ }
+ }
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ksmbd_netdev_notifier = {
+ .notifier_call = ksmbd_netdev_event,
+};
+
+int ksmbd_tcp_init(void)
+{
+ register_netdevice_notifier(&ksmbd_netdev_notifier);
+
+ return 0;
+}
+
+static void tcp_stop_kthread(struct task_struct *kthread)
+{
+ int ret;
+
+ if (!kthread)
+ return;
+
+ ret = kthread_stop(kthread);
+ if (ret)
+ pr_err("failed to stop forker thread\n");
+}
+
+void ksmbd_tcp_destroy(void)
+{
+ struct interface *iface, *tmp;
+
+ unregister_netdevice_notifier(&ksmbd_netdev_notifier);
+
+ list_for_each_entry_safe(iface, tmp, &iface_list, entry) {
+ list_del(&iface->entry);
+ kfree(iface->name);
+ kfree(iface);
+ }
+}
+
+static struct interface *alloc_iface(char *ifname)
+{
+ struct interface *iface;
+
+ if (!ifname)
+ return NULL;
+
+ iface = kzalloc(sizeof(struct interface), GFP_KERNEL);
+ if (!iface) {
+ kfree(ifname);
+ return NULL;
+ }
+
+ iface->name = ifname;
+ iface->state = IFACE_STATE_DOWN;
+ list_add(&iface->entry, &iface_list);
+ mutex_init(&iface->sock_release_lock);
+ return iface;
+}
+
+int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz)
+{
+ int sz = 0;
+
+ if (!ifc_list_sz) {
+ struct net_device *netdev;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, netdev) {
+ if (netdev->priv_flags & IFF_BRIDGE_PORT)
+ continue;
+ if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL)))
+ return -ENOMEM;
+ }
+ rtnl_unlock();
+ bind_additional_ifaces = 1;
+ return 0;
+ }
+
+ while (ifc_list_sz > 0) {
+ if (!alloc_iface(kstrdup(ifc_list, GFP_KERNEL)))
+ return -ENOMEM;
+
+ sz = strlen(ifc_list);
+ if (!sz)
+ break;
+
+ ifc_list += sz + 1;
+ ifc_list_sz -= (sz + 1);
+ }
+
+ bind_additional_ifaces = 0;
+
+ return 0;
+}
+
+static struct ksmbd_transport_ops ksmbd_tcp_transport_ops = {
+ .read = ksmbd_tcp_read,
+ .writev = ksmbd_tcp_writev,
+ .disconnect = ksmbd_tcp_disconnect,
+};
diff --git a/fs/ksmbd/transport_tcp.h b/fs/ksmbd/transport_tcp.h
new file mode 100644
index 000000000000..e338bebe322f
--- /dev/null
+++ b/fs/ksmbd/transport_tcp.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_TRANSPORT_TCP_H__
+#define __KSMBD_TRANSPORT_TCP_H__
+
+int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz);
+int ksmbd_tcp_init(void);
+void ksmbd_tcp_destroy(void);
+
+#endif /* __KSMBD_TRANSPORT_TCP_H__ */
diff --git a/fs/ksmbd/unicode.c b/fs/ksmbd/unicode.c
new file mode 100644
index 000000000000..a0db699ddafd
--- /dev/null
+++ b/fs/ksmbd/unicode.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Some of the source code in this file came from fs/cifs/cifs_unicode.c
+ *
+ * Copyright (c) International Business Machines Corp., 2000,2009
+ * Modified by Steve French (sfrench@us.ibm.com)
+ * Modified by Namjae Jeon (linkinjeon@kernel.org)
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "glob.h"
+#include "unicode.h"
+#include "uniupr.h"
+#include "smb_common.h"
+
+/*
+ * smb_utf16_bytes() - how long will a string be after conversion?
+ * @from: pointer to input string
+ * @maxbytes: don't go past this many bytes of input string
+ * @codepage: destination codepage
+ *
+ * Walk a utf16le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ *
+ * Return: string length after conversion
+ */
+static int smb_utf16_bytes(const __le16 *from, int maxbytes,
+ const struct nls_table *codepage)
+{
+ int i;
+ int charlen, outlen = 0;
+ int maxwords = maxbytes / 2;
+ char tmp[NLS_MAX_CHARSET_SIZE];
+ __u16 ftmp;
+
+ for (i = 0; i < maxwords; i++) {
+ ftmp = get_unaligned_le16(&from[i]);
+ if (ftmp == 0)
+ break;
+
+ charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
+ if (charlen > 0)
+ outlen += charlen;
+ else
+ outlen++;
+ }
+
+ return outlen;
+}
+
+/*
+ * cifs_mapchar() - convert a host-endian char to proper char in codepage
+ * @target: where converted character should be copied
+ * @src_char: 2 byte host-endian source character
+ * @cp: codepage to which character should be converted
+ * @mapchar: should character be mapped according to mapchars mount option?
+ *
+ * This function handles the conversion of a single character. It is the
+ * responsibility of the caller to ensure that the target buffer is large
+ * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
+ *
+ * Return: string length after conversion
+ */
+static int
+cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
+ bool mapchar)
+{
+ int len = 1;
+
+ if (!mapchar)
+ goto cp_convert;
+
+ /*
+ * BB: Cannot handle remapping UNI_SLASH until all the calls to
+ * build_path_from_dentry are modified, as they use slash as
+ * separator.
+ */
+ switch (src_char) {
+ case UNI_COLON:
+ *target = ':';
+ break;
+ case UNI_ASTERISK:
+ *target = '*';
+ break;
+ case UNI_QUESTION:
+ *target = '?';
+ break;
+ case UNI_PIPE:
+ *target = '|';
+ break;
+ case UNI_GRTRTHAN:
+ *target = '>';
+ break;
+ case UNI_LESSTHAN:
+ *target = '<';
+ break;
+ default:
+ goto cp_convert;
+ }
+
+out:
+ return len;
+
+cp_convert:
+ len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
+ if (len <= 0) {
+ *target = '?';
+ len = 1;
+ }
+
+ goto out;
+}
+
+/*
+ * is_char_allowed() - check for valid character
+ * @ch: input character to be checked
+ *
+ * Return: 1 if char is allowed, otherwise 0
+ */
+static inline int is_char_allowed(char *ch)
+{
+ /* check for control chars, wildcards etc. */
+ if (!(*ch & 0x80) &&
+ (*ch <= 0x1f ||
+ *ch == '?' || *ch == '"' || *ch == '<' ||
+ *ch == '>' || *ch == '|'))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * smb_from_utf16() - convert utf16le string to local charset
+ * @to: destination buffer
+ * @from: source buffer
+ * @tolen: destination buffer size (in bytes)
+ * @fromlen: source buffer size (in bytes)
+ * @codepage: codepage to which characters should be converted
+ * @mapchar: should characters be remapped according to the mapchars option?
+ *
+ * Convert a little-endian utf16le string (as sent by the server) to a string
+ * in the provided codepage. The tolen and fromlen parameters are to ensure
+ * that the code doesn't walk off of the end of the buffer (which is always
+ * a danger if the alignment of the source buffer is off). The destination
+ * string is always properly null terminated and fits in the destination
+ * buffer. Returns the length of the destination string in bytes (including
+ * null terminator).
+ *
+ * Note that some windows versions actually send multiword UTF-16 characters
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
+ * deal with those characters properly. In the event that we get some of
+ * those characters, they won't be translated properly.
+ *
+ * Return: string length after conversion
+ */
+static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+ const struct nls_table *codepage, bool mapchar)
+{
+ int i, charlen, safelen;
+ int outlen = 0;
+ int nullsize = nls_nullsize(codepage);
+ int fromwords = fromlen / 2;
+ char tmp[NLS_MAX_CHARSET_SIZE];
+ __u16 ftmp;
+
+ /*
+ * because the chars can be of varying widths, we need to take care
+ * not to overflow the destination buffer when we get close to the
+ * end of it. Until we get to this offset, we don't need to check
+ * for overflow however.
+ */
+ safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
+
+ for (i = 0; i < fromwords; i++) {
+ ftmp = get_unaligned_le16(&from[i]);
+ if (ftmp == 0)
+ break;
+
+ /*
+ * check to see if converting this character might make the
+ * conversion bleed into the null terminator
+ */
+ if (outlen >= safelen) {
+ charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
+ if ((outlen + charlen) > (tolen - nullsize))
+ break;
+ }
+
+ /* put converted char into 'to' buffer */
+ charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
+ outlen += charlen;
+ }
+
+ /* properly null-terminate string */
+ for (i = 0; i < nullsize; i++)
+ to[outlen++] = 0;
+
+ return outlen;
+}
+
+/*
+ * smb_strtoUTF16() - Convert character string to unicode string
+ * @to: destination buffer
+ * @from: source buffer
+ * @len: destination buffer size (in bytes)
+ * @codepage: codepage to which characters should be converted
+ *
+ * Return: string length after conversion
+ */
+int smb_strtoUTF16(__le16 *to, const char *from, int len,
+ const struct nls_table *codepage)
+{
+ int charlen;
+ int i;
+ wchar_t wchar_to; /* needed to quiet sparse */
+
+ /* special case for utf8 to handle no plane0 chars */
+ if (!strcmp(codepage->charset, "utf8")) {
+ /*
+ * convert utf8 -> utf16, we assume we have enough space
+ * as caller should have assumed conversion does not overflow
+ * in destination len is length in wchar_t units (16bits)
+ */
+ i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
+ (wchar_t *)to, len);
+
+ /* if success terminate and exit */
+ if (i >= 0)
+ goto success;
+ /*
+ * if fails fall back to UCS encoding as this
+ * function should not return negative values
+ * currently can fail only if source contains
+ * invalid encoded characters
+ */
+ }
+
+ for (i = 0; len > 0 && *from; i++, from += charlen, len -= charlen) {
+ charlen = codepage->char2uni(from, len, &wchar_to);
+ if (charlen < 1) {
+ /* A question mark */
+ wchar_to = 0x003f;
+ charlen = 1;
+ }
+ put_unaligned_le16(wchar_to, &to[i]);
+ }
+
+success:
+ put_unaligned_le16(0, &to[i]);
+ return i;
+}
+
+/*
+ * smb_strndup_from_utf16() - copy a string from wire format to the local
+ * codepage
+ * @src: source string
+ * @maxlen: don't walk past this many bytes in the source string
+ * @is_unicode: is this a unicode string?
+ * @codepage: destination codepage
+ *
+ * Take a string given by the server, convert it to the local codepage and
+ * put it in a new buffer. Returns a pointer to the new string or NULL on
+ * error.
+ *
+ * Return: destination string buffer or error ptr
+ */
+char *smb_strndup_from_utf16(const char *src, const int maxlen,
+ const bool is_unicode,
+ const struct nls_table *codepage)
+{
+ int len, ret;
+ char *dst;
+
+ if (is_unicode) {
+ len = smb_utf16_bytes((__le16 *)src, maxlen, codepage);
+ len += nls_nullsize(codepage);
+ dst = kmalloc(len, GFP_KERNEL);
+ if (!dst)
+ return ERR_PTR(-ENOMEM);
+ ret = smb_from_utf16(dst, (__le16 *)src, len, maxlen, codepage,
+ false);
+ if (ret < 0) {
+ kfree(dst);
+ return ERR_PTR(-EINVAL);
+ }
+ } else {
+ len = strnlen(src, maxlen);
+ len++;
+ dst = kmalloc(len, GFP_KERNEL);
+ if (!dst)
+ return ERR_PTR(-ENOMEM);
+ strscpy(dst, src, len);
+ }
+
+ return dst;
+}
+
+/*
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ */
+/*
+ * smbConvertToUTF16() - convert string from local charset to utf16
+ * @target: destination buffer
+ * @source: source buffer
+ * @srclen: source buffer size (in bytes)
+ * @cp: codepage to which characters should be converted
+ * @mapchar: should characters be remapped according to the mapchars option?
+ *
+ * Convert 16 bit Unicode pathname to wire format from string in current code
+ * page. Conversion may involve remapping up the six characters that are
+ * only legal in POSIX-like OS (if they are present in the string). Path
+ * names are little endian 16 bit Unicode on the wire
+ *
+ * Return: char length after conversion
+ */
+int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
+ const struct nls_table *cp, int mapchars)
+{
+ int i, j, charlen;
+ char src_char;
+ __le16 dst_char;
+ wchar_t tmp;
+
+ if (!mapchars)
+ return smb_strtoUTF16(target, source, srclen, cp);
+
+ for (i = 0, j = 0; i < srclen; j++) {
+ src_char = source[i];
+ charlen = 1;
+ switch (src_char) {
+ case 0:
+ put_unaligned(0, &target[j]);
+ return j;
+ case ':':
+ dst_char = cpu_to_le16(UNI_COLON);
+ break;
+ case '*':
+ dst_char = cpu_to_le16(UNI_ASTERISK);
+ break;
+ case '?':
+ dst_char = cpu_to_le16(UNI_QUESTION);
+ break;
+ case '<':
+ dst_char = cpu_to_le16(UNI_LESSTHAN);
+ break;
+ case '>':
+ dst_char = cpu_to_le16(UNI_GRTRTHAN);
+ break;
+ case '|':
+ dst_char = cpu_to_le16(UNI_PIPE);
+ break;
+ /*
+ * FIXME: We can not handle remapping backslash (UNI_SLASH)
+ * until all the calls to build_path_from_dentry are modified,
+ * as they use backslash as separator.
+ */
+ default:
+ charlen = cp->char2uni(source + i, srclen - i, &tmp);
+ dst_char = cpu_to_le16(tmp);
+
+ /*
+ * if no match, use question mark, which at least in
+ * some cases serves as wild card
+ */
+ if (charlen < 1) {
+ dst_char = cpu_to_le16(0x003f);
+ charlen = 1;
+ }
+ }
+ /*
+ * character may take more than one byte in the source string,
+ * but will take exactly two bytes in the target string
+ */
+ i += charlen;
+ put_unaligned(dst_char, &target[j]);
+ }
+
+ return j;
+}
diff --git a/fs/ksmbd/unicode.h b/fs/ksmbd/unicode.h
new file mode 100644
index 000000000000..5593024230ae
--- /dev/null
+++ b/fs/ksmbd/unicode.h
@@ -0,0 +1,357 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Some of the source code in this file came from fs/cifs/cifs_unicode.c
+ * cifs_unicode: Unicode kernel case support
+ *
+ * Function:
+ * Convert a unicode character to upper or lower case using
+ * compressed tables.
+ *
+ * Copyright (c) International Business Machines Corp., 2000,2009
+ *
+ *
+ * Notes:
+ * These APIs are based on the C library functions. The semantics
+ * should match the C functions but with expanded size operands.
+ *
+ * The upper/lower functions are based on a table created by mkupr.
+ * This is a compressed table of upper and lower case conversion.
+ *
+ */
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+#include <linux/nls.h>
+
+#define UNIUPR_NOLOWER /* Example to not expand lower case tables */
+
+/*
+ * Windows maps these to the user defined 16 bit Unicode range since they are
+ * reserved symbols (along with \ and /), otherwise illegal to store
+ * in filenames in NTFS
+ */
+#define UNI_ASTERISK ((__u16)('*' + 0xF000))
+#define UNI_QUESTION ((__u16)('?' + 0xF000))
+#define UNI_COLON ((__u16)(':' + 0xF000))
+#define UNI_GRTRTHAN ((__u16)('>' + 0xF000))
+#define UNI_LESSTHAN ((__u16)('<' + 0xF000))
+#define UNI_PIPE ((__u16)('|' + 0xF000))
+#define UNI_SLASH ((__u16)('\\' + 0xF000))
+
+/* Just define what we want from uniupr.h. We don't want to define the tables
+ * in each source file.
+ */
+#ifndef UNICASERANGE_DEFINED
+struct UniCaseRange {
+ wchar_t start;
+ wchar_t end;
+ signed char *table;
+};
+#endif /* UNICASERANGE_DEFINED */
+
+#ifndef UNIUPR_NOUPPER
+extern signed char SmbUniUpperTable[512];
+extern const struct UniCaseRange SmbUniUpperRange[];
+#endif /* UNIUPR_NOUPPER */
+
+#ifndef UNIUPR_NOLOWER
+extern signed char CifsUniLowerTable[512];
+extern const struct UniCaseRange CifsUniLowerRange[];
+#endif /* UNIUPR_NOLOWER */
+
+#ifdef __KERNEL__
+int smb_strtoUTF16(__le16 *to, const char *from, int len,
+ const struct nls_table *codepage);
+char *smb_strndup_from_utf16(const char *src, const int maxlen,
+ const bool is_unicode,
+ const struct nls_table *codepage);
+int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
+ const struct nls_table *cp, int mapchars);
+char *ksmbd_extract_sharename(char *treename);
+#endif
+
+/*
+ * UniStrcat: Concatenate the second string to the first
+ *
+ * Returns:
+ * Address of the first string
+ */
+static inline wchar_t *UniStrcat(wchar_t *ucs1, const wchar_t *ucs2)
+{
+ wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */
+
+ while (*ucs1++)
+ /*NULL*/; /* To end of first string */
+ ucs1--; /* Return to the null */
+ while ((*ucs1++ = *ucs2++))
+ /*NULL*/; /* copy string 2 over */
+ return anchor;
+}
+
+/*
+ * UniStrchr: Find a character in a string
+ *
+ * Returns:
+ * Address of first occurrence of character in string
+ * or NULL if the character is not in the string
+ */
+static inline wchar_t *UniStrchr(const wchar_t *ucs, wchar_t uc)
+{
+ while ((*ucs != uc) && *ucs)
+ ucs++;
+
+ if (*ucs == uc)
+ return (wchar_t *)ucs;
+ return NULL;
+}
+
+/*
+ * UniStrcmp: Compare two strings
+ *
+ * Returns:
+ * < 0: First string is less than second
+ * = 0: Strings are equal
+ * > 0: First string is greater than second
+ */
+static inline int UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2)
+{
+ while ((*ucs1 == *ucs2) && *ucs1) {
+ ucs1++;
+ ucs2++;
+ }
+ return (int)*ucs1 - (int)*ucs2;
+}
+
+/*
+ * UniStrcpy: Copy a string
+ */
+static inline wchar_t *UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2)
+{
+ wchar_t *anchor = ucs1; /* save the start of result string */
+
+ while ((*ucs1++ = *ucs2++))
+ /*NULL*/;
+ return anchor;
+}
+
+/*
+ * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes)
+ */
+static inline size_t UniStrlen(const wchar_t *ucs1)
+{
+ int i = 0;
+
+ while (*ucs1++)
+ i++;
+ return i;
+}
+
+/*
+ * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a
+ * string (length limited)
+ */
+static inline size_t UniStrnlen(const wchar_t *ucs1, int maxlen)
+{
+ int i = 0;
+
+ while (*ucs1++) {
+ i++;
+ if (i >= maxlen)
+ break;
+ }
+ return i;
+}
+
+/*
+ * UniStrncat: Concatenate length limited string
+ */
+static inline wchar_t *UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ wchar_t *anchor = ucs1; /* save pointer to string 1 */
+
+ while (*ucs1++)
+ /*NULL*/;
+ ucs1--; /* point to null terminator of s1 */
+ while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */
+ ucs1++;
+ ucs2++;
+ }
+ *ucs1 = 0; /* Null terminate the result */
+ return anchor;
+}
+
+/*
+ * UniStrncmp: Compare length limited string
+ */
+static inline int UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ if (!n)
+ return 0; /* Null strings are equal */
+ while ((*ucs1 == *ucs2) && *ucs1 && --n) {
+ ucs1++;
+ ucs2++;
+ }
+ return (int)*ucs1 - (int)*ucs2;
+}
+
+/*
+ * UniStrncmp_le: Compare length limited string - native to little-endian
+ */
+static inline int
+UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ if (!n)
+ return 0; /* Null strings are equal */
+ while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
+ ucs1++;
+ ucs2++;
+ }
+ return (int)*ucs1 - (int)__le16_to_cpu(*ucs2);
+}
+
+/*
+ * UniStrncpy: Copy length limited string with pad
+ */
+static inline wchar_t *UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ wchar_t *anchor = ucs1;
+
+ while (n-- && *ucs2) /* Copy the strings */
+ *ucs1++ = *ucs2++;
+
+ n++;
+ while (n--) /* Pad with nulls */
+ *ucs1++ = 0;
+ return anchor;
+}
+
+/*
+ * UniStrncpy_le: Copy length limited string with pad to little-endian
+ */
+static inline wchar_t *UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ wchar_t *anchor = ucs1;
+
+ while (n-- && *ucs2) /* Copy the strings */
+ *ucs1++ = __le16_to_cpu(*ucs2++);
+
+ n++;
+ while (n--) /* Pad with nulls */
+ *ucs1++ = 0;
+ return anchor;
+}
+
+/*
+ * UniStrstr: Find a string in a string
+ *
+ * Returns:
+ * Address of first match found
+ * NULL if no matching string is found
+ */
+static inline wchar_t *UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2)
+{
+ const wchar_t *anchor1 = ucs1;
+ const wchar_t *anchor2 = ucs2;
+
+ while (*ucs1) {
+ if (*ucs1 == *ucs2) {
+ /* Partial match found */
+ ucs1++;
+ ucs2++;
+ } else {
+ if (!*ucs2) /* Match found */
+ return (wchar_t *)anchor1;
+ ucs1 = ++anchor1; /* No match */
+ ucs2 = anchor2;
+ }
+ }
+
+ if (!*ucs2) /* Both end together */
+ return (wchar_t *)anchor1; /* Match found */
+ return NULL; /* No match */
+}
+
+#ifndef UNIUPR_NOUPPER
+/*
+ * UniToupper: Convert a unicode character to upper case
+ */
+static inline wchar_t UniToupper(register wchar_t uc)
+{
+ register const struct UniCaseRange *rp;
+
+ if (uc < sizeof(SmbUniUpperTable)) {
+ /* Latin characters */
+ return uc + SmbUniUpperTable[uc]; /* Use base tables */
+ }
+
+ rp = SmbUniUpperRange; /* Use range tables */
+ while (rp->start) {
+ if (uc < rp->start) /* Before start of range */
+ return uc; /* Uppercase = input */
+ if (uc <= rp->end) /* In range */
+ return uc + rp->table[uc - rp->start];
+ rp++; /* Try next range */
+ }
+ return uc; /* Past last range */
+}
+
+/*
+ * UniStrupr: Upper case a unicode string
+ */
+static inline __le16 *UniStrupr(register __le16 *upin)
+{
+ register __le16 *up;
+
+ up = upin;
+ while (*up) { /* For all characters */
+ *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
+ up++;
+ }
+ return upin; /* Return input pointer */
+}
+#endif /* UNIUPR_NOUPPER */
+
+#ifndef UNIUPR_NOLOWER
+/*
+ * UniTolower: Convert a unicode character to lower case
+ */
+static inline wchar_t UniTolower(register wchar_t uc)
+{
+ register const struct UniCaseRange *rp;
+
+ if (uc < sizeof(CifsUniLowerTable)) {
+ /* Latin characters */
+ return uc + CifsUniLowerTable[uc]; /* Use base tables */
+ }
+
+ rp = CifsUniLowerRange; /* Use range tables */
+ while (rp->start) {
+ if (uc < rp->start) /* Before start of range */
+ return uc; /* Uppercase = input */
+ if (uc <= rp->end) /* In range */
+ return uc + rp->table[uc - rp->start];
+ rp++; /* Try next range */
+ }
+ return uc; /* Past last range */
+}
+
+/*
+ * UniStrlwr: Lower case a unicode string
+ */
+static inline wchar_t *UniStrlwr(register wchar_t *upin)
+{
+ register wchar_t *up;
+
+ up = upin;
+ while (*up) { /* For all characters */
+ *up = UniTolower(*up);
+ up++;
+ }
+ return upin; /* Return input pointer */
+}
+
+#endif
+
+#endif /* _CIFS_UNICODE_H */
diff --git a/fs/ksmbd/uniupr.h b/fs/ksmbd/uniupr.h
new file mode 100644
index 000000000000..26583b776897
--- /dev/null
+++ b/fs/ksmbd/uniupr.h
@@ -0,0 +1,268 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Some of the source code in this file came from fs/cifs/uniupr.h
+ * Copyright (c) International Business Machines Corp., 2000,2002
+ *
+ * uniupr.h - Unicode compressed case ranges
+ *
+ */
+#ifndef __KSMBD_UNIUPR_H
+#define __KSMBD_UNIUPR_H
+
+#ifndef UNIUPR_NOUPPER
+/*
+ * Latin upper case
+ */
+signed char SmbUniUpperTable[512] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */
+ 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, /* 060-06f */
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, 0, 0, 0, 0, 0, /* 070-07f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, -32, /* 0e0-0ef */
+ -32, -32, -32, -32, -32, -32, -32, 0, -32, -32,
+ -32, -32, -32, -32, -32, 121, /* 0f0-0ff */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */
+ 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */
+ -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */
+ 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */
+ 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */
+ 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */
+ -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */
+ 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */
+ -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */
+ 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */
+};
+
+/* Upper case range - Greek */
+static signed char UniCaseRangeU03a0[47] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, /* 3a0-3af */
+ 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, /* 3b0-3bf */
+ -32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64,
+ -63, -63,
+};
+
+/* Upper case range - Cyrillic */
+static signed char UniCaseRangeU0430[48] = {
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, /* 430-43f */
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, /* 440-44f */
+ 0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80,
+ -80, -80, 0, -80, -80, /* 450-45f */
+};
+
+/* Upper case range - Extended cyrillic */
+static signed char UniCaseRangeU0490[61] = {
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */
+ 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1,
+};
+
+/* Upper case range - Extended latin and greek */
+static signed char UniCaseRangeU1e00[509] = {
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */
+ 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1, /* 1e90-1e9f */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */
+ 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */
+ 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */
+ 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */
+ 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */
+ 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */
+ 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */
+ 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */
+ 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */
+ 74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112,
+ 126, 126, 0, 0, /* 1f70-1f7f */
+ 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */
+ 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */
+ 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */
+ 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */
+ 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */
+ 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */
+ 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */
+ 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* Upper case range - Wide latin */
+static signed char UniCaseRangeUff40[27] = {
+ 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+ -32, -32, -32, -32, -32, /* ff40-ff4f */
+ -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
+};
+
+/*
+ * Upper Case Range
+ */
+const struct UniCaseRange SmbUniUpperRange[] = {
+ {0x03a0, 0x03ce, UniCaseRangeU03a0},
+ {0x0430, 0x045f, UniCaseRangeU0430},
+ {0x0490, 0x04cc, UniCaseRangeU0490},
+ {0x1e00, 0x1ffc, UniCaseRangeU1e00},
+ {0xff40, 0xff5a, UniCaseRangeUff40},
+ {0}
+};
+#endif
+
+#ifndef UNIUPR_NOLOWER
+/*
+ * Latin lower case
+ */
+signed char CifsUniLowerTable[512] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, /* 040-04f */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0,
+ 0, 0, 0, /* 050-05f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, /* 0c0-0cf */
+ 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32,
+ 32, 32, 32, 0, /* 0d0-0df */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */
+ 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */
+ 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0,
+ 0, /* 170-17f */
+ 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79,
+ 0, /* 180-18f */
+ 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */
+ 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */
+ 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */
+ 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */
+ 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */
+};
+
+/* Lower case range - Greek */
+static signed char UniCaseRangeL0380[44] = {
+ 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, /* 390-39f */
+ 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+};
+
+/* Lower case range - Cyrillic */
+static signed char UniCaseRangeL0400[48] = {
+ 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
+ 0, 80, 80, /* 400-40f */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, /* 410-41f */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, /* 420-42f */
+};
+
+/* Lower case range - Extended cyrillic */
+static signed char UniCaseRangeL0490[60] = {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */
+ 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
+};
+
+/* Lower case range - Extended latin and greek */
+static signed char UniCaseRangeL1e00[504] = {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */
+ 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */
+ 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0,
+ 0, 0, /* 1fc0-1fcf */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */
+ 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0,
+ 0, 0, /* 1fe0-1fef */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* Lower case range - Wide latin */
+static signed char UniCaseRangeLff20[27] = {
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, /* ff20-ff2f */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+};
+
+/*
+ * Lower Case Range
+ */
+const struct UniCaseRange CifsUniLowerRange[] = {
+ {0x0380, 0x03ab, UniCaseRangeL0380},
+ {0x0400, 0x042f, UniCaseRangeL0400},
+ {0x0490, 0x04cb, UniCaseRangeL0490},
+ {0x1e00, 0x1ff7, UniCaseRangeL1e00},
+ {0xff20, 0xff3a, UniCaseRangeLff20},
+ {0}
+};
+#endif
+
+#endif /* __KSMBD_UNIUPR_H */
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
new file mode 100644
index 000000000000..aee28ee6b19c
--- /dev/null
+++ b/fs/ksmbd/vfs.c
@@ -0,0 +1,1895 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/backing-dev.h>
+#include <linux/writeback.h>
+#include <linux/xattr.h>
+#include <linux/falloc.h>
+#include <linux/genhd.h>
+#include <linux/fsnotify.h>
+#include <linux/dcache.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/xacct.h>
+#include <linux/crc32c.h>
+
+#include "glob.h"
+#include "oplock.h"
+#include "connection.h"
+#include "vfs.h"
+#include "vfs_cache.h"
+#include "smbacl.h"
+#include "ndr.h"
+#include "auth.h"
+#include "misc.h"
+
+#include "smb_common.h"
+#include "mgmt/share_config.h"
+#include "mgmt/tree_connect.h"
+#include "mgmt/user_session.h"
+#include "mgmt/user_config.h"
+
+static char *extract_last_component(char *path)
+{
+ char *p = strrchr(path, '/');
+
+ if (p && p[1] != '\0') {
+ *p = '\0';
+ p++;
+ } else {
+ p = NULL;
+ pr_err("Invalid path %s\n", path);
+ }
+ return p;
+}
+
+static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
+ struct inode *parent_inode,
+ struct inode *inode)
+{
+ if (!test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_INHERIT_OWNER))
+ return;
+
+ i_uid_write(inode, i_uid_read(parent_inode));
+}
+
+/**
+ * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable
+ *
+ * the parent dentry got by dget_parent or @parent could be
+ * unstable, we try to lock a parent inode and lookup the
+ * child dentry again.
+ *
+ * the reference count of @parent isn't incremented.
+ */
+int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
+{
+ struct dentry *dentry;
+ int ret = 0;
+
+ inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
+ dentry = lookup_one_len(child->d_name.name, parent,
+ child->d_name.len);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ goto out_err;
+ }
+
+ if (dentry != child) {
+ ret = -ESTALE;
+ dput(dentry);
+ goto out_err;
+ }
+
+ dput(dentry);
+ return 0;
+out_err:
+ inode_unlock(d_inode(parent));
+ return ret;
+}
+
+int ksmbd_vfs_may_delete(struct user_namespace *user_ns,
+ struct dentry *dentry)
+{
+ struct dentry *parent;
+ int ret;
+
+ parent = dget_parent(dentry);
+ ret = ksmbd_vfs_lock_parent(parent, dentry);
+ if (ret) {
+ dput(parent);
+ return ret;
+ }
+
+ ret = inode_permission(user_ns, d_inode(parent),
+ MAY_EXEC | MAY_WRITE);
+
+ inode_unlock(d_inode(parent));
+ dput(parent);
+ return ret;
+}
+
+int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns,
+ struct dentry *dentry, __le32 *daccess)
+{
+ struct dentry *parent;
+ int ret = 0;
+
+ *daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL);
+
+ if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_WRITE))
+ *daccess |= cpu_to_le32(WRITE_DAC | WRITE_OWNER | SYNCHRONIZE |
+ FILE_WRITE_DATA | FILE_APPEND_DATA |
+ FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES |
+ FILE_DELETE_CHILD);
+
+ if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_READ))
+ *daccess |= FILE_READ_DATA_LE | FILE_READ_EA_LE;
+
+ if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_EXEC))
+ *daccess |= FILE_EXECUTE_LE;
+
+ parent = dget_parent(dentry);
+ ret = ksmbd_vfs_lock_parent(parent, dentry);
+ if (ret) {
+ dput(parent);
+ return ret;
+ }
+
+ if (!inode_permission(user_ns, d_inode(parent), MAY_EXEC | MAY_WRITE))
+ *daccess |= FILE_DELETE_LE;
+
+ inode_unlock(d_inode(parent));
+ dput(parent);
+ return ret;
+}
+
+/**
+ * ksmbd_vfs_create() - vfs helper for smb create file
+ * @work: work
+ * @name: file name
+ * @mode: file create mode
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
+{
+ struct path path;
+ struct dentry *dentry;
+ int err;
+
+ dentry = kern_path_create(AT_FDCWD, name, &path, 0);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ if (err != -ENOENT)
+ pr_err("path create failed for %s, err %d\n",
+ name, err);
+ return err;
+ }
+
+ mode |= S_IFREG;
+ err = vfs_create(mnt_user_ns(path.mnt), d_inode(path.dentry),
+ dentry, mode, true);
+ if (!err) {
+ ksmbd_vfs_inherit_owner(work, d_inode(path.dentry),
+ d_inode(dentry));
+ } else {
+ pr_err("File(%s): creation failed (err:%d)\n", name, err);
+ }
+ done_path_create(&path, dentry);
+ return err;
+}
+
+/**
+ * ksmbd_vfs_mkdir() - vfs helper for smb create directory
+ * @work: work
+ * @name: directory name
+ * @mode: directory create mode
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
+{
+ struct path path;
+ struct dentry *dentry;
+ int err;
+
+ dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ if (err != -EEXIST)
+ ksmbd_debug(VFS, "path create failed for %s, err %d\n",
+ name, err);
+ return err;
+ }
+
+ mode |= S_IFDIR;
+ err = vfs_mkdir(mnt_user_ns(path.mnt), d_inode(path.dentry),
+ dentry, mode);
+ if (err) {
+ goto out;
+ } else if (d_unhashed(dentry)) {
+ struct dentry *d;
+
+ d = lookup_one_len(dentry->d_name.name, dentry->d_parent,
+ dentry->d_name.len);
+ if (IS_ERR(d)) {
+ err = PTR_ERR(d);
+ goto out;
+ }
+ if (unlikely(d_is_negative(d))) {
+ dput(d);
+ err = -ENOENT;
+ goto out;
+ }
+
+ ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d));
+ dput(d);
+ }
+out:
+ done_path_create(&path, dentry);
+ if (err)
+ pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
+ return err;
+}
+
+static ssize_t ksmbd_vfs_getcasexattr(struct user_namespace *user_ns,
+ struct dentry *dentry, char *attr_name,
+ int attr_name_len, char **attr_value)
+{
+ char *name, *xattr_list = NULL;
+ ssize_t value_len = -ENOENT, xattr_list_len;
+
+ xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list);
+ if (xattr_list_len <= 0)
+ goto out;
+
+ for (name = xattr_list; name - xattr_list < xattr_list_len;
+ name += strlen(name) + 1) {
+ ksmbd_debug(VFS, "%s, len %zd\n", name, strlen(name));
+ if (strncasecmp(attr_name, name, attr_name_len))
+ continue;
+
+ value_len = ksmbd_vfs_getxattr(user_ns,
+ dentry,
+ name,
+ attr_value);
+ if (value_len < 0)
+ pr_err("failed to get xattr in file\n");
+ break;
+ }
+
+out:
+ kvfree(xattr_list);
+ return value_len;
+}
+
+static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos,
+ size_t count)
+{
+ ssize_t v_len;
+ char *stream_buf = NULL;
+
+ ksmbd_debug(VFS, "read stream data pos : %llu, count : %zd\n",
+ *pos, count);
+
+ v_len = ksmbd_vfs_getcasexattr(file_mnt_user_ns(fp->filp),
+ fp->filp->f_path.dentry,
+ fp->stream.name,
+ fp->stream.size,
+ &stream_buf);
+ if ((int)v_len <= 0)
+ return (int)v_len;
+
+ if (v_len <= *pos) {
+ count = -EINVAL;
+ goto free_buf;
+ }
+
+ if (v_len - *pos < count)
+ count = v_len - *pos;
+
+ memcpy(buf, &stream_buf[*pos], count);
+
+free_buf:
+ kvfree(stream_buf);
+ return count;
+}
+
+/**
+ * check_lock_range() - vfs helper for smb byte range file locking
+ * @filp: the file to apply the lock to
+ * @start: lock start byte offset
+ * @end: lock end byte offset
+ * @type: byte range type read/write
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int check_lock_range(struct file *filp, loff_t start, loff_t end,
+ unsigned char type)
+{
+ struct file_lock *flock;
+ struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+ int error = 0;
+
+ if (!ctx || list_empty_careful(&ctx->flc_posix))
+ return 0;
+
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(flock, &ctx->flc_posix, fl_list) {
+ /* check conflict locks */
+ if (flock->fl_end >= start && end >= flock->fl_start) {
+ if (flock->fl_type == F_RDLCK) {
+ if (type == WRITE) {
+ pr_err("not allow write by shared lock\n");
+ error = 1;
+ goto out;
+ }
+ } else if (flock->fl_type == F_WRLCK) {
+ /* check owner in lock */
+ if (flock->fl_file != filp) {
+ error = 1;
+ pr_err("not allow rw access by exclusive lock from other opens\n");
+ goto out;
+ }
+ }
+ }
+ }
+out:
+ spin_unlock(&ctx->flc_lock);
+ return error;
+}
+
+/**
+ * ksmbd_vfs_read() - vfs helper for smb file read
+ * @work: smb work
+ * @fid: file id of open file
+ * @count: read byte count
+ * @pos: file pos
+ *
+ * Return: number of read bytes on success, otherwise error
+ */
+int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
+ loff_t *pos)
+{
+ struct file *filp = fp->filp;
+ ssize_t nbytes = 0;
+ char *rbuf = work->aux_payload_buf;
+ struct inode *inode = file_inode(filp);
+
+ if (S_ISDIR(inode->i_mode))
+ return -EISDIR;
+
+ if (unlikely(count == 0))
+ return 0;
+
+ if (work->conn->connection_type) {
+ if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) {
+ pr_err("no right to read(%pd)\n",
+ fp->filp->f_path.dentry);
+ return -EACCES;
+ }
+ }
+
+ if (ksmbd_stream_fd(fp))
+ return ksmbd_vfs_stream_read(fp, rbuf, pos, count);
+
+ if (!work->tcon->posix_extensions) {
+ int ret;
+
+ ret = check_lock_range(filp, *pos, *pos + count - 1, READ);
+ if (ret) {
+ pr_err("unable to read due to lock\n");
+ return -EAGAIN;
+ }
+ }
+
+ nbytes = kernel_read(filp, rbuf, count, pos);
+ if (nbytes < 0) {
+ pr_err("smb read failed for (%s), err = %zd\n",
+ fp->filename, nbytes);
+ return nbytes;
+ }
+
+ filp->f_pos = *pos;
+ return nbytes;
+}
+
+static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
+ size_t count)
+{
+ char *stream_buf = NULL, *wbuf;
+ struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+ size_t size, v_len;
+ int err = 0;
+
+ ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n",
+ *pos, count);
+
+ size = *pos + count;
+ if (size > XATTR_SIZE_MAX) {
+ size = XATTR_SIZE_MAX;
+ count = (*pos + count) - XATTR_SIZE_MAX;
+ }
+
+ v_len = ksmbd_vfs_getcasexattr(user_ns,
+ fp->filp->f_path.dentry,
+ fp->stream.name,
+ fp->stream.size,
+ &stream_buf);
+ if ((int)v_len < 0) {
+ pr_err("not found stream in xattr : %zd\n", v_len);
+ err = (int)v_len;
+ goto out;
+ }
+
+ if (v_len < size) {
+ wbuf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ if (!wbuf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (v_len > 0)
+ memcpy(wbuf, stream_buf, v_len);
+ kvfree(stream_buf);
+ stream_buf = wbuf;
+ }
+
+ memcpy(&stream_buf[*pos], buf, count);
+
+ err = ksmbd_vfs_setxattr(user_ns,
+ fp->filp->f_path.dentry,
+ fp->stream.name,
+ (void *)stream_buf,
+ size,
+ 0);
+ if (err < 0)
+ goto out;
+
+ fp->filp->f_pos = *pos;
+ err = 0;
+out:
+ kvfree(stream_buf);
+ return err;
+}
+
+/**
+ * ksmbd_vfs_write() - vfs helper for smb file write
+ * @work: work
+ * @fid: file id of open file
+ * @buf: buf containing data for writing
+ * @count: read byte count
+ * @pos: file pos
+ * @sync: fsync after write
+ * @written: number of bytes written
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
+ char *buf, size_t count, loff_t *pos, bool sync,
+ ssize_t *written)
+{
+ struct ksmbd_session *sess = work->sess;
+ struct file *filp;
+ loff_t offset = *pos;
+ int err = 0;
+
+ if (sess->conn->connection_type) {
+ if (!(fp->daccess & FILE_WRITE_DATA_LE)) {
+ pr_err("no right to write(%pd)\n",
+ fp->filp->f_path.dentry);
+ err = -EACCES;
+ goto out;
+ }
+ }
+
+ filp = fp->filp;
+
+ if (ksmbd_stream_fd(fp)) {
+ err = ksmbd_vfs_stream_write(fp, buf, pos, count);
+ if (!err)
+ *written = count;
+ goto out;
+ }
+
+ if (!work->tcon->posix_extensions) {
+ err = check_lock_range(filp, *pos, *pos + count - 1, WRITE);
+ if (err) {
+ pr_err("unable to write due to lock\n");
+ err = -EAGAIN;
+ goto out;
+ }
+ }
+
+ /* Do we need to break any of a levelII oplock? */
+ smb_break_all_levII_oplock(work, fp, 1);
+
+ err = kernel_write(filp, buf, count, pos);
+ if (err < 0) {
+ ksmbd_debug(VFS, "smb write failed, err = %d\n", err);
+ goto out;
+ }
+
+ filp->f_pos = *pos;
+ *written = err;
+ err = 0;
+ if (sync) {
+ err = vfs_fsync_range(filp, offset, offset + *written, 0);
+ if (err < 0)
+ pr_err("fsync failed for filename = %pd, err = %d\n",
+ fp->filp->f_path.dentry, err);
+ }
+
+out:
+ return err;
+}
+
+/**
+ * ksmbd_vfs_getattr() - vfs helper for smb getattr
+ * @work: work
+ * @fid: file id of open file
+ * @attrs: inode attributes
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_getattr(struct path *path, struct kstat *stat)
+{
+ int err;
+
+ err = vfs_getattr(path, stat, STATX_BTIME, AT_STATX_SYNC_AS_STAT);
+ if (err)
+ pr_err("getattr failed, err %d\n", err);
+ return err;
+}
+
+/**
+ * ksmbd_vfs_fsync() - vfs helper for smb fsync
+ * @work: work
+ * @fid: file id of open file
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
+{
+ struct ksmbd_file *fp;
+ int err;
+
+ fp = ksmbd_lookup_fd_slow(work, fid, p_id);
+ if (!fp) {
+ pr_err("failed to get filp for fid %llu\n", fid);
+ return -ENOENT;
+ }
+ err = vfs_fsync(fp->filp, 0);
+ if (err < 0)
+ pr_err("smb fsync failed, err = %d\n", err);
+ ksmbd_fd_put(work, fp);
+ return err;
+}
+
+/**
+ * ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink
+ * @name: absolute directory or file name
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
+{
+ struct path path;
+ struct dentry *parent;
+ int err;
+ int flags = 0;
+
+ if (ksmbd_override_fsids(work))
+ return -ENOMEM;
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS))
+ flags = LOOKUP_FOLLOW;
+
+ err = kern_path(name, flags, &path);
+ if (err) {
+ ksmbd_debug(VFS, "can't get %s, err %d\n", name, err);
+ ksmbd_revert_fsids(work);
+ return err;
+ }
+
+ parent = dget_parent(path.dentry);
+ err = ksmbd_vfs_lock_parent(parent, path.dentry);
+ if (err) {
+ dput(parent);
+ path_put(&path);
+ ksmbd_revert_fsids(work);
+ return err;
+ }
+
+ if (!d_inode(path.dentry)->i_nlink) {
+ err = -ENOENT;
+ goto out_err;
+ }
+
+ if (S_ISDIR(d_inode(path.dentry)->i_mode)) {
+ err = vfs_rmdir(mnt_user_ns(path.mnt), d_inode(parent),
+ path.dentry);
+ if (err && err != -ENOTEMPTY)
+ ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name,
+ err);
+ } else {
+ err = vfs_unlink(mnt_user_ns(path.mnt), d_inode(parent),
+ path.dentry, NULL);
+ if (err)
+ ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name,
+ err);
+ }
+
+out_err:
+ inode_unlock(d_inode(parent));
+ dput(parent);
+ path_put(&path);
+ ksmbd_revert_fsids(work);
+ return err;
+}
+
+/**
+ * ksmbd_vfs_link() - vfs helper for creating smb hardlink
+ * @oldname: source file name
+ * @newname: hardlink name
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname,
+ const char *newname)
+{
+ struct path oldpath, newpath;
+ struct dentry *dentry;
+ int err;
+ int flags = 0;
+
+ if (ksmbd_override_fsids(work))
+ return -ENOMEM;
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS))
+ flags = LOOKUP_FOLLOW;
+
+ err = kern_path(oldname, flags, &oldpath);
+ if (err) {
+ pr_err("cannot get linux path for %s, err = %d\n",
+ oldname, err);
+ goto out1;
+ }
+
+ dentry = kern_path_create(AT_FDCWD, newname, &newpath,
+ flags | LOOKUP_REVAL);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ pr_err("path create err for %s, err %d\n", newname, err);
+ goto out2;
+ }
+
+ err = -EXDEV;
+ if (oldpath.mnt != newpath.mnt) {
+ pr_err("vfs_link failed err %d\n", err);
+ goto out3;
+ }
+
+ err = vfs_link(oldpath.dentry, mnt_user_ns(newpath.mnt),
+ d_inode(newpath.dentry),
+ dentry, NULL);
+ if (err)
+ ksmbd_debug(VFS, "vfs_link failed err %d\n", err);
+
+out3:
+ done_path_create(&newpath, dentry);
+out2:
+ path_put(&oldpath);
+out1:
+ ksmbd_revert_fsids(work);
+ return err;
+}
+
+static int ksmbd_validate_entry_in_use(struct dentry *src_dent)
+{
+ struct dentry *dst_dent;
+
+ spin_lock(&src_dent->d_lock);
+ list_for_each_entry(dst_dent, &src_dent->d_subdirs, d_child) {
+ struct ksmbd_file *child_fp;
+
+ if (d_really_is_negative(dst_dent))
+ continue;
+
+ child_fp = ksmbd_lookup_fd_inode(d_inode(dst_dent));
+ if (child_fp) {
+ spin_unlock(&src_dent->d_lock);
+ ksmbd_debug(VFS, "Forbid rename, sub file/dir is in use\n");
+ return -EACCES;
+ }
+ }
+ spin_unlock(&src_dent->d_lock);
+
+ return 0;
+}
+
+static int __ksmbd_vfs_rename(struct ksmbd_work *work,
+ struct user_namespace *src_user_ns,
+ struct dentry *src_dent_parent,
+ struct dentry *src_dent,
+ struct user_namespace *dst_user_ns,
+ struct dentry *dst_dent_parent,
+ struct dentry *trap_dent,
+ char *dst_name)
+{
+ struct dentry *dst_dent;
+ int err;
+
+ if (!work->tcon->posix_extensions) {
+ err = ksmbd_validate_entry_in_use(src_dent);
+ if (err)
+ return err;
+ }
+
+ if (d_really_is_negative(src_dent_parent))
+ return -ENOENT;
+ if (d_really_is_negative(dst_dent_parent))
+ return -ENOENT;
+ if (d_really_is_negative(src_dent))
+ return -ENOENT;
+ if (src_dent == trap_dent)
+ return -EINVAL;
+
+ if (ksmbd_override_fsids(work))
+ return -ENOMEM;
+
+ dst_dent = lookup_one_len(dst_name, dst_dent_parent, strlen(dst_name));
+ err = PTR_ERR(dst_dent);
+ if (IS_ERR(dst_dent)) {
+ pr_err("lookup failed %s [%d]\n", dst_name, err);
+ goto out;
+ }
+
+ err = -ENOTEMPTY;
+ if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) {
+ struct renamedata rd = {
+ .old_mnt_userns = src_user_ns,
+ .old_dir = d_inode(src_dent_parent),
+ .old_dentry = src_dent,
+ .new_mnt_userns = dst_user_ns,
+ .new_dir = d_inode(dst_dent_parent),
+ .new_dentry = dst_dent,
+ };
+ err = vfs_rename(&rd);
+ }
+ if (err)
+ pr_err("vfs_rename failed err %d\n", err);
+ if (dst_dent)
+ dput(dst_dent);
+out:
+ ksmbd_revert_fsids(work);
+ return err;
+}
+
+int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
+ char *newname)
+{
+ struct path dst_path;
+ struct dentry *src_dent_parent, *dst_dent_parent;
+ struct dentry *src_dent, *trap_dent, *src_child;
+ char *dst_name;
+ int err;
+ int flags;
+
+ dst_name = extract_last_component(newname);
+ if (!dst_name)
+ return -EINVAL;
+
+ src_dent_parent = dget_parent(fp->filp->f_path.dentry);
+ src_dent = fp->filp->f_path.dentry;
+
+ flags = LOOKUP_DIRECTORY;
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS))
+ flags |= LOOKUP_FOLLOW;
+
+ err = kern_path(newname, flags, &dst_path);
+ if (err) {
+ ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err);
+ goto out;
+ }
+ dst_dent_parent = dst_path.dentry;
+
+ trap_dent = lock_rename(src_dent_parent, dst_dent_parent);
+ dget(src_dent);
+ dget(dst_dent_parent);
+ src_child = lookup_one_len(src_dent->d_name.name, src_dent_parent,
+ src_dent->d_name.len);
+ if (IS_ERR(src_child)) {
+ err = PTR_ERR(src_child);
+ goto out_lock;
+ }
+
+ if (src_child != src_dent) {
+ err = -ESTALE;
+ dput(src_child);
+ goto out_lock;
+ }
+ dput(src_child);
+
+ err = __ksmbd_vfs_rename(work,
+ file_mnt_user_ns(fp->filp),
+ src_dent_parent,
+ src_dent,
+ mnt_user_ns(dst_path.mnt),
+ dst_dent_parent,
+ trap_dent,
+ dst_name);
+out_lock:
+ dput(src_dent);
+ dput(dst_dent_parent);
+ unlock_rename(src_dent_parent, dst_dent_parent);
+ path_put(&dst_path);
+out:
+ dput(src_dent_parent);
+ return err;
+}
+
+/**
+ * ksmbd_vfs_truncate() - vfs helper for smb file truncate
+ * @work: work
+ * @name: old filename
+ * @fid: file id of old file
+ * @size: truncate to given size
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name,
+ struct ksmbd_file *fp, loff_t size)
+{
+ struct path path;
+ int err = 0;
+
+ if (name) {
+ err = kern_path(name, 0, &path);
+ if (err) {
+ pr_err("cannot get linux path for %s, err %d\n",
+ name, err);
+ return err;
+ }
+ err = vfs_truncate(&path, size);
+ if (err)
+ pr_err("truncate failed for %s err %d\n",
+ name, err);
+ path_put(&path);
+ } else {
+ struct file *filp;
+
+ filp = fp->filp;
+
+ /* Do we need to break any of a levelII oplock? */
+ smb_break_all_levII_oplock(work, fp, 1);
+
+ if (!work->tcon->posix_extensions) {
+ struct inode *inode = file_inode(filp);
+
+ if (size < inode->i_size) {
+ err = check_lock_range(filp, size,
+ inode->i_size - 1, WRITE);
+ } else {
+ err = check_lock_range(filp, inode->i_size,
+ size - 1, WRITE);
+ }
+
+ if (err) {
+ pr_err("failed due to lock\n");
+ return -EAGAIN;
+ }
+ }
+
+ err = vfs_truncate(&filp->f_path, size);
+ if (err)
+ pr_err("truncate failed for filename : %s err %d\n",
+ fp->filename, err);
+ }
+
+ return err;
+}
+
+/**
+ * ksmbd_vfs_listxattr() - vfs helper for smb list extended attributes
+ * @dentry: dentry of file for listing xattrs
+ * @list: destination buffer
+ * @size: destination buffer length
+ *
+ * Return: xattr list length on success, otherwise error
+ */
+ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list)
+{
+ ssize_t size;
+ char *vlist = NULL;
+
+ size = vfs_listxattr(dentry, NULL, 0);
+ if (size <= 0)
+ return size;
+
+ vlist = kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ if (!vlist)
+ return -ENOMEM;
+
+ *list = vlist;
+ size = vfs_listxattr(dentry, vlist, size);
+ if (size < 0) {
+ ksmbd_debug(VFS, "listxattr failed\n");
+ kvfree(vlist);
+ *list = NULL;
+ }
+
+ return size;
+}
+
+static ssize_t ksmbd_vfs_xattr_len(struct user_namespace *user_ns,
+ struct dentry *dentry, char *xattr_name)
+{
+ return vfs_getxattr(user_ns, dentry, xattr_name, NULL, 0);
+}
+
+/**
+ * ksmbd_vfs_getxattr() - vfs helper for smb get extended attributes value
+ * @user_ns: user namespace
+ * @dentry: dentry of file for getting xattrs
+ * @xattr_name: name of xattr name to query
+ * @xattr_buf: destination buffer xattr value
+ *
+ * Return: read xattr value length on success, otherwise error
+ */
+ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns,
+ struct dentry *dentry,
+ char *xattr_name, char **xattr_buf)
+{
+ ssize_t xattr_len;
+ char *buf;
+
+ *xattr_buf = NULL;
+ xattr_len = ksmbd_vfs_xattr_len(user_ns, dentry, xattr_name);
+ if (xattr_len < 0)
+ return xattr_len;
+
+ buf = kmalloc(xattr_len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ xattr_len = vfs_getxattr(user_ns, dentry, xattr_name,
+ (void *)buf, xattr_len);
+ if (xattr_len > 0)
+ *xattr_buf = buf;
+ else
+ kfree(buf);
+ return xattr_len;
+}
+
+/**
+ * ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value
+ * @user_ns: user namespace
+ * @dentry: dentry to set XATTR at
+ * @name: xattr name for setxattr
+ * @value: xattr value to set
+ * @size: size of xattr value
+ * @flags: destination buffer length
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_setxattr(struct user_namespace *user_ns,
+ struct dentry *dentry, const char *attr_name,
+ const void *attr_value, size_t attr_size, int flags)
+{
+ int err;
+
+ err = vfs_setxattr(user_ns,
+ dentry,
+ attr_name,
+ attr_value,
+ attr_size,
+ flags);
+ if (err)
+ ksmbd_debug(VFS, "setxattr failed, err %d\n", err);
+ return err;
+}
+
+/**
+ * ksmbd_vfs_set_fadvise() - convert smb IO caching options to linux options
+ * @filp: file pointer for IO
+ * @options: smb IO options
+ */
+void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option)
+{
+ struct address_space *mapping;
+
+ mapping = filp->f_mapping;
+
+ if (!option || !mapping)
+ return;
+
+ if (option & FILE_WRITE_THROUGH_LE) {
+ filp->f_flags |= O_SYNC;
+ } else if (option & FILE_SEQUENTIAL_ONLY_LE) {
+ filp->f_ra.ra_pages = inode_to_bdi(mapping->host)->ra_pages * 2;
+ spin_lock(&filp->f_lock);
+ filp->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&filp->f_lock);
+ } else if (option & FILE_RANDOM_ACCESS_LE) {
+ spin_lock(&filp->f_lock);
+ filp->f_mode |= FMODE_RANDOM;
+ spin_unlock(&filp->f_lock);
+ }
+}
+
+int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
+ loff_t off, loff_t len)
+{
+ smb_break_all_levII_oplock(work, fp, 1);
+ if (fp->f_ci->m_fattr & ATTR_SPARSE_FILE_LE)
+ return vfs_fallocate(fp->filp,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ off, len);
+
+ return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len);
+}
+
+int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
+ struct file_allocated_range_buffer *ranges,
+ int in_count, int *out_count)
+{
+ struct file *f = fp->filp;
+ struct inode *inode = file_inode(fp->filp);
+ loff_t maxbytes = (u64)inode->i_sb->s_maxbytes, end;
+ loff_t extent_start, extent_end;
+ int ret = 0;
+
+ if (start > maxbytes)
+ return -EFBIG;
+
+ if (!in_count)
+ return 0;
+
+ /*
+ * Shrink request scope to what the fs can actually handle.
+ */
+ if (length > maxbytes || (maxbytes - length) < start)
+ length = maxbytes - start;
+
+ if (start + length > inode->i_size)
+ length = inode->i_size - start;
+
+ *out_count = 0;
+ end = start + length;
+ while (start < end && *out_count < in_count) {
+ extent_start = f->f_op->llseek(f, start, SEEK_DATA);
+ if (extent_start < 0) {
+ if (extent_start != -ENXIO)
+ ret = (int)extent_start;
+ break;
+ }
+
+ if (extent_start >= end)
+ break;
+
+ extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE);
+ if (extent_end < 0) {
+ if (extent_end != -ENXIO)
+ ret = (int)extent_end;
+ break;
+ } else if (extent_start >= extent_end) {
+ break;
+ }
+
+ ranges[*out_count].file_offset = cpu_to_le64(extent_start);
+ ranges[(*out_count)++].length =
+ cpu_to_le64(min(extent_end, end) - extent_start);
+
+ start = extent_end;
+ }
+
+ return ret;
+}
+
+int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns,
+ struct dentry *dentry, char *attr_name)
+{
+ return vfs_removexattr(user_ns, dentry, attr_name);
+}
+
+int ksmbd_vfs_unlink(struct user_namespace *user_ns,
+ struct dentry *dir, struct dentry *dentry)
+{
+ int err = 0;
+
+ err = ksmbd_vfs_lock_parent(dir, dentry);
+ if (err)
+ return err;
+ dget(dentry);
+
+ if (S_ISDIR(d_inode(dentry)->i_mode))
+ err = vfs_rmdir(user_ns, d_inode(dir), dentry);
+ else
+ err = vfs_unlink(user_ns, d_inode(dir), dentry, NULL);
+
+ dput(dentry);
+ inode_unlock(d_inode(dir));
+ if (err)
+ ksmbd_debug(VFS, "failed to delete, err %d\n", err);
+
+ return err;
+}
+
+static int __dir_empty(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct ksmbd_readdir_data *buf;
+
+ buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
+ buf->dirent_count++;
+
+ if (buf->dirent_count > 2)
+ return -ENOTEMPTY;
+ return 0;
+}
+
+/**
+ * ksmbd_vfs_empty_dir() - check for empty directory
+ * @fp: ksmbd file pointer
+ *
+ * Return: true if directory empty, otherwise false
+ */
+int ksmbd_vfs_empty_dir(struct ksmbd_file *fp)
+{
+ int err;
+ struct ksmbd_readdir_data readdir_data;
+
+ memset(&readdir_data, 0, sizeof(struct ksmbd_readdir_data));
+
+ set_ctx_actor(&readdir_data.ctx, __dir_empty);
+ readdir_data.dirent_count = 0;
+
+ err = iterate_dir(fp->filp, &readdir_data.ctx);
+ if (readdir_data.dirent_count > 2)
+ err = -ENOTEMPTY;
+ else
+ err = 0;
+ return err;
+}
+
+static int __caseless_lookup(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ksmbd_readdir_data *buf;
+
+ buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
+
+ if (buf->used != namlen)
+ return 0;
+ if (!strncasecmp((char *)buf->private, name, namlen)) {
+ memcpy((char *)buf->private, name, namlen);
+ buf->dirent_count = 1;
+ return -EEXIST;
+ }
+ return 0;
+}
+
+/**
+ * ksmbd_vfs_lookup_in_dir() - lookup a file in a directory
+ * @dir: path info
+ * @name: filename to lookup
+ * @namelen: filename length
+ *
+ * Return: 0 on success, otherwise error
+ */
+static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
+{
+ int ret;
+ struct file *dfilp;
+ int flags = O_RDONLY | O_LARGEFILE;
+ struct ksmbd_readdir_data readdir_data = {
+ .ctx.actor = __caseless_lookup,
+ .private = name,
+ .used = namelen,
+ .dirent_count = 0,
+ };
+
+ dfilp = dentry_open(dir, flags, current_cred());
+ if (IS_ERR(dfilp))
+ return PTR_ERR(dfilp);
+
+ ret = iterate_dir(dfilp, &readdir_data.ctx);
+ if (readdir_data.dirent_count > 0)
+ ret = 0;
+ fput(dfilp);
+ return ret;
+}
+
+/**
+ * ksmbd_vfs_kern_path() - lookup a file and get path info
+ * @name: name of file for lookup
+ * @flags: lookup flags
+ * @path: if lookup succeed, return path info
+ * @caseless: caseless filename lookup
+ *
+ * Return: 0 on success, otherwise error
+ */
+int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path,
+ bool caseless)
+{
+ int err;
+
+ if (name[0] != '/')
+ return -EINVAL;
+
+ err = kern_path(name, flags, path);
+ if (!err)
+ return 0;
+
+ if (caseless) {
+ char *filepath;
+ struct path parent;
+ size_t path_len, remain_len;
+
+ filepath = kstrdup(name, GFP_KERNEL);
+ if (!filepath)
+ return -ENOMEM;
+
+ path_len = strlen(filepath);
+ remain_len = path_len - 1;
+
+ err = kern_path("/", flags, &parent);
+ if (err)
+ goto out;
+
+ while (d_can_lookup(parent.dentry)) {
+ char *filename = filepath + path_len - remain_len;
+ char *next = strchrnul(filename, '/');
+ size_t filename_len = next - filename;
+ bool is_last = !next[0];
+
+ if (filename_len == 0)
+ break;
+
+ err = ksmbd_vfs_lookup_in_dir(&parent, filename,
+ filename_len);
+ if (err) {
+ path_put(&parent);
+ goto out;
+ }
+
+ path_put(&parent);
+ next[0] = '\0';
+
+ err = kern_path(filepath, flags, &parent);
+ if (err)
+ goto out;
+
+ if (is_last) {
+ path->mnt = parent.mnt;
+ path->dentry = parent.dentry;
+ goto out;
+ }
+
+ next[0] = '/';
+ remain_len -= filename_len + 1;
+ }
+
+ path_put(&parent);
+ err = -EINVAL;
+out:
+ kfree(filepath);
+ }
+ return err;
+}
+
+int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns,
+ struct dentry *dentry)
+{
+ char *name, *xattr_list = NULL;
+ ssize_t xattr_list_len;
+ int err = 0;
+
+ xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list);
+ if (xattr_list_len < 0) {
+ goto out;
+ } else if (!xattr_list_len) {
+ ksmbd_debug(SMB, "empty xattr in the file\n");
+ goto out;
+ }
+
+ for (name = xattr_list; name - xattr_list < xattr_list_len;
+ name += strlen(name) + 1) {
+ ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
+
+ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
+ sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) ||
+ !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) {
+ err = ksmbd_vfs_remove_xattr(user_ns, dentry, name);
+ if (err)
+ ksmbd_debug(SMB,
+ "remove acl xattr failed : %s\n", name);
+ }
+ }
+out:
+ kvfree(xattr_list);
+ return err;
+}
+
+int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns,
+ struct dentry *dentry)
+{
+ char *name, *xattr_list = NULL;
+ ssize_t xattr_list_len;
+ int err = 0;
+
+ xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list);
+ if (xattr_list_len < 0) {
+ goto out;
+ } else if (!xattr_list_len) {
+ ksmbd_debug(SMB, "empty xattr in the file\n");
+ goto out;
+ }
+
+ for (name = xattr_list; name - xattr_list < xattr_list_len;
+ name += strlen(name) + 1) {
+ ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
+
+ if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) {
+ err = ksmbd_vfs_remove_xattr(user_ns, dentry, name);
+ if (err)
+ ksmbd_debug(SMB, "remove xattr failed : %s\n", name);
+ }
+ }
+out:
+ kvfree(xattr_list);
+ return err;
+}
+
+static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespace *user_ns,
+ struct inode *inode,
+ int acl_type)
+{
+ struct xattr_smb_acl *smb_acl = NULL;
+ struct posix_acl *posix_acls;
+ struct posix_acl_entry *pa_entry;
+ struct xattr_acl_entry *xa_entry;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
+ return NULL;
+
+ posix_acls = get_acl(inode, acl_type);
+ if (!posix_acls)
+ return NULL;
+
+ smb_acl = kzalloc(sizeof(struct xattr_smb_acl) +
+ sizeof(struct xattr_acl_entry) * posix_acls->a_count,
+ GFP_KERNEL);
+ if (!smb_acl)
+ goto out;
+
+ smb_acl->count = posix_acls->a_count;
+ pa_entry = posix_acls->a_entries;
+ xa_entry = smb_acl->entries;
+ for (i = 0; i < posix_acls->a_count; i++, pa_entry++, xa_entry++) {
+ switch (pa_entry->e_tag) {
+ case ACL_USER:
+ xa_entry->type = SMB_ACL_USER;
+ xa_entry->uid = from_kuid(user_ns, pa_entry->e_uid);
+ break;
+ case ACL_USER_OBJ:
+ xa_entry->type = SMB_ACL_USER_OBJ;
+ break;
+ case ACL_GROUP:
+ xa_entry->type = SMB_ACL_GROUP;
+ xa_entry->gid = from_kgid(user_ns, pa_entry->e_gid);
+ break;
+ case ACL_GROUP_OBJ:
+ xa_entry->type = SMB_ACL_GROUP_OBJ;
+ break;
+ case ACL_OTHER:
+ xa_entry->type = SMB_ACL_OTHER;
+ break;
+ case ACL_MASK:
+ xa_entry->type = SMB_ACL_MASK;
+ break;
+ default:
+ pr_err("unknown type : 0x%x\n", pa_entry->e_tag);
+ goto out;
+ }
+
+ if (pa_entry->e_perm & ACL_READ)
+ xa_entry->perm |= SMB_ACL_READ;
+ if (pa_entry->e_perm & ACL_WRITE)
+ xa_entry->perm |= SMB_ACL_WRITE;
+ if (pa_entry->e_perm & ACL_EXECUTE)
+ xa_entry->perm |= SMB_ACL_EXECUTE;
+ }
+out:
+ posix_acl_release(posix_acls);
+ return smb_acl;
+}
+
+int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
+ struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct smb_ntsd *pntsd, int len)
+{
+ int rc;
+ struct ndr sd_ndr = {0}, acl_ndr = {0};
+ struct xattr_ntacl acl = {0};
+ struct xattr_smb_acl *smb_acl, *def_smb_acl = NULL;
+ struct inode *inode = d_inode(dentry);
+
+ acl.version = 4;
+ acl.hash_type = XATTR_SD_HASH_TYPE_SHA256;
+ acl.current_time = ksmbd_UnixTimeToNT(current_time(inode));
+
+ memcpy(acl.desc, "posix_acl", 9);
+ acl.desc_len = 10;
+
+ pntsd->osidoffset =
+ cpu_to_le32(le32_to_cpu(pntsd->osidoffset) + NDR_NTSD_OFFSETOF);
+ pntsd->gsidoffset =
+ cpu_to_le32(le32_to_cpu(pntsd->gsidoffset) + NDR_NTSD_OFFSETOF);
+ pntsd->dacloffset =
+ cpu_to_le32(le32_to_cpu(pntsd->dacloffset) + NDR_NTSD_OFFSETOF);
+
+ acl.sd_buf = (char *)pntsd;
+ acl.sd_size = len;
+
+ rc = ksmbd_gen_sd_hash(conn, acl.sd_buf, acl.sd_size, acl.hash);
+ if (rc) {
+ pr_err("failed to generate hash for ndr acl\n");
+ return rc;
+ }
+
+ smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode,
+ ACL_TYPE_ACCESS);
+ if (S_ISDIR(inode->i_mode))
+ def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode,
+ ACL_TYPE_DEFAULT);
+
+ rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode,
+ smb_acl, def_smb_acl);
+ if (rc) {
+ pr_err("failed to encode ndr to posix acl\n");
+ goto out;
+ }
+
+ rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset,
+ acl.posix_acl_hash);
+ if (rc) {
+ pr_err("failed to generate hash for ndr acl\n");
+ goto out;
+ }
+
+ rc = ndr_encode_v4_ntacl(&sd_ndr, &acl);
+ if (rc) {
+ pr_err("failed to encode ndr to posix acl\n");
+ goto out;
+ }
+
+ rc = ksmbd_vfs_setxattr(user_ns, dentry,
+ XATTR_NAME_SD, sd_ndr.data,
+ sd_ndr.offset, 0);
+ if (rc < 0)
+ pr_err("Failed to store XATTR ntacl :%d\n", rc);
+
+ kfree(sd_ndr.data);
+out:
+ kfree(acl_ndr.data);
+ kfree(smb_acl);
+ kfree(def_smb_acl);
+ return rc;
+}
+
+int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
+ struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct smb_ntsd **pntsd)
+{
+ int rc;
+ struct ndr n;
+ struct inode *inode = d_inode(dentry);
+ struct ndr acl_ndr = {0};
+ struct xattr_ntacl acl;
+ struct xattr_smb_acl *smb_acl = NULL, *def_smb_acl = NULL;
+ __u8 cmp_hash[XATTR_SD_HASH_SIZE] = {0};
+
+ rc = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_SD, &n.data);
+ if (rc <= 0)
+ return rc;
+
+ n.length = rc;
+ rc = ndr_decode_v4_ntacl(&n, &acl);
+ if (rc)
+ goto free_n_data;
+
+ smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode,
+ ACL_TYPE_ACCESS);
+ if (S_ISDIR(inode->i_mode))
+ def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode,
+ ACL_TYPE_DEFAULT);
+
+ rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, smb_acl,
+ def_smb_acl);
+ if (rc) {
+ pr_err("failed to encode ndr to posix acl\n");
+ goto out_free;
+ }
+
+ rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, cmp_hash);
+ if (rc) {
+ pr_err("failed to generate hash for ndr acl\n");
+ goto out_free;
+ }
+
+ if (memcmp(cmp_hash, acl.posix_acl_hash, XATTR_SD_HASH_SIZE)) {
+ pr_err("hash value diff\n");
+ rc = -EINVAL;
+ goto out_free;
+ }
+
+ *pntsd = acl.sd_buf;
+ (*pntsd)->osidoffset = cpu_to_le32(le32_to_cpu((*pntsd)->osidoffset) -
+ NDR_NTSD_OFFSETOF);
+ (*pntsd)->gsidoffset = cpu_to_le32(le32_to_cpu((*pntsd)->gsidoffset) -
+ NDR_NTSD_OFFSETOF);
+ (*pntsd)->dacloffset = cpu_to_le32(le32_to_cpu((*pntsd)->dacloffset) -
+ NDR_NTSD_OFFSETOF);
+
+ rc = acl.sd_size;
+out_free:
+ kfree(acl_ndr.data);
+ kfree(smb_acl);
+ kfree(def_smb_acl);
+ if (rc < 0) {
+ kfree(acl.sd_buf);
+ *pntsd = NULL;
+ }
+
+free_n_data:
+ kfree(n.data);
+ return rc;
+}
+
+int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct xattr_dos_attrib *da)
+{
+ struct ndr n;
+ int err;
+
+ err = ndr_encode_dos_attr(&n, da);
+ if (err)
+ return err;
+
+ err = ksmbd_vfs_setxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE,
+ (void *)n.data, n.offset, 0);
+ if (err)
+ ksmbd_debug(SMB, "failed to store dos attribute in xattr\n");
+ kfree(n.data);
+
+ return err;
+}
+
+int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct xattr_dos_attrib *da)
+{
+ struct ndr n;
+ int err;
+
+ err = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE,
+ (char **)&n.data);
+ if (err > 0) {
+ n.length = err;
+ if (ndr_decode_dos_attr(&n, da))
+ err = -EINVAL;
+ kfree(n.data);
+ } else {
+ ksmbd_debug(SMB, "failed to load dos attribute in xattr\n");
+ }
+
+ return err;
+}
+
+/**
+ * ksmbd_vfs_init_kstat() - convert unix stat information to smb stat format
+ * @p: destination buffer
+ * @ksmbd_kstat: ksmbd kstat wrapper
+ */
+void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat)
+{
+ struct file_directory_info *info = (struct file_directory_info *)(*p);
+ struct kstat *kstat = ksmbd_kstat->kstat;
+ u64 time;
+
+ info->FileIndex = 0;
+ info->CreationTime = cpu_to_le64(ksmbd_kstat->create_time);
+ time = ksmbd_UnixTimeToNT(kstat->atime);
+ info->LastAccessTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(kstat->mtime);
+ info->LastWriteTime = cpu_to_le64(time);
+ time = ksmbd_UnixTimeToNT(kstat->ctime);
+ info->ChangeTime = cpu_to_le64(time);
+
+ if (ksmbd_kstat->file_attributes & ATTR_DIRECTORY_LE) {
+ info->EndOfFile = 0;
+ info->AllocationSize = 0;
+ } else {
+ info->EndOfFile = cpu_to_le64(kstat->size);
+ info->AllocationSize = cpu_to_le64(kstat->blocks << 9);
+ }
+ info->ExtFileAttributes = ksmbd_kstat->file_attributes;
+
+ return info;
+}
+
+int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
+ struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct ksmbd_kstat *ksmbd_kstat)
+{
+ u64 time;
+ int rc;
+
+ generic_fillattr(user_ns, d_inode(dentry), ksmbd_kstat->kstat);
+
+ time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime);
+ ksmbd_kstat->create_time = time;
+
+ /*
+ * set default value for the case that store dos attributes is not yes
+ * or that acl is disable in server's filesystem and the config is yes.
+ */
+ if (S_ISDIR(ksmbd_kstat->kstat->mode))
+ ksmbd_kstat->file_attributes = ATTR_DIRECTORY_LE;
+ else
+ ksmbd_kstat->file_attributes = ATTR_ARCHIVE_LE;
+
+ if (test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) {
+ struct xattr_dos_attrib da;
+
+ rc = ksmbd_vfs_get_dos_attrib_xattr(user_ns, dentry, &da);
+ if (rc > 0) {
+ ksmbd_kstat->file_attributes = cpu_to_le32(da.attr);
+ ksmbd_kstat->create_time = da.create_time;
+ } else {
+ ksmbd_debug(VFS, "fail to load dos attribute.\n");
+ }
+ }
+
+ return 0;
+}
+
+ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns,
+ struct dentry *dentry, char *attr_name,
+ int attr_name_len)
+{
+ char *name, *xattr_list = NULL;
+ ssize_t value_len = -ENOENT, xattr_list_len;
+
+ xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list);
+ if (xattr_list_len <= 0)
+ goto out;
+
+ for (name = xattr_list; name - xattr_list < xattr_list_len;
+ name += strlen(name) + 1) {
+ ksmbd_debug(VFS, "%s, len %zd\n", name, strlen(name));
+ if (strncasecmp(attr_name, name, attr_name_len))
+ continue;
+
+ value_len = ksmbd_vfs_xattr_len(user_ns, dentry, name);
+ break;
+ }
+
+out:
+ kvfree(xattr_list);
+ return value_len;
+}
+
+int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
+ size_t *xattr_stream_name_size, int s_type)
+{
+ char *type, *buf;
+
+ if (s_type == DIR_STREAM)
+ type = ":$INDEX_ALLOCATION";
+ else
+ type = ":$DATA";
+
+ buf = kasprintf(GFP_KERNEL, "%s%s%s",
+ XATTR_NAME_STREAM, stream_name, type);
+ if (!buf)
+ return -ENOMEM;
+
+ *xattr_stream_name = buf;
+ *xattr_stream_name_size = strlen(buf) + 1;
+
+ return 0;
+}
+
+int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
+ struct ksmbd_file *src_fp,
+ struct ksmbd_file *dst_fp,
+ struct srv_copychunk *chunks,
+ unsigned int chunk_count,
+ unsigned int *chunk_count_written,
+ unsigned int *chunk_size_written,
+ loff_t *total_size_written)
+{
+ unsigned int i;
+ loff_t src_off, dst_off, src_file_size;
+ size_t len;
+ int ret;
+
+ *chunk_count_written = 0;
+ *chunk_size_written = 0;
+ *total_size_written = 0;
+
+ if (!(src_fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) {
+ pr_err("no right to read(%pd)\n", src_fp->filp->f_path.dentry);
+ return -EACCES;
+ }
+ if (!(dst_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) {
+ pr_err("no right to write(%pd)\n", dst_fp->filp->f_path.dentry);
+ return -EACCES;
+ }
+
+ if (ksmbd_stream_fd(src_fp) || ksmbd_stream_fd(dst_fp))
+ return -EBADF;
+
+ smb_break_all_levII_oplock(work, dst_fp, 1);
+
+ if (!work->tcon->posix_extensions) {
+ for (i = 0; i < chunk_count; i++) {
+ src_off = le64_to_cpu(chunks[i].SourceOffset);
+ dst_off = le64_to_cpu(chunks[i].TargetOffset);
+ len = le32_to_cpu(chunks[i].Length);
+
+ if (check_lock_range(src_fp->filp, src_off,
+ src_off + len - 1, READ))
+ return -EAGAIN;
+ if (check_lock_range(dst_fp->filp, dst_off,
+ dst_off + len - 1, WRITE))
+ return -EAGAIN;
+ }
+ }
+
+ src_file_size = i_size_read(file_inode(src_fp->filp));
+
+ for (i = 0; i < chunk_count; i++) {
+ src_off = le64_to_cpu(chunks[i].SourceOffset);
+ dst_off = le64_to_cpu(chunks[i].TargetOffset);
+ len = le32_to_cpu(chunks[i].Length);
+
+ if (src_off + len > src_file_size)
+ return -E2BIG;
+
+ ret = vfs_copy_file_range(src_fp->filp, src_off,
+ dst_fp->filp, dst_off, len, 0);
+ if (ret < 0)
+ return ret;
+
+ *chunk_count_written += 1;
+ *total_size_written += ret;
+ }
+ return 0;
+}
+
+void ksmbd_vfs_posix_lock_wait(struct file_lock *flock)
+{
+ wait_event(flock->fl_wait, !flock->fl_blocker);
+}
+
+int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout)
+{
+ return wait_event_interruptible_timeout(flock->fl_wait,
+ !flock->fl_blocker,
+ timeout);
+}
+
+void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
+{
+ locks_delete_block(flock);
+}
+
+int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
+ struct inode *inode)
+{
+ struct posix_acl_state acl_state;
+ struct posix_acl *acls;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ ksmbd_debug(SMB, "Set posix acls\n");
+ rc = init_acl_state(&acl_state, 1);
+ if (rc)
+ return rc;
+
+ /* Set default owner group */
+ acl_state.owner.allow = (inode->i_mode & 0700) >> 6;
+ acl_state.group.allow = (inode->i_mode & 0070) >> 3;
+ acl_state.other.allow = inode->i_mode & 0007;
+ acl_state.users->aces[acl_state.users->n].uid = inode->i_uid;
+ acl_state.users->aces[acl_state.users->n++].perms.allow =
+ acl_state.owner.allow;
+ acl_state.groups->aces[acl_state.groups->n].gid = inode->i_gid;
+ acl_state.groups->aces[acl_state.groups->n++].perms.allow =
+ acl_state.group.allow;
+ acl_state.mask.allow = 0x07;
+
+ acls = posix_acl_alloc(6, GFP_KERNEL);
+ if (!acls) {
+ free_acl_state(&acl_state);
+ return -ENOMEM;
+ }
+ posix_state_to_acl(&acl_state, acls->a_entries);
+ rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+ if (rc < 0)
+ ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
+ rc);
+ else if (S_ISDIR(inode->i_mode)) {
+ posix_state_to_acl(&acl_state, acls->a_entries);
+ rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
+ acls);
+ if (rc < 0)
+ ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
+ rc);
+ }
+ free_acl_state(&acl_state);
+ posix_acl_release(acls);
+ return rc;
+}
+
+int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
+ struct inode *inode, struct inode *parent_inode)
+{
+ struct posix_acl *acls;
+ struct posix_acl_entry *pace;
+ int rc, i;
+
+ if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ acls = get_acl(parent_inode, ACL_TYPE_DEFAULT);
+ if (!acls)
+ return -ENOENT;
+ pace = acls->a_entries;
+
+ for (i = 0; i < acls->a_count; i++, pace++) {
+ if (pace->e_tag == ACL_MASK) {
+ pace->e_perm = 0x07;
+ break;
+ }
+ }
+
+ rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+ if (rc < 0)
+ ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
+ rc);
+ if (S_ISDIR(inode->i_mode)) {
+ rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
+ acls);
+ if (rc < 0)
+ ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
+ rc);
+ }
+ posix_acl_release(acls);
+ return rc;
+}
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
new file mode 100644
index 000000000000..cb0cba0d5d07
--- /dev/null
+++ b/fs/ksmbd/vfs.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __KSMBD_VFS_H__
+#define __KSMBD_VFS_H__
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <uapi/linux/xattr.h>
+#include <linux/posix_acl.h>
+
+#include "smbacl.h"
+#include "xattr.h"
+
+/*
+ * Enumeration for stream type.
+ */
+enum {
+ DATA_STREAM = 1, /* type $DATA */
+ DIR_STREAM /* type $INDEX_ALLOCATION */
+};
+
+/* CreateOptions */
+/* Flag is set, it must not be a file , valid for directory only */
+#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001)
+#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
+#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
+
+/* Should not buffer on server*/
+#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
+/* MBZ */
+#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010)
+/* MBZ */
+#define FILE_SYNCHRONOUS_IO_NONALERT_LE cpu_to_le32(0x00000020)
+
+/* Flaf must not be set for directory */
+#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
+
+/* Should be zero */
+#define CREATE_TREE_CONNECTION cpu_to_le32(0x00000080)
+#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
+#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
+#define FILE_OPEN_REMOTE_INSTANCE cpu_to_le32(0x00000400)
+
+/**
+ * Doc says this is obsolete "open for recovery" flag should be zero
+ * in any case.
+ */
+#define CREATE_OPEN_FOR_RECOVERY cpu_to_le32(0x00000400)
+#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
+#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
+#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
+#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
+#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
+
+/* Should be zero*/
+#define FILE_OPEN_REQUIRING_OPLOCK cpu_to_le32(0x00010000)
+#define FILE_DISALLOW_EXCLUSIVE cpu_to_le32(0x00020000)
+#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000)
+#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
+#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
+
+/* Should be zero */
+#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000)
+#define CREATE_OPTIONS_MASK cpu_to_le32(0x00FFFFFF)
+#define CREATE_OPTION_READONLY 0x10000000
+/* system. NB not sent over wire */
+#define CREATE_OPTION_SPECIAL 0x20000000
+
+struct ksmbd_work;
+struct ksmbd_file;
+struct ksmbd_conn;
+
+struct ksmbd_dir_info {
+ const char *name;
+ char *wptr;
+ char *rptr;
+ int name_len;
+ int out_buf_len;
+ int num_entry;
+ int data_count;
+ int last_entry_offset;
+ bool hide_dot_file;
+ int flags;
+};
+
+struct ksmbd_readdir_data {
+ struct dir_context ctx;
+ union {
+ void *private;
+ char *dirent;
+ };
+
+ unsigned int used;
+ unsigned int dirent_count;
+ unsigned int file_attr;
+};
+
+/* ksmbd kstat wrapper to get valid create time when reading dir entry */
+struct ksmbd_kstat {
+ struct kstat *kstat;
+ unsigned long long create_time;
+ __le32 file_attributes;
+};
+
+int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child);
+int ksmbd_vfs_may_delete(struct user_namespace *user_ns, struct dentry *dentry);
+int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns,
+ struct dentry *dentry, __le32 *daccess);
+int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode);
+int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode);
+int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp,
+ size_t count, loff_t *pos);
+int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
+ char *buf, size_t count, loff_t *pos, bool sync,
+ ssize_t *written);
+int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id);
+int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name);
+int ksmbd_vfs_link(struct ksmbd_work *work,
+ const char *oldname, const char *newname);
+int ksmbd_vfs_getattr(struct path *path, struct kstat *stat);
+int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
+ char *newname);
+int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name,
+ struct ksmbd_file *fp, loff_t size);
+struct srv_copychunk;
+int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
+ struct ksmbd_file *src_fp,
+ struct ksmbd_file *dst_fp,
+ struct srv_copychunk *chunks,
+ unsigned int chunk_count,
+ unsigned int *chunk_count_written,
+ unsigned int *chunk_size_written,
+ loff_t *total_size_written);
+ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list);
+ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns,
+ struct dentry *dentry,
+ char *xattr_name,
+ char **xattr_buf);
+ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns,
+ struct dentry *dentry, char *attr_name,
+ int attr_name_len);
+int ksmbd_vfs_setxattr(struct user_namespace *user_ns,
+ struct dentry *dentry, const char *attr_name,
+ const void *attr_value, size_t attr_size, int flags);
+int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
+ size_t *xattr_stream_name_size, int s_type);
+int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns,
+ struct dentry *dentry, char *attr_name);
+int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path,
+ bool caseless);
+int ksmbd_vfs_empty_dir(struct ksmbd_file *fp);
+void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option);
+int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
+ loff_t off, loff_t len);
+struct file_allocated_range_buffer;
+int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
+ struct file_allocated_range_buffer *ranges,
+ int in_count, int *out_count);
+int ksmbd_vfs_unlink(struct user_namespace *user_ns,
+ struct dentry *dir, struct dentry *dentry);
+void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat);
+int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
+ struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct ksmbd_kstat *ksmbd_kstat);
+void ksmbd_vfs_posix_lock_wait(struct file_lock *flock);
+int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout);
+void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock);
+int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns,
+ struct dentry *dentry);
+int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns,
+ struct dentry *dentry);
+int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
+ struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct smb_ntsd *pntsd, int len);
+int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
+ struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct smb_ntsd **pntsd);
+int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct xattr_dos_attrib *da);
+int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns,
+ struct dentry *dentry,
+ struct xattr_dos_attrib *da);
+int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
+ struct inode *inode);
+int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
+ struct inode *inode,
+ struct inode *parent_inode);
+#endif /* __KSMBD_VFS_H__ */
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
new file mode 100644
index 000000000000..92d8c61ffd2a
--- /dev/null
+++ b/fs/ksmbd/vfs_cache.c
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org>
+ * Copyright (C) 2019 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "glob.h"
+#include "vfs_cache.h"
+#include "oplock.h"
+#include "vfs.h"
+#include "connection.h"
+#include "mgmt/tree_connect.h"
+#include "mgmt/user_session.h"
+#include "smb_common.h"
+
+#define S_DEL_PENDING 1
+#define S_DEL_ON_CLS 2
+#define S_DEL_ON_CLS_STREAM 8
+
+static unsigned int inode_hash_mask __read_mostly;
+static unsigned int inode_hash_shift __read_mostly;
+static struct hlist_head *inode_hashtable __read_mostly;
+static DEFINE_RWLOCK(inode_hash_lock);
+
+static struct ksmbd_file_table global_ft;
+static atomic_long_t fd_limit;
+static struct kmem_cache *filp_cache;
+
+void ksmbd_set_fd_limit(unsigned long limit)
+{
+ limit = min(limit, get_max_files());
+ atomic_long_set(&fd_limit, limit);
+}
+
+static bool fd_limit_depleted(void)
+{
+ long v = atomic_long_dec_return(&fd_limit);
+
+ if (v >= 0)
+ return false;
+ atomic_long_inc(&fd_limit);
+ return true;
+}
+
+static void fd_limit_close(void)
+{
+ atomic_long_inc(&fd_limit);
+}
+
+/*
+ * INODE hash
+ */
+
+static unsigned long inode_hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp;
+
+ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+ L1_CACHE_BYTES;
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> inode_hash_shift);
+ return tmp & inode_hash_mask;
+}
+
+static struct ksmbd_inode *__ksmbd_inode_lookup(struct inode *inode)
+{
+ struct hlist_head *head = inode_hashtable +
+ inode_hash(inode->i_sb, inode->i_ino);
+ struct ksmbd_inode *ci = NULL, *ret_ci = NULL;
+
+ hlist_for_each_entry(ci, head, m_hash) {
+ if (ci->m_inode == inode) {
+ if (atomic_inc_not_zero(&ci->m_count))
+ ret_ci = ci;
+ break;
+ }
+ }
+ return ret_ci;
+}
+
+static struct ksmbd_inode *ksmbd_inode_lookup(struct ksmbd_file *fp)
+{
+ return __ksmbd_inode_lookup(file_inode(fp->filp));
+}
+
+static struct ksmbd_inode *ksmbd_inode_lookup_by_vfsinode(struct inode *inode)
+{
+ struct ksmbd_inode *ci;
+
+ read_lock(&inode_hash_lock);
+ ci = __ksmbd_inode_lookup(inode);
+ read_unlock(&inode_hash_lock);
+ return ci;
+}
+
+int ksmbd_query_inode_status(struct inode *inode)
+{
+ struct ksmbd_inode *ci;
+ int ret = KSMBD_INODE_STATUS_UNKNOWN;
+
+ read_lock(&inode_hash_lock);
+ ci = __ksmbd_inode_lookup(inode);
+ if (ci) {
+ ret = KSMBD_INODE_STATUS_OK;
+ if (ci->m_flags & S_DEL_PENDING)
+ ret = KSMBD_INODE_STATUS_PENDING_DELETE;
+ atomic_dec(&ci->m_count);
+ }
+ read_unlock(&inode_hash_lock);
+ return ret;
+}
+
+bool ksmbd_inode_pending_delete(struct ksmbd_file *fp)
+{
+ return (fp->f_ci->m_flags & S_DEL_PENDING);
+}
+
+void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp)
+{
+ fp->f_ci->m_flags |= S_DEL_PENDING;
+}
+
+void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp)
+{
+ fp->f_ci->m_flags &= ~S_DEL_PENDING;
+}
+
+void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp,
+ int file_info)
+{
+ if (ksmbd_stream_fd(fp)) {
+ fp->f_ci->m_flags |= S_DEL_ON_CLS_STREAM;
+ return;
+ }
+
+ fp->f_ci->m_flags |= S_DEL_ON_CLS;
+}
+
+static void ksmbd_inode_hash(struct ksmbd_inode *ci)
+{
+ struct hlist_head *b = inode_hashtable +
+ inode_hash(ci->m_inode->i_sb, ci->m_inode->i_ino);
+
+ hlist_add_head(&ci->m_hash, b);
+}
+
+static void ksmbd_inode_unhash(struct ksmbd_inode *ci)
+{
+ write_lock(&inode_hash_lock);
+ hlist_del_init(&ci->m_hash);
+ write_unlock(&inode_hash_lock);
+}
+
+static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp)
+{
+ ci->m_inode = file_inode(fp->filp);
+ atomic_set(&ci->m_count, 1);
+ atomic_set(&ci->op_count, 0);
+ atomic_set(&ci->sop_count, 0);
+ ci->m_flags = 0;
+ ci->m_fattr = 0;
+ INIT_LIST_HEAD(&ci->m_fp_list);
+ INIT_LIST_HEAD(&ci->m_op_list);
+ rwlock_init(&ci->m_lock);
+ return 0;
+}
+
+static struct ksmbd_inode *ksmbd_inode_get(struct ksmbd_file *fp)
+{
+ struct ksmbd_inode *ci, *tmpci;
+ int rc;
+
+ read_lock(&inode_hash_lock);
+ ci = ksmbd_inode_lookup(fp);
+ read_unlock(&inode_hash_lock);
+ if (ci)
+ return ci;
+
+ ci = kmalloc(sizeof(struct ksmbd_inode), GFP_KERNEL);
+ if (!ci)
+ return NULL;
+
+ rc = ksmbd_inode_init(ci, fp);
+ if (rc) {
+ pr_err("inode initialized failed\n");
+ kfree(ci);
+ return NULL;
+ }
+
+ write_lock(&inode_hash_lock);
+ tmpci = ksmbd_inode_lookup(fp);
+ if (!tmpci) {
+ ksmbd_inode_hash(ci);
+ } else {
+ kfree(ci);
+ ci = tmpci;
+ }
+ write_unlock(&inode_hash_lock);
+ return ci;
+}
+
+static void ksmbd_inode_free(struct ksmbd_inode *ci)
+{
+ ksmbd_inode_unhash(ci);
+ kfree(ci);
+}
+
+static void ksmbd_inode_put(struct ksmbd_inode *ci)
+{
+ if (atomic_dec_and_test(&ci->m_count))
+ ksmbd_inode_free(ci);
+}
+
+int __init ksmbd_inode_hash_init(void)
+{
+ unsigned int loop;
+ unsigned long numentries = 16384;
+ unsigned long bucketsize = sizeof(struct hlist_head);
+ unsigned long size;
+
+ inode_hash_shift = ilog2(numentries);
+ inode_hash_mask = (1 << inode_hash_shift) - 1;
+
+ size = bucketsize << inode_hash_shift;
+
+ /* init master fp hash table */
+ inode_hashtable = vmalloc(size);
+ if (!inode_hashtable)
+ return -ENOMEM;
+
+ for (loop = 0; loop < (1U << inode_hash_shift); loop++)
+ INIT_HLIST_HEAD(&inode_hashtable[loop]);
+ return 0;
+}
+
+void ksmbd_release_inode_hash(void)
+{
+ vfree(inode_hashtable);
+}
+
+static void __ksmbd_inode_close(struct ksmbd_file *fp)
+{
+ struct dentry *dir, *dentry;
+ struct ksmbd_inode *ci = fp->f_ci;
+ int err;
+ struct file *filp;
+
+ filp = fp->filp;
+ if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) {
+ ci->m_flags &= ~S_DEL_ON_CLS_STREAM;
+ err = ksmbd_vfs_remove_xattr(file_mnt_user_ns(filp),
+ filp->f_path.dentry,
+ fp->stream.name);
+ if (err)
+ pr_err("remove xattr failed : %s\n",
+ fp->stream.name);
+ }
+
+ if (atomic_dec_and_test(&ci->m_count)) {
+ write_lock(&ci->m_lock);
+ if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) {
+ dentry = filp->f_path.dentry;
+ dir = dentry->d_parent;
+ ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING);
+ write_unlock(&ci->m_lock);
+ ksmbd_vfs_unlink(file_mnt_user_ns(filp), dir, dentry);
+ write_lock(&ci->m_lock);
+ }
+ write_unlock(&ci->m_lock);
+
+ ksmbd_inode_free(ci);
+ }
+}
+
+static void __ksmbd_remove_durable_fd(struct ksmbd_file *fp)
+{
+ if (!has_file_id(fp->persistent_id))
+ return;
+
+ write_lock(&global_ft.lock);
+ idr_remove(global_ft.idr, fp->persistent_id);
+ write_unlock(&global_ft.lock);
+}
+
+static void __ksmbd_remove_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
+{
+ if (!has_file_id(fp->volatile_id))
+ return;
+
+ write_lock(&fp->f_ci->m_lock);
+ list_del_init(&fp->node);
+ write_unlock(&fp->f_ci->m_lock);
+
+ write_lock(&ft->lock);
+ idr_remove(ft->idr, fp->volatile_id);
+ write_unlock(&ft->lock);
+}
+
+static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
+{
+ struct file *filp;
+ struct ksmbd_lock *smb_lock, *tmp_lock;
+
+ fd_limit_close();
+ __ksmbd_remove_durable_fd(fp);
+ __ksmbd_remove_fd(ft, fp);
+
+ close_id_del_oplock(fp);
+ filp = fp->filp;
+
+ __ksmbd_inode_close(fp);
+ if (!IS_ERR_OR_NULL(filp))
+ fput(filp);
+
+ /* because the reference count of fp is 0, it is guaranteed that
+ * there are not accesses to fp->lock_list.
+ */
+ list_for_each_entry_safe(smb_lock, tmp_lock, &fp->lock_list, flist) {
+ spin_lock(&fp->conn->llist_lock);
+ list_del(&smb_lock->clist);
+ spin_unlock(&fp->conn->llist_lock);
+
+ list_del(&smb_lock->flist);
+ locks_free_lock(smb_lock->fl);
+ kfree(smb_lock);
+ }
+
+ kfree(fp->filename);
+ if (ksmbd_stream_fd(fp))
+ kfree(fp->stream.name);
+ kmem_cache_free(filp_cache, fp);
+}
+
+static struct ksmbd_file *ksmbd_fp_get(struct ksmbd_file *fp)
+{
+ if (!atomic_inc_not_zero(&fp->refcount))
+ return NULL;
+ return fp;
+}
+
+static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft,
+ u64 id)
+{
+ struct ksmbd_file *fp;
+
+ if (!has_file_id(id))
+ return NULL;
+
+ read_lock(&ft->lock);
+ fp = idr_find(ft->idr, id);
+ if (fp)
+ fp = ksmbd_fp_get(fp);
+ read_unlock(&ft->lock);
+ return fp;
+}
+
+static void __put_fd_final(struct ksmbd_work *work, struct ksmbd_file *fp)
+{
+ __ksmbd_close_fd(&work->sess->file_table, fp);
+ atomic_dec(&work->conn->stats.open_files_count);
+}
+
+static void set_close_state_blocked_works(struct ksmbd_file *fp)
+{
+ struct ksmbd_work *cancel_work, *ctmp;
+
+ spin_lock(&fp->f_lock);
+ list_for_each_entry_safe(cancel_work, ctmp, &fp->blocked_works,
+ fp_entry) {
+ list_del(&cancel_work->fp_entry);
+ cancel_work->state = KSMBD_WORK_CLOSED;
+ cancel_work->cancel_fn(cancel_work->cancel_argv);
+ }
+ spin_unlock(&fp->f_lock);
+}
+
+int ksmbd_close_fd(struct ksmbd_work *work, u64 id)
+{
+ struct ksmbd_file *fp;
+ struct ksmbd_file_table *ft;
+
+ if (!has_file_id(id))
+ return 0;
+
+ ft = &work->sess->file_table;
+ read_lock(&ft->lock);
+ fp = idr_find(ft->idr, id);
+ if (fp) {
+ set_close_state_blocked_works(fp);
+
+ if (!atomic_dec_and_test(&fp->refcount))
+ fp = NULL;
+ }
+ read_unlock(&ft->lock);
+
+ if (!fp)
+ return -EINVAL;
+
+ __put_fd_final(work, fp);
+ return 0;
+}
+
+void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp)
+{
+ if (!fp)
+ return;
+
+ if (!atomic_dec_and_test(&fp->refcount))
+ return;
+ __put_fd_final(work, fp);
+}
+
+static bool __sanity_check(struct ksmbd_tree_connect *tcon, struct ksmbd_file *fp)
+{
+ if (!fp)
+ return false;
+ if (fp->tcon != tcon)
+ return false;
+ return true;
+}
+
+struct ksmbd_file *ksmbd_lookup_foreign_fd(struct ksmbd_work *work, u64 id)
+{
+ return __ksmbd_lookup_fd(&work->sess->file_table, id);
+}
+
+struct ksmbd_file *ksmbd_lookup_fd_fast(struct ksmbd_work *work, u64 id)
+{
+ struct ksmbd_file *fp = __ksmbd_lookup_fd(&work->sess->file_table, id);
+
+ if (__sanity_check(work->tcon, fp))
+ return fp;
+
+ ksmbd_fd_put(work, fp);
+ return NULL;
+}
+
+struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
+ u64 pid)
+{
+ struct ksmbd_file *fp;
+
+ if (!has_file_id(id)) {
+ id = work->compound_fid;
+ pid = work->compound_pfid;
+ }
+
+ fp = __ksmbd_lookup_fd(&work->sess->file_table, id);
+ if (!__sanity_check(work->tcon, fp)) {
+ ksmbd_fd_put(work, fp);
+ return NULL;
+ }
+ if (fp->persistent_id != pid) {
+ ksmbd_fd_put(work, fp);
+ return NULL;
+ }
+ return fp;
+}
+
+struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id)
+{
+ return __ksmbd_lookup_fd(&global_ft, id);
+}
+
+struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid)
+{
+ struct ksmbd_file *fp = NULL;
+ unsigned int id;
+
+ read_lock(&global_ft.lock);
+ idr_for_each_entry(global_ft.idr, fp, id) {
+ if (!memcmp(fp->create_guid,
+ cguid,
+ SMB2_CREATE_GUID_SIZE)) {
+ fp = ksmbd_fp_get(fp);
+ break;
+ }
+ }
+ read_unlock(&global_ft.lock);
+
+ return fp;
+}
+
+struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode)
+{
+ struct ksmbd_file *lfp;
+ struct ksmbd_inode *ci;
+
+ ci = ksmbd_inode_lookup_by_vfsinode(inode);
+ if (!ci)
+ return NULL;
+
+ read_lock(&ci->m_lock);
+ list_for_each_entry(lfp, &ci->m_fp_list, node) {
+ if (inode == file_inode(lfp->filp)) {
+ atomic_dec(&ci->m_count);
+ read_unlock(&ci->m_lock);
+ return lfp;
+ }
+ }
+ atomic_dec(&ci->m_count);
+ read_unlock(&ci->m_lock);
+ return NULL;
+}
+
+#define OPEN_ID_TYPE_VOLATILE_ID (0)
+#define OPEN_ID_TYPE_PERSISTENT_ID (1)
+
+static void __open_id_set(struct ksmbd_file *fp, u64 id, int type)
+{
+ if (type == OPEN_ID_TYPE_VOLATILE_ID)
+ fp->volatile_id = id;
+ if (type == OPEN_ID_TYPE_PERSISTENT_ID)
+ fp->persistent_id = id;
+}
+
+static int __open_id(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
+ int type)
+{
+ u64 id = 0;
+ int ret;
+
+ if (type == OPEN_ID_TYPE_VOLATILE_ID && fd_limit_depleted()) {
+ __open_id_set(fp, KSMBD_NO_FID, type);
+ return -EMFILE;
+ }
+
+ idr_preload(GFP_KERNEL);
+ write_lock(&ft->lock);
+ ret = idr_alloc_cyclic(ft->idr, fp, 0, INT_MAX - 1, GFP_NOWAIT);
+ if (ret >= 0) {
+ id = ret;
+ ret = 0;
+ } else {
+ id = KSMBD_NO_FID;
+ fd_limit_close();
+ }
+
+ __open_id_set(fp, id, type);
+ write_unlock(&ft->lock);
+ idr_preload_end();
+ return ret;
+}
+
+unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp)
+{
+ __open_id(&global_ft, fp, OPEN_ID_TYPE_PERSISTENT_ID);
+ return fp->persistent_id;
+}
+
+struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp)
+{
+ struct ksmbd_file *fp;
+ int ret;
+
+ fp = kmem_cache_zalloc(filp_cache, GFP_KERNEL);
+ if (!fp) {
+ pr_err("Failed to allocate memory\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&fp->blocked_works);
+ INIT_LIST_HEAD(&fp->node);
+ INIT_LIST_HEAD(&fp->lock_list);
+ spin_lock_init(&fp->f_lock);
+ atomic_set(&fp->refcount, 1);
+
+ fp->filp = filp;
+ fp->conn = work->sess->conn;
+ fp->tcon = work->tcon;
+ fp->volatile_id = KSMBD_NO_FID;
+ fp->persistent_id = KSMBD_NO_FID;
+ fp->f_ci = ksmbd_inode_get(fp);
+
+ if (!fp->f_ci) {
+ ret = -ENOMEM;
+ goto err_out;
+ }
+
+ ret = __open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID);
+ if (ret) {
+ ksmbd_inode_put(fp->f_ci);
+ goto err_out;
+ }
+
+ atomic_inc(&work->conn->stats.open_files_count);
+ return fp;
+
+err_out:
+ kmem_cache_free(filp_cache, fp);
+ return ERR_PTR(ret);
+}
+
+static int
+__close_file_table_ids(struct ksmbd_file_table *ft,
+ struct ksmbd_tree_connect *tcon,
+ bool (*skip)(struct ksmbd_tree_connect *tcon,
+ struct ksmbd_file *fp))
+{
+ unsigned int id;
+ struct ksmbd_file *fp;
+ int num = 0;
+
+ idr_for_each_entry(ft->idr, fp, id) {
+ if (skip(tcon, fp))
+ continue;
+
+ set_close_state_blocked_works(fp);
+
+ if (!atomic_dec_and_test(&fp->refcount))
+ continue;
+ __ksmbd_close_fd(ft, fp);
+ num++;
+ }
+ return num;
+}
+
+static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
+ struct ksmbd_file *fp)
+{
+ return fp->tcon != tcon;
+}
+
+static bool session_fd_check(struct ksmbd_tree_connect *tcon,
+ struct ksmbd_file *fp)
+{
+ return false;
+}
+
+void ksmbd_close_tree_conn_fds(struct ksmbd_work *work)
+{
+ int num = __close_file_table_ids(&work->sess->file_table,
+ work->tcon,
+ tree_conn_fd_check);
+
+ atomic_sub(num, &work->conn->stats.open_files_count);
+}
+
+void ksmbd_close_session_fds(struct ksmbd_work *work)
+{
+ int num = __close_file_table_ids(&work->sess->file_table,
+ work->tcon,
+ session_fd_check);
+
+ atomic_sub(num, &work->conn->stats.open_files_count);
+}
+
+int ksmbd_init_global_file_table(void)
+{
+ return ksmbd_init_file_table(&global_ft);
+}
+
+void ksmbd_free_global_file_table(void)
+{
+ struct ksmbd_file *fp = NULL;
+ unsigned int id;
+
+ idr_for_each_entry(global_ft.idr, fp, id) {
+ __ksmbd_remove_durable_fd(fp);
+ kmem_cache_free(filp_cache, fp);
+ }
+
+ ksmbd_destroy_file_table(&global_ft);
+}
+
+int ksmbd_file_table_flush(struct ksmbd_work *work)
+{
+ struct ksmbd_file *fp = NULL;
+ unsigned int id;
+ int ret;
+
+ read_lock(&work->sess->file_table.lock);
+ idr_for_each_entry(work->sess->file_table.idr, fp, id) {
+ ret = ksmbd_vfs_fsync(work, fp->volatile_id, KSMBD_NO_FID);
+ if (ret)
+ break;
+ }
+ read_unlock(&work->sess->file_table.lock);
+ return ret;
+}
+
+int ksmbd_init_file_table(struct ksmbd_file_table *ft)
+{
+ ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL);
+ if (!ft->idr)
+ return -ENOMEM;
+
+ idr_init(ft->idr);
+ rwlock_init(&ft->lock);
+ return 0;
+}
+
+void ksmbd_destroy_file_table(struct ksmbd_file_table *ft)
+{
+ if (!ft->idr)
+ return;
+
+ __close_file_table_ids(ft, NULL, session_fd_check);
+ idr_destroy(ft->idr);
+ kfree(ft->idr);
+ ft->idr = NULL;
+}
+
+int ksmbd_init_file_cache(void)
+{
+ filp_cache = kmem_cache_create("ksmbd_file_cache",
+ sizeof(struct ksmbd_file), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!filp_cache)
+ goto out;
+
+ return 0;
+
+out:
+ pr_err("failed to allocate file cache\n");
+ return -ENOMEM;
+}
+
+void ksmbd_exit_file_cache(void)
+{
+ kmem_cache_destroy(filp_cache);
+}
diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h
new file mode 100644
index 000000000000..70dfe6a99f13
--- /dev/null
+++ b/fs/ksmbd/vfs_cache.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __VFS_CACHE_H__
+#define __VFS_CACHE_H__
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include "vfs.h"
+
+/* Windows style file permissions for extended response */
+#define FILE_GENERIC_ALL 0x1F01FF
+#define FILE_GENERIC_READ 0x120089
+#define FILE_GENERIC_WRITE 0x120116
+#define FILE_GENERIC_EXECUTE 0X1200a0
+
+#define KSMBD_START_FID 0
+#define KSMBD_NO_FID (INT_MAX)
+#define SMB2_NO_FID (0xFFFFFFFFFFFFFFFFULL)
+
+struct ksmbd_conn;
+struct ksmbd_session;
+
+struct ksmbd_lock {
+ struct file_lock *fl;
+ struct list_head clist;
+ struct list_head flist;
+ struct list_head llist;
+ unsigned int flags;
+ int cmd;
+ int zero_len;
+ unsigned long long start;
+ unsigned long long end;
+};
+
+struct stream {
+ char *name;
+ ssize_t size;
+};
+
+struct ksmbd_inode {
+ rwlock_t m_lock;
+ atomic_t m_count;
+ atomic_t op_count;
+ /* opinfo count for streams */
+ atomic_t sop_count;
+ struct inode *m_inode;
+ unsigned int m_flags;
+ struct hlist_node m_hash;
+ struct list_head m_fp_list;
+ struct list_head m_op_list;
+ struct oplock_info *m_opinfo;
+ __le32 m_fattr;
+};
+
+struct ksmbd_file {
+ struct file *filp;
+ char *filename;
+ u64 persistent_id;
+ u64 volatile_id;
+
+ spinlock_t f_lock;
+
+ struct ksmbd_inode *f_ci;
+ struct ksmbd_inode *f_parent_ci;
+ struct oplock_info __rcu *f_opinfo;
+ struct ksmbd_conn *conn;
+ struct ksmbd_tree_connect *tcon;
+
+ atomic_t refcount;
+ __le32 daccess;
+ __le32 saccess;
+ __le32 coption;
+ __le32 cdoption;
+ __u64 create_time;
+ __u64 itime;
+
+ bool is_nt_open;
+ bool attrib_only;
+
+ char client_guid[16];
+ char create_guid[16];
+ char app_instance_id[16];
+
+ struct stream stream;
+ struct list_head node;
+ struct list_head blocked_works;
+ struct list_head lock_list;
+
+ int durable_timeout;
+
+ /* for SMB1 */
+ int pid;
+
+ /* conflict lock fail count for SMB1 */
+ unsigned int cflock_cnt;
+ /* last lock failure start offset for SMB1 */
+ unsigned long long llock_fstart;
+
+ int dirent_offset;
+
+ /* if ls is happening on directory, below is valid*/
+ struct ksmbd_readdir_data readdir_data;
+ int dot_dotdot[2];
+};
+
+static inline void set_ctx_actor(struct dir_context *ctx,
+ filldir_t actor)
+{
+ ctx->actor = actor;
+}
+
+#define KSMBD_NR_OPEN_DEFAULT BITS_PER_LONG
+
+struct ksmbd_file_table {
+ rwlock_t lock;
+ struct idr *idr;
+};
+
+static inline bool has_file_id(u64 id)
+{
+ return id < KSMBD_NO_FID;
+}
+
+static inline bool ksmbd_stream_fd(struct ksmbd_file *fp)
+{
+ return fp->stream.name != NULL;
+}
+
+int ksmbd_init_file_table(struct ksmbd_file_table *ft);
+void ksmbd_destroy_file_table(struct ksmbd_file_table *ft);
+int ksmbd_close_fd(struct ksmbd_work *work, u64 id);
+struct ksmbd_file *ksmbd_lookup_fd_fast(struct ksmbd_work *work, u64 id);
+struct ksmbd_file *ksmbd_lookup_foreign_fd(struct ksmbd_work *work, u64 id);
+struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
+ u64 pid);
+void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp);
+struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id);
+struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid);
+struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode);
+unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp);
+struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp);
+void ksmbd_close_tree_conn_fds(struct ksmbd_work *work);
+void ksmbd_close_session_fds(struct ksmbd_work *work);
+int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode);
+int ksmbd_init_global_file_table(void);
+void ksmbd_free_global_file_table(void);
+int ksmbd_file_table_flush(struct ksmbd_work *work);
+void ksmbd_set_fd_limit(unsigned long limit);
+
+/*
+ * INODE hash
+ */
+int __init ksmbd_inode_hash_init(void);
+void ksmbd_release_inode_hash(void);
+
+enum KSMBD_INODE_STATUS {
+ KSMBD_INODE_STATUS_OK,
+ KSMBD_INODE_STATUS_UNKNOWN,
+ KSMBD_INODE_STATUS_PENDING_DELETE,
+};
+
+int ksmbd_query_inode_status(struct inode *inode);
+bool ksmbd_inode_pending_delete(struct ksmbd_file *fp);
+void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp);
+void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp);
+void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp,
+ int file_info);
+int ksmbd_init_file_cache(void);
+void ksmbd_exit_file_cache(void);
+#endif /* __VFS_CACHE_H__ */
diff --git a/fs/ksmbd/xattr.h b/fs/ksmbd/xattr.h
new file mode 100644
index 000000000000..8857c01093d9
--- /dev/null
+++ b/fs/ksmbd/xattr.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2021 Samsung Electronics Co., Ltd.
+ */
+
+#ifndef __XATTR_H__
+#define __XATTR_H__
+
+/*
+ * These are on-disk structures to store additional metadata into xattr to
+ * reproduce windows filesystem semantics. And they are encoded with NDR to
+ * compatible with samba's xattr meta format. The compatibility with samba
+ * is important because it can lose the information(file attribute,
+ * creation time, acls) about the existing files when switching between
+ * ksmbd and samba.
+ */
+
+/*
+ * Dos attribute flags used for what variable is valid.
+ */
+enum {
+ XATTR_DOSINFO_ATTRIB = 0x00000001,
+ XATTR_DOSINFO_EA_SIZE = 0x00000002,
+ XATTR_DOSINFO_SIZE = 0x00000004,
+ XATTR_DOSINFO_ALLOC_SIZE = 0x00000008,
+ XATTR_DOSINFO_CREATE_TIME = 0x00000010,
+ XATTR_DOSINFO_CHANGE_TIME = 0x00000020,
+ XATTR_DOSINFO_ITIME = 0x00000040
+};
+
+/*
+ * Dos attribute structure which is compatible with samba's one.
+ * Storing it into the xattr named "DOSATTRIB" separately from inode
+ * allows ksmbd to faithfully reproduce windows filesystem semantics
+ * on top of a POSIX filesystem.
+ */
+struct xattr_dos_attrib {
+ __u16 version; /* version 3 or version 4 */
+ __u32 flags; /* valid flags */
+ __u32 attr; /* Dos attribute */
+ __u32 ea_size; /* EA size */
+ __u64 size;
+ __u64 alloc_size;
+ __u64 create_time; /* File creation time */
+ __u64 change_time; /* File change time */
+ __u64 itime; /* Invented/Initial time */
+};
+
+/*
+ * Enumeration is used for computing posix acl hash.
+ */
+enum {
+ SMB_ACL_TAG_INVALID = 0,
+ SMB_ACL_USER,
+ SMB_ACL_USER_OBJ,
+ SMB_ACL_GROUP,
+ SMB_ACL_GROUP_OBJ,
+ SMB_ACL_OTHER,
+ SMB_ACL_MASK
+};
+
+#define SMB_ACL_READ 4
+#define SMB_ACL_WRITE 2
+#define SMB_ACL_EXECUTE 1
+
+struct xattr_acl_entry {
+ int type;
+ uid_t uid;
+ gid_t gid;
+ mode_t perm;
+};
+
+/*
+ * xattr_smb_acl structure is used for computing posix acl hash.
+ */
+struct xattr_smb_acl {
+ int count;
+ int next;
+ struct xattr_acl_entry entries[0];
+};
+
+/* 64bytes hash in xattr_ntacl is computed with sha256 */
+#define XATTR_SD_HASH_TYPE_SHA256 0x1
+#define XATTR_SD_HASH_SIZE 64
+
+/*
+ * xattr_ntacl is used for storing ntacl and hashes.
+ * Hash is used for checking valid posix acl and ntacl in xattr.
+ */
+struct xattr_ntacl {
+ __u16 version; /* version 4*/
+ void *sd_buf;
+ __u32 sd_size;
+ __u16 hash_type; /* hash type */
+ __u8 desc[10]; /* posix_acl description */
+ __u16 desc_len;
+ __u64 current_time;
+ __u8 hash[XATTR_SD_HASH_SIZE]; /* 64bytes hash for ntacl */
+ __u8 posix_acl_hash[XATTR_SD_HASH_SIZE]; /* 64bytes hash for posix acl */
+};
+
+/* DOS ATTRIBUITE XATTR PREFIX */
+#define DOS_ATTRIBUTE_PREFIX "DOSATTRIB"
+#define DOS_ATTRIBUTE_PREFIX_LEN (sizeof(DOS_ATTRIBUTE_PREFIX) - 1)
+#define XATTR_NAME_DOS_ATTRIBUTE (XATTR_USER_PREFIX DOS_ATTRIBUTE_PREFIX)
+#define XATTR_NAME_DOS_ATTRIBUTE_LEN \
+ (sizeof(XATTR_USER_PREFIX DOS_ATTRIBUTE_PREFIX) - 1)
+
+/* STREAM XATTR PREFIX */
+#define STREAM_PREFIX "DosStream."
+#define STREAM_PREFIX_LEN (sizeof(STREAM_PREFIX) - 1)
+#define XATTR_NAME_STREAM (XATTR_USER_PREFIX STREAM_PREFIX)
+#define XATTR_NAME_STREAM_LEN (sizeof(XATTR_NAME_STREAM) - 1)
+
+/* SECURITY DESCRIPTOR(NTACL) XATTR PREFIX */
+#define SD_PREFIX "NTACL"
+#define SD_PREFIX_LEN (sizeof(SD_PREFIX) - 1)
+#define XATTR_NAME_SD (XATTR_SECURITY_PREFIX SD_PREFIX)
+#define XATTR_NAME_SD_LEN \
+ (sizeof(XATTR_SECURITY_PREFIX SD_PREFIX) - 1)
+
+#endif /* __XATTR_H__ */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 2de048f80eb8..b632be3ad57b 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -584,7 +584,7 @@ static struct ctl_table nlm_sysctls[] = {
.data = &nsm_use_hostnames,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dobool,
},
{
.procname = "nsm_local_state",
@@ -649,6 +649,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp)
switch (rqstp->rq_authop->flavour) {
case RPC_AUTH_NULL:
case RPC_AUTH_UNIX:
+ rqstp->rq_auth_stat = rpc_auth_ok;
if (rqstp->rq_proc == 0)
return SVC_OK;
if (is_callback(rqstp->rq_proc)) {
@@ -659,6 +660,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp)
}
return svc_set_client(rqstp);
}
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
return SVC_DENIED;
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4c10fb5138f1..e10ae2c41279 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -40,12 +40,15 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain file pointer. Not used by FREE_ALL call. */
if (filp != NULL) {
- if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0)
+ int mode = lock_to_openmode(&lock->fl);
+
+ error = nlm_lookup_file(rqstp, &file, lock);
+ if (error)
goto no_locks;
*filp = file;
/* Set up the missing parts of the file_lock structure */
- lock->fl.fl_file = file->f_file;
+ lock->fl.fl_file = file->f_file[mode];
lock->fl.fl_pid = current->tgid;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 61d3cc2283dc..e9b85d8fd5fe 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -31,6 +31,7 @@
#include <linux/lockd/nlm.h>
#include <linux/lockd/lockd.h>
#include <linux/kthread.h>
+#include <linux/exportfs.h>
#define NLMDBG_FACILITY NLMDBG_SVCLOCK
@@ -395,28 +396,10 @@ nlmsvc_release_lockowner(struct nlm_lock *lock)
nlmsvc_put_lockowner(lock->fl.fl_owner);
}
-static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
-{
- struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner;
- new->fl_owner = nlmsvc_get_lockowner(nlm_lo);
-}
-
-static void nlmsvc_locks_release_private(struct file_lock *fl)
-{
- nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner);
-}
-
-static const struct file_lock_operations nlmsvc_lock_ops = {
- .fl_copy_lock = nlmsvc_locks_copy_lock,
- .fl_release_private = nlmsvc_locks_release_private,
-};
-
void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host,
pid_t pid)
{
fl->fl_owner = nlmsvc_find_lockowner(host, pid);
- if (fl->fl_owner != NULL)
- fl->fl_ops = &nlmsvc_lock_ops;
}
/*
@@ -488,17 +471,24 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_cookie *cookie, int reclaim)
{
struct nlm_block *block = NULL;
+ struct inode *inode = nlmsvc_file_inode(file);
int error;
+ int mode;
+ int async_block = 0;
__be32 ret;
dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
- locks_inode(file->f_file)->i_sb->s_id,
- locks_inode(file->f_file)->i_ino,
+ inode->i_sb->s_id, inode->i_ino,
lock->fl.fl_type, lock->fl.fl_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end,
wait);
+ if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) {
+ async_block = wait;
+ wait = 0;
+ }
+
/* Lock file against concurrent access */
mutex_lock(&file->f_mutex);
/* Get existing block (in case client is busy-waiting)
@@ -542,7 +532,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
if (!wait)
lock->fl.fl_flags &= ~FL_SLEEP;
- error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+ mode = lock_to_openmode(&lock->fl);
+ error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
lock->fl.fl_flags &= ~FL_SLEEP;
dprintk("lockd: vfs_lock_file returned %d\n", error);
@@ -558,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
*/
if (wait)
break;
- ret = nlm_lck_denied;
+ ret = async_block ? nlm_lck_blocked : nlm_lck_denied;
goto out;
case FILE_LOCK_DEFERRED:
if (wait)
@@ -595,12 +586,13 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_lock *conflock, struct nlm_cookie *cookie)
{
int error;
+ int mode;
__be32 ret;
struct nlm_lockowner *test_owner;
dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
- locks_inode(file->f_file)->i_sb->s_id,
- locks_inode(file->f_file)->i_ino,
+ nlmsvc_file_inode(file)->i_sb->s_id,
+ nlmsvc_file_inode(file)->i_ino,
lock->fl.fl_type,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -613,7 +605,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
/* If there's a conflicting lock, remember to clean up the test lock */
test_owner = (struct nlm_lockowner *)lock->fl.fl_owner;
- error = vfs_test_lock(file->f_file, &lock->fl);
+ mode = lock_to_openmode(&lock->fl);
+ error = vfs_test_lock(file->f_file[mode], &lock->fl);
if (error) {
/* We can't currently deal with deferred test requests */
if (error == FILE_LOCK_DEFERRED)
@@ -634,7 +627,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
conflock->caller = "somehost"; /* FIXME */
conflock->len = strlen(conflock->caller);
conflock->oh.len = 0; /* don't return OH info */
- conflock->svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid;
+ conflock->svid = lock->fl.fl_pid;
conflock->fl.fl_type = lock->fl.fl_type;
conflock->fl.fl_start = lock->fl.fl_start;
conflock->fl.fl_end = lock->fl.fl_end;
@@ -659,11 +652,11 @@ out:
__be32
nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
{
- int error;
+ int error = 0;
dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
- locks_inode(file->f_file)->i_sb->s_id,
- locks_inode(file->f_file)->i_ino,
+ nlmsvc_file_inode(file)->i_sb->s_id,
+ nlmsvc_file_inode(file)->i_ino,
lock->fl.fl_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -672,7 +665,12 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
nlmsvc_cancel_blocked(net, file, lock);
lock->fl.fl_type = F_UNLCK;
- error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+ if (file->f_file[O_RDONLY])
+ error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK,
+ &lock->fl, NULL);
+ if (file->f_file[O_WRONLY])
+ error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK,
+ &lock->fl, NULL);
return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
}
@@ -689,10 +687,11 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
{
struct nlm_block *block;
int status = 0;
+ int mode;
dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
- locks_inode(file->f_file)->i_sb->s_id,
- locks_inode(file->f_file)->i_ino,
+ nlmsvc_file_inode(file)->i_sb->s_id,
+ nlmsvc_file_inode(file)->i_ino,
lock->fl.fl_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
@@ -704,7 +703,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
block = nlmsvc_lookup_block(file, lock);
mutex_unlock(&file->f_mutex);
if (block != NULL) {
- vfs_cancel_lock(block->b_file->f_file,
+ mode = lock_to_openmode(&lock->fl);
+ vfs_cancel_lock(block->b_file->f_file[mode],
&block->b_call->a_args.lock.fl);
status = nlmsvc_unlink_block(block);
nlmsvc_release_block(block);
@@ -788,9 +788,21 @@ nlmsvc_notify_blocked(struct file_lock *fl)
printk(KERN_WARNING "lockd: notification for unknown block!\n");
}
+static fl_owner_t nlmsvc_get_owner(fl_owner_t owner)
+{
+ return nlmsvc_get_lockowner(owner);
+}
+
+static void nlmsvc_put_owner(fl_owner_t owner)
+{
+ nlmsvc_put_lockowner(owner);
+}
+
const struct lock_manager_operations nlmsvc_lock_operations = {
.lm_notify = nlmsvc_notify_blocked,
.lm_grant = nlmsvc_grant_deferred,
+ .lm_get_owner = nlmsvc_get_owner,
+ .lm_put_owner = nlmsvc_put_owner,
};
/*
@@ -809,6 +821,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
{
struct nlm_file *file = block->b_file;
struct nlm_lock *lock = &block->b_call->a_args.lock;
+ int mode;
int error;
loff_t fl_start, fl_end;
@@ -834,7 +847,8 @@ nlmsvc_grant_blocked(struct nlm_block *block)
lock->fl.fl_flags |= FL_SLEEP;
fl_start = lock->fl.fl_start;
fl_end = lock->fl.fl_end;
- error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+ mode = lock_to_openmode(&lock->fl);
+ error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
lock->fl.fl_flags &= ~FL_SLEEP;
lock->fl.fl_start = fl_start;
lock->fl.fl_end = fl_end;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 4ae4b63b5392..99696d3f6dd6 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -55,6 +55,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_host *host = NULL;
struct nlm_file *file = NULL;
struct nlm_lock *lock = &argp->lock;
+ int mode;
__be32 error = 0;
/* nfsd callbacks must have been installed for this procedure */
@@ -69,13 +70,14 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain file pointer. Not used by FREE_ALL call. */
if (filp != NULL) {
- error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh));
+ error = cast_status(nlm_lookup_file(rqstp, &file, lock));
if (error != 0)
goto no_locks;
*filp = file;
/* Set up the missing parts of the file_lock structure */
- lock->fl.fl_file = file->f_file;
+ mode = lock_to_openmode(&lock->fl);
+ lock->fl.fl_file = file->f_file[mode];
lock->fl.fl_pid = current->tgid;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 028fc152da22..cb3a7512c33e 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)
{
- struct inode *inode = locks_inode(file->f_file);
+ struct inode *inode = nlmsvc_file_inode(file);
dprintk("lockd: %s %s/%ld\n",
msg, inode->i_sb->s_id, inode->i_ino);
@@ -71,56 +71,75 @@ static inline unsigned int file_hash(struct nfs_fh *f)
return tmp & (FILE_NRHASH - 1);
}
+int lock_to_openmode(struct file_lock *lock)
+{
+ return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY;
+}
+
+/*
+ * Open the file. Note that if we're reexporting, for example,
+ * this could block the lockd thread for a while.
+ *
+ * We have to make sure we have the right credential to open
+ * the file.
+ */
+static __be32 nlm_do_fopen(struct svc_rqst *rqstp,
+ struct nlm_file *file, int mode)
+{
+ struct file **fp = &file->f_file[mode];
+ __be32 nfserr;
+
+ if (*fp)
+ return 0;
+ nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode);
+ if (nfserr)
+ dprintk("lockd: open failed (error %d)\n", nfserr);
+ return nfserr;
+}
+
/*
* Lookup file info. If it doesn't exist, create a file info struct
* and open a (VFS) file for the given inode.
- *
- * FIXME:
- * Note that we open the file O_RDONLY even when creating write locks.
- * This is not quite right, but for now, we assume the client performs
- * the proper R/W checking.
*/
__be32
nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
- struct nfs_fh *f)
+ struct nlm_lock *lock)
{
struct nlm_file *file;
unsigned int hash;
__be32 nfserr;
+ int mode;
- nlm_debug_print_fh("nlm_lookup_file", f);
+ nlm_debug_print_fh("nlm_lookup_file", &lock->fh);
- hash = file_hash(f);
+ hash = file_hash(&lock->fh);
+ mode = lock_to_openmode(&lock->fl);
/* Lock file table */
mutex_lock(&nlm_file_mutex);
hlist_for_each_entry(file, &nlm_files[hash], f_list)
- if (!nfs_compare_fh(&file->f_handle, f))
+ if (!nfs_compare_fh(&file->f_handle, &lock->fh)) {
+ mutex_lock(&file->f_mutex);
+ nfserr = nlm_do_fopen(rqstp, file, mode);
+ mutex_unlock(&file->f_mutex);
goto found;
-
- nlm_debug_print_fh("creating file for", f);
+ }
+ nlm_debug_print_fh("creating file for", &lock->fh);
nfserr = nlm_lck_denied_nolocks;
file = kzalloc(sizeof(*file), GFP_KERNEL);
if (!file)
- goto out_unlock;
+ goto out_free;
- memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
+ memcpy(&file->f_handle, &lock->fh, sizeof(struct nfs_fh));
mutex_init(&file->f_mutex);
INIT_HLIST_NODE(&file->f_list);
INIT_LIST_HEAD(&file->f_blocks);
- /* Open the file. Note that this must not sleep for too long, else
- * we would lock up lockd:-) So no NFS re-exports, folks.
- *
- * We have to make sure we have the right credential to open
- * the file.
- */
- if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) {
- dprintk("lockd: open failed (error %d)\n", nfserr);
- goto out_free;
- }
+ nfserr = nlm_do_fopen(rqstp, file, mode);
+ if (nfserr)
+ goto out_unlock;
hlist_add_head(&file->f_list, &nlm_files[hash]);
@@ -128,7 +147,6 @@ found:
dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
*result = file;
file->f_count++;
- nfserr = 0;
out_unlock:
mutex_unlock(&nlm_file_mutex);
@@ -148,13 +166,34 @@ nlm_delete_file(struct nlm_file *file)
nlm_debug_print_file("closing file", file);
if (!hlist_unhashed(&file->f_list)) {
hlist_del(&file->f_list);
- nlmsvc_ops->fclose(file->f_file);
+ if (file->f_file[O_RDONLY])
+ nlmsvc_ops->fclose(file->f_file[O_RDONLY]);
+ if (file->f_file[O_WRONLY])
+ nlmsvc_ops->fclose(file->f_file[O_WRONLY]);
kfree(file);
} else {
printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
}
}
+static int nlm_unlock_files(struct nlm_file *file)
+{
+ struct file_lock lock;
+ struct file *f;
+
+ lock.fl_type = F_UNLCK;
+ lock.fl_start = 0;
+ lock.fl_end = OFFSET_MAX;
+ for (f = file->f_file[0]; f <= file->f_file[1]; f++) {
+ if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) {
+ pr_warn("lockd: unlock failure in %s:%d\n",
+ __FILE__, __LINE__);
+ return 1;
+ }
+ }
+ return 0;
+}
+
/*
* Loop over all locks on the given file and perform the specified
* action.
@@ -182,17 +221,10 @@ again:
lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host;
if (match(lockhost, host)) {
- struct file_lock lock = *fl;
spin_unlock(&flctx->flc_lock);
- lock.fl_type = F_UNLCK;
- lock.fl_start = 0;
- lock.fl_end = OFFSET_MAX;
- if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) {
- printk("lockd: unlock failure in %s:%d\n",
- __FILE__, __LINE__);
+ if (nlm_unlock_files(file))
return 1;
- }
goto again;
}
}
@@ -246,6 +278,15 @@ nlm_file_inuse(struct nlm_file *file)
return 0;
}
+static void nlm_close_files(struct nlm_file *file)
+{
+ struct file *f;
+
+ for (f = file->f_file[0]; f <= file->f_file[1]; f++)
+ if (f)
+ nlmsvc_ops->fclose(f);
+}
+
/*
* Loop over all files in the file table.
*/
@@ -276,7 +317,7 @@ nlm_traverse_files(void *data, nlm_host_match_fn_t match,
if (list_empty(&file->f_blocks) && !file->f_locks
&& !file->f_shares && !file->f_count) {
hlist_del(&file->f_list);
- nlmsvc_ops->fclose(file->f_file);
+ nlm_close_files(file);
kfree(file);
}
}
@@ -410,12 +451,13 @@ nlmsvc_invalidate_all(void)
nlm_traverse_files(NULL, nlmsvc_is_client, NULL);
}
+
static int
nlmsvc_match_sb(void *datap, struct nlm_file *file)
{
struct super_block *sb = datap;
- return sb == locks_inode(file->f_file)->i_sb;
+ return sb == nlmsvc_file_inode(file)->i_sb;
}
/**
diff --git a/fs/locks.c b/fs/locks.c
index 74b2a1dfe8d8..51a5b72ef302 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1397,103 +1397,6 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
return error;
}
-#ifdef CONFIG_MANDATORY_FILE_LOCKING
-/**
- * locks_mandatory_locked - Check for an active lock
- * @file: the file to check
- *
- * Searches the inode's list of locks to find any POSIX locks which conflict.
- * This function is called from locks_verify_locked() only.
- */
-int locks_mandatory_locked(struct file *file)
-{
- int ret;
- struct inode *inode = locks_inode(file);
- struct file_lock_context *ctx;
- struct file_lock *fl;
-
- ctx = smp_load_acquire(&inode->i_flctx);
- if (!ctx || list_empty_careful(&ctx->flc_posix))
- return 0;
-
- /*
- * Search the lock list for this inode for any POSIX locks.
- */
- spin_lock(&ctx->flc_lock);
- ret = 0;
- list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
- if (fl->fl_owner != current->files &&
- fl->fl_owner != file) {
- ret = -EAGAIN;
- break;
- }
- }
- spin_unlock(&ctx->flc_lock);
- return ret;
-}
-
-/**
- * locks_mandatory_area - Check for a conflicting lock
- * @inode: the file to check
- * @filp: how the file was opened (if it was)
- * @start: first byte in the file to check
- * @end: lastbyte in the file to check
- * @type: %F_WRLCK for a write lock, else %F_RDLCK
- *
- * Searches the inode's list of locks to find any POSIX locks which conflict.
- */
-int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
- loff_t end, unsigned char type)
-{
- struct file_lock fl;
- int error;
- bool sleep = false;
-
- locks_init_lock(&fl);
- fl.fl_pid = current->tgid;
- fl.fl_file = filp;
- fl.fl_flags = FL_POSIX | FL_ACCESS;
- if (filp && !(filp->f_flags & O_NONBLOCK))
- sleep = true;
- fl.fl_type = type;
- fl.fl_start = start;
- fl.fl_end = end;
-
- for (;;) {
- if (filp) {
- fl.fl_owner = filp;
- fl.fl_flags &= ~FL_SLEEP;
- error = posix_lock_inode(inode, &fl, NULL);
- if (!error)
- break;
- }
-
- if (sleep)
- fl.fl_flags |= FL_SLEEP;
- fl.fl_owner = current->files;
- error = posix_lock_inode(inode, &fl, NULL);
- if (error != FILE_LOCK_DEFERRED)
- break;
- error = wait_event_interruptible(fl.fl_wait,
- list_empty(&fl.fl_blocked_member));
- if (!error) {
- /*
- * If we've been sleeping someone might have
- * changed the permissions behind our back.
- */
- if (__mandatory_lock(inode))
- continue;
- }
-
- break;
- }
- locks_delete_block(&fl);
-
- return error;
-}
-EXPORT_SYMBOL(locks_mandatory_area);
-#endif /* CONFIG_MANDATORY_FILE_LOCKING */
-
static void lease_clear_pending(struct file_lock *fl, int arg)
{
switch (arg) {
@@ -2486,14 +2389,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
- /* Don't allow mandatory locks on files that may be memory mapped
- * and shared.
- */
- if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
- error = -EAGAIN;
- goto out;
- }
-
error = flock_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
@@ -2611,21 +2506,12 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
struct flock64 *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
- struct inode *inode = locks_inode(filp);
struct file *f;
int error;
if (file_lock == NULL)
return -ENOLCK;
- /* Don't allow mandatory locks on files that may be memory mapped
- * and shared.
- */
- if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
- error = -EAGAIN;
- goto out;
- }
-
error = flock64_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
@@ -2857,8 +2743,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_puts(f, "POSIX ");
seq_printf(f, " %s ",
- (inode == NULL) ? "*NOINODE*" :
- mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
+ (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
if (fl->fl_type & LOCK_MAND) {
seq_puts(f, "FLOCK MSNFS ");
@@ -3056,10 +2941,12 @@ static int __init filelock_init(void)
int i;
flctx_cache = kmem_cache_create("file_lock_ctx",
- sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+ sizeof(struct file_lock_context), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
filelock_cache = kmem_cache_create("file_lock_cache",
- sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+ sizeof(struct file_lock), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
for_each_possible_cpu(i) {
struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
diff --git a/fs/namei.c b/fs/namei.c
index bf6d8a738c59..95a881e0552b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -204,6 +204,14 @@ getname_flags(const char __user *filename, int flags, int *empty)
}
struct filename *
+getname_uflags(const char __user *filename, int uflags)
+{
+ int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+ return getname_flags(filename, flags, NULL);
+}
+
+struct filename *
getname(const char __user * filename)
{
return getname_flags(filename, 0, NULL);
@@ -247,6 +255,9 @@ getname_kernel(const char * filename)
void putname(struct filename *name)
{
+ if (IS_ERR_OR_NULL(name))
+ return;
+
BUG_ON(name->refcnt <= 0);
if (--name->refcnt > 0)
@@ -2456,7 +2467,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
return err;
}
-int filename_lookup(int dfd, struct filename *name, unsigned flags,
+static int __filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root)
{
int retval;
@@ -2474,6 +2485,14 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
audit_inode(name, path->dentry,
flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
restore_nameidata();
+ return retval;
+}
+
+int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root)
+{
+ int retval = __filename_lookup(dfd, name, flags, path, root);
+
putname(name);
return retval;
}
@@ -2495,7 +2514,7 @@ static int path_parentat(struct nameidata *nd, unsigned flags,
return err;
}
-static struct filename *filename_parentat(int dfd, struct filename *name,
+static int __filename_parentat(int dfd, struct filename *name,
unsigned int flags, struct path *parent,
struct qstr *last, int *type)
{
@@ -2503,7 +2522,7 @@ static struct filename *filename_parentat(int dfd, struct filename *name,
struct nameidata nd;
if (IS_ERR(name))
- return name;
+ return PTR_ERR(name);
set_nameidata(&nd, dfd, name, NULL);
retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
if (unlikely(retval == -ECHILD))
@@ -2514,29 +2533,34 @@ static struct filename *filename_parentat(int dfd, struct filename *name,
*last = nd.last;
*type = nd.last_type;
audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
- } else {
- putname(name);
- name = ERR_PTR(retval);
}
restore_nameidata();
- return name;
+ return retval;
+}
+
+static int filename_parentat(int dfd, struct filename *name,
+ unsigned int flags, struct path *parent,
+ struct qstr *last, int *type)
+{
+ int retval = __filename_parentat(dfd, name, flags, parent, last, type);
+
+ putname(name);
+ return retval;
}
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
{
- struct filename *filename;
struct dentry *d;
struct qstr last;
- int type;
+ int type, error;
- filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+ error = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
&last, &type);
- if (IS_ERR(filename))
- return ERR_CAST(filename);
+ if (error)
+ return ERR_PTR(error);
if (unlikely(type != LAST_NORM)) {
path_put(path);
- putname(filename);
return ERR_PTR(-EINVAL);
}
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
@@ -2545,7 +2569,6 @@ struct dentry *kern_path_locked(const char *name, struct path *path)
inode_unlock(path->dentry->d_inode);
path_put(path);
}
- putname(filename);
return d;
}
@@ -2575,8 +2598,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
}
EXPORT_SYMBOL(vfs_path_lookup);
-static int lookup_one_len_common(const char *name, struct dentry *base,
- int len, struct qstr *this)
+static int lookup_one_common(struct user_namespace *mnt_userns,
+ const char *name, struct dentry *base, int len,
+ struct qstr *this)
{
this->name = name;
this->len = len;
@@ -2604,7 +2628,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base,
return err;
}
- return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
+ return inode_permission(mnt_userns, base->d_inode, MAY_EXEC);
}
/**
@@ -2628,7 +2652,7 @@ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
- err = lookup_one_len_common(name, base, len, &this);
+ err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -2655,7 +2679,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
- err = lookup_one_len_common(name, base, len, &this);
+ err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -2665,6 +2689,36 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
EXPORT_SYMBOL(lookup_one_len);
/**
+ * lookup_one - filesystem helper to lookup single pathname component
+ * @mnt_userns: user namespace of the mount the lookup is performed from
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
+ */
+struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
+ struct dentry *base, int len)
+{
+ struct dentry *dentry;
+ struct qstr this;
+ int err;
+
+ WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+
+ err = lookup_one_common(mnt_userns, name, base, len, &this);
+ if (err)
+ return ERR_PTR(err);
+
+ dentry = lookup_dcache(&this, base, 0);
+ return dentry ? dentry : __lookup_slow(&this, base, 0);
+}
+EXPORT_SYMBOL(lookup_one);
+
+/**
* lookup_one_len_unlocked - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
* @base: base directory to lookup from
@@ -2683,7 +2737,7 @@ struct dentry *lookup_one_len_unlocked(const char *name,
int err;
struct dentry *ret;
- err = lookup_one_len_common(name, base, len, &this);
+ err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -3023,9 +3077,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
/*
* Refuse to truncate files with mandatory locks held on them.
*/
- error = locks_verify_locked(filp);
- if (!error)
- error = security_path_truncate(path);
+ error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -3566,7 +3618,7 @@ struct file *do_file_open_root(const struct path *root,
return file;
}
-static struct dentry *filename_create(int dfd, struct filename *name,
+static struct dentry *__filename_create(int dfd, struct filename *name,
struct path *path, unsigned int lookup_flags)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
@@ -3582,9 +3634,9 @@ static struct dentry *filename_create(int dfd, struct filename *name,
*/
lookup_flags &= LOOKUP_REVAL;
- name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
- if (IS_ERR(name))
- return ERR_CAST(name);
+ error = __filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+ if (error)
+ return ERR_PTR(error);
/*
* Yucky last component or no last component at all?
@@ -3622,7 +3674,6 @@ static struct dentry *filename_create(int dfd, struct filename *name,
error = err2;
goto fail;
}
- putname(name);
return dentry;
fail:
dput(dentry);
@@ -3633,10 +3684,18 @@ unlock:
mnt_drop_write(path->mnt);
out:
path_put(path);
- putname(name);
return dentry;
}
+static inline struct dentry *filename_create(int dfd, struct filename *name,
+ struct path *path, unsigned int lookup_flags)
+{
+ struct dentry *res = __filename_create(dfd, name, path, lookup_flags);
+
+ putname(name);
+ return res;
+}
+
struct dentry *kern_path_create(int dfd, const char *pathname,
struct path *path, unsigned int lookup_flags)
{
@@ -3725,7 +3784,7 @@ static int may_mknod(umode_t mode)
}
}
-static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
+static int do_mknodat(int dfd, struct filename *name, umode_t mode,
unsigned int dev)
{
struct user_namespace *mnt_userns;
@@ -3736,17 +3795,18 @@ static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
error = may_mknod(mode);
if (error)
- return error;
+ goto out1;
retry:
- dentry = user_path_create(dfd, filename, &path, lookup_flags);
+ dentry = __filename_create(dfd, name, &path, lookup_flags);
+ error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ goto out1;
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
error = security_path_mknod(&path, dentry, mode, dev);
if (error)
- goto out;
+ goto out2;
mnt_userns = mnt_user_ns(path.mnt);
switch (mode & S_IFMT) {
@@ -3765,24 +3825,26 @@ retry:
dentry, mode, 0);
break;
}
-out:
+out2:
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out1:
+ putname(name);
return error;
}
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
unsigned int, dev)
{
- return do_mknodat(dfd, filename, mode, dev);
+ return do_mknodat(dfd, getname(filename), mode, dev);
}
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
- return do_mknodat(AT_FDCWD, filename, mode, dev);
+ return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
}
/**
@@ -3827,7 +3889,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
}
EXPORT_SYMBOL(vfs_mkdir);
-static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
+int do_mkdirat(int dfd, struct filename *name, umode_t mode)
{
struct dentry *dentry;
struct path path;
@@ -3835,9 +3897,10 @@ static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
unsigned int lookup_flags = LOOKUP_DIRECTORY;
retry:
- dentry = user_path_create(dfd, pathname, &path, lookup_flags);
+ dentry = __filename_create(dfd, name, &path, lookup_flags);
+ error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ goto out_putname;
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
@@ -3853,17 +3916,19 @@ retry:
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out_putname:
+ putname(name);
return error;
}
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
- return do_mkdirat(dfd, pathname, mode);
+ return do_mkdirat(dfd, getname(pathname), mode);
}
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
- return do_mkdirat(AT_FDCWD, pathname, mode);
+ return do_mkdirat(AT_FDCWD, getname(pathname), mode);
}
/**
@@ -3921,62 +3986,62 @@ out:
}
EXPORT_SYMBOL(vfs_rmdir);
-long do_rmdir(int dfd, struct filename *name)
+int do_rmdir(int dfd, struct filename *name)
{
struct user_namespace *mnt_userns;
- int error = 0;
+ int error;
struct dentry *dentry;
struct path path;
struct qstr last;
int type;
unsigned int lookup_flags = 0;
retry:
- name = filename_parentat(dfd, name, lookup_flags,
- &path, &last, &type);
- if (IS_ERR(name))
- return PTR_ERR(name);
+ error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+ if (error)
+ goto exit1;
switch (type) {
case LAST_DOTDOT:
error = -ENOTEMPTY;
- goto exit1;
+ goto exit2;
case LAST_DOT:
error = -EINVAL;
- goto exit1;
+ goto exit2;
case LAST_ROOT:
error = -EBUSY;
- goto exit1;
+ goto exit2;
}
error = mnt_want_write(path.mnt);
if (error)
- goto exit1;
+ goto exit2;
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- goto exit2;
+ goto exit3;
if (!dentry->d_inode) {
error = -ENOENT;
- goto exit3;
+ goto exit4;
}
error = security_path_rmdir(&path, dentry);
if (error)
- goto exit3;
+ goto exit4;
mnt_userns = mnt_user_ns(path.mnt);
error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
-exit3:
+exit4:
dput(dentry);
-exit2:
+exit3:
inode_unlock(path.dentry->d_inode);
mnt_drop_write(path.mnt);
-exit1:
+exit2:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+exit1:
putname(name);
return error;
}
@@ -4024,7 +4089,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
return -EPERM;
inode_lock(target);
- if (is_local_mountpoint(dentry))
+ if (IS_SWAPFILE(target))
+ error = -EPERM;
+ else if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
@@ -4059,7 +4126,7 @@ EXPORT_SYMBOL(vfs_unlink);
* writeout happening, and we don't want to prevent access to the directory
* while waiting on the I/O.
*/
-long do_unlinkat(int dfd, struct filename *name)
+int do_unlinkat(int dfd, struct filename *name)
{
int error;
struct dentry *dentry;
@@ -4070,17 +4137,17 @@ long do_unlinkat(int dfd, struct filename *name)
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
- name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
- if (IS_ERR(name))
- return PTR_ERR(name);
+ error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+ if (error)
+ goto exit1;
error = -EISDIR;
if (type != LAST_NORM)
- goto exit1;
+ goto exit2;
error = mnt_want_write(path.mnt);
if (error)
- goto exit1;
+ goto exit2;
retry_deleg:
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
@@ -4097,11 +4164,11 @@ retry_deleg:
ihold(inode);
error = security_path_unlink(&path, dentry);
if (error)
- goto exit2;
+ goto exit3;
mnt_userns = mnt_user_ns(path.mnt);
error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
&delegated_inode);
-exit2:
+exit3:
dput(dentry);
}
inode_unlock(path.dentry->d_inode);
@@ -4114,13 +4181,14 @@ exit2:
goto retry_deleg;
}
mnt_drop_write(path.mnt);
-exit1:
+exit2:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
inode = NULL;
goto retry;
}
+exit1:
putname(name);
return error;
@@ -4131,7 +4199,7 @@ slashes:
error = -EISDIR;
else
error = -ENOTDIR;
- goto exit2;
+ goto exit3;
}
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
@@ -4186,23 +4254,22 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
}
EXPORT_SYMBOL(vfs_symlink);
-static long do_symlinkat(const char __user *oldname, int newdfd,
- const char __user *newname)
+int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
{
int error;
- struct filename *from;
struct dentry *dentry;
struct path path;
unsigned int lookup_flags = 0;
- from = getname(oldname);
- if (IS_ERR(from))
- return PTR_ERR(from);
+ if (IS_ERR(from)) {
+ error = PTR_ERR(from);
+ goto out_putnames;
+ }
retry:
- dentry = user_path_create(newdfd, newname, &path, lookup_flags);
+ dentry = __filename_create(newdfd, to, &path, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- goto out_putname;
+ goto out_putnames;
error = security_path_symlink(&path, dentry, from->name);
if (!error) {
@@ -4217,7 +4284,8 @@ retry:
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
-out_putname:
+out_putnames:
+ putname(to);
putname(from);
return error;
}
@@ -4225,12 +4293,12 @@ out_putname:
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
- return do_symlinkat(oldname, newdfd, newname);
+ return do_symlinkat(getname(oldname), newdfd, getname(newname));
}
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
- return do_symlinkat(oldname, AT_FDCWD, newname);
+ return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
}
/**
@@ -4331,8 +4399,8 @@ EXPORT_SYMBOL(vfs_link);
* with linux 2.0, and to avoid hard-linking to directories
* and other special files. --ADM
*/
-static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
- const char __user *newname, int flags)
+int do_linkat(int olddfd, struct filename *old, int newdfd,
+ struct filename *new, int flags)
{
struct user_namespace *mnt_userns;
struct dentry *new_dentry;
@@ -4341,31 +4409,32 @@ static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
int how = 0;
int error;
- if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
- return -EINVAL;
+ if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
+ error = -EINVAL;
+ goto out_putnames;
+ }
/*
* To use null names we require CAP_DAC_READ_SEARCH
* This ensures that not everyone will be able to create
* handlink using the passed filedescriptor.
*/
- if (flags & AT_EMPTY_PATH) {
- if (!capable(CAP_DAC_READ_SEARCH))
- return -ENOENT;
- how = LOOKUP_EMPTY;
+ if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
+ error = -ENOENT;
+ goto out_putnames;
}
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
retry:
- error = user_path_at(olddfd, oldname, how, &old_path);
+ error = __filename_lookup(olddfd, old, how, &old_path, NULL);
if (error)
- return error;
+ goto out_putnames;
- new_dentry = user_path_create(newdfd, newname, &new_path,
+ new_dentry = __filename_create(newdfd, new, &new_path,
(how & LOOKUP_REVAL));
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
- goto out;
+ goto out_putpath;
error = -EXDEV;
if (old_path.mnt != new_path.mnt)
@@ -4393,8 +4462,11 @@ out_dput:
how |= LOOKUP_REVAL;
goto retry;
}
-out:
+out_putpath:
path_put(&old_path);
+out_putnames:
+ putname(old);
+ putname(new);
return error;
}
@@ -4402,12 +4474,13 @@ out:
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, int, flags)
{
- return do_linkat(olddfd, oldname, newdfd, newname, flags);
+ return do_linkat(olddfd, getname_uflags(oldname, flags),
+ newdfd, getname(newname), flags);
}
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
- return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
}
/**
@@ -4526,6 +4599,10 @@ int vfs_rename(struct renamedata *rd)
else if (target)
inode_lock(target);
+ error = -EPERM;
+ if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
+ goto out;
+
error = -EBUSY;
if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
goto out;
@@ -4602,29 +4679,25 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
int error = -EINVAL;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
- goto put_both;
+ goto put_names;
if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
(flags & RENAME_EXCHANGE))
- goto put_both;
+ goto put_names;
if (flags & RENAME_EXCHANGE)
target_flags = 0;
retry:
- from = filename_parentat(olddfd, from, lookup_flags, &old_path,
+ error = __filename_parentat(olddfd, from, lookup_flags, &old_path,
&old_last, &old_type);
- if (IS_ERR(from)) {
- error = PTR_ERR(from);
- goto put_new;
- }
+ if (error)
+ goto put_names;
- to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
+ error = __filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
&new_type);
- if (IS_ERR(to)) {
- error = PTR_ERR(to);
+ if (error)
goto exit1;
- }
error = -EXDEV;
if (old_path.mnt != new_path.mnt)
@@ -4727,12 +4800,9 @@ exit1:
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
-put_both:
- if (!IS_ERR(from))
- putname(from);
-put_new:
- if (!IS_ERR(to))
- putname(to);
+put_names:
+ putname(from);
+ putname(to);
return error;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 97adcb5ab5d5..659a8f39c61a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -203,7 +203,8 @@ static struct mount *alloc_vfsmnt(const char *name)
goto out_free_cache;
if (name) {
- mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
+ mnt->mnt_devname = kstrdup_const(name,
+ GFP_KERNEL_ACCOUNT);
if (!mnt->mnt_devname)
goto out_free_id;
}
@@ -1715,22 +1716,14 @@ static inline bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
-#ifdef CONFIG_MANDATORY_FILE_LOCKING
-static bool may_mandlock(void)
+static void warn_mandlock(void)
{
- pr_warn_once("======================================================\n"
- "WARNING: the mand mount option is being deprecated and\n"
- " will be removed in v5.15!\n"
- "======================================================\n");
- return capable(CAP_SYS_ADMIN);
+ pr_warn_once("=======================================================\n"
+ "WARNING: The mand mount option has been deprecated and\n"
+ " and is ignored by this kernel. Remove the mand\n"
+ " option from the mount to silence this warning.\n"
+ "=======================================================\n");
}
-#else
-static inline bool may_mandlock(void)
-{
- pr_warn("VFS: \"mand\" mount option not supported");
- return false;
-}
-#endif
static int can_umount(const struct path *path, int flags)
{
@@ -2702,6 +2695,78 @@ out:
return ret;
}
+static int do_set_group(struct path *from_path, struct path *to_path)
+{
+ struct mount *from, *to;
+ int err;
+
+ from = real_mount(from_path->mnt);
+ to = real_mount(to_path->mnt);
+
+ namespace_lock();
+
+ err = -EINVAL;
+ /* To and From must be mounted */
+ if (!is_mounted(&from->mnt))
+ goto out;
+ if (!is_mounted(&to->mnt))
+ goto out;
+
+ err = -EPERM;
+ /* We should be allowed to modify mount namespaces of both mounts */
+ if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ goto out;
+ if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ goto out;
+
+ err = -EINVAL;
+ /* To and From paths should be mount roots */
+ if (from_path->dentry != from_path->mnt->mnt_root)
+ goto out;
+ if (to_path->dentry != to_path->mnt->mnt_root)
+ goto out;
+
+ /* Setting sharing groups is only allowed across same superblock */
+ if (from->mnt.mnt_sb != to->mnt.mnt_sb)
+ goto out;
+
+ /* From mount root should be wider than To mount root */
+ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
+ goto out;
+
+ /* From mount should not have locked children in place of To's root */
+ if (has_locked_children(from, to->mnt.mnt_root))
+ goto out;
+
+ /* Setting sharing groups is only allowed on private mounts */
+ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
+ goto out;
+
+ /* From should not be private */
+ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
+ goto out;
+
+ if (IS_MNT_SLAVE(from)) {
+ struct mount *m = from->mnt_master;
+
+ list_add(&to->mnt_slave, &m->mnt_slave_list);
+ to->mnt_master = m;
+ }
+
+ if (IS_MNT_SHARED(from)) {
+ to->mnt_group_id = from->mnt_group_id;
+ list_add(&to->mnt_share, &from->mnt_share);
+ lock_mount_hash();
+ set_mnt_shared(to);
+ unlock_mount_hash();
+ }
+
+ err = 0;
+out:
+ namespace_unlock();
+ return err;
+}
+
static int do_move_mount(struct path *old_path, struct path *new_path)
{
struct mnt_namespace *ns;
@@ -3197,8 +3262,8 @@ int path_mount(const char *dev_name, struct path *path,
return ret;
if (!may_mount())
return -EPERM;
- if ((flags & SB_MANDLOCK) && !may_mandlock())
- return -EPERM;
+ if (flags & SB_MANDLOCK)
+ warn_mandlock();
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
@@ -3306,7 +3371,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!ucounts)
return ERR_PTR(-ENOSPC);
- new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns) {
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
@@ -3581,9 +3646,8 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
goto err_unlock;
- ret = -EPERM;
- if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
- goto err_unlock;
+ if (fc->sb_flags & SB_MANDLOCK)
+ warn_mandlock();
newmount.mnt = vfs_create_mount(fc);
if (IS_ERR(newmount.mnt)) {
@@ -3687,7 +3751,10 @@ SYSCALL_DEFINE5(move_mount,
if (ret < 0)
goto out_to;
- ret = do_move_mount(&from_path, &to_path);
+ if (flags & MOVE_MOUNT_SET_GROUP)
+ ret = do_set_group(&from_path, &to_path);
+ else
+ ret = do_move_mount(&from_path, &to_path);
out_to:
path_put(&to_path);
@@ -4240,7 +4307,7 @@ void __init mnt_init(void)
int err;
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
mount_hashtable = alloc_large_system_hash("Mount-cache",
sizeof(struct hlist_head),
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7817ad94a6ba..86d856de1389 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -429,6 +429,8 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
*/
static int nfs_callback_authenticate(struct svc_rqst *rqstp)
{
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
+
switch (rqstp->rq_authop->flavour) {
case RPC_AUTH_NULL:
if (rqstp->rq_proc != CB_NULL)
@@ -439,6 +441,8 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
if (svc_is_backchannel(rqstp))
return SVC_DENIED;
}
+
+ rqstp->rq_auth_stat = rpc_auth_ok;
return SVC_OK;
}
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c5348ba81129..4c48d85f6517 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -63,11 +63,10 @@ static __be32 nfs4_callback_null(struct svc_rqst *rqstp)
return htonl(NFS4_OK);
}
-static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p)
-{
- return xdr_argsize_check(rqstp, p);
-}
-
+/*
+ * svc_process_common() looks for an XDR encoder to know when
+ * not to drop a Reply.
+ */
static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
{
return xdr_ressize_check(rqstp, p);
@@ -864,17 +863,16 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
}
static __be32 process_op(int nop, struct svc_rqst *rqstp,
- struct xdr_stream *xdr_in, void *argp,
- struct xdr_stream *xdr_out, void *resp,
- struct cb_process_state *cps)
+ struct cb_process_state *cps)
{
+ struct xdr_stream *xdr_out = &rqstp->rq_res_stream;
struct callback_op *op = &callback_ops[0];
unsigned int op_nr;
__be32 status;
long maxlen;
__be32 res;
- status = decode_op_hdr(xdr_in, &op_nr);
+ status = decode_op_hdr(&rqstp->rq_arg_stream, &op_nr);
if (unlikely(status))
return status;
@@ -904,9 +902,11 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp,
maxlen = xdr_out->end - xdr_out->p;
if (maxlen > 0 && maxlen < PAGE_SIZE) {
- status = op->decode_args(rqstp, xdr_in, argp);
+ status = op->decode_args(rqstp, &rqstp->rq_arg_stream,
+ rqstp->rq_argp);
if (likely(status == 0))
- status = op->process_op(argp, resp, cps);
+ status = op->process_op(rqstp->rq_argp, rqstp->rq_resp,
+ cps);
} else
status = htonl(NFS4ERR_RESOURCE);
@@ -915,7 +915,7 @@ encode_hdr:
if (unlikely(res))
return res;
if (op->encode_res != NULL && status == 0)
- status = op->encode_res(rqstp, xdr_out, resp);
+ status = op->encode_res(rqstp, xdr_out, rqstp->rq_resp);
return status;
}
@@ -926,22 +926,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
{
struct cb_compound_hdr_arg hdr_arg = { 0 };
struct cb_compound_hdr_res hdr_res = { NULL };
- struct xdr_stream xdr_in, xdr_out;
- __be32 *p, status;
struct cb_process_state cps = {
.drc_status = 0,
.clp = NULL,
.net = SVC_NET(rqstp),
};
unsigned int nops = 0;
+ __be32 status;
- xdr_init_decode(&xdr_in, &rqstp->rq_arg,
- rqstp->rq_arg.head[0].iov_base, NULL);
-
- p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
- xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL);
-
- status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+ status = decode_compound_hdr_arg(&rqstp->rq_arg_stream, &hdr_arg);
if (status == htonl(NFS4ERR_RESOURCE))
return rpc_garbage_args;
@@ -961,15 +954,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
cps.minorversion = hdr_arg.minorversion;
hdr_res.taglen = hdr_arg.taglen;
hdr_res.tag = hdr_arg.tag;
- if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) {
+ if (encode_compound_hdr_res(&rqstp->rq_res_stream, &hdr_res) != 0) {
if (cps.clp)
nfs_put_client(cps.clp);
return rpc_system_err;
}
while (status == 0 && nops != hdr_arg.nops) {
- status = process_op(nops, rqstp, &xdr_in,
- rqstp->rq_argp, &xdr_out, rqstp->rq_resp,
- &cps);
+ status = process_op(nops, rqstp, &cps);
nops++;
}
@@ -988,7 +979,20 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
out_invalidcred:
pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
- return svc_return_autherr(rqstp, rpc_autherr_badcred);
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ return rpc_success;
+}
+
+static int
+nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+{
+ const struct svc_procedure *procp = rqstp->rq_procinfo;
+
+ svcxdr_init_decode(rqstp);
+ svcxdr_init_encode(rqstp);
+
+ *statp = procp->pc_func(rqstp);
+ return 1;
}
/*
@@ -1057,7 +1061,6 @@ static struct callback_op callback_ops[] = {
static const struct svc_procedure nfs4_callback_procedures1[] = {
[CB_NULL] = {
.pc_func = nfs4_callback_null,
- .pc_decode = nfs4_decode_void,
.pc_encode = nfs4_encode_void,
.pc_xdrressize = 1,
.pc_name = "NULL",
@@ -1079,7 +1082,7 @@ const struct svc_version nfs4_callback_version1 = {
.vs_proc = nfs4_callback_procedures1,
.vs_count = nfs4_callback_count1,
.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
- .vs_dispatch = NULL,
+ .vs_dispatch = nfs_callback_dispatch,
.vs_hidden = true,
.vs_need_cong_ctrl = true,
};
@@ -1091,7 +1094,7 @@ const struct svc_version nfs4_callback_version4 = {
.vs_proc = nfs4_callback_procedures1,
.vs_count = nfs4_callback_count4,
.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
- .vs_dispatch = NULL,
+ .vs_dispatch = nfs_callback_dispatch,
.vs_hidden = true,
.vs_need_cong_ctrl = true,
};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 330f65727c45..23e165d5ec9c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -179,6 +179,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_proto = cl_init->proto;
clp->cl_nconnect = cl_init->nconnect;
+ clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1;
clp->cl_net = get_net(cl_init->net);
clp->cl_principal = "*";
@@ -540,6 +541,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
clnt->cl_principal = clp->cl_principal;
clp->cl_rpcclient = clnt;
+ clnt->cl_max_connect = clp->cl_max_connect;
return 0;
}
EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 37a1a88df771..d772c20bbfd1 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -180,5 +180,5 @@ const struct export_operations nfs_export_ops = {
.fetch_iversion = nfs_fetch_iversion,
.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
- EXPORT_OP_NOATOMIC_ATTR,
+ EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS,
};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1fef107961bc..aa353fd58240 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -806,9 +806,8 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
- /* No mandatory locks over NFS */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- goto out_err;
+ if (fl->fl_flags & FL_RECLAIM)
+ return -ENOGRACE;
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
is_local = 1;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index d95c9a39bc70..0d444a90f513 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -60,6 +60,7 @@ enum nfs_param {
Opt_mountvers,
Opt_namelen,
Opt_nconnect,
+ Opt_max_connect,
Opt_port,
Opt_posix,
Opt_proto,
@@ -158,6 +159,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_u32 ("mountvers", Opt_mountvers),
fsparam_u32 ("namlen", Opt_namelen),
fsparam_u32 ("nconnect", Opt_nconnect),
+ fsparam_u32 ("max_connect", Opt_max_connect),
fsparam_string("nfsvers", Opt_vers),
fsparam_u32 ("port", Opt_port),
fsparam_flag_no("posix", Opt_posix),
@@ -770,6 +772,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
goto out_of_bounds;
ctx->nfs_server.nconnect = result.uint_32;
break;
+ case Opt_max_connect:
+ if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS)
+ goto out_of_bounds;
+ ctx->nfs_server.max_connect = result.uint_32;
+ break;
case Opt_lookupcache:
switch (result.uint_32) {
case Opt_lookupcache_all:
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a36af04188c2..66fc936834f2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -67,6 +67,7 @@ struct nfs_client_initdata {
int proto;
u32 minorversion;
unsigned int nconnect;
+ unsigned int max_connect;
struct net *net;
const struct rpc_timeout *timeparms;
const struct cred *cred;
@@ -121,6 +122,7 @@ struct nfs_fs_context {
int port;
unsigned short protocol;
unsigned short nconnect;
+ unsigned short max_connect;
unsigned short export_path_len;
} nfs_server;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index c8a192802dda..03a4e679fd99 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -11,7 +11,7 @@
* nfs3acl.c
*/
#ifdef CONFIG_NFS_V3_ACL
-extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu);
extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9ec560aa4a50..93de0b58647a 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -44,7 +44,7 @@ static void nfs3_abort_get_acl(struct posix_acl **p)
cmpxchg(p, sentinel, ACL_NOT_CACHED);
}
-struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
+struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu)
{
struct nfs_server *server = NFS_SERVER(inode);
struct page *pages[NFSACL_MAXPAGES] = { };
@@ -62,6 +62,9 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
};
int status, count;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
if (!nfs_server_capable(inode, NFS_CAP_ACLS))
return ERR_PTR(-EOPNOTSUPP);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2299446b3b89..f7524310ddf4 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -49,8 +49,7 @@ nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
{
if (task->tk_status != -EJUKEBOX)
return 0;
- if (task->tk_status == -EJUKEBOX)
- nfs_inc_stats(inode, NFSIOS_DELAY);
+ nfs_inc_stats(inode, NFSIOS_DELAY);
task->tk_status = 0;
rpc_restart_call(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 28431acd1230..af57332503be 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -402,6 +402,33 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
return nfs4_init_callback(clp);
}
+static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
+{
+ struct sockaddr_storage clp_addr, old_addr;
+ struct sockaddr *clp_sap = (struct sockaddr *)&clp_addr;
+ struct sockaddr *old_sap = (struct sockaddr *)&old_addr;
+ size_t clp_salen;
+ struct xprt_create xprt_args = {
+ .ident = old->cl_proto,
+ .net = old->cl_net,
+ .servername = old->cl_hostname,
+ };
+
+ if (clp->cl_proto != old->cl_proto)
+ return;
+ clp_salen = rpc_peeraddr(clp->cl_rpcclient, clp_sap, sizeof(clp_addr));
+ rpc_peeraddr(old->cl_rpcclient, old_sap, sizeof(old_addr));
+
+ if (clp_addr.ss_family != old_addr.ss_family)
+ return;
+
+ xprt_args.dstaddr = clp_sap;
+ xprt_args.addrlen = clp_salen;
+
+ rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args,
+ rpc_clnt_test_and_add_xprt, NULL);
+}
+
/**
* nfs4_init_client - Initialise an NFS4 client record
*
@@ -436,6 +463,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
* won't try to use it.
*/
nfs_mark_client_ready(clp, -EPERM);
+ if (old->cl_mvops->session_trunk)
+ nfs4_add_trunk(clp, old);
}
clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
nfs_put_client(clp);
@@ -865,6 +894,7 @@ static int nfs4_set_client(struct nfs_server *server,
const char *ip_addr,
int proto, const struct rpc_timeout *timeparms,
u32 minorversion, unsigned int nconnect,
+ unsigned int max_connect,
struct net *net)
{
struct nfs_client_initdata cl_init = {
@@ -883,6 +913,8 @@ static int nfs4_set_client(struct nfs_server *server,
if (minorversion == 0)
__set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags);
+ else
+ cl_init.max_connect = max_connect;
if (proto == XPRT_TRANSPORT_TCP)
cl_init.nconnect = nconnect;
@@ -952,8 +984,10 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
- if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+ if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) {
cl_init.nconnect = mds_clp->cl_nconnect;
+ cl_init.max_connect = NFS_MAX_TRANSPORTS;
+ }
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
@@ -1122,6 +1156,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
&timeparms,
ctx->minorversion,
ctx->nfs_server.nconnect,
+ ctx->nfs_server.max_connect,
fc->net_ns);
if (error < 0)
return error;
@@ -1211,6 +1246,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
parent_server->client->cl_timeout,
parent_client->cl_mvops->minor_version,
parent_client->cl_nconnect,
+ parent_client->cl_max_connect,
parent_client->cl_net);
if (!error)
goto init_server;
@@ -1226,6 +1262,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
parent_server->client->cl_timeout,
parent_client->cl_mvops->minor_version,
parent_client->cl_nconnect,
+ parent_client->cl_max_connect,
parent_client->cl_net);
if (error < 0)
goto error;
@@ -1323,7 +1360,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
error = nfs4_set_client(server, hostname, sap, salen, buf,
clp->cl_proto, clnt->cl_timeout,
clp->cl_minorversion,
- clp->cl_nconnect, net);
+ clp->cl_nconnect, clp->cl_max_connect, net);
clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
if (error != 0) {
nfs_server_insert_lists(server);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index c820de58a661..c91565227ea2 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -158,13 +158,11 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
sync = true;
retry:
if (!nfs42_files_from_same_server(file_in, file_out)) {
- /* for inter copy, if copy size if smaller than 12 RPC
- * payloads, fallback to traditional copy. There are
- * 14 RPCs during an NFSv4.x mount between source/dest
- * servers.
+ /*
+ * for inter copy, if copy size is too small
+ * then fallback to generic copy.
*/
- if (sync ||
- count <= 14 * NFS_SERVER(file_inode(file_in))->rsize)
+ if (sync)
return -EOPNOTSUPP;
cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res),
GFP_NOFS);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ef14ea0b6ab8..7c9090a28e5c 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -335,7 +335,7 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq)
{
- if (pnfs_seqid_is_newer(newseq, lo->plh_barrier))
+ if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier)
lo->plh_barrier = newseq;
}
@@ -347,11 +347,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
iomode = IOMODE_ANY;
lo->plh_return_iomode = iomode;
set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
- if (seq != 0) {
- WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
+ /*
+ * We must set lo->plh_return_seq to avoid livelocks with
+ * pnfs_layout_need_return()
+ */
+ if (seq == 0)
+ seq = be32_to_cpu(lo->plh_stateid.seqid);
+ if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
lo->plh_return_seq = seq;
- pnfs_barrier_update(lo, seq);
- }
+ pnfs_barrier_update(lo, seq);
}
static void
@@ -592,10 +596,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
inode = lo->plh_inode;
if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
- if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
- spin_unlock(&inode->i_lock);
- return;
- }
pnfs_get_layout_hdr(lo);
pnfs_layout_remove_lseg(lo, lseg);
if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
@@ -1000,7 +1000,7 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
{
u32 seqid = be32_to_cpu(stateid->seqid);
- return !pnfs_seqid_is_newer(seqid, lo->plh_barrier) && lo->plh_barrier;
+ return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid);
}
/* lget is set to 1 if called from inside send_layoutget call chain */
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 9f39e0a1a38b..08d6cc57cbc3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -293,15 +293,19 @@ static int
readpage_async_filler(void *data, struct page *page)
{
struct nfs_readdesc *desc = data;
+ struct inode *inode = page_file_mapping(page)->host;
+ unsigned int rsize = NFS_SERVER(inode)->rsize;
struct nfs_page *new;
- unsigned int len;
+ unsigned int len, aligned_len;
int error;
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
- new = nfs_create_request(desc->ctx, page, 0, len);
+ aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE);
+
+ new = nfs_create_request(desc->ctx, page, 0, aligned_len);
if (IS_ERR(new))
goto out_error;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fe58525cfed4..e65c83494c05 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -480,6 +480,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
if (clp->cl_nconnect > 0)
seq_printf(m, ",nconnect=%u", clp->cl_nconnect);
if (version == 4) {
+ if (clp->cl_max_connect > 1)
+ seq_printf(m, ",max_connect=%u", clp->cl_max_connect);
if (nfss->port != NFS_PORT)
seq_printf(m, ",port=%u", nfss->port);
} else
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f229172652be..6e9ea4ee0f73 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -109,7 +109,7 @@ config NFSD_SCSILAYOUT
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
select EXPORTFS_BLOCK_OPS
- select BLK_SCSI_REQUEST
+ select SCSI_COMMON
help
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 3f5b3d7b62b7..606fa155c28a 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -25,9 +25,11 @@
* Note: we hold the dentry use count while the file is open.
*/
static __be32
-nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
+nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp,
+ int mode)
{
__be32 nfserr;
+ int access;
struct svc_fh fh;
/* must initialize before using! but maxsize doesn't matter */
@@ -36,7 +38,9 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
fh.fh_export = NULL;
- nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
+ access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ;
+ access |= NFSD_MAY_LOCK;
+ nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp);
fh_put(&fh);
/* We return nlm error codes as nlm doesn't know
* about nfsd, but nfsd does know about nlm..
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fa67ecd5fe63..42356416f0a0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2687,9 +2687,9 @@ static void force_expire_client(struct nfs4_client *clp)
trace_nfsd_clid_admin_expired(&clp->cl_clientid);
- spin_lock(&clp->cl_lock);
+ spin_lock(&nn->client_lock);
clp->cl_time = 0;
- spin_unlock(&clp->cl_lock);
+ spin_unlock(&nn->client_lock);
wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
spin_lock(&nn->client_lock);
@@ -5735,16 +5735,6 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid,
NFS4_SHARE_DENY_READ);
}
-/*
- * Allow READ/WRITE during grace period on recovered state only for files
- * that are not able to provide mandatory locking.
- */
-static inline int
-grace_disallows_io(struct net *net, struct inode *inode)
-{
- return opens_in_grace(net) && mandatory_lock(inode);
-}
-
static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
{
/*
@@ -6026,7 +6016,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
stateid_t *stateid, int flags, struct nfsd_file **nfp,
struct nfs4_stid **cstid)
{
- struct inode *ino = d_inode(fhp->fh_dentry);
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct nfs4_stid *s = NULL;
@@ -6035,9 +6024,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
if (nfp)
*nfp = NULL;
- if (grace_disallows_io(net, ino))
- return nfserr_grace;
-
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
status = check_special_stateids(net, fhp, stateid, flags);
goto done;
@@ -6835,6 +6821,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_blocked_lock *nbl = NULL;
struct file_lock *file_lock = NULL;
struct file_lock *conflock = NULL;
+ struct super_block *sb;
__be32 status = 0;
int lkflg;
int err;
@@ -6856,6 +6843,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfsd4_lock: permission denied!\n");
return status;
}
+ sb = cstate->current_fh.fh_dentry->d_sb;
if (lock->lk_is_new) {
if (nfsd4_has_session(cstate))
@@ -6901,10 +6889,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!locks_in_grace(net) && lock->lk_reclaim)
goto out;
+ if (lock->lk_reclaim)
+ fl_flags |= FL_RECLAIM;
+
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
- if (nfsd4_has_session(cstate))
+ if (nfsd4_has_session(cstate) &&
+ !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
@@ -6916,7 +6908,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fl_type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
- if (nfsd4_has_session(cstate))
+ if (nfsd4_has_session(cstate) &&
+ !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
@@ -7036,8 +7029,7 @@ out:
/*
* The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
* so we do a temporary open here just to get an open file to pass to
- * vfs_test_lock. (Arguably perhaps test_lock should be done with an
- * inode operation.)
+ * vfs_test_lock.
*/
static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
{
@@ -7052,7 +7044,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct
NFSD_MAY_READ));
if (err)
goto out;
+ lock->fl_file = nf->nf_file;
err = nfserrno(vfs_test_lock(nf->nf_file, lock));
+ lock->fl_file = NULL;
out:
fh_unlock(fhp);
nfsd_file_put(nf);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 60d7c59e7935..90fcd6178823 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -881,6 +881,7 @@ nfserrno (int errno)
{ nfserr_serverfault, -ENFILE },
{ nfserr_io, -EUCLEAN },
{ nfserr_perm, -ENOKEY },
+ { nfserr_no_grace, -ENOGRACE},
};
int i;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index adaec43548d1..538520957a81 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -400,18 +400,16 @@ TRACE_EVENT(nfsd_dirent,
TP_STRUCT__entry(
__field(u32, fh_hash)
__field(u64, ino)
- __field(int, len)
- __dynamic_array(unsigned char, name, namlen)
+ __string_len(name, name, namlen)
),
TP_fast_assign(
__entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
__entry->ino = ino;
- __entry->len = namlen;
- memcpy(__get_str(name), name, namlen);
+ __assign_str_len(name, name, namlen)
),
- TP_printk("fh_hash=0x%08x ino=%llu name=%.*s",
- __entry->fh_hash, __entry->ino,
- __entry->len, __get_str(name))
+ TP_printk("fh_hash=0x%08x ino=%llu name=%s",
+ __entry->fh_hash, __entry->ino, __get_str(name)
+ )
)
#include "state.h"
@@ -608,7 +606,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
__array(unsigned char, addr, sizeof(struct sockaddr_in6))
__field(unsigned long, flavor)
__array(unsigned char, verifier, NFS4_VERIFIER_SIZE)
- __dynamic_array(char, name, clp->cl_name.len + 1)
+ __string_len(name, name, clp->cl_name.len)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -618,8 +616,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
__entry->flavor = clp->cl_cred.cr_flavor;
memcpy(__entry->verifier, (void *)&clp->cl_verifier,
NFS4_VERIFIER_SIZE);
- memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len);
- __get_str(name)[clp->cl_name.len] = '\0';
+ __assign_str_len(name, clp->cl_name.data, clp->cl_name.len);
),
TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
__entry->addr, __get_str(name),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a224a5e23cc1..738d564ca4ce 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -244,7 +244,6 @@ out_nfserr:
* returned. Otherwise the covered directory is returned.
* NOTE: this mountpoint crossing is not supported properly by all
* clients and is explicitly disallowed for NFSv3
- * NeilBrown <neilb@cse.unsw.edu.au>
*/
__be32
nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
@@ -333,7 +332,6 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct iattr *iap)
{
struct inode *inode = d_inode(fhp->fh_dentry);
- int host_err;
if (iap->ia_size < inode->i_size) {
__be32 err;
@@ -343,20 +341,7 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err)
return err;
}
-
- host_err = get_write_access(inode);
- if (host_err)
- goto out_nfserrno;
-
- host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
- if (host_err)
- goto out_put_write_access;
- return 0;
-
-out_put_write_access:
- put_write_access(inode);
-out_nfserrno:
- return nfserrno(host_err);
+ return nfserrno(get_write_access(inode));
}
/*
@@ -750,13 +735,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
err = nfserr_perm;
if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
goto out;
- /*
- * We must ignore files (but only files) which might have mandatory
- * locks on them because there is no way to know if the accesser has
- * the lock.
- */
- if (S_ISREG((inode)->i_mode) && mandatory_lock(inode))
- goto out;
if (!inode->i_fop)
goto out;
@@ -847,26 +825,16 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct svc_rqst *rqstp = sd->u.data;
struct page **pp = rqstp->rq_next_page;
struct page *page = buf->page;
- size_t size;
-
- size = sd->len;
if (rqstp->rq_res.page_len == 0) {
- get_page(page);
- put_page(*rqstp->rq_next_page);
- *(rqstp->rq_next_page++) = page;
+ svc_rqst_replace_page(rqstp, page);
rqstp->rq_res.page_base = buf->offset;
- rqstp->rq_res.page_len = size;
} else if (page != pp[-1]) {
- get_page(page);
- if (*rqstp->rq_next_page)
- put_page(*rqstp->rq_next_page);
- *(rqstp->rq_next_page++) = page;
- rqstp->rq_res.page_len += size;
- } else
- rqstp->rq_res.page_len += size;
+ svc_rqst_replace_page(rqstp, page);
+ }
+ rqstp->rq_res.page_len += sd->len;
- return size;
+ return sd->len;
}
static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 4abd928b0bc8..f6b2d280aab5 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1053,7 +1053,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
sb->s_max_links = NILFS_LINK_MAX;
- sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
+ sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi);
err = load_nilfs(nilfs, sb);
if (err)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 28b67cb9458d..6facdf476255 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
#include <linux/fcntl.h>
+#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/anon_inodes.h>
@@ -109,8 +110,10 @@ struct kmem_cache *fanotify_path_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
#define FANOTIFY_EVENT_ALIGN 4
-#define FANOTIFY_INFO_HDR_LEN \
+#define FANOTIFY_FID_INFO_HDR_LEN \
(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
+#define FANOTIFY_PIDFD_INFO_HDR_LEN \
+ sizeof(struct fanotify_event_info_pidfd)
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -119,10 +122,11 @@ static int fanotify_fid_info_len(int fh_len, int name_len)
if (name_len)
info_len += name_len + 1;
- return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN);
+ return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
+ FANOTIFY_EVENT_ALIGN);
}
-static int fanotify_event_info_len(unsigned int fid_mode,
+static int fanotify_event_info_len(unsigned int info_mode,
struct fanotify_event *event)
{
struct fanotify_info *info = fanotify_event_info(event);
@@ -133,7 +137,8 @@ static int fanotify_event_info_len(unsigned int fid_mode,
if (dir_fh_len) {
info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
- } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) {
+ } else if ((info_mode & FAN_REPORT_NAME) &&
+ (event->mask & FAN_ONDIR)) {
/*
* With group flag FAN_REPORT_NAME, if name was not recorded in
* event on a directory, we will report the name ".".
@@ -141,6 +146,9 @@ static int fanotify_event_info_len(unsigned int fid_mode,
dot_len = 1;
}
+ if (info_mode & FAN_REPORT_PIDFD)
+ info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+
if (fh_len)
info_len += fanotify_fid_info_len(fh_len, dot_len);
@@ -176,7 +184,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
size_t event_size = FAN_EVENT_METADATA_LEN;
struct fanotify_event *event = NULL;
struct fsnotify_event *fsn_event;
- unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
@@ -186,8 +194,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
goto out;
event = FANOTIFY_E(fsn_event);
- if (fid_mode)
- event_size += fanotify_event_info_len(fid_mode, event);
+ if (info_mode)
+ event_size += fanotify_event_info_len(info_mode, event);
if (event_size > count) {
event = ERR_PTR(-EINVAL);
@@ -308,9 +316,10 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
-static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
- int info_type, const char *name, size_t name_len,
- char __user *buf, size_t count)
+static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+ int info_type, const char *name,
+ size_t name_len,
+ char __user *buf, size_t count)
{
struct fanotify_event_info_fid info = { };
struct file_handle handle = { };
@@ -403,6 +412,117 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
return info_len;
}
+static int copy_pidfd_info_to_user(int pidfd,
+ char __user *buf,
+ size_t count)
+{
+ struct fanotify_event_info_pidfd info = { };
+ size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+
+ if (WARN_ON_ONCE(info_len > count))
+ return -EFAULT;
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
+ info.hdr.len = info_len;
+ info.pidfd = pidfd;
+
+ if (copy_to_user(buf, &info, info_len))
+ return -EFAULT;
+
+ return info_len;
+}
+
+static int copy_info_records_to_user(struct fanotify_event *event,
+ struct fanotify_info *info,
+ unsigned int info_mode, int pidfd,
+ char __user *buf, size_t count)
+{
+ int ret, total_bytes = 0, info_type = 0;
+ unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
+ unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
+
+ /*
+ * Event info records order is as follows: dir fid + name, child fid.
+ */
+ if (fanotify_event_dir_fh_len(event)) {
+ info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
+ FAN_EVENT_INFO_TYPE_DFID;
+ ret = copy_fid_info_to_user(fanotify_event_fsid(event),
+ fanotify_info_dir_fh(info),
+ info_type,
+ fanotify_info_name(info),
+ info->name_len, buf, count);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
+ if (fanotify_event_object_fh_len(event)) {
+ const char *dot = NULL;
+ int dot_len = 0;
+
+ if (fid_mode == FAN_REPORT_FID || info_type) {
+ /*
+ * With only group flag FAN_REPORT_FID only type FID is
+ * reported. Second info record type is always FID.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_FID;
+ } else if ((fid_mode & FAN_REPORT_NAME) &&
+ (event->mask & FAN_ONDIR)) {
+ /*
+ * With group flag FAN_REPORT_NAME, if name was not
+ * recorded in an event on a directory, report the name
+ * "." with info type DFID_NAME.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
+ dot = ".";
+ dot_len = 1;
+ } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
+ (event->mask & FAN_ONDIR)) {
+ /*
+ * With group flag FAN_REPORT_DIR_FID, a single info
+ * record has type DFID for directory entry modification
+ * event and for event on a directory.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_DFID;
+ } else {
+ /*
+ * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
+ * a single info record has type FID for event on a
+ * non-directory, when there is no directory to report.
+ * For example, on FAN_DELETE_SELF event.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_FID;
+ }
+
+ ret = copy_fid_info_to_user(fanotify_event_fsid(event),
+ fanotify_event_object_fh(event),
+ info_type, dot, dot_len,
+ buf, count);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
+ if (pidfd_mode) {
+ ret = copy_pidfd_info_to_user(pidfd, buf, count);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
+ return total_bytes;
+}
+
static ssize_t copy_event_to_user(struct fsnotify_group *group,
struct fanotify_event *event,
char __user *buf, size_t count)
@@ -410,15 +530,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
struct fanotify_event_metadata metadata;
struct path *path = fanotify_event_path(event);
struct fanotify_info *info = fanotify_event_info(event);
- unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
+ unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
struct file *f = NULL;
- int ret, fd = FAN_NOFD;
- int info_type = 0;
+ int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
metadata.event_len = FAN_EVENT_METADATA_LEN +
- fanotify_event_info_len(fid_mode, event);
+ fanotify_event_info_len(info_mode, event);
metadata.metadata_len = FAN_EVENT_METADATA_LEN;
metadata.vers = FANOTIFY_METADATA_VERSION;
metadata.reserved = 0;
@@ -447,6 +567,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
}
metadata.fd = fd;
+ if (pidfd_mode) {
+ /*
+ * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
+ * exclusion is ever lifted. At the time of incoporating pidfd
+ * support within fanotify, the pidfd API only supported the
+ * creation of pidfds for thread-group leaders.
+ */
+ WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
+
+ /*
+ * The PIDTYPE_TGID check for an event->pid is performed
+ * preemptively in an attempt to catch out cases where the event
+ * listener reads events after the event generating process has
+ * already terminated. Report FAN_NOPIDFD to the event listener
+ * in those cases, with all other pidfd creation errors being
+ * reported as FAN_EPIDFD.
+ */
+ if (metadata.pid == 0 ||
+ !pid_has_task(event->pid, PIDTYPE_TGID)) {
+ pidfd = FAN_NOPIDFD;
+ } else {
+ pidfd = pidfd_create(event->pid, 0);
+ if (pidfd < 0)
+ pidfd = FAN_EPIDFD;
+ }
+ }
+
ret = -EFAULT;
/*
* Sanity check copy size in case get_one_event() and
@@ -467,67 +614,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
if (f)
fd_install(fd, f);
- /* Event info records order is: dir fid + name, child fid */
- if (fanotify_event_dir_fh_len(event)) {
- info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
- FAN_EVENT_INFO_TYPE_DFID;
- ret = copy_info_to_user(fanotify_event_fsid(event),
- fanotify_info_dir_fh(info),
- info_type, fanotify_info_name(info),
- info->name_len, buf, count);
+ if (info_mode) {
+ ret = copy_info_records_to_user(event, info, info_mode, pidfd,
+ buf, count);
if (ret < 0)
goto out_close_fd;
-
- buf += ret;
- count -= ret;
- }
-
- if (fanotify_event_object_fh_len(event)) {
- const char *dot = NULL;
- int dot_len = 0;
-
- if (fid_mode == FAN_REPORT_FID || info_type) {
- /*
- * With only group flag FAN_REPORT_FID only type FID is
- * reported. Second info record type is always FID.
- */
- info_type = FAN_EVENT_INFO_TYPE_FID;
- } else if ((fid_mode & FAN_REPORT_NAME) &&
- (event->mask & FAN_ONDIR)) {
- /*
- * With group flag FAN_REPORT_NAME, if name was not
- * recorded in an event on a directory, report the
- * name "." with info type DFID_NAME.
- */
- info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
- dot = ".";
- dot_len = 1;
- } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
- (event->mask & FAN_ONDIR)) {
- /*
- * With group flag FAN_REPORT_DIR_FID, a single info
- * record has type DFID for directory entry modification
- * event and for event on a directory.
- */
- info_type = FAN_EVENT_INFO_TYPE_DFID;
- } else {
- /*
- * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
- * a single info record has type FID for event on a
- * non-directory, when there is no directory to report.
- * For example, on FAN_DELETE_SELF event.
- */
- info_type = FAN_EVENT_INFO_TYPE_FID;
- }
-
- ret = copy_info_to_user(fanotify_event_fsid(event),
- fanotify_event_object_fh(event),
- info_type, dot, dot_len, buf, count);
- if (ret < 0)
- goto out_close_fd;
-
- buf += ret;
- count -= ret;
}
return metadata.event_len;
@@ -537,6 +628,10 @@ out_close_fd:
put_unused_fd(fd);
fput(f);
}
+
+ if (pidfd >= 0)
+ close_fd(pidfd);
+
return ret;
}
@@ -1082,6 +1177,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
#endif
return -EINVAL;
+ /*
+ * A pidfd can only be returned for a thread-group leader; thus
+ * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
+ * exclusive.
+ */
+ if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
+ return -EINVAL;
+
if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
return -EINVAL;
@@ -1483,7 +1586,7 @@ static int __init fanotify_user_setup(void)
FANOTIFY_DEFAULT_MAX_USER_MARKS);
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 30d422b8c0fc..963e6ce75b96 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -87,15 +87,15 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
if (iput_inode)
iput(iput_inode);
- /* Wait for outstanding inode references from connectors */
- wait_var_event(&sb->s_fsnotify_inode_refs,
- !atomic_long_read(&sb->s_fsnotify_inode_refs));
}
void fsnotify_sb_delete(struct super_block *sb)
{
fsnotify_unmount_inodes(sb);
fsnotify_clear_marks_by_sb(sb);
+ /* Wait for outstanding object references from connectors */
+ wait_var_event(&sb->s_fsnotify_connectors,
+ !atomic_long_read(&sb->s_fsnotify_connectors));
}
/*
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index ff2063ec6b0f..87d8a50ee803 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb(
return container_of(conn->obj, struct super_block, s_fsnotify_marks);
}
+static inline struct super_block *fsnotify_connector_sb(
+ struct fsnotify_mark_connector *conn)
+{
+ switch (conn->type) {
+ case FSNOTIFY_OBJ_TYPE_INODE:
+ return fsnotify_conn_inode(conn)->i_sb;
+ case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
+ return fsnotify_conn_mount(conn)->mnt.mnt_sb;
+ case FSNOTIFY_OBJ_TYPE_SB:
+ return fsnotify_conn_sb(conn);
+ default:
+ return NULL;
+ }
+}
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d32ab349db74..95006d1d29ab 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -169,6 +169,37 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
+static void fsnotify_get_inode_ref(struct inode *inode)
+{
+ ihold(inode);
+ atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_put_inode_ref(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ iput(inode);
+ if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
+ wake_up_var(&sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
+{
+ struct super_block *sb = fsnotify_connector_sb(conn);
+
+ if (sb)
+ atomic_long_inc(&sb->s_fsnotify_connectors);
+}
+
+static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
+{
+ struct super_block *sb = fsnotify_connector_sb(conn);
+
+ if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
+ wake_up_var(&sb->s_fsnotify_connectors);
+}
+
static void *fsnotify_detach_connector_from_object(
struct fsnotify_mark_connector *conn,
unsigned int *type)
@@ -182,13 +213,13 @@ static void *fsnotify_detach_connector_from_object(
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
- atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
}
+ fsnotify_put_sb_connectors(conn);
rcu_assign_pointer(*(conn->obj), NULL);
conn->obj = NULL;
conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
@@ -209,19 +240,12 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
/* Drop object reference originally held by a connector */
static void fsnotify_drop_object(unsigned int type, void *objp)
{
- struct inode *inode;
- struct super_block *sb;
-
if (!objp)
return;
/* Currently only inode references are passed to be dropped */
if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
return;
- inode = objp;
- sb = inode->i_sb;
- iput(inode);
- if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
- wake_up_var(&sb->s_fsnotify_inode_refs);
+ fsnotify_put_inode_ref(objp);
}
void fsnotify_put_mark(struct fsnotify_mark *mark)
@@ -493,8 +517,12 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
conn->fsid.val[0] = conn->fsid.val[1] = 0;
conn->flags = 0;
}
- if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
- inode = igrab(fsnotify_conn_inode(conn));
+ if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = fsnotify_conn_inode(conn);
+ fsnotify_get_inode_ref(inode);
+ }
+ fsnotify_get_sb_connectors(conn);
+
/*
* cmpxchg() provides the barrier so that readers of *connp can see
* only initialized structure
@@ -502,7 +530,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
if (inode)
- iput(inode);
+ fsnotify_put_inode_ref(inode);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 97e682ebcfb9..dc71c59fd445 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -809,7 +809,7 @@ int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
/* globals from xattr.c */
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-struct posix_acl *ntfs_get_acl(struct inode *inode, int type);
+struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu);
int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index b15d532e4a17..7282d85c4ece 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -535,8 +535,11 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
/*
* ntfs_get_acl - inode_operations::get_acl
*/
-struct posix_acl *ntfs_get_acl(struct inode *inode, int type)
+struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
{
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
/* TODO: init_user_ns? */
return ntfs_get_acl_ex(&init_user_ns, inode, type, 0);
}
@@ -635,7 +638,7 @@ static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns,
return -EOPNOTSUPP;
}
- acl = ntfs_get_acl(inode, type);
+ acl = ntfs_get_acl(inode, type, false);
if (IS_ERR(acl))
return PTR_ERR(acl);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 5c72a7e6d6c5..23a72a423955 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -289,7 +289,7 @@ unlock:
return status;
}
-struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu)
{
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
@@ -297,6 +297,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
int had_lock;
struct ocfs2_lock_holder oh;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index f59d8d0a61fa..95a57c888ab6 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -16,7 +16,7 @@ struct ocfs2_acl_entry {
__le32 e_id;
};
-struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu);
int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 48fd369c29a4..359524b7341f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -16,6 +16,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/delay.h>
#include <linux/quotaops.h>
#include <linux/sched/signal.h>
@@ -2721,7 +2722,7 @@ int ocfs2_inode_lock_tracker(struct inode *inode,
return status;
}
}
- return tmp_oh ? 1 : 0;
+ return 1;
}
void ocfs2_inode_unlock_tracker(struct inode *inode,
@@ -3912,6 +3913,17 @@ downconvert:
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
gen);
+ /* The dlm lock convert is being cancelled in background,
+ * ocfs2_cancel_convert() is asynchronous in fs/dlm,
+ * requeue it, try again later.
+ */
+ if (ret == -EBUSY) {
+ ctl->requeue = 1;
+ mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n",
+ lockres->l_name);
+ ret = 0;
+ msleep(20);
+ }
leave:
if (ret)
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index fab7c6a4a7d0..73a3854b2afb 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -101,8 +101,6 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- if (__mandatory_lock(inode))
- return -ENOLCK;
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
@@ -121,8 +119,6 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
}
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index eda83487c9ec..f033de733adb 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -357,7 +357,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
}
oinfo->dqi_gi.dqi_sb = sb;
oinfo->dqi_gi.dqi_type = type;
- ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
oinfo->dqi_gqi_bh = NULL;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b1a8b046f4c2..0e4b16d4c037 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -702,6 +702,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
info->dqi_priv = oinfo;
oinfo->dqi_type = type;
INIT_LIST_HEAD(&oinfo->dqi_chunk);
+ oinfo->dqi_gqinode = NULL;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_rec = NULL;
oinfo->dqi_lqi_bh = NULL;
oinfo->dqi_libh = NULL;
diff --git a/fs/open.c b/fs/open.c
index 94bef26ff1b6..daa324606a41 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -105,9 +105,7 @@ long vfs_truncate(const struct path *path, loff_t length)
if (error)
goto put_write_and_out;
- error = locks_verify_truncate(inode, NULL, length);
- if (!error)
- error = security_path_truncate(path);
+ error = security_path_truncate(path);
if (!error)
error = do_truncate(mnt_userns, path->dentry, length, 0, NULL);
@@ -189,9 +187,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (IS_APPEND(file_inode(f.file)))
goto out_putf;
sb_start_write(inode->i_sb);
- error = locks_verify_truncate(inode, f.file, length);
- if (!error)
- error = security_path_truncate(&f.file->f_path);
+ error = security_path_truncate(&f.file->f_path);
if (!error)
error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
ATTR_MTIME | ATTR_CTIME, f.file);
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 18852b9ed82b..605e5a3506ec 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -10,12 +10,15 @@
#include "orangefs-bufmap.h"
#include <linux/posix_acl_xattr.h>
-struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
+struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu)
{
struct posix_acl *acl;
int ret;
char *key = NULL, *value = NULL;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
key = XATTR_NAME_POSIX_ACL_ACCESS;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 16ac617df7d7..c1bb4c4b5d67 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -882,12 +882,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
if (!(request_mask & STATX_SIZE))
stat->result_mask &= ~STATX_SIZE;
- stat->attributes_mask = STATX_ATTR_IMMUTABLE |
- STATX_ATTR_APPEND;
- if (inode->i_flags & S_IMMUTABLE)
- stat->attributes |= STATX_ATTR_IMMUTABLE;
- if (inode->i_flags & S_APPEND)
- stat->attributes |= STATX_ATTR_APPEND;
+ generic_fill_statx_attr(inode, stat);
}
return ret;
}
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 0e6b97682e41..b5940ec1836a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -106,7 +106,7 @@ enum orangefs_vfs_op_states {
extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
extern const struct xattr_handler *orangefs_xattr_handlers[];
-extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type);
+extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
extern int orangefs_set_acl(struct user_namespace *mnt_userns,
struct inode *inode, struct posix_acl *acl,
int type);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 2846b943e80c..4e7d5bfa2949 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
+#include <linux/fileattr.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
@@ -62,7 +63,7 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
return list_size;
}
- buf = kzalloc(list_size, GFP_KERNEL);
+ buf = kvzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -105,11 +106,12 @@ retry:
if (size > value_size) {
void *new;
- new = krealloc(value, size, GFP_KERNEL);
+ new = kvmalloc(size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
+ kvfree(value);
value = new;
value_size = size;
goto retry;
@@ -124,12 +126,50 @@ retry:
error = 0;
}
}
- kfree(value);
+ kvfree(value);
out:
- kfree(buf);
+ kvfree(buf);
return error;
}
+static int ovl_copy_fileattr(struct inode *inode, struct path *old,
+ struct path *new)
+{
+ struct fileattr oldfa = { .flags_valid = true };
+ struct fileattr newfa = { .flags_valid = true };
+ int err;
+
+ err = ovl_real_fileattr_get(old, &oldfa);
+ if (err)
+ return err;
+
+ err = ovl_real_fileattr_get(new, &newfa);
+ if (err)
+ return err;
+
+ /*
+ * We cannot set immutable and append-only flags on upper inode,
+ * because we would not be able to link upper inode to upper dir
+ * not set overlay private xattr on upper inode.
+ * Store these flags in overlay.protattr xattr instead.
+ */
+ if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) {
+ err = ovl_set_protattr(inode, new->dentry, &oldfa);
+ if (err)
+ return err;
+ }
+
+ BUILD_BUG_ON(OVL_COPY_FS_FLAGS_MASK & ~FS_COMMON_FL);
+ newfa.flags &= ~OVL_COPY_FS_FLAGS_MASK;
+ newfa.flags |= (oldfa.flags & OVL_COPY_FS_FLAGS_MASK);
+
+ BUILD_BUG_ON(OVL_COPY_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON);
+ newfa.fsx_xflags &= ~OVL_COPY_FSX_FLAGS_MASK;
+ newfa.fsx_xflags |= (oldfa.fsx_xflags & OVL_COPY_FSX_FLAGS_MASK);
+
+ return ovl_real_fileattr_set(new, &newfa);
+}
+
static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
struct path *new, loff_t len)
{
@@ -331,8 +371,8 @@ out_err:
return ERR_PTR(err);
}
-int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry,
- struct dentry *lower, struct dentry *upper)
+int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
+ struct dentry *upper)
{
const struct ovl_fh *fh = NULL;
int err;
@@ -351,7 +391,7 @@ int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry,
/*
* Do not fail when upper doesn't support xattrs.
*/
- err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf,
+ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf,
fh ? fh->fb.len : 0, 0);
kfree(fh);
@@ -493,20 +533,21 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
{
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+ struct inode *inode = d_inode(c->dentry);
+ struct path upperpath, datapath;
int err;
+ ovl_path_upper(c->dentry, &upperpath);
+ if (WARN_ON(upperpath.dentry != NULL))
+ return -EIO;
+
+ upperpath.dentry = temp;
+
/*
* Copy up data first and then xattrs. Writing data after
* xattrs will remove security.capability xattr automatically.
*/
if (S_ISREG(c->stat.mode) && !c->metacopy) {
- struct path upperpath, datapath;
-
- ovl_path_upper(c->dentry, &upperpath);
- if (WARN_ON(upperpath.dentry != NULL))
- return -EIO;
- upperpath.dentry = temp;
-
ovl_path_lowerdata(c->dentry, &datapath);
err = ovl_copy_up_data(ofs, &datapath, &upperpath,
c->stat.size);
@@ -518,6 +559,16 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
if (err)
return err;
+ if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) {
+ /*
+ * Copy the fileattr inode flags that are the source of already
+ * copied i_flags
+ */
+ err = ovl_copy_fileattr(inode, &c->lowerpath, &upperpath);
+ if (err)
+ return err;
+ }
+
/*
* Store identifier of lower inode in upper inode xattr to
* allow lookup of the copy up origin inode.
@@ -526,13 +577,13 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
* hard link.
*/
if (c->origin) {
- err = ovl_set_origin(ofs, c->dentry, c->lowerpath.dentry, temp);
+ err = ovl_set_origin(ofs, c->lowerpath.dentry, temp);
if (err)
return err;
}
if (c->metacopy) {
- err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY,
+ err = ovl_check_setxattr(ofs, temp, OVL_XATTR_METACOPY,
NULL, 0, -EOPNOTSUPP);
if (err)
return err;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 93efe7048a77..1fefb2b8960e 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -233,9 +233,10 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr)
static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
int xerr)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
int err;
- err = ovl_check_setxattr(dentry, upper, OVL_XATTR_OPAQUE, "y", 1, xerr);
+ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_OPAQUE, "y", 1, xerr);
if (!err)
ovl_dentry_set_opaque(dentry);
@@ -320,6 +321,7 @@ static bool ovl_type_origin(struct dentry *dentry)
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct ovl_cattr *attr)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
@@ -338,7 +340,8 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
if (IS_ERR(newdentry))
goto out_unlock;
- if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) {
+ if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) &&
+ !ovl_allow_offline_changes(ofs)) {
/* Setting opaque here is just an optimization, allow to fail */
ovl_set_opaque(dentry, newdentry);
}
@@ -542,8 +545,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
goto out_cleanup;
}
err = ovl_instantiate(dentry, inode, newdentry, hardlink);
- if (err)
- goto out_cleanup;
+ if (err) {
+ ovl_cleanup(udir, newdentry);
+ dput(newdentry);
+ }
out_dput:
dput(upper);
out_unlock:
@@ -1043,6 +1048,7 @@ static bool ovl_need_absolute_redirect(struct dentry *dentry, bool samedir)
static int ovl_set_redirect(struct dentry *dentry, bool samedir)
{
int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
const char *redirect = ovl_dentry_get_redirect(dentry);
bool absolute_redirect = ovl_need_absolute_redirect(dentry, samedir);
@@ -1053,7 +1059,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir)
if (IS_ERR(redirect))
return PTR_ERR(redirect);
- err = ovl_check_setxattr(dentry, ovl_dentry_upper(dentry),
+ err = ovl_check_setxattr(ofs, ovl_dentry_upper(dentry),
OVL_XATTR_REDIRECT,
redirect, strlen(redirect), -EXDEV);
if (!err) {
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 5e828a1c98a8..832b17589733 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -13,6 +13,7 @@
#include <linux/fiemap.h>
#include <linux/fileattr.h>
#include <linux/security.h>
+#include <linux/namei.h>
#include "overlayfs.h"
@@ -33,12 +34,6 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
goto out;
if (attr->ia_valid & ATTR_SIZE) {
- struct inode *realinode = d_inode(ovl_dentry_real(dentry));
-
- err = -ETXTBSY;
- if (atomic_read(&realinode->i_writecount) < 0)
- goto out_drop_write;
-
/* Truncate should trigger data copy up as well */
full_copy_up = true;
}
@@ -162,7 +157,8 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
enum ovl_path_type type;
struct path realpath;
const struct cred *old_cred;
- bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
+ struct inode *inode = d_inode(dentry);
+ bool is_dir = S_ISDIR(inode->i_mode);
int fsid = 0;
int err;
bool metacopy_blocks = false;
@@ -175,6 +171,9 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
if (err)
goto out;
+ /* Report the effective immutable/append-only STATX flags */
+ generic_fill_statx_attr(inode, stat);
+
/*
* For non-dir or same fs, we use st_ino of the copy up origin.
* This guaranties constant st_dev/st_ino across copy up.
@@ -448,7 +447,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
return res;
}
-struct posix_acl *ovl_get_acl(struct inode *inode, int type)
+struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
{
struct inode *realinode = ovl_inode_real(inode);
const struct cred *old_cred;
@@ -457,6 +456,9 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type)
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
return NULL;
+ if (rcu)
+ return get_cached_acl_rcu(realinode, type);
+
old_cred = ovl_override_creds(inode->i_sb);
acl = get_acl(realinode, type);
revert_creds(old_cred);
@@ -503,16 +505,14 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Introducing security_inode_fileattr_get/set() hooks would solve this issue
* properly.
*/
-static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa,
+static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa,
bool set)
{
- struct path realpath;
struct file *file;
unsigned int cmd;
int err;
- ovl_path_real(dentry, &realpath);
- file = dentry_open(&realpath, O_RDONLY, current_cred());
+ file = dentry_open(realpath, O_RDONLY, current_cred());
if (IS_ERR(file))
return PTR_ERR(file);
@@ -527,12 +527,24 @@ static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa,
return err;
}
+int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa)
+{
+ int err;
+
+ err = ovl_security_fileattr(realpath, fa, true);
+ if (err)
+ return err;
+
+ return vfs_fileattr_set(&init_user_ns, realpath->dentry, fa);
+}
+
int ovl_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa)
{
struct inode *inode = d_inode(dentry);
- struct dentry *upperdentry;
+ struct path upperpath;
const struct cred *old_cred;
+ unsigned int flags;
int err;
err = ovl_want_write(dentry);
@@ -541,31 +553,78 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns,
err = ovl_copy_up(dentry);
if (!err) {
- upperdentry = ovl_dentry_upper(dentry);
+ ovl_path_real(dentry, &upperpath);
old_cred = ovl_override_creds(inode->i_sb);
- err = ovl_security_fileattr(dentry, fa, true);
+ /*
+ * Store immutable/append-only flags in xattr and clear them
+ * in upper fileattr (in case they were set by older kernel)
+ * so children of "ovl-immutable" directories lower aliases of
+ * "ovl-immutable" hardlinks could be copied up.
+ * Clear xattr when flags are cleared.
+ */
+ err = ovl_set_protattr(inode, upperpath.dentry, fa);
if (!err)
- err = vfs_fileattr_set(&init_user_ns, upperdentry, fa);
+ err = ovl_real_fileattr_set(&upperpath, fa);
revert_creds(old_cred);
- ovl_copyflags(ovl_inode_real(inode), inode);
+
+ /*
+ * Merge real inode flags with inode flags read from
+ * overlay.protattr xattr
+ */
+ flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK;
+
+ BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK);
+ flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK;
+ inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK);
+
+ /* Update ctime */
+ ovl_copyattr(ovl_inode_real(inode), inode);
}
ovl_drop_write(dentry);
out:
return err;
}
+/* Convert inode protection flags to fileattr flags */
+static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
+{
+ BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL);
+ BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON);
+
+ if (inode->i_flags & S_APPEND) {
+ fa->flags |= FS_APPEND_FL;
+ fa->fsx_xflags |= FS_XFLAG_APPEND;
+ }
+ if (inode->i_flags & S_IMMUTABLE) {
+ fa->flags |= FS_IMMUTABLE_FL;
+ fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
+ }
+}
+
+int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa)
+{
+ int err;
+
+ err = ovl_security_fileattr(realpath, fa, false);
+ if (err)
+ return err;
+
+ return vfs_fileattr_get(realpath->dentry, fa);
+}
+
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
struct inode *inode = d_inode(dentry);
- struct dentry *realdentry = ovl_dentry_real(dentry);
+ struct path realpath;
const struct cred *old_cred;
int err;
+ ovl_path_real(dentry, &realpath);
+
old_cred = ovl_override_creds(inode->i_sb);
- err = ovl_security_fileattr(dentry, fa, false);
- if (!err)
- err = vfs_fileattr_get(realdentry, fa);
+ err = ovl_real_fileattr_get(&realpath, fa);
+ ovl_fileattr_prot_flags(inode, fa);
revert_creds(old_cred);
return err;
@@ -1118,6 +1177,10 @@ struct inode *ovl_get_inode(struct super_block *sb,
}
}
+ /* Check for immutable/append-only inode flags in xattr */
+ if (upperdentry)
+ ovl_check_protattr(inode, upperdentry);
+
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
out:
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 210cd6f66e28..1a9b515fc45d 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -392,7 +392,7 @@ invalid:
upperdentry, d_inode(upperdentry)->i_mode & S_IFMT,
d_inode(origin)->i_mode & S_IFMT);
dput(origin);
- return -EIO;
+ return -ESTALE;
}
static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
@@ -811,7 +811,7 @@ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
if (err)
return err;
- err = ovl_set_origin(ofs, dentry, lower, upper);
+ err = ovl_set_origin(ofs, lower, upper);
if (!err)
err = ovl_set_impure(dentry->d_parent, upper->d_parent);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6ec73db4bf9e..3894f3347955 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -34,6 +34,7 @@ enum ovl_xattr {
OVL_XATTR_NLINK,
OVL_XATTR_UPPER,
OVL_XATTR_METACOPY,
+ OVL_XATTR_PROTATTR,
};
enum ovl_inode_flag {
@@ -262,6 +263,18 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
}
+static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs)
+{
+ /*
+ * To avoid regressions in existing setups with overlay lower offline
+ * changes, we allow lower changes only if none of the new features
+ * are used.
+ */
+ return (!ofs->config.index && !ofs->config.metacopy &&
+ !ofs->config.redirect_dir && ofs->config.xino != OVL_XINO_ON);
+}
+
+
/* util.c */
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
@@ -320,7 +333,7 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags);
bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry);
bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
enum ovl_xattr ox);
-int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
+int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
enum ovl_xattr ox, const void *value, size_t size,
int xerr);
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
@@ -485,7 +498,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
-struct posix_acl *ovl_get_acl(struct inode *inode, int type);
+struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu);
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
bool ovl_is_private_xattr(struct super_block *sb, const char *name);
@@ -518,9 +531,28 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to)
i_size_write(to, i_size_read(from));
}
+/* vfs inode flags copied from real to ovl inode */
+#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE)
+/* vfs inode flags read from overlay.protattr xattr to ovl inode */
+#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE)
+
+/*
+ * fileattr flags copied from lower to upper inode on copy up.
+ * We cannot copy up immutable/append-only flags, because that would prevent
+ * linking temp inode to upper dir, so we store them in xattr instead.
+ */
+#define OVL_COPY_FS_FLAGS_MASK (FS_SYNC_FL | FS_NOATIME_FL)
+#define OVL_COPY_FSX_FLAGS_MASK (FS_XFLAG_SYNC | FS_XFLAG_NOATIME)
+#define OVL_PROT_FS_FLAGS_MASK (FS_APPEND_FL | FS_IMMUTABLE_FL)
+#define OVL_PROT_FSX_FLAGS_MASK (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE)
+
+void ovl_check_protattr(struct inode *inode, struct dentry *upper);
+int ovl_set_protattr(struct inode *inode, struct dentry *upper,
+ struct fileattr *fa);
+
static inline void ovl_copyflags(struct inode *from, struct inode *to)
{
- unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME;
+ unsigned int mask = OVL_COPY_I_FLAGS_MASK;
inode_set_flags(to, from->i_flags & mask, mask);
}
@@ -548,6 +580,8 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
extern const struct file_operations ovl_file_operations;
int __init ovl_aio_request_cache_init(void);
void ovl_aio_request_cache_destroy(void);
+int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa);
+int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int ovl_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
@@ -561,8 +595,8 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
bool is_upper);
-int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry,
- struct dentry *lower, struct dentry *upper);
+int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
+ struct dentry *upper);
/* export.c */
extern const struct export_operations ovl_export_operations;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b01d4147520d..178daa5e82c9 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1599,9 +1599,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
* user opted-in to one of the new features that require following the
* lower inode of non-dir upper.
*/
- if (!ofs->config.index && !ofs->config.metacopy &&
- ofs->config.xino != OVL_XINO_ON &&
- uuid_is_null(uuid))
+ if (ovl_allow_offline_changes(ofs) && uuid_is_null(uuid))
return false;
for (i = 0; i < ofs->numfs; i++) {
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index b9d03627f364..f48284a2a896 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -10,6 +10,7 @@
#include <linux/cred.h>
#include <linux/xattr.h>
#include <linux/exportfs.h>
+#include <linux/fileattr.h>
#include <linux/uuid.h>
#include <linux/namei.h>
#include <linux/ratelimit.h>
@@ -585,6 +586,7 @@ bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
#define OVL_XATTR_NLINK_POSTFIX "nlink"
#define OVL_XATTR_UPPER_POSTFIX "upper"
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
+#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
#define OVL_XATTR_TAB_ENTRY(x) \
[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -598,14 +600,14 @@ const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
};
-int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
+int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
enum ovl_xattr ox, const void *value, size_t size,
int xerr)
{
int err;
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
if (ofs->noxattr)
return xerr;
@@ -623,6 +625,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
int err;
if (ovl_test_flag(OVL_IMPURE, d_inode(dentry)))
@@ -632,14 +635,95 @@ int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry)
* Do not fail when upper doesn't support xattrs.
* Upper inodes won't have origin nor redirect xattr anyway.
*/
- err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE,
- "y", 1, 0);
+ err = ovl_check_setxattr(ofs, upperdentry, OVL_XATTR_IMPURE, "y", 1, 0);
if (!err)
ovl_set_flag(OVL_IMPURE, d_inode(dentry));
return err;
}
+
+#define OVL_PROTATTR_MAX 32 /* Reserved for future flags */
+
+void ovl_check_protattr(struct inode *inode, struct dentry *upper)
+{
+ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+ u32 iflags = inode->i_flags & OVL_PROT_I_FLAGS_MASK;
+ char buf[OVL_PROTATTR_MAX+1];
+ int res, n;
+
+ res = ovl_do_getxattr(ofs, upper, OVL_XATTR_PROTATTR, buf,
+ OVL_PROTATTR_MAX);
+ if (res < 0)
+ return;
+
+ /*
+ * Initialize inode flags from overlay.protattr xattr and upper inode
+ * flags. If upper inode has those fileattr flags set (i.e. from old
+ * kernel), we do not clear them on ovl_get_inode(), but we will clear
+ * them on next fileattr_set().
+ */
+ for (n = 0; n < res; n++) {
+ if (buf[n] == 'a')
+ iflags |= S_APPEND;
+ else if (buf[n] == 'i')
+ iflags |= S_IMMUTABLE;
+ else
+ break;
+ }
+
+ if (!res || n < res) {
+ pr_warn_ratelimited("incompatible overlay.protattr format (%pd2, len=%d)\n",
+ upper, res);
+ } else {
+ inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK);
+ }
+}
+
+int ovl_set_protattr(struct inode *inode, struct dentry *upper,
+ struct fileattr *fa)
+{
+ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
+ char buf[OVL_PROTATTR_MAX];
+ int len = 0, err = 0;
+ u32 iflags = 0;
+
+ BUILD_BUG_ON(HWEIGHT32(OVL_PROT_FS_FLAGS_MASK) > OVL_PROTATTR_MAX);
+
+ if (fa->flags & FS_APPEND_FL) {
+ buf[len++] = 'a';
+ iflags |= S_APPEND;
+ }
+ if (fa->flags & FS_IMMUTABLE_FL) {
+ buf[len++] = 'i';
+ iflags |= S_IMMUTABLE;
+ }
+
+ /*
+ * Do not allow to set protection flags when upper doesn't support
+ * xattrs, because we do not set those fileattr flags on upper inode.
+ * Remove xattr if it exist and all protection flags are cleared.
+ */
+ if (len) {
+ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR,
+ buf, len, -EPERM);
+ } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) {
+ err = ovl_do_removexattr(ofs, upper, OVL_XATTR_PROTATTR);
+ if (err == -EOPNOTSUPP || err == -ENODATA)
+ err = 0;
+ }
+ if (err)
+ return err;
+
+ inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK);
+
+ /* Mask out the fileattr flags that should not be set in upper inode */
+ fa->flags &= ~OVL_PROT_FS_FLAGS_MASK;
+ fa->fsx_xflags &= ~OVL_PROT_FSX_FLAGS_MASK;
+
+ return 0;
+}
+
/**
* Caller must hold a reference to inode to prevent it from being freed while
* it is marked inuse.
diff --git a/fs/pipe.c b/fs/pipe.c
index 678dee2a8228..1fa1f52763f0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal);
*/
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- return try_get_page(buf->page);
+ return try_get_compound_head(buf->page, 1);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
@@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
* _very_ unlikely case that the pipe was full, but we got
* no data.
*/
- if (unlikely(was_full)) {
+ if (unlikely(was_full))
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- }
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
/*
* But because we didn't read anything, at this point we can
@@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
wake_next_reader = false;
__pipe_unlock(pipe);
- if (was_full) {
+ if (was_full)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- }
if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
if (ret > 0)
file_accessed(filp);
return ret;
@@ -565,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* become empty while we dropped the lock.
*/
__pipe_unlock(pipe);
- if (was_empty) {
+ if (was_empty)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- }
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
was_empty = pipe_empty(pipe->head, pipe->tail);
@@ -591,10 +588,9 @@ out:
* Epoll nonsensically wants a wakeup whether the pipe
* was already empty or not.
*/
- if (was_empty || pipe->poll_usage) {
+ if (was_empty || pipe->poll_usage)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- }
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index f3309a7edb49..f5c25f580dd9 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
#include <linux/xattr.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/namei.h>
static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
@@ -56,7 +57,17 @@ EXPORT_SYMBOL(get_cached_acl);
struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
{
- return rcu_dereference(*acl_by_type(inode, type));
+ struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type));
+
+ if (acl == ACL_DONT_CACHE) {
+ struct posix_acl *ret;
+
+ ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU);
+ if (!IS_ERR(ret))
+ acl = ret;
+ }
+
+ return acl;
}
EXPORT_SYMBOL(get_cached_acl_rcu);
@@ -138,7 +149,7 @@ struct posix_acl *get_acl(struct inode *inode, int type)
set_cached_acl(inode, type, NULL);
return NULL;
}
- acl = inode->i_op->get_acl(inode, type);
+ acl = inode->i_op->get_acl(inode, type, false);
if (IS_ERR(acl)) {
/*
diff --git a/fs/read_write.c b/fs/read_write.c
index 9db7adf160d2..af057c57bdc6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -365,12 +365,8 @@ out_putf:
int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
- struct inode *inode;
- int retval = -EINVAL;
-
- inode = file_inode(file);
if (unlikely((ssize_t) count < 0))
- return retval;
+ return -EINVAL;
/*
* ranged mandatory locking does not apply to streams - it makes sense
@@ -381,19 +377,12 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
if (unlikely(pos < 0)) {
if (!unsigned_offsets(file))
- return retval;
+ return -EINVAL;
if (count >= -pos) /* both values are in 0..LLONG_MAX */
return -EOVERFLOW;
} else if (unlikely((loff_t) (pos + count) < 0)) {
if (!unsigned_offsets(file))
- return retval;
- }
-
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
- read_write == READ ? F_RDLCK : F_WRLCK);
- if (retval < 0)
- return retval;
+ return -EINVAL;
}
}
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index fd58618da360..d9052b8ce6dd 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -48,7 +48,7 @@ static inline int reiserfs_acl_count(size_t size)
}
#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
+struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
int reiserfs_acl_chmod(struct inode *inode);
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 500f2000eb41..30319dc33c18 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -8,7 +8,7 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
-#include <stdarg.h>
+#include <linux/stdarg.h>
static char error_buf[1024];
static char fmt_buf[1024];
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index a9547144a099..d6fcddc46f5b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -190,13 +190,16 @@ fail:
* inode->i_mutex: down
* BKL held [before 2.5.x]
*/
-struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
+struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu)
{
char *name, *value;
struct posix_acl *acl;
int size;
int retval;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e4a5fdd7ad7b..6d4a9beaa097 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -99,24 +99,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
bool write)
{
- struct inode *inode = file_inode(file);
-
if (unlikely(pos < 0 || len < 0))
return -EINVAL;
if (unlikely((loff_t) (pos + len) < 0))
return -EINVAL;
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- loff_t end = len ? pos + len - 1 : OFFSET_MAX;
- int retval;
-
- retval = locks_mandatory_area(inode, file, pos, end,
- write ? F_WRLCK : F_RDLCK);
- if (retval < 0)
- return retval;
- }
-
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
}
diff --git a/fs/select.c b/fs/select.c
index 945896d0ac9e..e83e563a351d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -655,7 +655,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
goto out_nofds;
alloc_size = 6 * size;
- bits = kvmalloc(alloc_size, GFP_KERNEL);
+ bits = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
if (!bits)
goto out_nofds;
}
@@ -1000,7 +1000,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
len = min(todo, POLLFD_PER_PAGE);
walk = walk->next = kmalloc(struct_size(walk, entries, len),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!walk) {
err = -ENOMEM;
goto out_fds;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 167b5889db4b..040e1cf90528 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -114,10 +114,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
break;
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
- case SIL_PERF_EVENT:
+ case SIL_FAULT_PERF_EVENT:
/*
* Fall through to the SIL_FAULT case. SIL_FAULT_BNDERR,
- * SIL_FAULT_PKUERR, and SIL_PERF_EVENT are only
+ * SIL_FAULT_PKUERR, and SIL_FAULT_PERF_EVENT are only
* generated by faults that deliver them synchronously to
* userspace. In case someone injects one of these signals
* and signalfd catches it treat it as SIL_FAULT.
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 855f0e87066d..2db8bcf7ff85 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -49,8 +49,7 @@ static int copy_bio_to_actor(struct bio *bio,
bytes_to_copy = min_t(int, bytes_to_copy,
req_length - copied_bytes);
- memcpy(actor_addr + actor_offset,
- page_address(bvec->bv_page) + bvec->bv_offset + offset,
+ memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset,
bytes_to_copy);
actor_offset += bytes_to_copy;
@@ -177,7 +176,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
goto out_free_bio;
}
/* Extract the length of the metadata block */
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length = data[offset];
if (offset < bvec->bv_len - 1) {
length |= data[offset + 1] << 8;
@@ -186,7 +185,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
res = -EIO;
goto out_free_bio;
}
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length |= data[0] << 8;
}
bio_free_pages(bio);
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index 233d5582fbee..b685b6238316 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -101,7 +101,7 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
while (bio_next_segment(bio, &iter_all)) {
int avail = min(bytes, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
memcpy(buff, data + offset, avail);
buff += avail;
bytes -= avail;
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 97bb7d92ddcd..cb510a631968 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -76,7 +76,7 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
while (bio_next_segment(bio, &iter_all)) {
int avail = min(bytes, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
memcpy(buff, data + offset, avail);
buff += avail;
bytes -= avail;
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index e80419aed862..68f6d09bb3a2 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -146,7 +146,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
avail = min(length, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length -= avail;
stream->buf.in = data + offset;
stream->buf.in_size = avail;
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index bcb881ec47f2..a20e9042146b 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -76,7 +76,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
avail = min(length, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length -= avail;
stream->next_in = data + offset;
stream->avail_in = avail;
diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index b7cb1faa652d..0015cf8b5582 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -94,7 +94,7 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
avail = min(length, ((int)bvec->bv_len) - offset);
- data = page_address(bvec->bv_page) + bvec->bv_offset;
+ data = bvec_virt(bvec);
length -= avail;
in_buf.src = data + offset;
in_buf.size = avail;
diff --git a/fs/stat.c b/fs/stat.c
index 1fa38bdec1a6..28d2020ba1f4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -60,6 +60,24 @@ void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode,
EXPORT_SYMBOL(generic_fillattr);
/**
+ * generic_fill_statx_attr - Fill in the statx attributes from the inode flags
+ * @inode: Inode to use as the source
+ * @stat: Where to fill in the attribute flags
+ *
+ * Fill in the STATX_ATTR_* flags in the kstat structure for properties of the
+ * inode that are published on i_flags and enforced by the VFS.
+ */
+void generic_fill_statx_attr(struct inode *inode, struct kstat *stat)
+{
+ if (inode->i_flags & S_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (inode->i_flags & S_APPEND)
+ stat->attributes |= STATX_ATTR_APPEND;
+ stat->attributes_mask |= KSTAT_ATTR_VFS_FLAGS;
+}
+EXPORT_SYMBOL(generic_fill_statx_attr);
+
+/**
* vfs_getattr_nosec - getattr without security checks
* @path: file to get attributes from
* @stat: structure to return attributes in
diff --git a/fs/super.c b/fs/super.c
index 91b7f156735b..bcef3a6f4c4b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1203,7 +1203,7 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
- s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+ s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
s->s_iflags |= SB_I_STABLE_WRITES;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9aefa7779b29..d019d6ac6ad0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -174,8 +174,8 @@ static int sysfs_kf_bin_open(struct kernfs_open_file *of)
{
struct bin_attribute *battr = of->kn->priv;
- if (battr->mapping)
- of->file->f_mapping = battr->mapping;
+ if (battr->f_mapping)
+ of->file->f_mapping = battr->f_mapping();
return 0;
}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 64e6a6698935..f29d62004527 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -446,7 +446,7 @@ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
if (!target)
return -ENOENT;
- entry = kernfs_find_and_get(target_kobj->sd, target_name);
+ entry = kernfs_find_and_get(target, target_name);
if (!entry) {
kernfs_put(target);
return -ENOENT;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index c5509d2448e3..e9c96a0c79f1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -115,6 +115,22 @@ void timerfd_clock_was_set(void)
rcu_read_unlock();
}
+static void timerfd_resume_work(struct work_struct *work)
+{
+ timerfd_clock_was_set();
+}
+
+static DECLARE_WORK(timerfd_work, timerfd_resume_work);
+
+/*
+ * Invoked from timekeeping_resume(). Defer the actual update to work so
+ * timerfd_clock_was_set() runs in task context.
+ */
+void timerfd_resume(void)
+{
+ schedule_work(&timerfd_work);
+}
+
static void __timerfd_remove_cancel(struct timerfd_ctx *ctx)
{
if (ctx->might_cancel) {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2e4e1d159969..5cfa28cd00cd 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1630,6 +1630,17 @@ static const char *ubifs_get_link(struct dentry *dentry,
return fscrypt_get_symlink(inode, ui->data, ui->data_len, done);
}
+static int ubifs_symlink_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ ubifs_getattr(mnt_userns, path, stat, request_mask, query_flags);
+
+ if (IS_ENCRYPTED(d_inode(path->dentry)))
+ return fscrypt_symlink_getattr(path, stat);
+ return 0;
+}
+
const struct address_space_operations ubifs_file_address_operations = {
.readpage = ubifs_readpage,
.writepage = ubifs_writepage,
@@ -1655,7 +1666,7 @@ const struct inode_operations ubifs_file_inode_operations = {
const struct inode_operations ubifs_symlink_inode_operations = {
.get_link = ubifs_get_link,
.setattr = ubifs_setattr,
- .getattr = ubifs_getattr,
+ .getattr = ubifs_symlink_getattr,
.listxattr = ubifs_listxattr,
.update_time = ubifs_update_time,
};
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index c19dba45aa20..70abdfad2df1 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,7 +35,6 @@
#include "udf_i.h"
#include "udf_sb.h"
-
static int udf_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *dir = file_inode(file);
@@ -135,7 +134,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
lfi = cfi.lengthFileIdent;
if (fibh.sbh == fibh.ebh) {
- nameptr = fi->fileIdent + liu;
+ nameptr = udf_get_fi_ident(fi);
} else {
int poffset; /* Unpaded ending offset */
@@ -153,7 +152,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
}
}
nameptr = copy_name;
- memcpy(nameptr, fi->fileIdent + liu,
+ memcpy(nameptr, udf_get_fi_ident(fi),
lfi - poffset);
memcpy(nameptr + lfi - poffset,
fibh.ebh->b_data, poffset);
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 185c3e247648..de17a97e8667 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -307,14 +307,14 @@ struct logicalVolDesc {
struct regid impIdent;
uint8_t impUse[128];
struct extent_ad integritySeqExt;
- uint8_t partitionMaps[0];
+ uint8_t partitionMaps[];
} __packed;
/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
struct genericPartitionMap {
uint8_t partitionMapType;
uint8_t partitionMapLength;
- uint8_t partitionMapping[0];
+ uint8_t partitionMapping[];
} __packed;
/* Partition Map Type (ECMA 167r3 3/10.7.1.1) */
@@ -342,7 +342,7 @@ struct unallocSpaceDesc {
struct tag descTag;
__le32 volDescSeqNum;
__le32 numAllocDescs;
- struct extent_ad allocDescs[0];
+ struct extent_ad allocDescs[];
} __packed;
/* Terminating Descriptor (ECMA 167r3 3/10.9) */
@@ -360,9 +360,9 @@ struct logicalVolIntegrityDesc {
uint8_t logicalVolContentsUse[32];
__le32 numOfPartitions;
__le32 lengthOfImpUse;
- __le32 freeSpaceTable[0];
- __le32 sizeTable[0];
- uint8_t impUse[0];
+ __le32 freeSpaceTable[];
+ /* __le32 sizeTable[]; */
+ /* uint8_t impUse[]; */
} __packed;
/* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -471,9 +471,9 @@ struct fileIdentDesc {
uint8_t lengthFileIdent;
struct long_ad icb;
__le16 lengthOfImpUse;
- uint8_t impUse[0];
- uint8_t fileIdent[0];
- uint8_t padding[0];
+ uint8_t impUse[];
+ /* uint8_t fileIdent[]; */
+ /* uint8_t padding[]; */
} __packed;
/* File Characteristics (ECMA 167r3 4/14.4.3) */
@@ -578,8 +578,8 @@ struct fileEntry {
__le64 uniqueID;
__le32 lengthExtendedAttr;
__le32 lengthAllocDescs;
- uint8_t extendedAttr[0];
- uint8_t allocDescs[0];
+ uint8_t extendedAttr[];
+ /* uint8_t allocDescs[]; */
} __packed;
/* Permissions (ECMA 167r3 4/14.9.5) */
@@ -632,7 +632,7 @@ struct genericFormat {
uint8_t attrSubtype;
uint8_t reserved[3];
__le32 attrLength;
- uint8_t attrData[0];
+ uint8_t attrData[];
} __packed;
/* Character Set Information (ECMA 167r3 4/14.10.3) */
@@ -643,7 +643,7 @@ struct charSetInfo {
__le32 attrLength;
__le32 escapeSeqLength;
uint8_t charSetType;
- uint8_t escapeSeq[0];
+ uint8_t escapeSeq[];
} __packed;
/* Alternate Permissions (ECMA 167r3 4/14.10.4) */
@@ -682,7 +682,7 @@ struct infoTimesExtAttr {
__le32 attrLength;
__le32 dataLength;
__le32 infoTimeExistence;
- uint8_t infoTimes[0];
+ uint8_t infoTimes[];
} __packed;
/* Device Specification (ECMA 167r3 4/14.10.7) */
@@ -694,7 +694,7 @@ struct deviceSpec {
__le32 impUseLength;
__le32 majorDeviceIdent;
__le32 minorDeviceIdent;
- uint8_t impUse[0];
+ uint8_t impUse[];
} __packed;
/* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */
@@ -705,7 +705,7 @@ struct impUseExtAttr {
__le32 attrLength;
__le32 impUseLength;
struct regid impIdent;
- uint8_t impUse[0];
+ uint8_t impUse[];
} __packed;
/* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */
@@ -716,7 +716,7 @@ struct appUseExtAttr {
__le32 attrLength;
__le32 appUseLength;
struct regid appIdent;
- uint8_t appUse[0];
+ uint8_t appUse[];
} __packed;
#define EXTATTR_CHAR_SET 1
@@ -733,7 +733,7 @@ struct unallocSpaceEntry {
struct tag descTag;
struct icbtag icbTag;
__le32 lengthAllocDescs;
- uint8_t allocDescs[0];
+ uint8_t allocDescs[];
} __packed;
/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
@@ -741,7 +741,7 @@ struct spaceBitmapDesc {
struct tag descTag;
__le32 numOfBits;
__le32 numOfBytes;
- uint8_t bitmap[0];
+ uint8_t bitmap[];
} __packed;
/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
@@ -780,7 +780,7 @@ struct pathComponent {
uint8_t componentType;
uint8_t lengthComponentIdent;
__le16 componentFileVersionNum;
- dchars componentIdent[0];
+ dchars componentIdent[];
} __packed;
/* File Entry (ECMA 167r3 4/14.17) */
@@ -809,8 +809,8 @@ struct extendedFileEntry {
__le64 uniqueID;
__le32 lengthExtendedAttr;
__le32 lengthAllocDescs;
- uint8_t extendedAttr[0];
- uint8_t allocDescs[0];
+ uint8_t extendedAttr[];
+ /* uint8_t allocDescs[]; */
} __packed;
#endif /* _ECMA_167_H */
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4917670860a0..1d6b7a50736b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -390,8 +390,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode,
dfibh.eoffset += (sfibh.eoffset - sfibh.soffset);
dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset);
if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse,
- sfi->fileIdent +
- le16_to_cpu(sfi->lengthOfImpUse))) {
+ udf_get_fi_ident(sfi))) {
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
brelse(dbh);
return NULL;
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index eab94527340d..1614d308d0f0 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -173,13 +173,22 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
else
offset = le32_to_cpu(eahd->appAttrLocation);
- while (offset < iinfo->i_lenEAttr) {
+ while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) {
+ uint32_t attrLength;
+
gaf = (struct genericFormat *)&ea[offset];
+ attrLength = le32_to_cpu(gaf->attrLength);
+
+ /* Detect undersized elements and buffer overflows */
+ if ((attrLength < sizeof(*gaf)) ||
+ (attrLength > (iinfo->i_lenEAttr - offset)))
+ break;
+
if (le32_to_cpu(gaf->attrType) == type &&
gaf->attrSubtype == subtype)
return gaf;
else
- offset += le32_to_cpu(gaf->attrLength);
+ offset += attrLength;
}
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 7c7c9bbbfa57..caeef08efed2 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -74,12 +74,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
if (fileident) {
if (adinicb || (offset + lfi < 0)) {
- memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi);
+ memcpy(udf_get_fi_ident(sfi), fileident, lfi);
} else if (offset >= 0) {
memcpy(fibh->ebh->b_data + offset, fileident, lfi);
} else {
- memcpy((uint8_t *)sfi->fileIdent + liu, fileident,
- -offset);
+ memcpy(udf_get_fi_ident(sfi), fileident, -offset);
memcpy(fibh->ebh->b_data, fileident - offset,
lfi + offset);
}
@@ -88,11 +87,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
offset += lfi;
if (adinicb || (offset + padlen < 0)) {
- memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen);
+ memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen);
} else if (offset >= 0) {
memset(fibh->ebh->b_data + offset, 0x00, padlen);
} else {
- memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset);
+ memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset);
memset(fibh->ebh->b_data, 0x00, padlen + offset);
}
@@ -226,7 +225,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
lfi = cfi->lengthFileIdent;
if (fibh->sbh == fibh->ebh) {
- nameptr = fi->fileIdent + liu;
+ nameptr = udf_get_fi_ident(fi);
} else {
int poffset; /* Unpaded ending offset */
@@ -246,7 +245,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
}
}
nameptr = copy_name;
- memcpy(nameptr, fi->fileIdent + liu,
+ memcpy(nameptr, udf_get_fi_ident(fi),
lfi - poffset);
memcpy(nameptr + lfi - poffset,
fibh->ebh->b_data, poffset);
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 22bc4fb2feb9..157de0ec0cd5 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -111,7 +111,7 @@ struct logicalVolIntegrityDescImpUse {
__le16 minUDFReadRev;
__le16 minUDFWriteRev;
__le16 maxUDFWriteRev;
- uint8_t impUse[0];
+ uint8_t impUse[];
} __packed;
/* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */
@@ -178,15 +178,6 @@ struct metadataPartitionMap {
uint8_t reserved2[5];
} __packed;
-/* Virtual Allocation Table (UDF 1.5 2.2.10) */
-struct virtualAllocationTable15 {
- __le32 vatEntry[0];
- struct regid vatIdent;
- __le32 previousVATICBLoc;
-} __packed;
-
-#define ICBTAG_FILE_TYPE_VAT15 0x00U
-
/* Virtual Allocation Table (UDF 2.60 2.2.11) */
struct virtualAllocationTable20 {
__le16 lengthHeader;
@@ -199,8 +190,8 @@ struct virtualAllocationTable20 {
__le16 minUDFWriteRev;
__le16 maxUDFWriteRev;
__le16 reserved;
- uint8_t impUse[0];
- __le32 vatEntry[0];
+ uint8_t impUse[];
+ /* __le32 vatEntry[]; */
} __packed;
#define ICBTAG_FILE_TYPE_VAT20 0xF8U
@@ -217,8 +208,7 @@ struct sparingTable {
__le16 reallocationTableLen;
__le16 reserved;
__le32 sequenceNum;
- struct sparingEntry
- mapEntry[0];
+ struct sparingEntry mapEntry[];
} __packed;
/* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */
@@ -241,7 +231,7 @@ struct allocDescImpUse {
/* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */
struct freeEaSpace {
__le16 headerChecksum;
- uint8_t freeEASpace[0];
+ uint8_t freeEASpace[];
} __packed;
/* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */
@@ -265,7 +255,7 @@ struct LVExtensionEA {
/* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */
struct freeAppEASpace {
__le16 headerChecksum;
- uint8_t freeEASpace[0];
+ uint8_t freeEASpace[];
} __packed;
/* UDF Defined System Stream (UDF 2.60 3.3.7) */
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2f83c1204e20..b2d7c57d0688 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -108,16 +108,10 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
return NULL;
lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data;
partnum = le32_to_cpu(lvid->numOfPartitions);
- if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) -
- offsetof(struct logicalVolIntegrityDesc, impUse)) /
- (2 * sizeof(uint32_t)) < partnum) {
- udf_err(sb, "Logical volume integrity descriptor corrupted "
- "(numOfPartitions = %u)!\n", partnum);
- return NULL;
- }
/* The offset is to skip freeSpaceTable and sizeTable arrays */
offset = partnum * 2 * sizeof(uint32_t);
- return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]);
+ return (struct logicalVolIntegrityDescImpUse *)
+ (((uint8_t *)(lvid + 1)) + offset);
}
/* UDF filesystem type */
@@ -349,10 +343,10 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
if (sbi->s_anchor != 0)
seq_printf(seq, ",anchor=%u", sbi->s_anchor);
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
- seq_puts(seq, ",utf8");
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map)
+ if (sbi->s_nls_map)
seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset);
+ else
+ seq_puts(seq, ",iocharset=utf8");
return 0;
}
@@ -558,19 +552,24 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
/* Ignored (never implemented properly) */
break;
case Opt_utf8:
- uopt->flags |= (1 << UDF_FLAG_UTF8);
+ if (!remount) {
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
+ }
break;
case Opt_iocharset:
if (!remount) {
- if (uopt->nls_map)
- unload_nls(uopt->nls_map);
- /*
- * load_nls() failure is handled later in
- * udf_fill_super() after all options are
- * parsed.
- */
+ unload_nls(uopt->nls_map);
+ uopt->nls_map = NULL;
+ }
+ /* When nls_map is not loaded then UTF-8 is used */
+ if (!remount && strcmp(args[0].from, "utf8") != 0) {
uopt->nls_map = load_nls(args[0].from);
- uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
+ if (!uopt->nls_map) {
+ pr_err("iocharset %s not found\n",
+ args[0].from);
+ return 0;
+ }
}
break;
case Opt_uforget:
@@ -1542,6 +1541,7 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
struct udf_sb_info *sbi = UDF_SB(sb);
struct logicalVolIntegrityDesc *lvid;
int indirections = 0;
+ u32 parts, impuselen;
while (++indirections <= UDF_MAX_LVID_NESTING) {
final_bh = NULL;
@@ -1568,15 +1568,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data;
if (lvid->nextIntegrityExt.extLength == 0)
- return;
+ goto check;
loc = leea_to_cpu(lvid->nextIntegrityExt);
}
udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n",
UDF_MAX_LVID_NESTING);
+out_err:
brelse(sbi->s_lvid_bh);
sbi->s_lvid_bh = NULL;
+ return;
+check:
+ parts = le32_to_cpu(lvid->numOfPartitions);
+ impuselen = le32_to_cpu(lvid->lengthOfImpUse);
+ if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
+ sizeof(struct logicalVolIntegrityDesc) + impuselen +
+ 2 * parts * sizeof(u32) > sb->s_blocksize) {
+ udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
+ "ignoring.\n", parts, impuselen);
+ goto out_err;
+ }
}
/*
@@ -2139,21 +2151,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (!udf_parse_options((char *)options, &uopt, false))
goto parse_options_failure;
- if (uopt.flags & (1 << UDF_FLAG_UTF8) &&
- uopt.flags & (1 << UDF_FLAG_NLS_MAP)) {
- udf_err(sb, "utf8 cannot be combined with iocharset\n");
- goto parse_options_failure;
- }
- if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) {
- uopt.nls_map = load_nls_default();
- if (!uopt.nls_map)
- uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP);
- else
- udf_debug("Using default NLS map\n");
- }
- if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP)))
- uopt.flags |= (1 << UDF_FLAG_UTF8);
-
fileset.logicalBlockNum = 0xFFFFFFFF;
fileset.partitionReferenceNum = 0xFFFF;
@@ -2308,8 +2305,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
error_out:
iput(sbi->s_vat_inode);
parse_options_failure:
- if (uopt.nls_map)
- unload_nls(uopt.nls_map);
+ unload_nls(uopt.nls_map);
if (lvid_open)
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
@@ -2359,8 +2355,7 @@ static void udf_put_super(struct super_block *sb)
sbi = UDF_SB(sb);
iput(sbi->s_vat_inode);
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
- unload_nls(sbi->s_nls_map);
+ unload_nls(sbi->s_nls_map);
if (!sb_rdonly(sb))
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 758efe557a19..4fa620543d30 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -20,8 +20,6 @@
#define UDF_FLAG_UNDELETE 6
#define UDF_FLAG_UNHIDE 7
#define UDF_FLAG_VARCONV 8
-#define UDF_FLAG_NLS_MAP 9
-#define UDF_FLAG_UTF8 10
#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */
#define UDF_FLAG_GID_FORGET 12
#define UDF_FLAG_UID_SET 13
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 9dd0814f1077..7e258f15b8ef 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -130,6 +130,10 @@ static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi)
le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent,
UDF_NAME_PAD);
}
+static inline uint8_t *udf_get_fi_ident(struct fileIdentDesc *fi)
+{
+ return ((uint8_t *)(fi + 1)) + le16_to_cpu(fi->lengthOfImpUse);
+}
/* file.c */
extern long udf_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 5fcfa96463eb..622569007b53 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -177,7 +177,7 @@ static int udf_name_from_CS0(struct super_block *sb,
return 0;
}
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+ if (UDF_SB(sb)->s_nls_map)
conv_f = UDF_SB(sb)->s_nls_map->uni2char;
else
conv_f = NULL;
@@ -285,7 +285,7 @@ static int udf_name_to_CS0(struct super_block *sb,
if (ocu_max_len <= 0)
return 0;
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+ if (UDF_SB(sb)->s_nls_map)
conv_f = UDF_SB(sb)->s_nls_map->char2uni;
else
conv_f = NULL;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 74028b5a7b0a..00a01471ea05 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -70,7 +70,7 @@
#include <linux/module.h>
#include <linux/bitops.h>
-#include <stdarg.h>
+#include <linux/stdarg.h>
#include <linux/uaccess.h>
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5c2d806e6ae5..003f0d31743e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly;
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
-enum userfaultfd_state {
- UFFD_STATE_WAIT_API,
- UFFD_STATE_RUNNING,
-};
-
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
* to be in the same cacheline.
@@ -69,12 +64,10 @@ struct userfaultfd_ctx {
unsigned int flags;
/* features requested from the userspace */
unsigned int features;
- /* state machine */
- enum userfaultfd_state state;
/* released */
bool released;
/* memory mappings are changing because of non-cooperative event */
- bool mmap_changing;
+ atomic_t mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
};
@@ -104,6 +97,14 @@ struct userfaultfd_wake_range {
unsigned long len;
};
+/* internal indication that UFFD_API ioctl was successfully executed */
+#define UFFD_FEATURE_INITIALIZED (1u << 31)
+
+static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
+{
+ return ctx->features & UFFD_FEATURE_INITIALIZED;
+}
+
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -623,7 +624,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
* already released.
*/
out:
- WRITE_ONCE(ctx->mmap_changing, false);
+ atomic_dec(&ctx->mmap_changing);
+ VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
userfaultfd_ctx_put(ctx);
}
@@ -666,15 +668,14 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
refcount_set(&ctx->refcount, 1);
ctx->flags = octx->flags;
- ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
ctx->released = false;
- ctx->mmap_changing = false;
+ atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
- WRITE_ONCE(octx->mmap_changing, true);
+ atomic_inc(&octx->mmap_changing);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -721,7 +722,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
} else {
/* Drop uffd context if remap feature not enabled */
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -766,7 +767,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
mmap_read_unlock(mm);
msg_init(&ewq.msg);
@@ -810,7 +811,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -943,38 +944,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
poll_wait(file, &ctx->fd_wqh, wait);
- switch (ctx->state) {
- case UFFD_STATE_WAIT_API:
+ if (!userfaultfd_is_initialized(ctx))
return EPOLLERR;
- case UFFD_STATE_RUNNING:
- /*
- * poll() never guarantees that read won't block.
- * userfaults can be waken before they're read().
- */
- if (unlikely(!(file->f_flags & O_NONBLOCK)))
- return EPOLLERR;
- /*
- * lockless access to see if there are pending faults
- * __pollwait last action is the add_wait_queue but
- * the spin_unlock would allow the waitqueue_active to
- * pass above the actual list_add inside
- * add_wait_queue critical section. So use a full
- * memory barrier to serialize the list_add write of
- * add_wait_queue() with the waitqueue_active read
- * below.
- */
- ret = 0;
- smp_mb();
- if (waitqueue_active(&ctx->fault_pending_wqh))
- ret = EPOLLIN;
- else if (waitqueue_active(&ctx->event_wqh))
- ret = EPOLLIN;
- return ret;
- default:
- WARN_ON_ONCE(1);
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
return EPOLLERR;
- }
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = EPOLLIN;
+ else if (waitqueue_active(&ctx->event_wqh))
+ ret = EPOLLIN;
+
+ return ret;
}
static const struct file_operations userfaultfd_fops;
@@ -1169,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
int no_wait = file->f_flags & O_NONBLOCK;
struct inode *inode = file_inode(file);
- if (ctx->state == UFFD_STATE_WAIT_API)
+ if (!userfaultfd_is_initialized(ctx))
return -EINVAL;
for (;;) {
@@ -1700,7 +1696,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1757,7 +1753,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1807,7 +1803,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range range;
bool mode_wp, mode_dontwake;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
return -EAGAIN;
user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
@@ -1855,7 +1851,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
user_uffdio_continue = (struct uffdio_continue __user *)arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1908,9 +1904,10 @@ out:
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
- * For the current set of features the bits just coincide
+ * For the current set of features the bits just coincide. Set
+ * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
*/
- return (unsigned int)user_features;
+ return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}
/*
@@ -1923,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
{
struct uffdio_api uffdio_api;
void __user *buf = (void __user *)arg;
+ unsigned int ctx_features;
int ret;
__u64 features;
- ret = -EINVAL;
- if (ctx->state != UFFD_STATE_WAIT_API)
- goto out;
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
@@ -1952,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
goto out;
- ctx->state = UFFD_STATE_RUNNING;
+
/* only enable the requested features for this uffd context */
- ctx->features = uffd_ctx_features(features);
+ ctx_features = uffd_ctx_features(features);
+ ret = -EINVAL;
+ if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
+ goto err_out;
+
ret = 0;
out:
return ret;
@@ -1971,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
int ret = -EINVAL;
struct userfaultfd_ctx *ctx = file->private_data;
- if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+ if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
return -EINVAL;
switch(cmd) {
@@ -2085,9 +2084,8 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
refcount_set(&ctx->refcount, 1);
ctx->flags = flags;
ctx->features = 0;
- ctx->state = UFFD_STATE_WAIT_API;
ctx->released = false;
- ctx->mmap_changing = false;
+ atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index e986b95d94c9..6f49bf39183c 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -29,67 +29,3 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
-
-
-/*
- * __vmalloc() will allocate data pages and auxiliary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
- * we need to tell memory reclaim that we are in such a context via
- * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
- * and potentially deadlocking.
- */
-static void *
-__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
-{
- unsigned nofs_flag = 0;
- void *ptr;
- gfp_t lflags = kmem_flags_convert(flags);
-
- if (flags & KM_NOFS)
- nofs_flag = memalloc_nofs_save();
-
- ptr = __vmalloc(size, lflags);
-
- if (flags & KM_NOFS)
- memalloc_nofs_restore(nofs_flag);
-
- return ptr;
-}
-
-/*
- * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
- * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
- * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
- * aligned region.
- */
-void *
-kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
-{
- void *ptr;
-
- trace_kmem_alloc_io(size, flags, _RET_IP_);
-
- if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
- align_mask = PAGE_SIZE - 1;
-
- ptr = kmem_alloc(size, flags | KM_MAYFAIL);
- if (ptr) {
- if (!((uintptr_t)ptr & align_mask))
- return ptr;
- kfree(ptr);
- }
- return __kmem_vmalloc(size, flags);
-}
-
-void *
-kmem_alloc_large(size_t size, xfs_km_flags_t flags)
-{
- void *ptr;
-
- trace_kmem_alloc_large(size, flags, _RET_IP_);
-
- ptr = kmem_alloc(size, flags | KM_MAYFAIL);
- if (ptr)
- return ptr;
- return __kmem_vmalloc(size, flags);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 38007117697e..54da6d717a06 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -57,8 +57,6 @@ kmem_flags_convert(xfs_km_flags_t flags)
}
extern void *kmem_alloc(size_t, xfs_km_flags_t);
-extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
-extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
{
kvfree(ptr);
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index ee9ec0c50bec..005abfd9fd34 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -313,7 +313,6 @@ xfs_get_aghdr_buf(
if (error)
return error;
- bp->b_bn = blkno;
bp->b_maps[0].bm_bn = blkno;
bp->b_ops = ops;
@@ -469,7 +468,7 @@ xfs_rmaproot_init(
rrec->rm_offset = 0;
/* account for refc btree root */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
rrec = XFS_RMAP_REC_ADDR(block, 5);
rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
rrec->rm_blockcount = cpu_to_be32(1);
@@ -528,7 +527,7 @@ xfs_agfblock_init(
agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (xfs_has_rmapbt(mp)) {
agf->agf_roots[XFS_BTNUM_RMAPi] =
cpu_to_be32(XFS_RMAP_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
@@ -541,9 +540,9 @@ xfs_agfblock_init(
tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
agf->agf_refcount_root = cpu_to_be32(
xfs_refc_block(mp));
agf->agf_refcount_level = cpu_to_be32(1);
@@ -569,7 +568,7 @@ xfs_agflblock_init(
__be32 *agfl_bno;
int bucket;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
agfl->agfl_seqno = cpu_to_be32(id->agno);
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
@@ -599,17 +598,17 @@ xfs_agiblock_init(
agi->agi_freecount = 0;
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ if (xfs_has_finobt(mp)) {
agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
agi->agi_free_level = cpu_to_be32(1);
}
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
- if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ if (xfs_has_inobtcounts(mp)) {
agi->agi_iblocks = cpu_to_be32(1);
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
agi->agi_fblocks = cpu_to_be32(1);
}
}
@@ -719,14 +718,14 @@ xfs_ag_init_headers(
.ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_FINO,
- .need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
+ .need_init = xfs_has_finobt(mp)
},
{ /* RMAP root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_rmapbt_buf_ops,
.work = &xfs_rmaproot_init,
- .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb)
+ .need_init = xfs_has_rmapbt(mp)
},
{ /* REFC root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
@@ -734,7 +733,7 @@ xfs_ag_init_headers(
.ops = &xfs_refcountbt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_REFC,
- .need_init = xfs_sb_version_hasreflink(&mp->m_sb)
+ .need_init = xfs_has_reflink(mp)
},
{ /* NULL terminating block */
.daddr = XFS_BUF_DADDR_NULL,
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 6929157d8d6e..95157f5a5a6c 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -51,7 +51,7 @@ xfs_agfl_size(
{
unsigned int size = mp->m_sb.sb_sectsize;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
size -= sizeof(struct xfs_agfl);
return size / sizeof(xfs_agblock_t);
@@ -61,9 +61,9 @@ unsigned int
xfs_refc_block(
struct xfs_mount *mp)
{
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
return XFS_RMAP_BLOCK(mp) + 1;
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
return XFS_FIBT_BLOCK(mp) + 1;
return XFS_IBT_BLOCK(mp) + 1;
}
@@ -72,11 +72,11 @@ xfs_extlen_t
xfs_prealloc_blocks(
struct xfs_mount *mp)
{
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
return xfs_refc_block(mp) + 1;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
return XFS_RMAP_BLOCK(mp) + 1;
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
return XFS_FIBT_BLOCK(mp) + 1;
return XFS_IBT_BLOCK(mp) + 1;
}
@@ -126,11 +126,11 @@ xfs_alloc_ag_max_usable(
blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
blocks += XFS_ALLOC_AGFL_RESERVE;
blocks += 3; /* AGF, AGI btree root blocks */
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
blocks++; /* finobt root block */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
blocks++; /* rmap root block */
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
blocks++; /* refcount root block */
return mp->m_sb.sb_agblocks - blocks;
@@ -598,7 +598,7 @@ xfs_agfl_verify(
* AGFL is what the AGF says is active. We can't get to the AGF, so we
* can't verify just those entries are valid.
*/
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return NULL;
if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
@@ -638,7 +638,7 @@ xfs_agfl_read_verify(
* AGFL is what the AGF says is active. We can't get to the AGF, so we
* can't verify just those entries are valid.
*/
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
@@ -659,7 +659,7 @@ xfs_agfl_write_verify(
xfs_failaddr_t fa;
/* no verification of non-crc AGFLs */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
fa = xfs_agfl_verify(bp);
@@ -2264,7 +2264,7 @@ xfs_alloc_min_freelist(
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
mp->m_ag_maxlevels);
/* space needed reverse mapping used space btree */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
mp->m_rmap_maxlevels);
@@ -2373,7 +2373,7 @@ xfs_agfl_needs_reset(
int active;
/* no agfl header on v4 supers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return false;
/*
@@ -2877,7 +2877,7 @@ xfs_agf_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_agf *agf = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
@@ -2907,12 +2907,12 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels)
return __this_address;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ if (xfs_has_rmapbt(mp) &&
(be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels))
return __this_address;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ if (xfs_has_rmapbt(mp) &&
be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
return __this_address;
@@ -2925,16 +2925,16 @@ xfs_agf_verify(
if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
return __this_address;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+ if (xfs_has_lazysbcount(mp) &&
be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
return __this_address;
- if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+ if (xfs_has_reflink(mp) &&
be32_to_cpu(agf->agf_refcount_blocks) >
be32_to_cpu(agf->agf_length))
return __this_address;
- if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+ if (xfs_has_reflink(mp) &&
(be32_to_cpu(agf->agf_refcount_level) < 1 ||
be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
return __this_address;
@@ -2950,7 +2950,7 @@ xfs_agf_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -2975,7 +2975,7 @@ xfs_agf_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -3073,13 +3073,13 @@ xfs_alloc_read_agf(
* counter only tracks non-root blocks.
*/
allocbt_blks = pag->pagf_btreeblks;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
if (allocbt_blks > 0)
atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
}
#ifdef DEBUG
- else if (!XFS_FORCED_SHUTDOWN(mp)) {
+ else if (!xfs_is_shutdown(mp)) {
ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
@@ -3166,7 +3166,7 @@ xfs_alloc_vextent(
* the first a.g. fails.
*/
if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
- (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+ xfs_is_inode32(mp)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
mp->m_sb.sb_agcount), 0);
@@ -3392,7 +3392,7 @@ struct xfs_alloc_query_range_info {
STATIC int
xfs_alloc_query_range_helper(
struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
void *priv)
{
struct xfs_alloc_query_range_info *query = priv;
@@ -3407,8 +3407,8 @@ xfs_alloc_query_range_helper(
int
xfs_alloc_query_range(
struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *low_rec,
- struct xfs_alloc_rec_incore *high_rec,
+ const struct xfs_alloc_rec_incore *low_rec,
+ const struct xfs_alloc_rec_incore *high_rec,
xfs_alloc_query_range_fn fn,
void *priv)
{
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index e30900b6f8ba..df4aefaf0046 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -220,13 +220,13 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag,
xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
typedef int (*xfs_alloc_query_range_fn)(
- struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *rec,
- void *priv);
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *rec,
+ void *priv);
int xfs_alloc_query_range(struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *low_rec,
- struct xfs_alloc_rec_incore *high_rec,
+ const struct xfs_alloc_rec_incore *low_rec,
+ const struct xfs_alloc_rec_incore *high_rec,
xfs_alloc_query_range_fn fn, void *priv);
int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
void *priv);
@@ -243,7 +243,7 @@ static inline __be32 *
xfs_buf_to_agfl_bno(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_mount->m_sb))
+ if (xfs_has_crc(bp->b_mount))
return bp->b_addr + sizeof(struct xfs_agfl);
return bp->b_addr;
}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 6b363f78cfa2..6746fd735550 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -31,9 +31,9 @@ xfs_allocbt_dup_cursor(
STATIC void
xfs_allocbt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int inc)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
@@ -50,10 +50,10 @@ xfs_allocbt_set_root(
STATIC int
xfs_allocbt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
int error;
xfs_agblock_t bno;
@@ -87,7 +87,7 @@ xfs_allocbt_free_block(
xfs_agblock_t bno;
int error;
- bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+ bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
if (error)
return error;
@@ -103,11 +103,11 @@ xfs_allocbt_free_block(
*/
STATIC void
xfs_allocbt_update_lastrec(
- struct xfs_btree_cur *cur,
- struct xfs_btree_block *block,
- union xfs_btree_rec *rec,
- int ptr,
- int reason)
+ struct xfs_btree_cur *cur,
+ const struct xfs_btree_block *block,
+ const union xfs_btree_rec *rec,
+ int ptr,
+ int reason)
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
struct xfs_perag *pag;
@@ -177,8 +177,8 @@ xfs_allocbt_get_maxrecs(
STATIC void
xfs_allocbt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->alloc.ar_startblock = rec->alloc.ar_startblock;
key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
@@ -186,10 +186,10 @@ xfs_allocbt_init_key_from_rec(
STATIC void
xfs_bnobt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
- __u32 x;
+ __u32 x;
x = be32_to_cpu(rec->alloc.ar_startblock);
x += be32_to_cpu(rec->alloc.ar_blockcount) - 1;
@@ -199,8 +199,8 @@ xfs_bnobt_init_high_key_from_rec(
STATIC void
xfs_cntbt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
key->alloc.ar_startblock = 0;
@@ -229,23 +229,23 @@ xfs_allocbt_init_ptr_from_cur(
STATIC int64_t
xfs_bnobt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
- xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
- xfs_alloc_key_t *kp = &key->alloc;
+ struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
+ const struct xfs_alloc_rec *kp = &key->alloc;
return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
STATIC int64_t
xfs_cntbt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
- xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
- xfs_alloc_key_t *kp = &key->alloc;
- int64_t diff;
+ struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
+ const struct xfs_alloc_rec *kp = &key->alloc;
+ int64_t diff;
diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
if (diff)
@@ -256,9 +256,9 @@ xfs_cntbt_key_diff(
STATIC int64_t
xfs_bnobt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
be32_to_cpu(k2->alloc.ar_startblock);
@@ -266,11 +266,11 @@ xfs_bnobt_diff_two_keys(
STATIC int64_t
xfs_cntbt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
- int64_t diff;
+ int64_t diff;
diff = be32_to_cpu(k1->alloc.ar_blockcount) -
be32_to_cpu(k2->alloc.ar_blockcount);
@@ -295,7 +295,7 @@ xfs_allocbt_verify(
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
@@ -376,9 +376,9 @@ const struct xfs_buf_ops xfs_cntbt_buf_ops = {
STATIC int
xfs_bnobt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be32_to_cpu(k1->alloc.ar_startblock) <
be32_to_cpu(k2->alloc.ar_startblock);
@@ -386,9 +386,9 @@ xfs_bnobt_keys_inorder(
STATIC int
xfs_bnobt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return be32_to_cpu(r1->alloc.ar_startblock) +
be32_to_cpu(r1->alloc.ar_blockcount) <=
@@ -397,9 +397,9 @@ xfs_bnobt_recs_inorder(
STATIC int
xfs_cntbt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be32_to_cpu(k1->alloc.ar_blockcount) <
be32_to_cpu(k2->alloc.ar_blockcount) ||
@@ -410,9 +410,9 @@ xfs_cntbt_keys_inorder(
STATIC int
xfs_cntbt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return be32_to_cpu(r1->alloc.ar_blockcount) <
be32_to_cpu(r2->alloc.ar_blockcount) ||
@@ -498,7 +498,7 @@ xfs_allocbt_init_common(
atomic_inc(&pag->pag_ref);
cur->bc_ag.pag = pag;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
return cur;
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 9eb4c667a6b8..2f6b816aaf9f 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -20,7 +20,7 @@ struct xbtree_afakeroot;
* Btree block header size depends on a superblock flag.
*/
#define XFS_ALLOC_BLOCK_LEN(mp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ (xfs_has_crc(((mp))) ? \
XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 191d51725988..fbc9d816882c 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -146,7 +146,7 @@ xfs_attr_get(
XFS_STATS_INC(args->dp->i_mount, xs_attr_get);
- if (XFS_FORCED_SHUTDOWN(args->dp->i_mount))
+ if (xfs_is_shutdown(args->dp->i_mount))
return -EIO;
args->geo = args->dp->i_mount->m_attr_geo;
@@ -224,7 +224,7 @@ xfs_attr_try_sf_addname(
if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
- if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(dp->i_mount))
xfs_trans_set_sync(args->trans);
return error;
@@ -335,6 +335,7 @@ xfs_attr_sf_addname(
* the attr fork to leaf format and will restart with the leaf
* add.
*/
+ trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp);
dac->flags |= XFS_DAC_DEFER_FINISH;
return -EAGAIN;
}
@@ -394,6 +395,8 @@ xfs_attr_set_iter(
* handling code below
*/
dac->flags |= XFS_DAC_DEFER_FINISH;
+ trace_xfs_attr_set_iter_return(
+ dac->dela_state, args->dp);
return -EAGAIN;
} else if (error) {
return error;
@@ -411,6 +414,7 @@ xfs_attr_set_iter(
dac->dela_state = XFS_DAS_FOUND_NBLK;
}
+ trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
return -EAGAIN;
case XFS_DAS_FOUND_LBLK:
/*
@@ -438,6 +442,8 @@ xfs_attr_set_iter(
error = xfs_attr_rmtval_set_blk(dac);
if (error)
return error;
+ trace_xfs_attr_set_iter_return(dac->dela_state,
+ args->dp);
return -EAGAIN;
}
@@ -472,6 +478,7 @@ xfs_attr_set_iter(
* series.
*/
dac->dela_state = XFS_DAS_FLIP_LFLAG;
+ trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
return -EAGAIN;
case XFS_DAS_FLIP_LFLAG:
/*
@@ -488,11 +495,15 @@ xfs_attr_set_iter(
/* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
dac->dela_state = XFS_DAS_RM_LBLK;
if (args->rmtblkno) {
- error = __xfs_attr_rmtval_remove(dac);
+ error = xfs_attr_rmtval_remove(dac);
+ if (error == -EAGAIN)
+ trace_xfs_attr_set_iter_return(
+ dac->dela_state, args->dp);
if (error)
return error;
dac->dela_state = XFS_DAS_RD_LEAF;
+ trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
return -EAGAIN;
}
@@ -542,6 +553,8 @@ xfs_attr_set_iter(
error = xfs_attr_rmtval_set_blk(dac);
if (error)
return error;
+ trace_xfs_attr_set_iter_return(
+ dac->dela_state, args->dp);
return -EAGAIN;
}
@@ -577,6 +590,7 @@ xfs_attr_set_iter(
* series
*/
dac->dela_state = XFS_DAS_FLIP_NFLAG;
+ trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
return -EAGAIN;
case XFS_DAS_FLIP_NFLAG:
@@ -595,11 +609,16 @@ xfs_attr_set_iter(
/* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
dac->dela_state = XFS_DAS_RM_NBLK;
if (args->rmtblkno) {
- error = __xfs_attr_rmtval_remove(dac);
+ error = xfs_attr_rmtval_remove(dac);
+ if (error == -EAGAIN)
+ trace_xfs_attr_set_iter_return(
+ dac->dela_state, args->dp);
+
if (error)
return error;
dac->dela_state = XFS_DAS_CLR_FLAG;
+ trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
return -EAGAIN;
}
@@ -623,8 +642,8 @@ out:
/*
* Return EEXIST if attr is found, or ENOATTR if not
*/
-int
-xfs_has_attr(
+static int
+xfs_attr_lookup(
struct xfs_da_args *args)
{
struct xfs_inode *dp = args->dp;
@@ -691,7 +710,7 @@ xfs_attr_set(
int rmt_blks = 0;
unsigned int total;
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ if (xfs_is_shutdown(dp->i_mount))
return -EIO;
error = xfs_qm_dqattach(dp);
@@ -761,8 +780,8 @@ xfs_attr_set(
goto out_trans_cancel;
}
+ error = xfs_attr_lookup(args);
if (args->value) {
- error = xfs_has_attr(args);
if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
goto out_trans_cancel;
if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
@@ -777,7 +796,6 @@ xfs_attr_set(
if (!args->trans)
goto out_unlock;
} else {
- error = xfs_has_attr(args);
if (error != -EEXIST)
goto out_trans_cancel;
@@ -790,7 +808,7 @@ xfs_attr_set(
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(args->trans);
if (!(args->op_flags & XFS_DA_OP_NOTIME))
@@ -1176,6 +1194,8 @@ xfs_attr_node_addname(
* this point.
*/
dac->flags |= XFS_DAC_DEFER_FINISH;
+ trace_xfs_attr_node_addname_return(
+ dac->dela_state, args->dp);
return -EAGAIN;
}
@@ -1421,11 +1441,14 @@ xfs_attr_remove_iter(
* May return -EAGAIN. Roll and repeat until all remote
* blocks are removed.
*/
- error = __xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
+ error = xfs_attr_rmtval_remove(dac);
+ if (error == -EAGAIN) {
+ trace_xfs_attr_remove_iter_return(
+ dac->dela_state, args->dp);
return error;
- else if (error)
+ } else if (error) {
goto out;
+ }
/*
* Refill the state structure with buffers (the prior
@@ -1438,6 +1461,7 @@ xfs_attr_remove_iter(
goto out;
dac->dela_state = XFS_DAS_RM_NAME;
dac->flags |= XFS_DAC_DEFER_FINISH;
+ trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp);
return -EAGAIN;
}
@@ -1466,6 +1490,8 @@ xfs_attr_remove_iter(
dac->flags |= XFS_DAC_DEFER_FINISH;
dac->dela_state = XFS_DAS_RM_SHRINK;
+ trace_xfs_attr_remove_iter_return(
+ dac->dela_state, args->dp);
return -EAGAIN;
}
@@ -1514,7 +1540,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->bp) {
- blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
blk->bp = NULL;
} else {
blk->disk_blkno = 0;
@@ -1529,7 +1555,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->bp) {
- blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
blk->bp = NULL;
} else {
blk->disk_blkno = 0;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 8de5d1d2733e..5e71f719bdd5 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -490,7 +490,6 @@ int xfs_attr_get_ilocked(struct xfs_da_args *args);
int xfs_attr_get(struct xfs_da_args *args);
int xfs_attr_set(struct xfs_da_args *args);
int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_has_attr(struct xfs_da_args *args);
int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_remove_iter(struct xfs_delattr_context *dac);
bool xfs_attr_namecheck(const void *name, size_t length);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b910bd209949..e1d11e314228 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -384,7 +384,7 @@ xfs_attr3_leaf_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -406,7 +406,7 @@ xfs_attr3_leaf_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -489,7 +489,7 @@ xfs_attr_copy_value(
}
if (!args->value) {
- args->value = kmem_alloc_large(valuelen, KM_NOLOCKDEP);
+ args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP);
if (!args->value)
return -ENOMEM;
}
@@ -568,7 +568,7 @@ xfs_attr_shortform_bytesfit(
* literal area, but for the old format we are done if there is no
* space in the fixed attribute fork.
*/
- if (!(mp->m_flags & XFS_MOUNT_ATTR2))
+ if (!xfs_has_attr2(mp))
return 0;
dsize = dp->i_df.if_bytes;
@@ -576,7 +576,7 @@ xfs_attr_shortform_bytesfit(
switch (dp->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
/*
- * If there is no attr fork and the data fork is extents,
+ * If there is no attr fork and the data fork is extents,
* determine if creating the default attr fork will result
* in the extents form migrating to btree. If so, the
* minimum offset only needs to be the space required for
@@ -621,21 +621,27 @@ xfs_attr_shortform_bytesfit(
}
/*
- * Switch on the ATTR2 superblock bit (implies also FEATURES2)
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless:
+ * - noattr2 mount option is set,
+ * - on-disk version bit says it is already set, or
+ * - the attr2 mount option is not set to enable automatic upgrade from attr1.
*/
STATIC void
-xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
+xfs_sbversion_add_attr2(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp)
{
- if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
- !(xfs_sb_version_hasattr2(&mp->m_sb))) {
- spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
- xfs_sb_version_addattr2(&mp->m_sb);
- spin_unlock(&mp->m_sb_lock);
- xfs_log_sb(tp);
- } else
- spin_unlock(&mp->m_sb_lock);
- }
+ if (xfs_has_noattr2(mp))
+ return;
+ if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
+ return;
+ if (!xfs_has_attr2(mp))
+ return;
+
+ spin_lock(&mp->m_sb_lock);
+ xfs_add_attr2(mp);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_log_sb(tp);
}
/*
@@ -810,8 +816,7 @@ xfs_attr_sf_removename(
* Fix up the start offset of the attribute fork
*/
totsize -= size;
- if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
- (mp->m_flags & XFS_MOUNT_ATTR2) &&
+ if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
!(args->op_flags & XFS_DA_OP_ADDNAME)) {
xfs_attr_fork_remove(dp, args->trans);
@@ -821,7 +826,7 @@ xfs_attr_sf_removename(
ASSERT(dp->i_forkoff);
ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
- !(mp->m_flags & XFS_MOUNT_ATTR2) ||
+ !xfs_has_attr2(mp) ||
dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
@@ -997,7 +1002,7 @@ xfs_attr_shortform_allfit(
bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
be16_to_cpu(name_loc->valuelen));
}
- if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
+ if (xfs_has_attr2(dp->i_mount) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
return -1;
@@ -1122,7 +1127,7 @@ xfs_attr3_leaf_to_shortform(
goto out;
if (forkoff == -1) {
- ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
+ ASSERT(xfs_has_attr2(dp->i_mount));
ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
xfs_attr_fork_remove(dp, args->trans);
goto out;
@@ -1199,9 +1204,9 @@ xfs_attr3_leaf_to_node(
xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
bp2->b_ops = bp1->b_ops;
memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
- hdr3->blkno = cpu_to_be64(bp2->b_bn);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2));
}
xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
@@ -1264,12 +1269,12 @@ xfs_attr3_leaf_create(
memset(&ichdr, 0, sizeof(ichdr));
ichdr.firstused = args->geo->blksize;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
- hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 0c8bee3abc3b..83b95be9ded8 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -51,7 +51,7 @@ xfs_attr3_rmt_blocks(
struct xfs_mount *mp,
int attrlen)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
return (attrlen + buflen - 1) / buflen;
}
@@ -126,11 +126,11 @@ __xfs_attr3_rmt_read_verify(
int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return 0;
ptr = bp->b_addr;
- bno = bp->b_bn;
+ bno = xfs_buf_daddr(bp);
len = BBTOB(bp->b_length);
ASSERT(len >= blksize);
@@ -191,11 +191,11 @@ xfs_attr3_rmt_write_verify(
xfs_daddr_t bno;
/* no verification of non-crc buffers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
ptr = bp->b_addr;
- bno = bp->b_bn;
+ bno = xfs_buf_daddr(bp);
len = BBTOB(bp->b_length);
ASSERT(len >= blksize);
@@ -246,7 +246,7 @@ xfs_attr3_rmt_hdr_set(
{
struct xfs_attr3_rmt_hdr *rmt = ptr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return 0;
rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
@@ -284,7 +284,7 @@ xfs_attr_rmtval_copyout(
uint8_t **dst)
{
char *src = bp->b_addr;
- xfs_daddr_t bno = bp->b_bn;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
int len = BBTOB(bp->b_length);
int blksize = mp->m_attr_geo->blksize;
@@ -296,7 +296,7 @@ xfs_attr_rmtval_copyout(
byte_cnt = min(*valuelen, byte_cnt);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (xfs_attr3_rmt_hdr_ok(src, ino, *offset,
byte_cnt, bno)) {
xfs_alert(mp,
@@ -332,7 +332,7 @@ xfs_attr_rmtval_copyin(
uint8_t **src)
{
char *dst = bp->b_addr;
- xfs_daddr_t bno = bp->b_bn;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
int len = BBTOB(bp->b_length);
int blksize = mp->m_attr_geo->blksize;
@@ -672,7 +672,7 @@ xfs_attr_rmtval_invalidate(
* routine until it returns something other than -EAGAIN.
*/
int
-__xfs_attr_rmtval_remove(
+xfs_attr_rmtval_remove(
struct xfs_delattr_context *dac)
{
struct xfs_da_args *args = dac->da_args;
@@ -696,6 +696,7 @@ __xfs_attr_rmtval_remove(
*/
if (!done) {
dac->flags |= XFS_DAC_DEFER_FINISH;
+ trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp);
return -EAGAIN;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 61b85b918db8..d72eff30ca18 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -12,7 +12,7 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
xfs_buf_flags_t incore_flags);
int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
-int __xfs_attr_rmtval_remove(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac);
int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 948092babb6a..b48230f1a361 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -242,7 +242,7 @@ xfs_bmap_get_bp(
for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
if (!cur->bc_bufs[i])
break;
- if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+ if (xfs_buf_daddr(cur->bc_bufs[i]) == bno)
return cur->bc_bufs[i];
}
@@ -251,7 +251,7 @@ xfs_bmap_get_bp(
struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip;
if (bip->bli_item.li_type == XFS_LI_BUF &&
- XFS_BUF_ADDR(bip->bli_buf) == bno)
+ xfs_buf_daddr(bip->bli_buf) == bno)
return bip->bli_buf;
}
@@ -739,7 +739,7 @@ xfs_bmap_extents_to_btree(
*/
abp->b_ops = &xfs_bmbt_buf_ops;
ablock = XFS_BUF_TO_BLOCK(abp);
- xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+ xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp),
XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
XFS_BTREE_LONG_PTRS);
@@ -1047,7 +1047,7 @@ xfs_bmap_set_attrforkoff(
ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size);
if (!ip->i_forkoff)
ip->i_forkoff = default_size;
- else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version)
+ else if (xfs_has_attr2(ip->i_mount) && version)
*version = 2;
break;
default:
@@ -1115,17 +1115,17 @@ xfs_bmap_add_attrfork(
xfs_trans_log_inode(tp, ip, logflags);
if (error)
goto trans_cancel;
- if (!xfs_sb_version_hasattr(&mp->m_sb) ||
- (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+ if (!xfs_has_attr(mp) ||
+ (!xfs_has_attr2(mp) && version == 2)) {
bool log_sb = false;
spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasattr(&mp->m_sb)) {
- xfs_sb_version_addattr(&mp->m_sb);
+ if (!xfs_has_attr(mp)) {
+ xfs_add_attr(mp);
log_sb = true;
}
- if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
- xfs_sb_version_addattr2(&mp->m_sb);
+ if (!xfs_has_attr2(mp) && version == 2) {
+ xfs_add_attr2(mp);
log_sb = true;
}
spin_unlock(&mp->m_sb_lock);
@@ -3422,7 +3422,7 @@ xfs_bmap_compute_alignments(
int stripe_align = 0;
/* stripe alignment for allocation is determined by mount parameters */
- if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ if (mp->m_swidth && xfs_has_swalloc(mp))
stripe_align = mp->m_swidth;
else if (mp->m_dalign)
stripe_align = mp->m_dalign;
@@ -3938,7 +3938,7 @@ xfs_bmapi_read(
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
return -EFSCORRUPTED;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
XFS_STATS_INC(mp, xs_blk_mapr);
@@ -4420,7 +4420,7 @@ xfs_bmapi_write(
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
XFS_STATS_INC(mp, xs_blk_mapw);
@@ -4703,7 +4703,7 @@ xfs_bmapi_remap(
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_iread_extents(tp, ip, whichfork);
@@ -5361,7 +5361,7 @@ __xfs_bunmapi(
ifp = XFS_IFORK_PTR(ip, whichfork);
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
return -EFSCORRUPTED;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -5852,7 +5852,7 @@ xfs_bmap_collapse_extents(
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
@@ -5930,7 +5930,7 @@ xfs_bmap_can_insert_extents(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return -EIO;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -5967,7 +5967,7 @@ xfs_bmap_insert_extents(
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
@@ -6070,7 +6070,7 @@ xfs_bmap_split_extent(
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/* Read in all the extents */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 1ceba020940e..72444b8b38a6 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -58,7 +58,7 @@ xfs_bmdr_to_bmbt(
void
xfs_bmbt_disk_get_all(
- struct xfs_bmbt_rec *rec,
+ const struct xfs_bmbt_rec *rec,
struct xfs_bmbt_irec *irec)
{
uint64_t l0 = get_unaligned_be64(&rec->l0);
@@ -78,7 +78,7 @@ xfs_bmbt_disk_get_all(
*/
xfs_filblks_t
xfs_bmbt_disk_get_blockcount(
- xfs_bmbt_rec_t *r)
+ const struct xfs_bmbt_rec *r)
{
return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
}
@@ -88,7 +88,7 @@ xfs_bmbt_disk_get_blockcount(
*/
xfs_fileoff_t
xfs_bmbt_disk_get_startoff(
- xfs_bmbt_rec_t *r)
+ const struct xfs_bmbt_rec *r)
{
return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
@@ -136,7 +136,7 @@ xfs_bmbt_to_bmdr(
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid,
&mp->m_sb.sb_meta_uuid));
@@ -193,10 +193,10 @@ xfs_bmbt_update_cursor(
STATIC int
xfs_bmbt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
xfs_alloc_arg_t args; /* block allocation args */
int error; /* error return value */
@@ -282,7 +282,7 @@ xfs_bmbt_free_block(
struct xfs_mount *mp = cur->bc_mp;
struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_trans *tp = cur->bc_tp;
- xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
struct xfs_owner_info oinfo;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
@@ -352,8 +352,8 @@ xfs_bmbt_get_dmaxrecs(
STATIC void
xfs_bmbt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->bmbt.br_startoff =
cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
@@ -361,8 +361,8 @@ xfs_bmbt_init_key_from_rec(
STATIC void
xfs_bmbt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->bmbt.br_startoff = cpu_to_be64(
xfs_bmbt_disk_get_startoff(&rec->bmbt) +
@@ -387,8 +387,8 @@ xfs_bmbt_init_ptr_from_cur(
STATIC int64_t
xfs_bmbt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
return (int64_t)be64_to_cpu(key->bmbt.br_startoff) -
cur->bc_rec.b.br_startoff;
@@ -396,12 +396,12 @@ xfs_bmbt_key_diff(
STATIC int64_t
xfs_bmbt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
- uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
- uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
+ uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
+ uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
/*
* Note: This routine previously casted a and b to int64 and subtracted
@@ -428,7 +428,7 @@ xfs_bmbt_verify(
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
/*
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
@@ -497,9 +497,9 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = {
STATIC int
xfs_bmbt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be64_to_cpu(k1->bmbt.br_startoff) <
be64_to_cpu(k2->bmbt.br_startoff);
@@ -507,9 +507,9 @@ xfs_bmbt_keys_inorder(
STATIC int
xfs_bmbt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
@@ -563,7 +563,7 @@ xfs_bmbt_init_cursor(
cur->bc_ops = &xfs_bmbt_ops;
cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 72bf74c79fb9..729e3bc569be 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -16,7 +16,7 @@ struct xfs_trans;
* Btree block header size depends on a superblock flag.
*/
#define XFS_BMBT_BLOCK_LEN(mp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ (xfs_has_crc(((mp))) ? \
XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
#define XFS_BMBT_REC_ADDR(mp, block, index) \
@@ -88,9 +88,10 @@ extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
struct xfs_btree_block *, int);
void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s);
-extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
-extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(const struct xfs_bmbt_rec *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(const struct xfs_bmbt_rec *r);
+void xfs_bmbt_disk_get_all(const struct xfs_bmbt_rec *r,
+ struct xfs_bmbt_irec *s);
extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
xfs_bmdr_block_t *, int);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index be74a6b53689..298395481713 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -64,13 +64,13 @@ __xfs_btree_check_lblock(
{
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_sb_version_hascrc(&mp->m_sb);
+ int crc = xfs_has_crc(mp);
if (crc) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (block->bb_u.l.bb_blkno !=
- cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
+ cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
return __this_address;
if (block->bb_u.l.bb_pad != cpu_to_be32(0))
return __this_address;
@@ -129,13 +129,13 @@ __xfs_btree_check_sblock(
{
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_sb_version_hascrc(&mp->m_sb);
+ int crc = xfs_has_crc(mp);
if (crc) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (block->bb_u.s.bb_blkno !=
- cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
+ cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
return __this_address;
}
@@ -225,10 +225,10 @@ xfs_btree_check_sptr(
*/
static int
xfs_btree_check_ptr(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int index,
- int level)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int index,
+ int level)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
@@ -273,7 +273,7 @@ xfs_btree_lblock_calc_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_log_item;
- if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
+ if (!xfs_has_crc(bp->b_mount))
return;
if (bip)
block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -287,7 +287,7 @@ xfs_btree_lblock_verify_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_mount *mp = bp->b_mount;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
@@ -311,7 +311,7 @@ xfs_btree_sblock_calc_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_log_item;
- if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
+ if (!xfs_has_crc(bp->b_mount))
return;
if (bip)
block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -325,7 +325,7 @@ xfs_btree_sblock_verify_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_mount *mp = bp->b_mount;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
@@ -374,7 +374,7 @@ xfs_btree_del_cursor(
}
ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
- XFS_FORCED_SHUTDOWN(cur->bc_mp));
+ xfs_is_shutdown(cur->bc_mp));
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
kmem_free(cur->bc_ops);
if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
@@ -420,7 +420,7 @@ xfs_btree_dup_cursor(
bp = cur->bc_bufs[i];
if (bp) {
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_BUF_ADDR(bp), mp->m_bsize,
+ xfs_buf_daddr(bp), mp->m_bsize,
0, &bp,
cur->bc_ops->buf_ops);
if (error) {
@@ -935,9 +935,9 @@ xfs_btree_readahead(
STATIC int
xfs_btree_ptr_to_daddr(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- xfs_daddr_t *daddr)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ xfs_daddr_t *daddr)
{
xfs_fsblock_t fsbno;
xfs_agblock_t agbno;
@@ -1012,8 +1012,8 @@ xfs_btree_setbuf(
bool
xfs_btree_ptr_is_null(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return ptr->l == cpu_to_be64(NULLFSBLOCK);
@@ -1059,10 +1059,10 @@ xfs_btree_get_sibling(
void
xfs_btree_set_sibling(
- struct xfs_btree_cur *cur,
- struct xfs_btree_block *block,
- union xfs_btree_ptr *ptr,
- int lr)
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ const union xfs_btree_ptr *ptr,
+ int lr)
{
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
@@ -1090,7 +1090,7 @@ xfs_btree_init_block_int(
__u64 owner,
unsigned int flags)
{
- int crc = xfs_sb_version_hascrc(&mp->m_sb);
+ int crc = xfs_has_crc(mp);
__u32 magic = xfs_btree_magic(crc, btnum);
buf->bb_magic = cpu_to_be32(magic);
@@ -1131,7 +1131,7 @@ xfs_btree_init_block(
__u16 numrecs,
__u64 owner)
{
- xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+ xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
btnum, level, numrecs, owner, 0);
}
@@ -1155,9 +1155,9 @@ xfs_btree_init_block_cur(
else
owner = cur->bc_ag.pag->pag_agno;
- xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
- cur->bc_btnum, level, numrecs,
- owner, cur->bc_flags);
+ xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
+ xfs_buf_daddr(bp), cur->bc_btnum, level,
+ numrecs, owner, cur->bc_flags);
}
/*
@@ -1192,10 +1192,10 @@ xfs_btree_buf_to_ptr(
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
- XFS_BUF_ADDR(bp)));
+ xfs_buf_daddr(bp)));
else {
ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
- XFS_BUF_ADDR(bp)));
+ xfs_buf_daddr(bp)));
}
}
@@ -1229,10 +1229,10 @@ xfs_btree_set_refs(
int
xfs_btree_get_buf_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- struct xfs_btree_block **block,
- struct xfs_buf **bpp)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
{
struct xfs_mount *mp = cur->bc_mp;
xfs_daddr_t d;
@@ -1257,11 +1257,11 @@ xfs_btree_get_buf_block(
*/
STATIC int
xfs_btree_read_buf_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int flags,
- struct xfs_btree_block **block,
- struct xfs_buf **bpp)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
{
struct xfs_mount *mp = cur->bc_mp;
xfs_daddr_t d;
@@ -1289,10 +1289,10 @@ xfs_btree_read_buf_block(
*/
void
xfs_btree_copy_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *dst_key,
- union xfs_btree_key *src_key,
- int numkeys)
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key,
+ const union xfs_btree_key *src_key,
+ int numkeys)
{
ASSERT(numkeys >= 0);
memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
@@ -1713,10 +1713,10 @@ error0:
int
xfs_btree_lookup_get_block(
- struct xfs_btree_cur *cur, /* btree cursor */
- int level, /* level in the btree */
- union xfs_btree_ptr *pp, /* ptr to btree block */
- struct xfs_btree_block **blkp) /* return btree block */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level in the btree */
+ const union xfs_btree_ptr *pp, /* ptr to btree block */
+ struct xfs_btree_block **blkp) /* return btree block */
{
struct xfs_buf *bp; /* buffer pointer for btree block */
xfs_daddr_t daddr;
@@ -1739,7 +1739,7 @@ xfs_btree_lookup_get_block(
error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
if (error)
return error;
- if (bp && XFS_BUF_ADDR(bp) == daddr) {
+ if (bp && xfs_buf_daddr(bp) == daddr) {
*blkp = XFS_BUF_TO_BLOCK(bp);
return 0;
}
@@ -1749,7 +1749,7 @@ xfs_btree_lookup_get_block(
return error;
/* Check the inode owner since the verifiers don't. */
- if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
+ if (xfs_has_crc(cur->bc_mp) &&
!(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
@@ -2923,10 +2923,11 @@ xfs_btree_new_iroot(
*/
memcpy(cblock, block, xfs_btree_block_len(cur));
if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+ cblock->bb_u.l.bb_blkno = bno;
else
- cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+ cblock->bb_u.s.bb_blkno = bno;
}
be16_add_cpu(&block->bb_level, 1);
@@ -3225,7 +3226,7 @@ xfs_btree_insrec(
/* Get pointers to the btree buffer and block. */
block = xfs_btree_get_block(cur, level, &bp);
- old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
+ old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
numrecs = xfs_btree_get_numrecs(block);
#ifdef DEBUG
@@ -3341,7 +3342,7 @@ xfs_btree_insrec(
* some records into the new tree block), so use the regular key
* update mechanism.
*/
- if (bp && bp->b_bn != old_bn) {
+ if (bp && xfs_buf_daddr(bp) != old_bn) {
xfs_btree_get_keys(cur, block, lkey);
} else if (xfs_btree_needs_key_update(cur, optr)) {
error = xfs_btree_update_keys(cur, level);
@@ -4418,11 +4419,11 @@ xfs_btree_lblock_v5hdr_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return __this_address;
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
+ if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
if (owner != XFS_RMAP_OWN_UNKNOWN &&
be64_to_cpu(block->bb_u.l.bb_owner) != owner)
@@ -4468,11 +4469,11 @@ xfs_btree_sblock_v5hdr_verify(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return __this_address;
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
return __this_address;
@@ -4499,7 +4500,7 @@ xfs_btree_sblock_verify(
return __this_address;
/* sibling pointer verification */
- agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
+ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
!xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
return __this_address;
@@ -4536,8 +4537,8 @@ xfs_btree_compute_maxlevels(
STATIC int
xfs_btree_simple_query_range(
struct xfs_btree_cur *cur,
- union xfs_btree_key *low_key,
- union xfs_btree_key *high_key,
+ const union xfs_btree_key *low_key,
+ const union xfs_btree_key *high_key,
xfs_btree_query_range_fn fn,
void *priv)
{
@@ -4627,8 +4628,8 @@ out:
STATIC int
xfs_btree_overlapped_query_range(
struct xfs_btree_cur *cur,
- union xfs_btree_key *low_key,
- union xfs_btree_key *high_key,
+ const union xfs_btree_key *low_key,
+ const union xfs_btree_key *high_key,
xfs_btree_query_range_fn fn,
void *priv)
{
@@ -4769,8 +4770,8 @@ out:
int
xfs_btree_query_range(
struct xfs_btree_cur *cur,
- union xfs_btree_irec *low_rec,
- union xfs_btree_irec *high_rec,
+ const union xfs_btree_irec *low_rec,
+ const union xfs_btree_irec *high_rec,
xfs_btree_query_range_fn fn,
void *priv)
{
@@ -4877,7 +4878,7 @@ xfs_btree_diff_two_ptrs(
STATIC int
xfs_btree_has_record_helper(
struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
void *priv)
{
return -ECANCELED;
@@ -4886,12 +4887,12 @@ xfs_btree_has_record_helper(
/* Is there a record covering a given range of keys? */
int
xfs_btree_has_record(
- struct xfs_btree_cur *cur,
- union xfs_btree_irec *low,
- union xfs_btree_irec *high,
- bool *exists)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_irec *low,
+ const union xfs_btree_irec *high,
+ bool *exists)
{
- int error;
+ int error;
error = xfs_btree_query_range(cur, low, high,
&xfs_btree_has_record_helper, NULL);
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4dbdc659c396..4eaf8517f850 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -106,19 +106,19 @@ struct xfs_btree_ops {
/* update btree root pointer */
void (*set_root)(struct xfs_btree_cur *cur,
- union xfs_btree_ptr *nptr, int level_change);
+ const union xfs_btree_ptr *nptr, int level_change);
/* block allocation / freeing */
int (*alloc_block)(struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start_bno,
+ const union xfs_btree_ptr *start_bno,
union xfs_btree_ptr *new_bno,
int *stat);
int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
/* update last record information */
void (*update_lastrec)(struct xfs_btree_cur *cur,
- struct xfs_btree_block *block,
- union xfs_btree_rec *rec,
+ const struct xfs_btree_block *block,
+ const union xfs_btree_rec *rec,
int ptr, int reason);
/* records in block/level */
@@ -130,37 +130,37 @@ struct xfs_btree_ops {
/* init values of btree structures */
void (*init_key_from_rec)(union xfs_btree_key *key,
- union xfs_btree_rec *rec);
+ const union xfs_btree_rec *rec);
void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec);
void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr);
void (*init_high_key_from_rec)(union xfs_btree_key *key,
- union xfs_btree_rec *rec);
+ const union xfs_btree_rec *rec);
/* difference between key value and cursor value */
int64_t (*key_diff)(struct xfs_btree_cur *cur,
- union xfs_btree_key *key);
+ const union xfs_btree_key *key);
/*
* Difference between key2 and key1 -- positive if key1 > key2,
* negative if key1 < key2, and zero if equal.
*/
int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
- union xfs_btree_key *key1,
- union xfs_btree_key *key2);
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2);
const struct xfs_buf_ops *buf_ops;
/* check that k1 is lower than k2 */
int (*keys_inorder)(struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2);
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2);
/* check that r1 is lower than r2 */
int (*recs_inorder)(struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2);
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2);
};
/*
@@ -423,7 +423,7 @@ void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
/*
* Helpers.
*/
-static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block)
{
return be16_to_cpu(block->bb_numrecs);
}
@@ -434,7 +434,7 @@ static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
block->bb_numrecs = cpu_to_be16(numrecs);
}
-static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
{
return be16_to_cpu(block->bb_level);
}
@@ -471,10 +471,11 @@ unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
* code on its own.
*/
typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec, void *priv);
+ const union xfs_btree_rec *rec, void *priv);
int xfs_btree_query_range(struct xfs_btree_cur *cur,
- union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
+ const union xfs_btree_irec *low_rec,
+ const union xfs_btree_irec *high_rec,
xfs_btree_query_range_fn fn, void *priv);
int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
void *priv);
@@ -502,10 +503,11 @@ union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n,
union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n,
struct xfs_btree_block *block);
int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
- union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
+ const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
int level, struct xfs_buf **bpp);
-bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr);
+bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr);
int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b);
@@ -516,8 +518,9 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, union xfs_btree_key *key);
union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
-int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
- union xfs_btree_irec *high, bool *exists);
+int xfs_btree_has_record(struct xfs_btree_cur *cur,
+ const union xfs_btree_irec *low,
+ const union xfs_btree_irec *high, bool *exists);
bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
@@ -540,10 +543,11 @@ xfs_btree_islastblock(
void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr);
-int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr,
- struct xfs_btree_block **block, struct xfs_buf **bpp);
+int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
+ struct xfs_buf **bpp);
void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
- struct xfs_btree_block *block, union xfs_btree_ptr *ptr,
+ struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
int lr);
void xfs_btree_init_block_cur(struct xfs_btree_cur *cur,
struct xfs_buf *bp, int level, int numrecs);
@@ -551,7 +555,7 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
union xfs_btree_ptr *dst_ptr,
const union xfs_btree_ptr *src_ptr, int numptrs);
void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
- union xfs_btree_key *dst_key, union xfs_btree_key *src_key,
- int numkeys);
+ union xfs_btree_key *dst_key,
+ const union xfs_btree_key *src_key, int numkeys);
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index aa8dc9521c39..ac9e80152b5c 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -59,10 +59,10 @@ xfs_btree_fakeroot_dup_cursor(
*/
STATIC int
xfs_btree_fakeroot_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start_bno,
- union xfs_btree_ptr *new_bno,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int *stat)
{
ASSERT(0);
return -EFSCORRUPTED;
@@ -112,9 +112,9 @@ xfs_btree_fakeroot_init_ptr_from_cur(
/* Update the btree root information for a per-AG fake root. */
STATIC void
xfs_btree_afakeroot_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int inc)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
{
struct xbtree_afakeroot *afake = cur->bc_ag.afake;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 747ec77912c3..c062e2c85178 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -129,7 +129,7 @@ xfs_da3_node_hdr_from_disk(
struct xfs_da3_icnode_hdr *to,
struct xfs_da_intnode *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from;
to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
@@ -156,7 +156,7 @@ xfs_da3_node_hdr_to_disk(
struct xfs_da_intnode *to,
struct xfs_da3_icnode_hdr *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to;
ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
@@ -191,10 +191,10 @@ xfs_da3_blkinfo_verify(
if (!xfs_verify_magic16(bp, hdr->magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
@@ -253,7 +253,7 @@ xfs_da3_node_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -442,12 +442,12 @@ xfs_da3_node_create(
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
node = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
- hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
@@ -711,7 +711,7 @@ xfs_da3_root_split(
oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
- node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+ node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
}
xfs_trans_log_buf(tp, bp, 0, size - 1);
@@ -1219,7 +1219,7 @@ xfs_da3_root_join(
xfs_trans_buf_copy_type(root_blk->bp, bp);
if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
- da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+ da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp));
}
xfs_trans_log_buf(args->trans, root_blk->bp, 0,
args->geo->blksize - 1);
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index b876b44c0204..5a49caa5c9df 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -789,7 +789,7 @@ struct xfs_attr3_rmt_hdr {
#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
- ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ ((bufsize) - (xfs_has_crc((mp)) ? \
sizeof(struct xfs_attr3_rmt_hdr) : 0))
/* Number of bytes in a directory block. */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 050bdcc4fe73..50546eadaae2 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -115,7 +115,7 @@ xfs_da_mount(
dageo->fsblog = mp->m_sb.sb_blocklog;
dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb);
dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr);
dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr);
dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr);
@@ -730,7 +730,7 @@ xfs_dir2_hashname(
struct xfs_mount *mp,
struct xfs_name *name)
{
- if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb)))
+ if (unlikely(xfs_has_asciici(mp)))
return xfs_ascii_ci_hashname(name);
return xfs_da_hashname(name->name, name->len);
}
@@ -741,7 +741,7 @@ xfs_dir2_compname(
const unsigned char *name,
int len)
{
- if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb)))
+ if (unlikely(xfs_has_asciici(args->dp->i_mount)))
return xfs_ascii_ci_compname(args, name, len);
return xfs_da_compname(args, name, len);
}
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 75e1421f69c4..df0869bba275 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -53,10 +53,10 @@ xfs_dir3_block_verify(
if (!xfs_verify_magic(bp, hdr3->magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
@@ -71,7 +71,7 @@ xfs_dir3_block_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -96,7 +96,7 @@ xfs_dir3_block_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -121,7 +121,7 @@ xfs_dir3_block_header_check(
{
struct xfs_mount *mp = dp->i_mount;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (be64_to_cpu(hdr3->owner) != dp->i_ino)
@@ -171,10 +171,10 @@ xfs_dir3_block_init(
bp->b_ops = &xfs_dir3_block_buf_ops;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
- hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
return;
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index e67fa086f2c1..dbcf58979a59 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -29,7 +29,7 @@ xfs_dir2_data_bestfree_p(
struct xfs_mount *mp,
struct xfs_dir2_data_hdr *hdr)
{
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
return hdr->bestfree;
}
@@ -51,7 +51,7 @@ xfs_dir2_data_get_ftype(
struct xfs_mount *mp,
struct xfs_dir2_data_entry *dep)
{
- if (xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (xfs_has_ftype(mp)) {
uint8_t ftype = dep->name[dep->namelen];
if (likely(ftype < XFS_DIR3_FT_MAX))
@@ -70,7 +70,7 @@ xfs_dir2_data_put_ftype(
ASSERT(ftype < XFS_DIR3_FT_MAX);
ASSERT(dep->namelen != 0);
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
dep->name[dep->namelen] = ftype;
}
@@ -297,10 +297,10 @@ xfs_dir3_data_verify(
if (!xfs_verify_magic(bp, hdr3->magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
@@ -343,7 +343,7 @@ xfs_dir3_data_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -368,7 +368,7 @@ xfs_dir3_data_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -401,7 +401,7 @@ xfs_dir3_data_header_check(
{
struct xfs_mount *mp = dp->i_mount;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
@@ -717,12 +717,12 @@ xfs_dir3_data_init(
* Initialize the header.
*/
hdr = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
- hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 5369d8bb2593..d9b66306a9a7 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -37,7 +37,7 @@ xfs_dir2_leaf_hdr_from_disk(
struct xfs_dir3_icleaf_hdr *to,
struct xfs_dir2_leaf *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from;
to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
@@ -68,7 +68,7 @@ xfs_dir2_leaf_hdr_to_disk(
struct xfs_dir2_leaf *to,
struct xfs_dir3_icleaf_hdr *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to;
ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
@@ -108,7 +108,7 @@ xfs_dir3_leaf1_check(
if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp))
return __this_address;
} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
return __this_address;
@@ -209,7 +209,7 @@ xfs_dir3_leaf_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -234,7 +234,7 @@ xfs_dir3_leaf_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -308,7 +308,7 @@ xfs_dir3_leaf_init(
ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
memset(leaf3, 0, sizeof(*leaf3));
@@ -316,7 +316,7 @@ xfs_dir3_leaf_init(
leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
: cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
- leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+ leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
leaf3->info.owner = cpu_to_be64(owner);
uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index d0520afb913a..7a03aeb9f4c9 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -68,7 +68,7 @@ xfs_dir3_leafn_check(
if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp))
return __this_address;
} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
return __this_address;
@@ -105,12 +105,12 @@ xfs_dir3_free_verify(
if (!xfs_verify_magic(bp, hdr->magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
@@ -128,7 +128,7 @@ xfs_dir3_free_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -153,7 +153,7 @@ xfs_dir3_free_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -185,7 +185,7 @@ xfs_dir3_free_header_check(
firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
maxbests;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
if (be32_to_cpu(hdr3->firstdb) != firstdb)
@@ -247,7 +247,7 @@ xfs_dir2_free_hdr_from_disk(
struct xfs_dir3_icfree_hdr *to,
struct xfs_dir2_free *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from;
to->magic = be32_to_cpu(from3->hdr.hdr.magic);
@@ -274,7 +274,7 @@ xfs_dir2_free_hdr_to_disk(
struct xfs_dir2_free *to,
struct xfs_dir3_icfree_hdr *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to;
ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
@@ -341,12 +341,12 @@ xfs_dir3_free_get_buf(
memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
memset(&hdr, 0, sizeof(hdr));
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
hdr.magic = XFS_DIR3_FREE_MAGIC;
- hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 94943ce49cab..711709a2aa53 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -196,7 +196,7 @@ xfs_dir2_data_entsize(
len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen +
sizeof(xfs_dir2_data_off_t) /* tag */;
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
len += sizeof(uint8_t);
return round_up(len, XFS_DIR2_DATA_ALIGN);
}
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 46d18bf9d5e1..5a97a87eaa20 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -48,7 +48,7 @@ xfs_dir2_sf_entsize(
count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
count += sizeof(uint8_t);
return count;
}
@@ -76,7 +76,7 @@ xfs_dir2_sf_get_ino(
{
uint8_t *from = sfep->name + sfep->namelen;
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
from++;
if (!hdr->i8count)
@@ -95,7 +95,7 @@ xfs_dir2_sf_put_ino(
ASSERT(ino <= XFS_MAXINUMBER);
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
to++;
if (hdr->i8count)
@@ -135,7 +135,7 @@ xfs_dir2_sf_get_ftype(
struct xfs_mount *mp,
struct xfs_dir2_sf_entry *sfep)
{
- if (xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (xfs_has_ftype(mp)) {
uint8_t ftype = sfep->name[sfep->namelen];
if (ftype < XFS_DIR3_FT_MAX)
@@ -153,7 +153,7 @@ xfs_dir2_sf_put_ftype(
{
ASSERT(ftype < XFS_DIR3_FT_MAX);
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
sfep->name[sfep->namelen] = ftype;
}
@@ -192,7 +192,7 @@ xfs_dir2_block_sfsize(
* if there is a filetype field, add the extra byte to the namelen
* for each entry that we see.
*/
- has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
+ has_ftype = xfs_has_ftype(mp) ? 1 : 0;
count = i8count = namelen = 0;
btp = xfs_dir2_block_tail_p(geo, hdr);
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 6766417d5ba4..deeb74becabc 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -70,7 +70,7 @@ xfs_dquot_verify(
return __this_address;
if ((ddq->d_type & XFS_DQTYPE_BIGTIME) &&
- !xfs_sb_version_hasbigtime(&mp->m_sb))
+ !xfs_has_bigtime(mp))
return __this_address;
if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id)
@@ -106,7 +106,7 @@ xfs_dqblk_verify(
struct xfs_dqblk *dqb,
xfs_dqid_t id) /* used only during quotacheck */
{
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -134,7 +134,7 @@ xfs_dqblk_repair(
dqb->dd_diskdq.d_type = type;
dqb->dd_diskdq.d_id = cpu_to_be32(id);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -151,7 +151,7 @@ xfs_dquot_buf_verify_crc(
int ndquots;
int i;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return true;
/*
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 76e2461b9e66..2d7057b7984b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -9,7 +9,7 @@
/*
* XFS On Disk Format Definitions
*
- * This header file defines all the on-disk format definitions for
+ * This header file defines all the on-disk format definitions for
* general XFS objects. Directory and attribute related objects are defined in
* xfs_da_format.h, which log and log item formats are defined in
* xfs_log_format.h. Everything else goes here.
@@ -265,7 +265,6 @@ typedef struct xfs_dsb {
/* must be padded to 64 bit alignment */
} xfs_dsb_t;
-
/*
* Misc. Flags - warning - these will be cleared by xfs_repair unless
* a feature bit is set when the flag is used.
@@ -280,37 +279,9 @@ typedef struct xfs_dsb {
#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-/*
- * The first XFS version we support is a v4 superblock with V2 directories.
- */
-static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
-{
- if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
- return false;
- if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
- return false;
-
- /* check for unknown features in the fs */
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
- return false;
-
- return true;
-}
-
-static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
+static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
{
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
- return true;
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
- return xfs_sb_good_v4_features(sbp);
- return false;
-}
-
-static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp)
-{
- return sbp->sb_rblocks > 0;
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
/*
@@ -322,9 +293,10 @@ static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
return sbp->sb_bad_features2 != sbp->sb_features2;
}
-static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
{
- return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
+ return xfs_sb_is_v5(sbp) ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
}
static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
@@ -332,87 +304,18 @@ static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
}
-static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
-}
-
static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
}
-static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
-}
-
-static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
-}
-
-static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
-}
-
-static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
-}
-
-static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
-}
-
-static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
-}
-
-/*
- * sb_features2 bit version macros.
- */
-static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
-}
-
-static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
-}
-
static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
}
-static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
-{
- sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
- if (!sbp->sb_features2)
- sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
-}
-
-static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
-}
-
-static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
+static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
@@ -495,106 +398,21 @@ xfs_sb_has_incompat_log_feature(
return (sbp->sb_features_log_incompat & feature) != 0;
}
-/*
- * V5 superblock specific feature checks
- */
-static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-/*
- * v5 file systems support V3 inodes only, earlier file systems support
- * v2 and v1 inodes.
- */
-static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-static inline bool xfs_dinode_good_version(struct xfs_sb *sbp,
- uint8_t version)
-{
- if (xfs_sb_version_has_v3inode(sbp))
- return version == 3;
- return version == 1 || version == 2;
-}
-
-static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
-}
-
-static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
- (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
-}
-
-static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
-}
-
-/*
- * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID
- * is stored separately from the user-visible UUID; this allows the
- * user-visible UUID to be changed on V5 filesystems which have a
- * filesystem UUID stamped into every piece of metadata.
- */
-static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
- (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
-}
-
-static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
- (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
-}
-
-static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
-}
-
-static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME);
-}
-
-/*
- * Inode btree block counter. We record the number of inobt and finobt blocks
- * in the AGI header so that we can skip the finobt walk at mount time when
- * setting up per-AG reservations.
- */
-static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp)
+static inline void
+xfs_sb_remove_incompat_log_features(
+ struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT);
+ sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL;
}
-static inline bool xfs_sb_version_needsrepair(struct xfs_sb *sbp)
+static inline void
+xfs_sb_add_incompat_log_features(
+ struct xfs_sb *sbp,
+ unsigned int features)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
+ sbp->sb_features_log_incompat |= features;
}
-/*
- * end of superblock version macros
- */
static inline bool
xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
@@ -1062,12 +880,12 @@ enum xfs_dinode_fmt {
/*
* Inode size for given fs.
*/
-#define XFS_DINODE_SIZE(sbp) \
- (xfs_sb_version_has_v3inode(sbp) ? \
+#define XFS_DINODE_SIZE(mp) \
+ (xfs_has_v3inodes(mp) ? \
sizeof(struct xfs_dinode) : \
offsetof(struct xfs_dinode, di_crc))
#define XFS_LITINO(mp) \
- ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb))
+ ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(mp))
/*
* Inode data & attribute fork sizes, per inode.
@@ -1454,7 +1272,7 @@ struct xfs_dsymlink_hdr {
#define XFS_SYMLINK_MAPS 3
#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
- ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ ((bufsize) - (xfs_has_crc((mp)) ? \
sizeof(struct xfs_dsymlink_hdr) : 0))
@@ -1686,7 +1504,7 @@ struct xfs_rmap_key {
typedef __be32 xfs_rmap_ptr_t;
#define XFS_RMAP_BLOCK(mp) \
- (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+ (xfs_has_finobt(((mp))) ? \
XFS_FIBT_BLOCK(mp) + 1 : \
XFS_IBT_BLOCK(mp) + 1)
@@ -1918,7 +1736,7 @@ struct xfs_acl {
* limited only by the maximum size of the xattr that stores the information.
*/
#define XFS_ACL_MAX_ENTRIES(mp) \
- (xfs_sb_version_hascrc(&mp->m_sb) \
+ (xfs_has_crc(mp) \
? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
sizeof(struct xfs_acl_entry) \
: 25)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index aaf8805a82df..994ad783d407 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -58,7 +58,7 @@ xfs_inobt_update(
union xfs_btree_rec rec;
rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
- if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ if (xfs_has_sparseinodes(cur->bc_mp)) {
rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
rec.inobt.ir_u.sp.ir_count = irec->ir_count;
rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
@@ -74,11 +74,11 @@ xfs_inobt_update(
void
xfs_inobt_btrec_to_irec(
struct xfs_mount *mp,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec)
{
irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
- if (xfs_sb_version_hassparseinodes(&mp->m_sb)) {
+ if (xfs_has_sparseinodes(mp)) {
irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
irec->ir_count = rec->inobt.ir_u.sp.ir_count;
irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
@@ -241,7 +241,7 @@ xfs_check_agi_freecount(
}
} while (i == 1);
- if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+ if (!xfs_is_shutdown(cur->bc_mp))
ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
}
return 0;
@@ -302,7 +302,7 @@ xfs_ialloc_inode_init(
* That means for v3 inode we log the entire buffer rather than just the
* inode cores.
*/
- if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
+ if (xfs_has_v3inodes(mp)) {
version = 3;
ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
@@ -337,7 +337,6 @@ xfs_ialloc_inode_init(
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
- uint isize = XFS_DINODE_SIZE(&mp->m_sb);
free = xfs_make_iptr(mp, fbuf, i);
free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
@@ -354,7 +353,7 @@ xfs_ialloc_inode_init(
} else if (tp) {
/* just log the inode core */
xfs_trans_log_buf(tp, fbuf, ioffset,
- ioffset + isize - 1);
+ ioffset + XFS_DINODE_SIZE(mp) - 1);
}
}
@@ -635,7 +634,7 @@ xfs_ialloc_ag_alloc(
#ifdef DEBUG
/* randomly do sparse inode allocations */
- if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
+ if (xfs_has_sparseinodes(tp->t_mountp) &&
igeo->ialloc_min_blks < igeo->ialloc_blks)
do_sparse = prandom_u32() & 1;
#endif
@@ -712,7 +711,7 @@ xfs_ialloc_ag_alloc(
*/
isaligned = 0;
if (igeo->ialloc_align) {
- ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+ ASSERT(!xfs_has_noalign(args.mp));
args.alignment = args.mp->m_dalign;
isaligned = 1;
} else
@@ -754,7 +753,7 @@ xfs_ialloc_ag_alloc(
* Finally, try a sparse allocation if the filesystem supports it and
* the sparse allocation length is smaller than a full chunk.
*/
- if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+ if (xfs_has_sparseinodes(args.mp) &&
igeo->ialloc_min_blks < igeo->ialloc_blks &&
args.fsbno == NULLFSBLOCK) {
sparse_alloc:
@@ -856,7 +855,7 @@ sparse_alloc:
* from the previous call. Set merge false to replace any
* existing record with this one.
*/
- if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ if (xfs_has_finobt(args.mp)) {
error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
XFS_BTNUM_FINO, &rec, false);
if (error)
@@ -869,7 +868,7 @@ sparse_alloc:
if (error)
return error;
- if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ if (xfs_has_finobt(args.mp)) {
error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino,
newlen, XFS_BTNUM_FINO);
if (error)
@@ -1448,7 +1447,7 @@ xfs_dialloc_ag(
int offset;
int i;
- if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (!xfs_has_finobt(mp))
return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop);
/*
@@ -1784,7 +1783,7 @@ xfs_dialloc(
break;
}
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (xfs_is_shutdown(mp)) {
error = -EFSCORRUPTED;
break;
}
@@ -1953,8 +1952,7 @@ xfs_difree_inobt(
* remove the chunk if the block size is large enough for multiple inode
* chunks (that might not be free).
*/
- if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
- rec.ir_free == XFS_INOBT_ALL_FREE &&
+ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
struct xfs_perag *pag = agbp->b_pag;
@@ -1994,7 +1992,7 @@ xfs_difree_inobt(
goto error0;
}
- /*
+ /*
* Change the inode free counts and log the ag/sb changes.
*/
be32_add_cpu(&agi->agi_freecount, 1);
@@ -2098,9 +2096,8 @@ xfs_difree_finobt(
* enough for multiple chunks. Leave the finobt record to remain in sync
* with the inobt.
*/
- if (rec.ir_free == XFS_INOBT_ALL_FREE &&
- mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
- !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
error = xfs_btree_delete(cur, &i);
if (error)
goto error;
@@ -2189,7 +2186,7 @@ xfs_difree(
/*
* Fix up the free inode btree.
*/
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ if (xfs_has_finobt(mp)) {
error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec);
if (error)
goto error0;
@@ -2478,7 +2475,7 @@ xfs_agi_verify(
struct xfs_agi *agi = bp->b_addr;
int i;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
@@ -2497,7 +2494,7 @@ xfs_agi_verify(
be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
return __this_address;
- if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ if (xfs_has_finobt(mp) &&
(be32_to_cpu(agi->agi_free_level) < 1 ||
be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
return __this_address;
@@ -2528,7 +2525,7 @@ xfs_agi_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -2553,7 +2550,7 @@ xfs_agi_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -2626,7 +2623,7 @@ xfs_ialloc_read_agi(
* we are in the middle of a forced shutdown.
*/
ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
+ xfs_is_shutdown(mp));
return 0;
}
@@ -2716,7 +2713,7 @@ struct xfs_ialloc_count_inodes {
STATIC int
xfs_ialloc_count_inodes_rec(
struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
void *priv)
{
struct xfs_inobt_rec_incore irec;
@@ -2773,7 +2770,7 @@ xfs_ialloc_setup_geometry(
uint inodes;
igeo->new_diflags2 = 0;
- if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ if (xfs_has_bigtime(mp))
igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
/* Compute inode btree geometry. */
@@ -2828,7 +2825,7 @@ xfs_ialloc_setup_geometry(
* cannot change the behavior.
*/
igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
- if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
+ if (xfs_has_v3inodes(mp)) {
int new_size = igeo->inode_cluster_size_raw;
new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
@@ -2846,7 +2843,7 @@ xfs_ialloc_setup_geometry(
igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
/* Calculate inode cluster alignment. */
- if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ if (xfs_has_align(mp) &&
mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
igeo->cluster_align = mp->m_sb.sb_inoalignmt;
else
@@ -2894,15 +2891,15 @@ xfs_ialloc_calc_rootino(
first_bno += xfs_alloc_min_freelist(mp, NULL);
/* ...the free inode btree root... */
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
first_bno++;
/* ...the reverse mapping btree root... */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
first_bno++;
/* ...the reference count btree... */
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
first_bno++;
/*
@@ -2920,9 +2917,9 @@ xfs_ialloc_calc_rootino(
* Now round first_bno up to whatever allocation alignment is given
* by the filesystem or was passed in.
*/
- if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0)
+ if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
first_bno = roundup(first_bno, sunit);
- else if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ else if (xfs_has_align(mp) &&
mp->m_sb.sb_inoalignmt > 1)
first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
@@ -2953,7 +2950,7 @@ xfs_ialloc_check_shrink(
int has;
int error;
- if (!xfs_sb_version_hassparseinodes(&mp->m_sb))
+ if (!xfs_has_sparseinodes(mp))
return 0;
pag = xfs_perag_get(mp, agno);
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 9a2112b4ad5e..8b5c2b709022 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -106,7 +106,8 @@ int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
union xfs_btree_rec;
-void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec,
+void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
+ const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 823a038939f8..27190840c5d8 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -40,9 +40,9 @@ xfs_inobt_dup_cursor(
STATIC void
xfs_inobt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *nptr,
- int inc) /* level change */
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *nptr,
+ int inc) /* level change */
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agi *agi = agbp->b_addr;
@@ -54,9 +54,9 @@ xfs_inobt_set_root(
STATIC void
xfs_finobt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *nptr,
- int inc) /* level change */
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *nptr,
+ int inc) /* level change */
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agi *agi = agbp->b_addr;
@@ -76,7 +76,7 @@ xfs_inobt_mod_blockcount(
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agi *agi = agbp->b_addr;
- if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb))
+ if (!xfs_has_inobtcounts(cur->bc_mp))
return;
if (cur->bc_btnum == XFS_BTNUM_FINO)
@@ -88,11 +88,11 @@ xfs_inobt_mod_blockcount(
STATIC int
__xfs_inobt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat,
- enum xfs_ag_resv_type resv)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat,
+ enum xfs_ag_resv_type resv)
{
xfs_alloc_arg_t args; /* block allocation args */
int error; /* error return value */
@@ -127,20 +127,20 @@ __xfs_inobt_alloc_block(
STATIC int
xfs_inobt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
}
STATIC int
xfs_finobt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_alloc_block(cur, start, new, stat);
@@ -156,7 +156,7 @@ __xfs_inobt_free_block(
{
xfs_inobt_mod_blockcount(cur, -1);
return xfs_free_extent(cur->bc_tp,
- XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
+ XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1,
&XFS_RMAP_OINFO_INOBT, resv);
}
@@ -188,18 +188,18 @@ xfs_inobt_get_maxrecs(
STATIC void
xfs_inobt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->inobt.ir_startino = rec->inobt.ir_startino;
}
STATIC void
xfs_inobt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
- __u32 x;
+ __u32 x;
x = be32_to_cpu(rec->inobt.ir_startino);
x += XFS_INODES_PER_CHUNK - 1;
@@ -212,7 +212,7 @@ xfs_inobt_init_rec_from_cur(
union xfs_btree_rec *rec)
{
rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
- if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ if (xfs_has_sparseinodes(cur->bc_mp)) {
rec->inobt.ir_u.sp.ir_holemask =
cpu_to_be16(cur->bc_rec.i.ir_holemask);
rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
@@ -253,8 +253,8 @@ xfs_finobt_init_ptr_from_cur(
STATIC int64_t
xfs_inobt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
return (int64_t)be32_to_cpu(key->inobt.ir_startino) -
cur->bc_rec.i.ir_startino;
@@ -262,9 +262,9 @@ xfs_inobt_key_diff(
STATIC int64_t
xfs_inobt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
be32_to_cpu(k2->inobt.ir_startino);
@@ -292,7 +292,7 @@ xfs_inobt_verify(
* but beware of the landmine (i.e. need to check pag->pagi_init) if we
* ever do.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
@@ -360,9 +360,9 @@ const struct xfs_buf_ops xfs_finobt_buf_ops = {
STATIC int
xfs_inobt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be32_to_cpu(k1->inobt.ir_startino) <
be32_to_cpu(k2->inobt.ir_startino);
@@ -370,9 +370,9 @@ xfs_inobt_keys_inorder(
STATIC int
xfs_inobt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
be32_to_cpu(r2->inobt.ir_startino);
@@ -446,7 +446,7 @@ xfs_inobt_init_common(
cur->bc_blocklog = mp->m_sb.sb_blocklog;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
/* take a reference for the cursor */
@@ -511,7 +511,7 @@ xfs_inobt_commit_staged_btree(
fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
agi->agi_root = cpu_to_be32(afake->af_root);
agi->agi_level = cpu_to_be32(afake->af_levels);
- if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) {
+ if (xfs_has_inobtcounts(cur->bc_mp)) {
agi->agi_iblocks = cpu_to_be32(afake->af_blocks);
fields |= XFS_AGI_IBLOCKS;
}
@@ -521,7 +521,7 @@ xfs_inobt_commit_staged_btree(
fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
agi->agi_free_root = cpu_to_be32(afake->af_root);
agi->agi_free_level = cpu_to_be32(afake->af_levels);
- if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) {
+ if (xfs_has_inobtcounts(cur->bc_mp)) {
agi->agi_fblocks = cpu_to_be32(afake->af_blocks);
fields |= XFS_AGI_IBLOCKS;
}
@@ -737,10 +737,10 @@ xfs_finobt_calc_reserves(
xfs_extlen_t tree_len = 0;
int error;
- if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (!xfs_has_finobt(mp))
return 0;
- if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
+ if (xfs_has_inobtcounts(mp))
error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len);
else
error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index e530c82b2217..8a322d402e61 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -19,7 +19,7 @@ struct xfs_perag;
* Btree block header size depends on a superblock flag.
*/
#define XFS_INOBT_BLOCK_LEN(mp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ (xfs_has_crc(((mp))) ? \
XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 84ea2e0af9f0..3932b4ebf903 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -48,7 +48,7 @@ xfs_inode_buf_verify(
/*
* Validate the magic number and version of every inode in the buffer
*/
- agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
+ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
int di_ok;
@@ -58,7 +58,7 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
- xfs_dinode_good_version(&mp->m_sb, dip->di_version) &&
+ xfs_dinode_good_version(mp, dip->di_version) &&
xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
@@ -71,7 +71,7 @@ xfs_inode_buf_verify(
#ifdef DEBUG
xfs_alert(mp,
"bad inode magic/vsn daddr %lld #%d (magic=%x)",
- (unsigned long long)bp->b_bn, i,
+ (unsigned long long)xfs_buf_daddr(bp), i,
be16_to_cpu(dip->di_magic));
#endif
xfs_buf_verifier_error(bp, -EFSCORRUPTED,
@@ -192,7 +192,7 @@ xfs_inode_from_disk(
* inode. If the inode is unused, mode is zero and we shouldn't mess
* with the uninitialized part of it.
*/
- if (!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb))
+ if (!xfs_has_v3inodes(ip->i_mount))
ip->i_flushiter = be16_to_cpu(from->di_flushiter);
inode->i_generation = be32_to_cpu(from->di_gen);
inode->i_mode = be16_to_cpu(from->di_mode);
@@ -235,7 +235,7 @@ xfs_inode_from_disk(
if (from->di_dmevmask || from->di_dmstate)
xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS);
- if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ if (xfs_has_v3inodes(ip->i_mount)) {
inode_set_iversion_queried(inode,
be64_to_cpu(from->di_changecount));
ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
@@ -313,7 +313,7 @@ xfs_inode_to_disk(
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = cpu_to_be16(ip->i_diflags);
- if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ if (xfs_has_v3inodes(ip->i_mount)) {
to->di_version = 3;
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
@@ -413,7 +413,7 @@ xfs_dinode_verify(
/* Verify v3 integrity information first */
if (dip->di_version >= 3) {
- if (!xfs_sb_version_has_v3inode(&mp->m_sb))
+ if (!xfs_has_v3inodes(mp))
return __this_address;
if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF))
@@ -515,7 +515,7 @@ xfs_dinode_verify(
/* don't allow reflink/cowextsize if we don't have reflink */
if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) &&
- !xfs_sb_version_hasreflink(&mp->m_sb))
+ !xfs_has_reflink(mp))
return __this_address;
/* only regular files get reflink */
@@ -534,7 +534,7 @@ xfs_dinode_verify(
/* bigtime iflag can only happen on bigtime filesystems */
if (xfs_dinode_has_bigtime(dip) &&
- !xfs_sb_version_hasbigtime(&mp->m_sb))
+ !xfs_has_bigtime(mp))
return __this_address;
return NULL;
@@ -550,7 +550,7 @@ xfs_dinode_calc_crc(
if (dip->di_version < 3)
return;
- ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+ ASSERT(xfs_has_crc(mp));
crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF);
dip->di_crc = xfs_end_cksum(crc);
@@ -677,7 +677,7 @@ xfs_inode_validate_cowextsize(
hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
- if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb))
+ if (hint_flag && !xfs_has_reflink(mp))
return __this_address;
if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode)))
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 7f865bb4df84..585ed5a110af 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -21,7 +21,7 @@ struct xfs_imap {
int xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_imap *imap, struct xfs_buf **bpp);
-void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void xfs_dinode_calc_crc(struct xfs_mount *mp, struct xfs_dinode *dip);
void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
xfs_lsn_t lsn);
int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
@@ -42,4 +42,13 @@ static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip,
const xfs_timestamp_t ts);
+static inline bool
+xfs_dinode_good_version(struct xfs_mount *mp, uint8_t version)
+{
+ if (xfs_has_v3inodes(mp))
+ return version == 3;
+ return version == 1 || version == 2;
+}
+
+
#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 2c5bcbc19264..b322db523d65 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -41,10 +41,10 @@ typedef uint32_t xlog_tid_t;
#define XFS_MIN_LOG_FACTOR 3
#define XLOG_REC_SHIFT(log) \
- BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+ BTOBB(1 << (xfs_has_logv2(log->l_mp) ? \
XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
#define XLOG_TOTAL_REC_SHIFT(log) \
- BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+ BTOBB(XLOG_MAX_ICLOGS << (xfs_has_logv2(log->l_mp) ? \
XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
/* get lsn fields */
@@ -434,7 +434,7 @@ struct xfs_log_dinode {
};
#define xfs_log_dinode_size(mp) \
- (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \
+ (xfs_has_v3inodes((mp)) ? \
sizeof(struct xfs_log_dinode) : \
offsetof(struct xfs_log_dinode, di_next_unlinked))
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 3cca2bfe714c..ff69a0000817 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -122,6 +122,8 @@ void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len,
const struct xfs_buf_ops *ops);
bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
+int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
+ struct xfs_inode **ipp);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
uint64_t intent_id);
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 7f55eb3f3653..67798ff5e14e 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -92,7 +92,7 @@ xfs_log_calc_minimum_size(
if (tres.tr_logcount > 1)
max_logres *= tres.tr_logcount;
- if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+ if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1)
lsunit = BTOBB(mp->m_sb.sb_logsunit);
/*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 0f0af4e35032..a02c5062f9b2 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -60,37 +60,15 @@ typedef uint8_t xfs_dqtype_t;
#define XFS_DQUOT_LOGRES(mp) \
((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
-#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
-#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
-#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
/*
- * Incore only flags for quotaoff - these bits get cleared when quota(s)
- * are in the process of getting turned off. These flags are in m_qflags but
- * never in sb_qflags.
- */
-#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
-#define XFS_ALL_QUOTA_ACTIVE \
- (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
-
-/*
- * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
- * quota will be not be switched off as long as that inode lock is held.
- */
-#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
- XFS_GQUOTA_ACTIVE | \
- XFS_PQUOTA_ACTIVE))
-#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
-#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
-#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
-
-/*
* Flags to tell various functions what to do. Not all of these are meaningful
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 860a0c9801ba..e5d767a7fc5d 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -91,7 +91,7 @@ xfs_refcount_lookup_eq(
/* Convert on-disk record to in-core format. */
void
xfs_refcount_btrec_to_irec(
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec)
{
irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock);
@@ -1253,7 +1253,7 @@ xfs_refcount_increase_extent(
struct xfs_trans *tp,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
+ if (!xfs_has_reflink(tp->t_mountp))
return;
__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
@@ -1268,7 +1268,7 @@ xfs_refcount_decrease_extent(
struct xfs_trans *tp,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
+ if (!xfs_has_reflink(tp->t_mountp))
return;
__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
@@ -1617,7 +1617,7 @@ xfs_refcount_alloc_cow_extent(
{
struct xfs_mount *mp = tp->t_mountp;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return;
__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
@@ -1636,7 +1636,7 @@ xfs_refcount_free_cow_extent(
{
struct xfs_mount *mp = tp->t_mountp;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return;
/* Remove rmap entry */
@@ -1654,7 +1654,7 @@ struct xfs_refcount_recovery {
STATIC int
xfs_refcount_recover_extent(
struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
void *priv)
{
struct list_head *debris = priv;
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 9f6e9aae4da0..02cb3aa405be 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -78,7 +78,7 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
-extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec,
+extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 92d336c17e83..1ef9b99962ab 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -31,9 +31,9 @@ xfs_refcountbt_dup_cursor(
STATIC void
xfs_refcountbt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int inc)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
@@ -51,10 +51,10 @@ xfs_refcountbt_set_root(
STATIC int
xfs_refcountbt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
@@ -102,7 +102,7 @@ xfs_refcountbt_free_block(
struct xfs_mount *mp = cur->bc_mp;
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
int error;
trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
@@ -135,18 +135,18 @@ xfs_refcountbt_get_maxrecs(
STATIC void
xfs_refcountbt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->refc.rc_startblock = rec->refc.rc_startblock;
}
STATIC void
xfs_refcountbt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
- __u32 x;
+ __u32 x;
x = be32_to_cpu(rec->refc.rc_startblock);
x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
@@ -177,20 +177,20 @@ xfs_refcountbt_init_ptr_from_cur(
STATIC int64_t
xfs_refcountbt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
struct xfs_refcount_irec *rec = &cur->bc_rec.rc;
- struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_key *kp = &key->refc;
return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
}
STATIC int64_t
xfs_refcountbt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
be32_to_cpu(k2->refc.rc_startblock);
@@ -209,7 +209,7 @@ xfs_refcountbt_verify(
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return __this_address;
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
@@ -269,9 +269,9 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
STATIC int
xfs_refcountbt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be32_to_cpu(k1->refc.rc_startblock) <
be32_to_cpu(k2->refc.rc_startblock);
@@ -279,9 +279,9 @@ xfs_refcountbt_keys_inorder(
STATIC int
xfs_refcountbt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return be32_to_cpu(r1->refc.rc_startblock) +
be32_to_cpu(r1->refc.rc_blockcount) <=
@@ -462,7 +462,7 @@ xfs_refcountbt_calc_reserves(
xfs_extlen_t tree_len;
int error;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return 0;
error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index d1dfad0204e3..f45929b1b94a 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -179,8 +179,8 @@ done:
/* Convert an internal btree record to an rmap record. */
int
xfs_rmap_btrec_to_irec(
- union xfs_btree_rec *rec,
- struct xfs_rmap_irec *irec)
+ const union xfs_btree_rec *rec,
+ struct xfs_rmap_irec *irec)
{
irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
@@ -255,9 +255,9 @@ struct xfs_find_left_neighbor_info {
/* For each rmap given, figure out if it matches the key we want. */
STATIC int
xfs_rmap_find_left_neighbor_helper(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
- void *priv)
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
{
struct xfs_find_left_neighbor_info *info = priv;
@@ -331,9 +331,9 @@ xfs_rmap_find_left_neighbor(
/* For each rmap given, figure out if it matches the key we want. */
STATIC int
xfs_rmap_lookup_le_range_helper(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
- void *priv)
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
{
struct xfs_find_left_neighbor_info *info = priv;
@@ -705,7 +705,7 @@ xfs_rmap_free(
struct xfs_btree_cur *cur;
int error;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
@@ -959,7 +959,7 @@ xfs_rmap_alloc(
struct xfs_btree_cur *cur;
int error;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
@@ -2278,9 +2278,9 @@ struct xfs_rmap_query_range_info {
/* Format btree record and pass to our callback. */
STATIC int
xfs_rmap_query_range_helper(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
- void *priv)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
{
struct xfs_rmap_query_range_info *query = priv;
struct xfs_rmap_irec irec;
@@ -2296,8 +2296,8 @@ xfs_rmap_query_range_helper(
int
xfs_rmap_query_range(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *low_rec,
- struct xfs_rmap_irec *high_rec,
+ const struct xfs_rmap_irec *low_rec,
+ const struct xfs_rmap_irec *high_rec,
xfs_rmap_query_range_fn fn,
void *priv)
{
@@ -2459,7 +2459,7 @@ xfs_rmap_update_is_needed(
struct xfs_mount *mp,
int whichfork)
{
- return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK;
+ return xfs_has_rmapbt(mp) && whichfork != XFS_COW_FORK;
}
/*
@@ -2707,7 +2707,7 @@ struct xfs_rmap_key_state {
STATIC int
xfs_rmap_has_other_keys_helper(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xfs_rmap_key_state *rks = priv;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index f2423cf7f1e2..fd67904ed446 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -134,12 +134,13 @@ int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec,
int *stat);
typedef int (*xfs_rmap_query_range_fn)(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
- void *priv);
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv);
int xfs_rmap_query_range(struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
+ const struct xfs_rmap_irec *low_rec,
+ const struct xfs_rmap_irec *high_rec,
xfs_rmap_query_range_fn fn, void *priv);
int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn,
void *priv);
@@ -192,7 +193,7 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
int xfs_rmap_compare(const struct xfs_rmap_irec *a,
const struct xfs_rmap_irec *b);
union xfs_btree_rec;
-int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec,
+int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec);
int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, bool *exists);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f29bc71b9950..b7dbbfb3aeed 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -57,9 +57,9 @@ xfs_rmapbt_dup_cursor(
STATIC void
xfs_rmapbt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int inc)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
@@ -76,10 +76,10 @@ xfs_rmapbt_set_root(
STATIC int
xfs_rmapbt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
@@ -122,7 +122,7 @@ xfs_rmapbt_free_block(
xfs_agblock_t bno;
int error;
- bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+ bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
@@ -156,8 +156,8 @@ xfs_rmapbt_get_maxrecs(
STATIC void
xfs_rmapbt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->rmap.rm_startblock = rec->rmap.rm_startblock;
key->rmap.rm_owner = rec->rmap.rm_owner;
@@ -173,11 +173,11 @@ xfs_rmapbt_init_key_from_rec(
*/
STATIC void
xfs_rmapbt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
- uint64_t off;
- int adj;
+ uint64_t off;
+ int adj;
adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
@@ -219,13 +219,13 @@ xfs_rmapbt_init_ptr_from_cur(
STATIC int64_t
xfs_rmapbt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
- struct xfs_rmap_irec *rec = &cur->bc_rec.r;
- struct xfs_rmap_key *kp = &key->rmap;
- __u64 x, y;
- int64_t d;
+ struct xfs_rmap_irec *rec = &cur->bc_rec.r;
+ const struct xfs_rmap_key *kp = &key->rmap;
+ __u64 x, y;
+ int64_t d;
d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
if (d)
@@ -249,14 +249,14 @@ xfs_rmapbt_key_diff(
STATIC int64_t
xfs_rmapbt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
- struct xfs_rmap_key *kp1 = &k1->rmap;
- struct xfs_rmap_key *kp2 = &k2->rmap;
- int64_t d;
- __u64 x, y;
+ const struct xfs_rmap_key *kp1 = &k1->rmap;
+ const struct xfs_rmap_key *kp2 = &k2->rmap;
+ int64_t d;
+ __u64 x, y;
d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
be32_to_cpu(kp2->rm_startblock);
@@ -304,7 +304,7 @@ xfs_rmapbt_verify(
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return __this_address;
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
@@ -364,9 +364,9 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
STATIC int
xfs_rmapbt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
uint32_t x;
uint32_t y;
@@ -394,9 +394,9 @@ xfs_rmapbt_keys_inorder(
STATIC int
xfs_rmapbt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
uint32_t x;
uint32_t y;
@@ -558,7 +558,7 @@ xfs_rmapbt_compute_maxlevels(
* disallow reflinking when less than 10% of the per-AG metadata
* block reservation since the fallback is a regular file copy.
*/
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
else
mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
@@ -606,7 +606,7 @@ xfs_rmapbt_calc_reserves(
xfs_extlen_t tree_len;
int error;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 88d8d18788a2..f2eee6572af4 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -59,4 +59,4 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
-#endif /* __XFS_RMAP_BTREE_H__ */
+#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 483375c6a735..5740ba664867 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1009,8 +1009,8 @@ xfs_rtfree_extent(
int
xfs_rtalloc_query_range(
struct xfs_trans *tp,
- struct xfs_rtalloc_rec *low_rec,
- struct xfs_rtalloc_rec *high_rec,
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
xfs_rtalloc_query_range_fn fn,
void *priv)
{
@@ -1018,6 +1018,7 @@ xfs_rtalloc_query_range(
struct xfs_mount *mp = tp->t_mountp;
xfs_rtblock_t rtstart;
xfs_rtblock_t rtend;
+ xfs_rtblock_t high_key;
int is_free;
int error = 0;
@@ -1026,12 +1027,12 @@ xfs_rtalloc_query_range(
if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
low_rec->ar_startext == high_rec->ar_startext)
return 0;
- high_rec->ar_startext = min(high_rec->ar_startext,
- mp->m_sb.sb_rextents - 1);
+
+ high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1);
/* Iterate the bitmap, looking for discrepancies. */
rtstart = low_rec->ar_startext;
- while (rtstart <= high_rec->ar_startext) {
+ while (rtstart <= high_key) {
/* Is the first block free? */
error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
&is_free);
@@ -1039,8 +1040,7 @@ xfs_rtalloc_query_range(
break;
/* How long does the extent go for? */
- error = xfs_rtfind_forw(mp, tp, rtstart,
- high_rec->ar_startext, &rtend);
+ error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend);
if (error)
break;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 04f5386446db..e58349be78bd 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -30,13 +30,110 @@
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
*/
+/*
+ * We support all XFS versions newer than a v4 superblock with V2 directories.
+ */
+bool
+xfs_sb_good_version(
+ struct xfs_sb *sbp)
+{
+ /* all v5 filesystems are supported */
+ if (xfs_sb_is_v5(sbp))
+ return true;
+
+ /* versions prior to v4 are not supported */
+ if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
+ return false;
+
+ /* V4 filesystems need v2 directories and unwritten extents */
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+ return false;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+ return false;
+
+ /* And must not have any unknown v4 feature bits set */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
+
+ /* It's a supported v4 filesystem */
+ return true;
+}
+
+uint64_t
+xfs_sb_version_to_features(
+ struct xfs_sb *sbp)
+{
+ uint64_t features = 0;
+
+ /* optional V4 features */
+ if (sbp->sb_rblocks > 0)
+ features |= XFS_FEAT_REALTIME;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
+ features |= XFS_FEAT_ATTR;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
+ features |= XFS_FEAT_QUOTA;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)
+ features |= XFS_FEAT_ALIGN;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)
+ features |= XFS_FEAT_LOGV2;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT)
+ features |= XFS_FEAT_DALIGN;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)
+ features |= XFS_FEAT_EXTFLG;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT)
+ features |= XFS_FEAT_SECTOR;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT)
+ features |= XFS_FEAT_ASCIICI;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
+ if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
+ features |= XFS_FEAT_LAZYSBCOUNT;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
+ features |= XFS_FEAT_ATTR2;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
+ features |= XFS_FEAT_PROJID32;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
+ features |= XFS_FEAT_FTYPE;
+ }
+
+ if (!xfs_sb_is_v5(sbp))
+ return features;
+
+ /* Always on V5 features */
+ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
+ XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 |
+ XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;
+
+ /* Optional V5 features */
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT)
+ features |= XFS_FEAT_FINOBT;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT)
+ features |= XFS_FEAT_RMAPBT;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK)
+ features |= XFS_FEAT_REFLINK;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+ features |= XFS_FEAT_INOBTCNT;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
+ features |= XFS_FEAT_FTYPE;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
+ features |= XFS_FEAT_SPINODES;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
+ features |= XFS_FEAT_META_UUID;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME)
+ features |= XFS_FEAT_BIGTIME;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+ features |= XFS_FEAT_NEEDSREPAIR;
+ return features;
+}
+
/* Check all the superblock fields we care about when reading one in. */
STATIC int
xfs_validate_sb_read(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
- if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5)
+ if (!xfs_sb_is_v5(sbp))
return 0;
/*
@@ -56,7 +153,7 @@ xfs_validate_sb_read(
"Superblock has unknown read-only compatible features (0x%x) enabled.",
(sbp->sb_features_ro_compat &
XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
- if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ if (!xfs_is_readonly(mp)) {
xfs_warn(mp,
"Attempted to mount read-only compatible filesystem read-write.");
xfs_warn(mp,
@@ -95,7 +192,7 @@ xfs_validate_sb_write(
* secondary superblocks, so allow this usage to continue because
* we never read counters from such superblocks.
*/
- if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
+ if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
(sbp->sb_fdblocks > sbp->sb_dblocks ||
!xfs_verify_icount(mp, sbp->sb_icount) ||
sbp->sb_ifree > sbp->sb_icount)) {
@@ -103,7 +200,7 @@ xfs_validate_sb_write(
return -EFSCORRUPTED;
}
- if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5)
+ if (!xfs_sb_is_v5(sbp))
return 0;
/*
@@ -162,6 +259,7 @@ xfs_validate_sb_common(
struct xfs_dsb *dsb = bp->b_addr;
uint32_t agcount = 0;
uint32_t rem;
+ bool has_dalign;
if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp, "bad magic number");
@@ -173,12 +271,41 @@ xfs_validate_sb_common(
return -EWRONGFS;
}
- if (xfs_sb_version_has_pquotino(sbp)) {
+ /*
+ * Validate feature flags and state
+ */
+ if (xfs_sb_is_v5(sbp)) {
+ if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
+ xfs_notice(mp,
+"Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)",
+ sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE);
+ return -EFSCORRUPTED;
+ }
+
+ /* V5 has a separate project quota inode */
if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
xfs_notice(mp,
"Version 5 of Super block has XFS_OQUOTA bits.");
return -EFSCORRUPTED;
}
+
+ /*
+ * Full inode chunks must be aligned to inode chunk size when
+ * sparse inodes are enabled to support the sparse chunk
+ * allocation algorithm and prevent overlapping inode records.
+ */
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) {
+ uint32_t align;
+
+ align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+ >> sbp->sb_blocklog;
+ if (sbp->sb_inoalignmt != align) {
+ xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+ sbp->sb_inoalignmt, align);
+ return -EINVAL;
+ }
+ }
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
xfs_notice(mp,
@@ -186,24 +313,6 @@ xfs_validate_sb_common(
return -EFSCORRUPTED;
}
- /*
- * Full inode chunks must be aligned to inode chunk size when
- * sparse inodes are enabled to support the sparse chunk
- * allocation algorithm and prevent overlapping inode records.
- */
- if (xfs_sb_version_hassparseinodes(sbp)) {
- uint32_t align;
-
- align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
- >> sbp->sb_blocklog;
- if (sbp->sb_inoalignmt != align) {
- xfs_warn(mp,
-"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
- sbp->sb_inoalignmt, align);
- return -EINVAL;
- }
- }
-
if (unlikely(
sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
xfs_warn(mp,
@@ -303,7 +412,8 @@ xfs_validate_sb_common(
* Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign)
* would imply the image is corrupted.
*/
- if (!!sbp->sb_unit ^ xfs_sb_version_hasdalign(sbp)) {
+ has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT;
+ if (!!sbp->sb_unit ^ has_dalign) {
xfs_notice(mp, "SB stripe alignment sanity check failed");
return -EFSCORRUPTED;
}
@@ -312,12 +422,6 @@ xfs_validate_sb_common(
XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
return -EFSCORRUPTED;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
- xfs_notice(mp, "v5 SB sanity check failed");
- return -EFSCORRUPTED;
- }
-
/*
* Currently only very few inode sizes are supported.
*/
@@ -361,7 +465,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
* We need to do these manipilations only if we are working
* with an older version of on-disk superblock.
*/
- if (xfs_sb_version_has_pquotino(sbp))
+ if (xfs_sb_is_v5(sbp))
return;
if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
@@ -454,7 +558,8 @@ __xfs_sb_from_disk(
* sb_meta_uuid is only on disk if it differs from sb_uuid and the
* feature flag is set; if not set we keep it only in memory.
*/
- if (xfs_sb_version_hasmetauuid(to))
+ if (xfs_sb_is_v5(to) &&
+ (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID))
uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
else
uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
@@ -479,7 +584,12 @@ xfs_sb_quota_to_disk(
uint16_t qflags = from->sb_qflags;
to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
- if (xfs_sb_version_has_pquotino(from)) {
+
+ /*
+ * The in-memory superblock quota state matches the v5 on-disk format so
+ * just write them out and return
+ */
+ if (xfs_sb_is_v5(from)) {
to->sb_qflags = cpu_to_be16(from->sb_qflags);
to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
@@ -487,9 +597,9 @@ xfs_sb_quota_to_disk(
}
/*
- * The in-core version of sb_qflags do not have XFS_OQUOTA_*
- * flags, whereas the on-disk version does. So, convert incore
- * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
+ * For older superblocks (v4), the in-core version of sb_qflags do not
+ * have XFS_OQUOTA_* flags, whereas the on-disk version does. So,
+ * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
*/
qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
@@ -589,19 +699,20 @@ xfs_sb_to_disk(
to->sb_features2 = cpu_to_be32(from->sb_features2);
to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
- if (xfs_sb_version_hascrc(from)) {
- to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
- to->sb_features_ro_compat =
- cpu_to_be32(from->sb_features_ro_compat);
- to->sb_features_incompat =
- cpu_to_be32(from->sb_features_incompat);
- to->sb_features_log_incompat =
- cpu_to_be32(from->sb_features_log_incompat);
- to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
- to->sb_lsn = cpu_to_be64(from->sb_lsn);
- if (xfs_sb_version_hasmetauuid(from))
- uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
- }
+ if (!xfs_sb_is_v5(from))
+ return;
+
+ to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
+ to->sb_features_ro_compat =
+ cpu_to_be32(from->sb_features_ro_compat);
+ to->sb_features_incompat =
+ cpu_to_be32(from->sb_features_incompat);
+ to->sb_features_log_incompat =
+ cpu_to_be32(from->sb_features_log_incompat);
+ to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
+ to->sb_lsn = cpu_to_be64(from->sb_lsn);
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
}
/*
@@ -636,8 +747,8 @@ xfs_sb_read_verify(
if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
/* Only fail bad secondaries on a known V5 filesystem */
- if (bp->b_bn == XFS_SB_DADDR ||
- xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_buf_daddr(bp) == XFS_SB_DADDR ||
+ xfs_has_crc(mp)) {
error = -EFSBADCRC;
goto out_error;
}
@@ -704,7 +815,7 @@ xfs_sb_write_verify(
if (error)
goto out_error;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_sb_is_v5(&sb))
return;
if (bip)
@@ -801,7 +912,7 @@ xfs_log_sb(
* unclean shutdown, this will be corrected by log recovery rebuilding
* the counters from the AGF block counts.
*/
- if (xfs_sb_version_haslazysbcount(&mp->m_sb)) {
+ if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
@@ -950,10 +1061,12 @@ out:
void
xfs_fs_geometry(
- struct xfs_sb *sbp,
+ struct xfs_mount *mp,
struct xfs_fsop_geom *geo,
int struct_version)
{
+ struct xfs_sb *sbp = &mp->m_sb;
+
memset(geo, 0, sizeof(struct xfs_fsop_geom));
geo->blocksize = sbp->sb_blocksize;
@@ -984,51 +1097,51 @@ xfs_fs_geometry(
geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
XFS_FSOP_GEOM_FLAGS_DIRV2 |
XFS_FSOP_GEOM_FLAGS_EXTFLG;
- if (xfs_sb_version_hasattr(sbp))
+ if (xfs_has_attr(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
- if (xfs_sb_version_hasquota(sbp))
+ if (xfs_has_quota(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA;
- if (xfs_sb_version_hasalign(sbp))
+ if (xfs_has_align(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
- if (xfs_sb_version_hasdalign(sbp))
+ if (xfs_has_dalign(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
- if (xfs_sb_version_hassector(sbp))
- geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
- if (xfs_sb_version_hasasciici(sbp))
+ if (xfs_has_asciici(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
- if (xfs_sb_version_haslazysbcount(sbp))
+ if (xfs_has_lazysbcount(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
- if (xfs_sb_version_hasattr2(sbp))
+ if (xfs_has_attr2(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2;
- if (xfs_sb_version_hasprojid32bit(sbp))
+ if (xfs_has_projid32(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
- if (xfs_sb_version_hascrc(sbp))
+ if (xfs_has_crc(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB;
- if (xfs_sb_version_hasftype(sbp))
+ if (xfs_has_ftype(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE;
- if (xfs_sb_version_hasfinobt(sbp))
+ if (xfs_has_finobt(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT;
- if (xfs_sb_version_hassparseinodes(sbp))
+ if (xfs_has_sparseinodes(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES;
- if (xfs_sb_version_hasrmapbt(sbp))
+ if (xfs_has_rmapbt(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT;
- if (xfs_sb_version_hasreflink(sbp))
+ if (xfs_has_reflink(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
- if (xfs_sb_version_hasbigtime(sbp))
+ if (xfs_has_bigtime(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
- if (xfs_sb_version_hasinobtcounts(sbp))
+ if (xfs_has_inobtcounts(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
- if (xfs_sb_version_hassector(sbp))
+ if (xfs_has_sector(mp)) {
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
geo->logsectsize = sbp->sb_logsectsize;
- else
+ } else {
geo->logsectsize = BBSIZE;
+ }
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
if (struct_version < 4)
return;
- if (xfs_sb_version_haslogv2(sbp))
+ if (xfs_has_logv2(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2;
geo->logsunit = sbp->sb_logsunit;
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 0c1602d9b53d..a5e14740ec9a 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -20,11 +20,13 @@ extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+extern bool xfs_sb_good_version(struct xfs_sb *sbp);
+extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp);
extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
-extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
+extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo,
int struct_version);
extern int xfs_sb_read_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 594bc447a7dd..f0b38f4aba80 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -42,7 +42,7 @@ xfs_symlink_hdr_set(
{
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return 0;
memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
@@ -51,7 +51,7 @@ xfs_symlink_hdr_set(
dsl->sl_bytes = cpu_to_be32(size);
uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid);
dsl->sl_owner = cpu_to_be64(ino);
- dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+ dsl->sl_blkno = cpu_to_be64(xfs_buf_daddr(bp));
bp->b_ops = &xfs_symlink_buf_ops;
return sizeof(struct xfs_dsymlink_hdr);
@@ -89,13 +89,13 @@ xfs_symlink_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return __this_address;
if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+ if (xfs_buf_daddr(bp) != be64_to_cpu(dsl->sl_blkno))
return __this_address;
if (be32_to_cpu(dsl->sl_offset) +
be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN)
@@ -116,7 +116,7 @@ xfs_symlink_read_verify(
xfs_failaddr_t fa;
/* no verification of non-crc buffers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
@@ -137,7 +137,7 @@ xfs_symlink_write_verify(
xfs_failaddr_t fa;
/* no verification of non-crc buffers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
fa = xfs_symlink_verify(bp);
@@ -173,7 +173,7 @@ xfs_symlink_local_to_remote(
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
- if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_has_crc(mp)) {
bp->b_ops = NULL;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 16f723ebe8dd..8b5547073379 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -136,7 +136,7 @@ xfs_trans_log_inode(
* to upgrade this inode to bigtime format, do so now.
*/
if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
- xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) &&
+ xfs_has_bigtime(ip->i_mount) &&
!xfs_inode_has_bigtime(ip)) {
ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
flags |= XFS_ILOG_CORE;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index d1a0848cb52e..5e300daa2559 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -71,9 +71,9 @@ xfs_allocfree_log_count(
uint blocks;
blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
return blocks;
@@ -155,7 +155,7 @@ STATIC uint
xfs_calc_finobt_res(
struct xfs_mount *mp)
{
- if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (!xfs_has_finobt(mp))
return 0;
return xfs_calc_inobt_res(mp);
@@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res(
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
- if (xfs_sb_version_has_v3inode(&mp->m_sb))
+ if (xfs_has_v3inodes(mp))
return res;
size = XFS_FSB_TO_B(mp, 1);
}
@@ -268,7 +268,7 @@ xfs_calc_write_reservation(
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
- if (xfs_sb_version_hasrealtime(&mp->m_sb)) {
+ if (xfs_has_realtime(mp)) {
t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
blksz) +
@@ -317,7 +317,7 @@ xfs_calc_itruncate_reservation(
t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz);
- if (xfs_sb_version_hasrealtime(&mp->m_sb)) {
+ if (xfs_has_realtime(mp)) {
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) +
xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
@@ -799,29 +799,6 @@ xfs_calc_qm_dqalloc_reservation(
}
/*
- * Turning off quotas.
- * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
- * the superblock for the quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_quotaoff_reservation(
- struct xfs_mount *mp)
-{
- return sizeof(struct xfs_qoff_logitem) * 2 +
- xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * End of turning off quotas.
- * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
- */
-STATIC uint
-xfs_calc_qm_quotaoff_end_reservation(void)
-{
- return sizeof(struct xfs_qoff_logitem) * 2;
-}
-
-/*
* Syncing the incore super block changes to disk.
* the super block to reflect the changes: sector size
*/
@@ -842,14 +819,14 @@ xfs_trans_resv_calc(
* require a permanent reservation on space.
*/
resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
else
resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
resp->tr_itruncate.tr_logcount =
XFS_ITRUNCATE_LOG_COUNT_REFLINK;
else
@@ -910,7 +887,7 @@ xfs_trans_resv_calc(
resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
else
resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
@@ -923,13 +900,6 @@ xfs_trans_resv_calc(
resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation();
resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
- resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
- resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
- resp->tr_qm_equotaoff.tr_logres =
- xfs_calc_qm_quotaoff_end_reservation();
- resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 7241ab28cf84..fc4e9b369a3a 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -46,8 +46,6 @@ struct xfs_trans_resv {
struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
- struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */
- struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */
struct xfs_trans_res tr_sb; /* modify superblock */
struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
};
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 7ad3659c5d2a..50332be34388 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -57,8 +57,7 @@
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
(M_IGEO(mp)->ialloc_blks + \
- ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \
- M_IGEO(mp)->inobt_maxlevels))
+ ((xfs_has_finobt(mp) ? 2 : 1) * M_IGEO(mp)->inobt_maxlevels))
/*
* Space reservation values for various transactions.
@@ -94,8 +93,7 @@
#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
#define XFS_IFREE_SPACE_RES(mp) \
- (xfs_sb_version_hasfinobt(&mp->m_sb) ? \
- M_IGEO(mp)->inobt_maxlevels : 0)
+ (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0)
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index e8f4abee7892..e810d23f2d97 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -169,7 +169,7 @@ xfs_internal_inum(
xfs_ino_t ino)
{
return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
- (xfs_sb_version_hasquota(&mp->m_sb) &&
+ (xfs_has_quota(mp) &&
xfs_is_quota_inode(&mp->m_sb, ino));
}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 0870ef6f933d..b6da06b40989 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -87,6 +87,11 @@ typedef void * xfs_failaddr_t;
#define XFS_ATTR_FORK 1
#define XFS_COW_FORK 2
+#define XFS_WHICHFORK_STRINGS \
+ { XFS_DATA_FORK, "data" }, \
+ { XFS_ATTR_FORK, "attr" }, \
+ { XFS_COW_FORK, "cow" }
+
/*
* Min numbers of data/attr fork btree root pointers.
*/
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index be1a7e1e65f7..ae3c9f6e2c69 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -36,7 +36,7 @@ xchk_superblock_xref(
agbno = XFS_SB_BLOCK(mp);
- error = xchk_ag_init(sc, agno, &sc->sa);
+ error = xchk_ag_init_existing(sc, agno, &sc->sa);
if (!xchk_xref_process_error(sc, agno, agbno, &error))
return;
@@ -63,6 +63,7 @@ xchk_superblock(
struct xfs_mount *mp = sc->mp;
struct xfs_buf *bp;
struct xfs_dsb *sb;
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -73,6 +74,15 @@ xchk_superblock(
if (agno == 0)
return 0;
+ /*
+ * Grab an active reference to the perag structure. If we can't get
+ * it, we're racing with something that's tearing down the AG, so
+ * signal that the AG no longer exists.
+ */
+ pag = xfs_perag_get(mp, agno);
+ if (!pag)
+ return -ENOENT;
+
error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp);
/*
* The superblock verifier can return several different error codes
@@ -92,7 +102,7 @@ xchk_superblock(
break;
}
if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error))
- return error;
+ goto out_pag;
sb = bp->b_addr;
@@ -248,7 +258,7 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
} else {
v2_ok = XFS_SB_VERSION2_OKBITS;
- if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5)
+ if (xfs_sb_is_v5(&mp->m_sb))
v2_ok |= XFS_SB_VERSION2_CRCBIT;
if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok)))
@@ -273,7 +283,7 @@ xchk_superblock(
(cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
xchk_block_set_corrupt(sc, bp);
- if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_has_crc(mp)) {
/* all v5 fields must be zero */
if (memchr_inv(&sb->sb_features_compat, 0,
sizeof(struct xfs_dsb) -
@@ -324,7 +334,7 @@ xchk_superblock(
/* Don't care about sb_lsn */
}
- if (xfs_sb_version_hasmetauuid(&mp->m_sb)) {
+ if (xfs_has_metauuid(mp)) {
/* The metadata UUID must be the same for all supers */
if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid))
xchk_block_set_corrupt(sc, bp);
@@ -336,7 +346,8 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
-
+out_pag:
+ xfs_perag_put(pag);
return error;
}
@@ -346,7 +357,7 @@ xchk_superblock(
STATIC int
xchk_agf_record_bno_lengths(
struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *rec,
+ const struct xfs_alloc_rec_incore *rec,
void *priv)
{
xfs_extlen_t *blocks = priv;
@@ -419,7 +430,7 @@ xchk_agf_xref_btreeblks(
int error;
/* agf_btreeblks didn't exist before lazysbcount */
- if (!xfs_sb_version_haslazysbcount(&sc->mp->m_sb))
+ if (!xfs_has_lazysbcount(sc->mp))
return;
/* Check agf_rmap_blocks; set up for agf_btreeblks check */
@@ -438,7 +449,7 @@ xchk_agf_xref_btreeblks(
* No rmap cursor; we can't xref if we have the rmapbt feature.
* We also can't do it if we're missing the free space btree cursors.
*/
- if ((xfs_sb_version_hasrmapbt(&mp->m_sb) && !sc->sa.rmap_cur) ||
+ if ((xfs_has_rmapbt(mp) && !sc->sa.rmap_cur) ||
!sc->sa.bno_cur || !sc->sa.cnt_cur)
return;
@@ -527,6 +538,7 @@ xchk_agf(
xchk_buffer_recheck(sc, sc->sa.agf_bp);
agf = sc->sa.agf_bp->b_addr;
+ pag = sc->sa.pag;
/* Check the AG length */
eoag = be32_to_cpu(agf->agf_length);
@@ -550,7 +562,7 @@ xchk_agf(
if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (xfs_has_rmapbt(mp)) {
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
if (!xfs_verify_agbno(mp, agno, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
@@ -560,7 +572,7 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
agbno = be32_to_cpu(agf->agf_refcount_root);
if (!xfs_verify_agbno(mp, agno, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
@@ -582,15 +594,13 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Do the incore counters match? */
- pag = xfs_perag_get(mp, agno);
if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb) &&
+ if (xfs_has_lazysbcount(sc->mp) &&
pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- xfs_perag_put(pag);
xchk_agf_xref(sc);
out:
@@ -630,7 +640,7 @@ xchk_agfl_block(
{
struct xchk_agfl_info *sai = priv;
struct xfs_scrub *sc = sai->sc;
- xfs_agnumber_t agno = sc->sa.agno;
+ xfs_agnumber_t agno = sc->sa.pag->pag_agno;
if (xfs_verify_agbno(mp, agno, agbno) &&
sai->nr_entries < sai->sz_entries)
@@ -787,7 +797,7 @@ xchk_agi_xref_fiblocks(
xfs_agblock_t blocks;
int error = 0;
- if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb))
+ if (!xfs_has_inobtcounts(sc->mp))
return;
if (sc->sa.ino_cur) {
@@ -857,6 +867,7 @@ xchk_agi(
xchk_buffer_recheck(sc, sc->sa.agi_bp);
agi = sc->sa.agi_bp->b_addr;
+ pag = sc->sa.pag;
/* Check the AG length */
eoag = be32_to_cpu(agi->agi_length);
@@ -872,7 +883,7 @@ xchk_agi(
if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ if (xfs_has_finobt(mp)) {
agbno = be32_to_cpu(agi->agi_free_root);
if (!xfs_verify_agbno(mp, agno, agbno))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
@@ -909,12 +920,10 @@ xchk_agi(
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Do the incore counters match? */
- pag = xfs_perag_get(mp, agno);
if (pag->pagi_count != be32_to_cpu(agi->agi_count))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
- xfs_perag_put(pag);
xchk_agi_xref(sc);
out:
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index e95f8c98f0f7..0f8deee66f15 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -70,7 +70,7 @@ struct xrep_agf_allocbt {
STATIC int
xrep_agf_walk_allocbt(
struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *rec,
+ const struct xfs_alloc_rec_incore *rec,
void *priv)
{
struct xrep_agf_allocbt *raa = priv;
@@ -94,7 +94,7 @@ xrep_agf_check_agfl_block(
{
struct xfs_scrub *sc = priv;
- if (!xfs_verify_agbno(mp, sc->sa.agno, agbno))
+ if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno))
return -EFSCORRUPTED;
return 0;
}
@@ -164,7 +164,7 @@ xrep_agf_find_btrees(
return -EFSCORRUPTED;
/* We must find the refcountbt root if that feature is enabled. */
- if (xfs_sb_version_hasreflink(&sc->mp->m_sb) &&
+ if (xfs_has_reflink(sc->mp) &&
!xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT]))
return -EFSCORRUPTED;
@@ -188,12 +188,13 @@ xrep_agf_init_header(
memset(agf, 0, BBTOB(agf_bp->b_length));
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
- agf->agf_seqno = cpu_to_be32(sc->sa.agno);
- agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno));
+ agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+ agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp,
+ sc->sa.pag->pag_agno));
agf->agf_flfirst = old_agf->agf_flfirst;
agf->agf_fllast = old_agf->agf_fllast;
agf->agf_flcount = old_agf->agf_flcount;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
/* Mark the incore AGF data stale until we're done fixing things. */
@@ -223,7 +224,7 @@ xrep_agf_set_roots(
agf->agf_levels[XFS_BTNUM_RMAPi] =
cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
- if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+ if (xfs_has_reflink(sc->mp)) {
agf->agf_refcount_root =
cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root);
agf->agf_refcount_level =
@@ -280,7 +281,7 @@ xrep_agf_calc_from_btrees(
agf->agf_btreeblks = cpu_to_be32(btreeblks);
/* Update the AGF counters from the refcountbt. */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
@@ -363,16 +364,16 @@ xrep_agf(
int error;
/* We require the rmapbt to rebuild anything. */
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
- xchk_perag_get(sc->mp, &sc->sa);
/*
* Make sure we have the AGF buffer, as scrub might have decided it
* was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
+ XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
if (error)
return error;
@@ -388,7 +389,7 @@ xrep_agf(
* btrees rooted in the AGF. If the AGFL contents are obviously bad
* then we'll bail out.
*/
- error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
+ error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp);
if (error)
return error;
@@ -442,7 +443,7 @@ struct xrep_agfl {
STATIC int
xrep_agfl_walk_rmap(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xrep_agfl *ra = priv;
@@ -586,7 +587,7 @@ xrep_agfl_init_header(
agfl = XFS_BUF_TO_AGFL(agfl_bp);
memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
- agfl->agfl_seqno = cpu_to_be32(sc->sa.agno);
+ agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
/*
@@ -599,7 +600,8 @@ xrep_agfl_init_header(
for_each_xbitmap_extent(br, n, agfl_extents) {
agbno = XFS_FSB_TO_AGBNO(mp, br->start);
- trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len);
+ trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno,
+ br->len);
while (br->len > 0 && fl_off < flcount) {
agfl_bno[fl_off] = cpu_to_be32(agbno);
@@ -638,10 +640,9 @@ xrep_agfl(
int error;
/* We require the rmapbt to rebuild anything. */
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
- xchk_perag_get(sc->mp, &sc->sa);
xbitmap_init(&agfl_extents);
/*
@@ -649,7 +650,8 @@ xrep_agfl(
* nothing wrong with the AGF, but all the AG header repair functions
* have this chicken-and-egg problem.
*/
- error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0,
+ &agf_bp);
if (error)
return error;
@@ -658,7 +660,8 @@ xrep_agfl(
* was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGFL_DADDR(mp)),
+ XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
if (error)
return error;
@@ -723,7 +726,8 @@ xrep_agi_find_btrees(
int error;
/* Read the AGF. */
- error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0,
+ &agf_bp);
if (error)
return error;
@@ -737,7 +741,7 @@ xrep_agi_find_btrees(
return -EFSCORRUPTED;
/* We must find the finobt root if that feature is enabled. */
- if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ if (xfs_has_finobt(mp) &&
!xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT]))
return -EFSCORRUPTED;
@@ -761,11 +765,12 @@ xrep_agi_init_header(
memset(agi, 0, BBTOB(agi_bp->b_length));
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
- agi->agi_seqno = cpu_to_be32(sc->sa.agno);
- agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno));
+ agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+ agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp,
+ sc->sa.pag->pag_agno));
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
/* We don't know how to fix the unlinked list yet. */
@@ -787,7 +792,7 @@ xrep_agi_set_roots(
agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
- if (xfs_sb_version_hasfinobt(&sc->mp->m_sb)) {
+ if (xfs_has_finobt(sc->mp)) {
agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root);
agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height);
}
@@ -811,7 +816,7 @@ xrep_agi_calc_from_btrees(
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
- if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ if (xfs_has_inobtcounts(mp)) {
xfs_agblock_t blocks;
error = xfs_btree_count_blocks(cur, &blocks);
@@ -824,8 +829,7 @@ xrep_agi_calc_from_btrees(
agi->agi_count = cpu_to_be32(count);
agi->agi_freecount = cpu_to_be32(freecount);
- if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
- xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
xfs_agblock_t blocks;
cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
@@ -893,16 +897,16 @@ xrep_agi(
int error;
/* We require the rmapbt to rebuild anything. */
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
- xchk_perag_get(sc->mp, &sc->sa);
/*
* Make sure we have the AGI buffer, as scrub might have decided it
* was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGI_DADDR(mp)),
+ XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL);
if (error)
return error;
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index d5741980094a..87518e1292f8 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -91,7 +91,7 @@ xchk_allocbt_xref(
STATIC int
xchk_allocbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 552af0cf8482..b6f0c9f3f124 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -25,11 +25,11 @@
* reallocating the buffer if necessary. Buffer contents are not preserved
* across a reallocation.
*/
-int
+static int
xchk_setup_xattr_buf(
struct xfs_scrub *sc,
size_t value_size,
- xfs_km_flags_t flags)
+ gfp_t flags)
{
size_t sz;
struct xchk_xattr_buf *ab = sc->buf;
@@ -57,7 +57,7 @@ xchk_setup_xattr_buf(
* Don't zero the buffer upon allocation to avoid runtime overhead.
* All users must be careful never to read uninitialized contents.
*/
- ab = kmem_alloc_large(sizeof(*ab) + sz, flags);
+ ab = kvmalloc(sizeof(*ab) + sz, flags);
if (!ab)
return -ENOMEM;
@@ -79,7 +79,7 @@ xchk_setup_xattr(
* without the inode lock held, which means we can sleep.
*/
if (sc->flags & XCHK_TRY_HARDER) {
- error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL);
if (error)
return error;
}
@@ -138,7 +138,8 @@ xchk_xattr_listent(
* doesn't work, we overload the seen_enough variable to convey
* the error message back to the main scrub function.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL);
+ error = xchk_setup_xattr_buf(sx->sc, valuelen,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (error == -ENOMEM)
error = -EDEADLOCK;
if (error) {
@@ -323,7 +324,8 @@ xchk_xattr_block(
return 0;
/* Allocate memory for block usage checking. */
- error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL);
+ error = xchk_setup_xattr_buf(ds->sc, 0,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (error == -ENOMEM)
return -EDEADLOCK;
if (error)
@@ -334,7 +336,7 @@ xchk_xattr_block(
bitmap_zero(usedmap, mp->m_attr_geo->blksize);
/* Check all the padding. */
- if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) {
+ if (xfs_has_crc(ds->sc->mp)) {
struct xfs_attr3_leafblock *leaf = bp->b_addr;
if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 ||
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 13a1d2e8424d..1719e1c4da59 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -65,7 +65,4 @@ xchk_xattr_dstmap(
BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
}
-int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size,
- xfs_km_flags_t flags);
-
#endif /* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 813b5f219113..d6d24c866bc4 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -260,7 +260,7 @@ xbitmap_set_btcur_path(
xfs_btree_get_block(cur, i, &bp);
if (!bp)
continue;
- fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
+ fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
error = xbitmap_set(bitmap, fsb, 1);
if (error)
return error;
@@ -284,7 +284,7 @@ xbitmap_collect_btblock(
if (!bp)
return 0;
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
return xbitmap_set(bitmap, fsbno, 1);
}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 1d146c9d9de1..017da9ceaee9 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -260,10 +260,10 @@ xchk_bmap_iextent_xref(
agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
len = irec->br_blockcount;
- error = xchk_ag_init(info->sc, agno, &info->sc->sa);
+ error = xchk_ag_init_existing(info->sc, agno, &info->sc->sa);
if (!xchk_fblock_process_error(info->sc, info->whichfork,
irec->br_startoff, &error))
- return;
+ goto out_free;
xchk_xref_is_used_space(info->sc, agbno, len);
xchk_xref_is_not_inode_chunk(info->sc, agbno, len);
@@ -283,6 +283,7 @@ xchk_bmap_iextent_xref(
break;
}
+out_free:
xchk_ag_free(info->sc, &info->sc->sa);
}
@@ -383,7 +384,7 @@ xchk_bmap_iextent(
STATIC int
xchk_bmapbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
struct xfs_bmbt_irec irec;
struct xfs_bmbt_irec iext_irec;
@@ -400,7 +401,7 @@ xchk_bmapbt_rec(
* Check the owners of the btree blocks up to the level below
* the root since the verifiers don't do that.
*/
- if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) &&
+ if (xfs_has_crc(bs->cur->bc_mp) &&
bs->cur->bc_ptrs[0] == 1) {
for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
block = xfs_btree_get_block(bs->cur, i, &bp);
@@ -473,10 +474,11 @@ struct xchk_bmap_check_rmap_info {
STATIC int
xchk_bmap_check_rmap(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xfs_bmbt_irec irec;
+ struct xfs_rmap_irec check_rec;
struct xchk_bmap_check_rmap_info *sbcri = priv;
struct xfs_ifork *ifp;
struct xfs_scrub *sc = sbcri->sc;
@@ -510,28 +512,30 @@ xchk_bmap_check_rmap(
* length, so we have to loop through the bmbt to make sure that the
* entire rmap is covered by bmbt records.
*/
+ check_rec = *rec;
while (have_map) {
- if (irec.br_startoff != rec->rm_offset)
+ if (irec.br_startoff != check_rec.rm_offset)
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
- rec->rm_offset);
+ check_rec.rm_offset);
if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
- cur->bc_ag.pag->pag_agno, rec->rm_startblock))
+ cur->bc_ag.pag->pag_agno,
+ check_rec.rm_startblock))
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
- rec->rm_offset);
- if (irec.br_blockcount > rec->rm_blockcount)
+ check_rec.rm_offset);
+ if (irec.br_blockcount > check_rec.rm_blockcount)
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
- rec->rm_offset);
+ check_rec.rm_offset);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
break;
- rec->rm_startblock += irec.br_blockcount;
- rec->rm_offset += irec.br_blockcount;
- rec->rm_blockcount -= irec.br_blockcount;
- if (rec->rm_blockcount == 0)
+ check_rec.rm_startblock += irec.br_blockcount;
+ check_rec.rm_offset += irec.br_blockcount;
+ check_rec.rm_blockcount -= irec.br_blockcount;
+ if (check_rec.rm_blockcount == 0)
break;
have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec);
if (!have_map)
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
- rec->rm_offset);
+ check_rec.rm_offset);
}
out:
@@ -581,7 +585,7 @@ xchk_bmap_check_rmaps(
bool zero_size;
int error;
- if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) ||
+ if (!xfs_has_rmapbt(sc->mp) ||
whichfork == XFS_COW_FORK ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return 0;
@@ -659,8 +663,7 @@ xchk_bmap(
}
break;
case XFS_ATTR_FORK:
- if (!xfs_sb_version_hasattr(&mp->m_sb) &&
- !xfs_sb_version_hasattr2(&mp->m_sb))
+ if (!xfs_has_attr(mp) && !xfs_has_attr2(mp))
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
break;
default:
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index bd1172358964..eccb855dc904 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -374,10 +374,10 @@ xchk_btree_check_block_owner(
init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
if (init_sa) {
- error = xchk_ag_init(bs->sc, agno, &bs->sc->sa);
+ error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
level, &error))
- return error;
+ goto out_free;
}
xchk_xref_is_used_space(bs->sc, agbno, 1);
@@ -393,6 +393,7 @@ xchk_btree_check_block_owner(
if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
bs->cur = NULL;
+out_free:
if (init_sa)
xchk_ag_free(bs->sc, &bs->sc->sa);
@@ -435,12 +436,12 @@ xchk_btree_check_owner(
if (!co)
return -ENOMEM;
co->level = level;
- co->daddr = XFS_BUF_ADDR(bp);
+ co->daddr = xfs_buf_daddr(bp);
list_add_tail(&co->list, &bs->to_check);
return 0;
}
- return xchk_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp));
+ return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
}
/* Decide if we want to check minrecs of a btree block in the inode root. */
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index 5572e475f8ed..b7d2fc01fbf9 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -26,8 +26,8 @@ void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc,
struct xchk_btree;
typedef int (*xchk_btree_rec_fn)(
- struct xchk_btree *bs,
- union xfs_btree_rec *rec);
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec);
struct xchk_btree {
/* caller-provided scrub state */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 8558ca05e11d..bf1f3607d0b6 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -186,7 +186,7 @@ xchk_block_set_preen(
struct xfs_buf *bp)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
- trace_xchk_block_preen(sc, bp->b_bn, __return_address);
+ trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
}
/*
@@ -219,7 +219,7 @@ xchk_block_set_corrupt(
struct xfs_buf *bp)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- trace_xchk_block_error(sc, bp->b_bn, __return_address);
+ trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
}
/* Record a corruption while cross-referencing. */
@@ -229,7 +229,7 @@ xchk_block_xref_set_corrupt(
struct xfs_buf *bp)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
- trace_xchk_block_error(sc, bp->b_bn, __return_address);
+ trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
}
/*
@@ -324,7 +324,7 @@ struct xchk_rmap_ownedby_info {
STATIC int
xchk_count_rmap_ownedby_irec(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xchk_rmap_ownedby_info *sroi = priv;
@@ -394,11 +394,11 @@ want_ag_read_header_failure(
}
/*
- * Grab all the headers for an AG.
+ * Grab the perag structure and all the headers for an AG.
*
- * The headers should be released by xchk_ag_free, but as a fail
- * safe we attach all the buffers we grab to the scrub transaction so
- * they'll all be freed when we cancel it.
+ * The headers should be released by xchk_ag_free, but as a fail safe we attach
+ * all the buffers we grab to the scrub transaction so they'll all be freed
+ * when we cancel it. Returns ENOENT if we can't grab the perag structure.
*/
int
xchk_ag_read_headers(
@@ -409,22 +409,24 @@ xchk_ag_read_headers(
struct xfs_mount *mp = sc->mp;
int error;
- sa->agno = agno;
+ ASSERT(!sa->pag);
+ sa->pag = xfs_perag_get(mp, agno);
+ if (!sa->pag)
+ return -ENOENT;
error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
- goto out;
+ return error;
error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
- goto out;
+ return error;
error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
- goto out;
- error = 0;
-out:
- return error;
+ return error;
+
+ return 0;
}
/* Release all the AG btree cursors. */
@@ -461,7 +463,6 @@ xchk_ag_btcur_init(
{
struct xfs_mount *mp = sc->mp;
- xchk_perag_get(sc->mp, sa);
if (sa->agf_bp &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
/* Set up a bnobt cursor for cross-referencing. */
@@ -484,21 +485,21 @@ xchk_ag_btcur_init(
}
/* Set up a finobt cursor for cross-referencing. */
- if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ if (sa->agi_bp && xfs_has_finobt(mp) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
sa->pag, XFS_BTNUM_FINO);
}
/* Set up a rmapbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ if (sa->agf_bp && xfs_has_rmapbt(mp) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
sa->pag);
}
/* Set up a refcountbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) &&
+ if (sa->agf_bp && xfs_has_reflink(mp) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
sa->agf_bp, sa->pag);
@@ -528,15 +529,14 @@ xchk_ag_free(
xfs_perag_put(sa->pag);
sa->pag = NULL;
}
- sa->agno = NULLAGNUMBER;
}
/*
- * For scrub, grab the AGI and the AGF headers, in that order. Locking
- * order requires us to get the AGI before the AGF. We use the
- * transaction to avoid deadlocking on crosslinked metadata buffers;
- * either the caller passes one in (bmap scrub) or we have to create a
- * transaction ourselves.
+ * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
+ * order. Locking order requires us to get the AGI before the AGF. We use the
+ * transaction to avoid deadlocking on crosslinked metadata buffers; either the
+ * caller passes one in (bmap scrub) or we have to create a transaction
+ * ourselves. Returns ENOENT if the perag struct cannot be grabbed.
*/
int
xchk_ag_init(
@@ -554,19 +554,6 @@ xchk_ag_init(
return 0;
}
-/*
- * Grab the per-ag structure if we haven't already gotten it. Teardown of the
- * xchk_ag will release it for us.
- */
-void
-xchk_perag_get(
- struct xfs_mount *mp,
- struct xchk_ag *sa)
-{
- if (!sa->pag)
- sa->pag = xfs_perag_get(mp, sa->agno);
-}
-
/* Per-scrubber setup functions */
/*
@@ -797,7 +784,7 @@ xchk_buffer_recheck(
if (!fa)
return;
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- trace_xchk_block_error(sc, bp->b_bn, fa);
+ trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
}
/*
@@ -842,7 +829,7 @@ xchk_metadata_inode_forks(
return error;
/* Look for incorrect shared blocks. */
- if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+ if (xfs_has_reflink(sc->mp)) {
error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
&shared);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
@@ -884,6 +871,7 @@ xchk_stop_reaping(
{
sc->flags |= XCHK_REAPING_DISABLED;
xfs_blockgc_stop(sc->mp);
+ xfs_inodegc_stop(sc->mp);
}
/* Restart background reaping of resources. */
@@ -891,6 +879,13 @@ void
xchk_start_reaping(
struct xfs_scrub *sc)
{
- xfs_blockgc_start(sc->mp);
+ /*
+ * Readonly filesystems do not perform inactivation or speculative
+ * preallocation, so there's no need to restart the workers.
+ */
+ if (!xfs_is_readonly(sc->mp)) {
+ xfs_inodegc_start(sc->mp);
+ xfs_blockgc_start(sc->mp);
+ }
sc->flags &= ~XCHK_REAPING_DISABLED;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 0410faf7d735..454145db10e7 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -107,7 +107,23 @@ int xchk_setup_fscounters(struct xfs_scrub *sc);
void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
-void xchk_perag_get(struct xfs_mount *mp, struct xchk_ag *sa);
+
+/*
+ * Grab all AG resources, treating the inability to grab the perag structure as
+ * a fs corruption. This is intended for callers checking an ondisk reference
+ * to a given AG, which means that the AG must still exist.
+ */
+static inline int
+xchk_ag_init_existing(
+ struct xfs_scrub *sc,
+ xfs_agnumber_t agno,
+ struct xchk_ag *sa)
+{
+ int error = xchk_ag_init(sc, agno, sa);
+
+ return error == -ENOENT ? -EFSCORRUPTED : error;
+}
+
int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
void xchk_ag_btcur_free(struct xchk_ag *sa);
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 9f0dbb47c82c..8a52514bc1ff 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -367,11 +367,11 @@ xchk_da_btree_block(
pmaxrecs = &ds->maxrecs[level];
/* We only started zeroing the header on v5 filesystems. */
- if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad)
+ if (xfs_has_crc(ds->sc->mp) && hdr3->hdr.pad)
xchk_da_set_corrupt(ds, level);
/* Check the owner. */
- if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) {
+ if (xfs_has_crc(ip->i_mount)) {
owner = be64_to_cpu(hdr3->owner);
if (owner != ip->i_ino)
xchk_da_set_corrupt(ds, level);
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 28dda391d5df..200a63f58fe7 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -51,7 +51,7 @@ xchk_dir_check_ftype(
int ino_dtype;
int error = 0;
- if (!xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (!xfs_has_ftype(mp)) {
if (dtype != DT_UNKNOWN && dtype != DT_DIR)
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
offset);
@@ -140,7 +140,7 @@ xchk_dir_actor(
if (!strncmp(".", name, namelen)) {
/* If this is "." then check that the inum matches the dir. */
- if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ if (xfs_has_ftype(mp) && type != DT_DIR)
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
offset);
checked_ftype = true;
@@ -152,7 +152,7 @@ xchk_dir_actor(
* If this is ".." in the root inode, check that the inum
* matches this dir.
*/
- if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ if (xfs_has_ftype(mp) && type != DT_DIR)
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
offset);
checked_ftype = true;
@@ -526,7 +526,7 @@ xchk_directory_leaf1_bestfree(
bestcount = be32_to_cpu(ltp->bestcount);
bestp = xfs_dir2_leaf_bests_p(ltp);
- if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ if (xfs_has_crc(sc->mp)) {
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
if (hdr3->pad != cpu_to_be32(0))
@@ -623,7 +623,7 @@ xchk_directory_free_bestfree(
return error;
xchk_buffer_recheck(sc, bp);
- if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ if (xfs_has_crc(sc->mp)) {
struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
if (hdr3->pad != cpu_to_be32(0))
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index fd7941e04ae1..48a6cbdf95d0 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -148,9 +148,9 @@ xchk_fscount_btreeblks(
xfs_extlen_t blocks;
int error;
- error = xchk_ag_init(sc, agno, &sc->sa);
+ error = xchk_ag_init_existing(sc, agno, &sc->sa);
if (error)
- return error;
+ goto out_free;
error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
if (error)
@@ -207,7 +207,7 @@ retry:
/* Add up the free/freelist/bnobt/cntbt blocks */
fsc->fdblocks += pag->pagf_freeblks;
fsc->fdblocks += pag->pagf_flcount;
- if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) {
+ if (xfs_has_lazysbcount(sc->mp)) {
fsc->fdblocks += pag->pagf_btreeblks;
} else {
error = xchk_fscount_btreeblks(sc, fsc, agno);
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 30e568596b79..00848ee542fb 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -418,7 +418,7 @@ xchk_iallocbt_rec_alignment(
STATIC int
xchk_iallocbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
struct xchk_iallocbt *iabt = bs->private;
@@ -517,7 +517,7 @@ xchk_iallocbt_xref_rmap_btreeblks(
int error;
if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
- (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) ||
+ (xfs_has_finobt(sc->mp) && !sc->sa.fino_cur) ||
xchk_skip_xref(sc->sm))
return;
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 76fbc7ca4cec..2405b09d03d0 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -181,7 +181,7 @@ xchk_inode_flags2(
/* reflink flag requires reflink feature */
if ((flags2 & XFS_DIFLAG2_REFLINK) &&
- !xfs_sb_version_hasreflink(&mp->m_sb))
+ !xfs_has_reflink(mp))
goto bad;
/* cowextsize flag is checked w.r.t. mode separately */
@@ -199,8 +199,7 @@ xchk_inode_flags2(
goto bad;
/* no bigtime iflag without the bigtime feature */
- if (xfs_dinode_has_bigtime(dip) &&
- !xfs_sb_version_hasbigtime(&mp->m_sb))
+ if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
goto bad;
return;
@@ -278,7 +277,7 @@ xchk_dinode(
xchk_ino_set_corrupt(sc, ino);
if (dip->di_projid_hi != 0 &&
- !xfs_sb_version_hasprojid32bit(&mp->m_sb))
+ !xfs_has_projid32(mp))
xchk_ino_set_corrupt(sc, ino);
break;
default:
@@ -532,9 +531,9 @@ xchk_inode_xref(
agno = XFS_INO_TO_AGNO(sc->mp, ino);
agbno = XFS_INO_TO_AGBNO(sc->mp, ino);
- error = xchk_ag_init(sc, agno, &sc->sa);
+ error = xchk_ag_init_existing(sc, agno, &sc->sa);
if (!xchk_xref_process_error(sc, agno, agbno, &error))
- return;
+ goto out_free;
xchk_xref_is_used_space(sc, agbno, 1);
xchk_inode_xref_finobt(sc, ino);
@@ -542,6 +541,7 @@ xchk_inode_xref(
xchk_xref_is_not_shared(sc, agbno, 1);
xchk_inode_xref_bmap(sc, dip);
+out_free:
xchk_ag_free(sc, &sc->sa);
}
@@ -560,7 +560,7 @@ xchk_inode_check_reflink_iflag(
bool has_shared;
int error;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return;
error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index acbb9839d42f..d6c1b00a4fc8 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -42,7 +42,7 @@ xchk_setup_quota(
xfs_dqtype_t dqtype;
int error;
- if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp))
+ if (!XFS_IS_QUOTA_ON(sc->mp))
return -ENOENT;
dqtype = xchk_quota_to_dqtype(sc);
@@ -127,7 +127,7 @@ xchk_quota_item(
* a reflink filesystem we're allowed to exceed physical space
* if there are no quota limits.
*/
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_warning(sc, XFS_DATA_FORK,
offset);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 7014b7408bad..2744eecdbaf0 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -91,7 +91,7 @@ struct xchk_refcnt_check {
STATIC int
xchk_refcountbt_rmap_check(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xchk_refcnt_check *refchk = priv;
@@ -330,7 +330,7 @@ xchk_refcountbt_xref(
STATIC int
xchk_refcountbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
xfs_agblock_t *cow_blocks = bs->private;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index ebb0e245aa72..8f3cba14ada3 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -248,19 +248,19 @@ xrep_calc_ag_resblks(
* bnobt/cntbt or inobt/finobt as pairs.
*/
bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
- if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ if (xfs_has_sparseinodes(mp))
inobt_sz = xfs_iallocbt_calc_size(mp, icount /
XFS_INODES_PER_HOLEMASK_BIT);
else
inobt_sz = xfs_iallocbt_calc_size(mp, icount /
XFS_INODES_PER_CHUNK);
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
inobt_sz *= 2;
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
else
refcbt_sz = 0;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (xfs_has_rmapbt(mp)) {
/*
* Guess how many blocks we need to rebuild the rmapbt.
* For non-reflink filesystems we can't have more records than
@@ -269,7 +269,7 @@ xrep_calc_ag_resblks(
* many rmaps there could be in the AG, so we start off with
* what we hope is an generous over-estimation.
*/
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
rmapbt_sz = xfs_rmapbt_calc_size(mp,
(unsigned long long)aglen * 2);
else
@@ -306,9 +306,9 @@ xrep_alloc_ag_block(
return -ENOSPC;
xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno,
1, false);
- *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
+ *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
if (resv == XFS_AG_RESV_RMAPBT)
- xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
+ xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
return 0;
default:
break;
@@ -317,7 +317,7 @@ xrep_alloc_ag_block(
args.tp = sc->tp;
args.mp = sc->mp;
args.oinfo = *oinfo;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
@@ -352,14 +352,14 @@ xrep_init_btblock(
trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
XFS_FSB_TO_AGBNO(mp, fsb), btnum);
- ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
+ ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
&bp);
if (error)
return error;
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
+ xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
bp->b_ops = ops;
@@ -481,7 +481,7 @@ xrep_fix_freelist(
args.mp = sc->mp;
args.tp = sc->tp;
- args.agno = sc->sa.agno;
+ args.agno = sc->sa.pag->pag_agno;
args.alignment = 1;
args.pag = sc->sa.pag;
@@ -611,11 +611,11 @@ xrep_reap_extents(
xfs_fsblock_t fsbno;
int error = 0;
- ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
+ ASSERT(xfs_has_rmapbt(sc->mp));
for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
ASSERT(sc->ip != NULL ||
- XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno);
+ XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
trace_xrep_dispose_btree_extent(sc->mp,
XFS_FSB_TO_AGNO(sc->mp, fsbno),
XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
@@ -690,7 +690,7 @@ xrep_findroot_block(
int block_level;
int error = 0;
- daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
+ daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
/*
* Blocks in the AGFL have stale contents that might just happen to
@@ -819,7 +819,7 @@ xrep_findroot_block(
else
fab->root = NULLAGBLOCK;
- trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno,
+ trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
be32_to_cpu(btblock->bb_magic), fab->height - 1);
out:
xfs_trans_brelse(ri->sc->tp, bp);
@@ -833,7 +833,7 @@ out:
STATIC int
xrep_findroot_rmap(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xrep_findroot *ri = priv;
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index fc306573f0ac..8dae0345c7df 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -88,7 +88,7 @@ xchk_rmapbt_xref(
STATIC int
xchk_rmapbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_rmap_irec irec;
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 37c0e2266c85..8fa012057405 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -41,7 +41,7 @@ xchk_setup_rt(
STATIC int
xchk_rtbitmap_rec(
struct xfs_trans *tp,
- struct xfs_rtalloc_rec *rec,
+ const struct xfs_rtalloc_rec *rec,
void *priv)
{
struct xfs_scrub *sc = priv;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 0e542636227c..51e4c61916d2 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -239,21 +239,21 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
.scrub = xchk_finobt,
- .has = xfs_sb_version_hasfinobt,
+ .has = xfs_has_finobt,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_rmapbt,
.scrub = xchk_rmapbt,
- .has = xfs_sb_version_hasrmapbt,
+ .has = xfs_has_rmapbt,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_refcountbt,
.scrub = xchk_refcountbt,
- .has = xfs_sb_version_hasreflink,
+ .has = xfs_has_reflink,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
@@ -308,14 +308,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_rt,
.scrub = xchk_rtbitmap,
- .has = xfs_sb_version_hasrealtime,
+ .has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
.setup = xchk_setup_rt,
.scrub = xchk_rtsummary,
- .has = xfs_sb_version_hasrealtime,
+ .has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
@@ -383,7 +383,7 @@ xchk_validate_inputs(
if (ops->setup == NULL || ops->scrub == NULL)
goto out;
/* Does this fs even support this type of metadata? */
- if (ops->has && !ops->has(&mp->m_sb))
+ if (ops->has && !ops->has(mp))
goto out;
error = -EINVAL;
@@ -415,11 +415,11 @@ xchk_validate_inputs(
*/
if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
error = -EOPNOTSUPP;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
goto out;
error = -EROFS;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
goto out;
}
@@ -464,9 +464,6 @@ xfs_scrub_metadata(
struct xfs_scrub sc = {
.file = file,
.sm = sm,
- .sa = {
- .agno = NULLAGNUMBER,
- },
};
struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
int error = 0;
@@ -480,10 +477,10 @@ xfs_scrub_metadata(
/* Forbidden if we are shut down or mounted norecovery. */
error = -ESHUTDOWN;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
goto out;
error = -ENOTRECOVERABLE;
- if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ if (xfs_has_norecovery(mp))
goto out;
error = xchk_validate_inputs(mp, sm);
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 08a483cb46e2..80e5026bba44 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -27,7 +27,7 @@ struct xchk_meta_ops {
int (*repair)(struct xfs_scrub *);
/* Decide if we even have this piece of metadata. */
- bool (*has)(struct xfs_sb *);
+ bool (*has)(struct xfs_mount *);
/* type describing required/allowed inputs */
enum xchk_type type;
@@ -35,7 +35,6 @@ struct xchk_meta_ops {
/* Buffer pointers and btree cursors for an entire AG. */
struct xchk_ag {
- xfs_agnumber_t agno;
struct xfs_perag *pag;
/* AG btree roots */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 03882a605a3c..c0ef53fe6611 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -22,11 +22,11 @@ xchk_btree_cur_fsbno(
int level)
{
if (level < cur->bc_nlevels && cur->bc_bufs[level])
- return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn);
- else if (level == cur->bc_nlevels - 1 &&
- cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(cur->bc_bufs[level]));
+ if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS)
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
- else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
+ if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0);
return NULLFSBLOCK;
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index e46f5cef90da..a7bbb84f91a7 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2,6 +2,10 @@
/*
* Copyright (C) 2017 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * NOTE: none of these tracepoints shall be considered a stable kernel ABI
+ * as they can change at any time. See xfs_trace.h for documentation of
+ * specific units found in tracepoint output.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM xfs_scrub
@@ -79,6 +83,16 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \
{ XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }
+#define XFS_SCRUB_FLAG_STRINGS \
+ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
+ { XFS_SCRUB_OFLAG_CORRUPT, "corrupt" }, \
+ { XFS_SCRUB_OFLAG_PREEN, "preen" }, \
+ { XFS_SCRUB_OFLAG_XFAIL, "xfail" }, \
+ { XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \
+ { XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \
+ { XFS_SCRUB_OFLAG_WARNING, "warning" }, \
+ { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -103,14 +117,14 @@ DECLARE_EVENT_CLASS(xchk_class,
__entry->flags = sm->sm_flags;
__entry->error = error;
),
- TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d",
+ TP_printk("dev %d:%d ino 0x%llx type %s agno 0x%x inum 0x%llx gen 0x%x flags (%s) error %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
__entry->inum,
__entry->gen,
- __entry->flags,
+ __print_flags(__entry->flags, "|", XFS_SCRUB_FLAG_STRINGS),
__entry->error)
)
#define DEFINE_SCRUB_EVENT(name) \
@@ -145,7 +159,7 @@ TRACE_EVENT(xchk_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
@@ -176,10 +190,10 @@ TRACE_EVENT(xchk_file_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->whichfork,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->offset,
__entry->error,
@@ -193,29 +207,21 @@ DECLARE_EVENT_CLASS(xchk_block_error_class,
__field(dev_t, dev)
__field(unsigned int, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, bno)
+ __field(xfs_agblock_t, agbno)
__field(void *, ret_ip)
),
TP_fast_assign(
- xfs_fsblock_t fsbno;
- xfs_agnumber_t agno;
- xfs_agblock_t bno;
-
- fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr);
- agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
- bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
-
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->agno = agno;
- __entry->bno = bno;
+ __entry->agno = xfs_daddr_to_agno(sc->mp, daddr);
+ __entry->agbno = xfs_daddr_to_agbno(sc->mp, daddr);
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
- __entry->bno,
+ __entry->agbno,
__entry->ret_ip)
)
@@ -281,10 +287,10 @@ DECLARE_EVENT_CLASS(xchk_fblock_error_class,
__entry->offset = offset;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->whichfork,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->offset,
__entry->ret_ip)
@@ -346,7 +352,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
@@ -389,10 +395,10 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->whichfork,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
@@ -428,7 +434,7 @@ TRACE_EVENT(xchk_btree_error,
__entry->ptr = cur->bc_ptrs[level];
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
@@ -468,10 +474,10 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->ptr = cur->bc_ptrs[level];
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->whichfork,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
@@ -507,7 +513,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_ptrs[level];
),
- TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d",
+ TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
@@ -580,7 +586,7 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
__entry->holemask = holemask;
__entry->cluster_ino = cluster_ino;
),
- TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+ TP_printk("dev %d:%d agno 0x%x startino 0x%x daddr 0x%llx bbcount 0x%x chunkino 0x%x nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startino,
@@ -670,7 +676,7 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -707,7 +713,7 @@ DECLARE_EVENT_CLASS(xrep_rmap_class,
__entry->offset = offset;
__entry->flags = flags;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -745,7 +751,7 @@ TRACE_EVENT(xrep_refcount_extent_fn,
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -769,7 +775,7 @@ TRACE_EVENT(xrep_init_btblock,
__entry->agbno = agbno;
__entry->btnum = btnum;
),
- TP_printk("dev %d:%d agno %u agbno %u btree %s",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -793,7 +799,7 @@ TRACE_EVENT(xrep_findroot_block,
__entry->magic = magic;
__entry->level = level;
),
- TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x magic 0x%x level %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -821,7 +827,7 @@ TRACE_EVENT(xrep_calc_ag_resblks,
__entry->freelen = freelen;
__entry->usedlen = usedlen;
),
- TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u",
+ TP_printk("dev %d:%d agno 0x%x icount %u aglen %u freelen %u usedlen %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->icount,
@@ -850,7 +856,7 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__entry->rmapbt_sz = rmapbt_sz;
__entry->refcbt_sz = refcbt_sz;
),
- TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x bnobt %u inobt %u rmapbt %u refcountbt %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->bnobt_sz,
@@ -894,7 +900,7 @@ TRACE_EVENT(xrep_ialloc_insert,
__entry->freecount = freecount;
__entry->freemask = freemask;
),
- TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx",
+ TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startino,
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index d02bef24b32b..5c52ee869272 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -125,7 +125,7 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
}
struct posix_acl *
-xfs_get_acl(struct inode *inode, int type)
+xfs_get_acl(struct inode *inode, int type, bool rcu)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -137,6 +137,9 @@ xfs_get_acl(struct inode *inode, int type)
};
int error;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
trace_xfs_get_acl(ip);
switch (type) {
@@ -232,7 +235,7 @@ xfs_acl_set_mode(
inode->i_ctime = current_time(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
return xfs_trans_commit(tp);
}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 7bdb3a4ed798..bb6abdcb265d 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -10,13 +10,13 @@ struct inode;
struct posix_acl;
#ifdef CONFIG_XFS_POSIX_ACL
-extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu);
extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
#else
-static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
+static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu)
{
return NULL;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index cb4e0fcf4c76..34fc6148032a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -97,7 +97,7 @@ xfs_end_ioend(
/*
* Just clean up the in-memory structures if the fs has been shut down.
*/
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ if (xfs_is_shutdown(ip->i_mount)) {
error = -EIO;
goto done;
}
@@ -260,7 +260,7 @@ xfs_map_blocks(
int retries = 0;
int error = 0;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/*
@@ -440,7 +440,7 @@ xfs_discard_page(
xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
int error;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
goto out_invalidate;
xfs_alert_ratelimited(mp,
@@ -449,7 +449,7 @@ xfs_discard_page(
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
i_blocks_per_page(inode, page) - pageoff_fsb);
- if (error && !XFS_FORCED_SHUTDOWN(mp))
+ if (error && !xfs_is_shutdown(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
@@ -462,22 +462,6 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
};
STATIC int
-xfs_vm_writepage(
- struct page *page,
- struct writeback_control *wbc)
-{
- struct xfs_writepage_ctx wpc = { };
-
- if (WARN_ON_ONCE(current->journal_info)) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
-
- return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
-}
-
-STATIC int
xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
@@ -559,7 +543,6 @@ xfs_iomap_swapfile_activate(
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readahead = xfs_vm_readahead,
- .writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
.set_page_dirty = __set_page_dirty_nobuffers,
.releasepage = iomap_releasepage,
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index aaa7e66c42d7..2b5da6218977 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -151,7 +151,7 @@ xfs_attr3_node_inactive(
}
xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr);
- parent_blkno = bp->b_bn;
+ parent_blkno = xfs_buf_daddr(bp);
if (!ichdr.count) {
xfs_trans_brelse(*trans, bp);
return 0;
@@ -177,7 +177,7 @@ xfs_attr3_node_inactive(
return error;
/* save for re-read later */
- child_blkno = XFS_BUF_ADDR(child_bp);
+ child_blkno = xfs_buf_daddr(child_bp);
/*
* Invalidate the subtree, however we have to.
@@ -271,7 +271,7 @@ xfs_attr3_root_inactive(
error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK);
if (error)
return error;
- blkno = bp->b_bn;
+ blkno = xfs_buf_daddr(bp);
/*
* Invalidate the tree, even if the "tree" is only a single leaf block.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 25dcc98d50e6..2d1e5134cebe 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -529,7 +529,7 @@ xfs_attr_list(
XFS_STATS_INC(dp->i_mount, xs_attr_list);
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ if (xfs_is_shutdown(dp->i_mount))
return -EIO;
lock_mode = xfs_ilock_attr_map_shared(dp);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e3a691937e92..03159970133f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -24,7 +24,6 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-#include "xfs_quota.h"
kmem_zone_t *xfs_bui_zone;
kmem_zone_t *xfs_bud_zone;
@@ -487,18 +486,10 @@ xfs_bui_item_recover(
XFS_ATTR_FORK : XFS_DATA_FORK;
bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
- /* Grab the inode. */
- error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip);
+ error = xlog_recover_iget(mp, bmap->me_owner, &ip);
if (error)
return error;
- error = xfs_qm_dqattach(ip);
- if (error)
- goto err_rele;
-
- if (VFS_I(ip)->i_nlink == 0)
- xfs_iflags_set(ip, XFS_IRECOVERY);
-
/* Allocate transaction and do the work. */
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
@@ -522,6 +513,9 @@ xfs_bui_item_recover(
error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
whichfork, bmap->me_startoff, bmap->me_startblock,
&count, state);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap,
+ sizeof(*bmap));
if (error)
goto err_cancel;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 213a97a921bb..73a36b7be3bd 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -731,7 +731,7 @@ xfs_free_eofblocks(
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
return error;
}
@@ -789,7 +789,7 @@ xfs_alloc_file_space(
trace_xfs_alloc_file_space(ip);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_qm_dqattach(ip);
@@ -1282,7 +1282,7 @@ xfs_swap_extents_check_format(
* If we have to use the (expensive) rmap swap method, we can
* handle any number of extents and any format.
*/
- if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
+ if (xfs_has_rmapbt(ip->i_mount))
return 0;
/*
@@ -1516,7 +1516,7 @@ xfs_swap_extent_forks(
* event of a crash. Set the owner change log flags now and leave the
* bmbt scan as the last step.
*/
- if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ if (xfs_has_v3inodes(ip->i_mount)) {
if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
(*target_log_flags) |= XFS_ILOG_DOWNER;
if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
@@ -1553,7 +1553,7 @@ xfs_swap_extent_forks(
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
+ ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
(*src_log_flags & XFS_ILOG_DOWNER));
(*src_log_flags) |= XFS_ILOG_DBROOT;
break;
@@ -1565,7 +1565,7 @@ xfs_swap_extent_forks(
break;
case XFS_DINODE_FMT_BTREE:
(*target_log_flags) |= XFS_ILOG_DBROOT;
- ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
+ ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
(*target_log_flags & XFS_ILOG_DOWNER));
break;
}
@@ -1626,7 +1626,6 @@ xfs_swap_extents(
struct xfs_bstat *sbp = &sxp->sx_stat;
int src_log_flags, target_log_flags;
int error = 0;
- int lock_flags;
uint64_t f;
int resblks = 0;
unsigned int flags = 0;
@@ -1638,8 +1637,8 @@ xfs_swap_extents(
* do the rest of the checks.
*/
lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
- lock_flags = XFS_MMAPLOCK_EXCL;
- xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_lock_two(VFS_I(ip)->i_mapping,
+ VFS_I(tip)->i_mapping);
/* Verify that both files have the same format */
if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
@@ -1679,7 +1678,7 @@ xfs_swap_extents(
* a block reservation because it's really just a remap operation
* performed with log redo items!
*/
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (xfs_has_rmapbt(mp)) {
int w = XFS_DATA_FORK;
uint32_t ipnext = ip->i_df.if_nextents;
uint32_t tipnext = tip->i_df.if_nextents;
@@ -1711,7 +1710,6 @@ xfs_swap_extents(
* or cancel will unlock the inodes from this point onwards.
*/
xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
- lock_flags |= XFS_ILOCK_EXCL;
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_ijoin(tp, tip, 0);
@@ -1761,7 +1759,7 @@ xfs_swap_extents(
src_log_flags = XFS_ILOG_CORE;
target_log_flags = XFS_ILOG_CORE;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
error = xfs_swap_extent_rmap(&tp, ip, tip);
else
error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
@@ -1780,7 +1778,7 @@ xfs_swap_extents(
}
/* Swap the cow forks. */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
ASSERT(!ip->i_cowfp ||
ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
ASSERT(!tip->i_cowfp ||
@@ -1822,7 +1820,7 @@ xfs_swap_extents(
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
@@ -1830,13 +1828,16 @@ xfs_swap_extents(
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
+out_unlock_ilock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL);
out_unlock:
- xfs_iunlock(ip, lock_flags);
- xfs_iunlock(tip, lock_flags);
+ filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping,
+ VFS_I(tip)->i_mapping);
unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
- goto out_unlock;
+ goto out_unlock_ilock;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8ff42b3585e0..5fa6cd947dd4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -251,7 +251,7 @@ _xfs_buf_alloc(
return error;
}
- bp->b_bn = map[0].bm_bn;
+ bp->b_rhash_key = map[0].bm_bn;
bp->b_length = 0;
for (i = 0; i < nmaps; i++) {
bp->b_maps[i].bm_bn = map[i].bm_bn;
@@ -315,7 +315,6 @@ xfs_buf_alloc_kmem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
- int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
xfs_km_flags_t kmflag_mask = KM_NOFS;
size_t size = BBTOB(bp->b_length);
@@ -323,7 +322,7 @@ xfs_buf_alloc_kmem(
if (!(flags & XBF_READ))
kmflag_mask |= KM_ZERO;
- bp->b_addr = kmem_alloc_io(size, align_mask, kmflag_mask);
+ bp->b_addr = kmem_alloc(size, kmflag_mask);
if (!bp->b_addr)
return -ENOMEM;
@@ -460,7 +459,7 @@ _xfs_buf_obj_cmp(
*/
BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
- if (bp->b_bn != map->bm_bn)
+ if (bp->b_rhash_key != map->bm_bn)
return 1;
if (unlikely(bp->b_length != map->bm_len)) {
@@ -482,7 +481,7 @@ static const struct rhashtable_params xfs_buf_hash_params = {
.min_size = 32, /* empty AGs have minimal footprint */
.nelem_hint = 16,
.key_len = sizeof(xfs_daddr_t),
- .key_offset = offsetof(struct xfs_buf, b_bn),
+ .key_offset = offsetof(struct xfs_buf, b_rhash_key),
.head_offset = offsetof(struct xfs_buf, b_rhash_head),
.automatic_shrinking = true,
.obj_cmpfn = _xfs_buf_obj_cmp,
@@ -814,7 +813,7 @@ xfs_buf_read_map(
* buffer.
*/
if (error) {
- if (!XFS_FORCED_SHUTDOWN(target->bt_mount))
+ if (!xfs_is_shutdown(target->bt_mount))
xfs_buf_ioerror_alert(bp, fa);
bp->b_flags &= ~XBF_DONE;
@@ -844,7 +843,7 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
- if (bdi_read_congested(target->bt_bdev->bd_bdi))
+ if (bdi_read_congested(target->bt_bdev->bd_disk->bdi))
return;
xfs_buf_read_map(target, map, nmaps,
@@ -854,7 +853,9 @@ xfs_buf_readahead_map(
/*
* Read an uncached buffer from disk. Allocates and returns a locked
- * buffer containing the disk contents or nothing.
+ * buffer containing the disk contents or nothing. Uncached buffers always have
+ * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
+ * is cached or uncached during fault diagnosis.
*/
int
xfs_buf_read_uncached(
@@ -876,7 +877,7 @@ xfs_buf_read_uncached(
/* set up the buffer for a read IO */
ASSERT(bp->b_map_count == 1);
- bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
+ bp->b_rhash_key = XFS_BUF_DADDR_NULL;
bp->b_maps[0].bm_bn = daddr;
bp->b_flags |= XBF_READ;
bp->b_ops = ops;
@@ -1145,7 +1146,7 @@ xfs_buf_ioerror_permanent(
return true;
/* At unmount we may treat errors differently */
- if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
return true;
return false;
@@ -1179,7 +1180,7 @@ xfs_buf_ioend_handle_error(
* If we've already decided to shutdown the filesystem because of I/O
* errors, there's no point in giving this a retry.
*/
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
goto out_stale;
xfs_buf_ioerror_alert_ratelimited(bp);
@@ -1336,7 +1337,7 @@ xfs_buf_ioerror_alert(
{
xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
- func, (uint64_t)XFS_BUF_ADDR(bp),
+ func, (uint64_t)xfs_buf_daddr(bp),
bp->b_length, -bp->b_error);
}
@@ -1514,17 +1515,18 @@ _xfs_buf_ioapply(
SHUTDOWN_CORRUPT_INCORE);
return;
}
- } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+ } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
struct xfs_mount *mp = bp->b_mount;
/*
* non-crc filesystems don't attach verifiers during
* log recovery, so don't warn for such filesystems.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
xfs_warn(mp,
"%s: no buf ops on daddr 0x%llx len %d",
- __func__, bp->b_bn, bp->b_length);
+ __func__, xfs_buf_daddr(bp),
+ bp->b_length);
xfs_hex_dump(bp->b_addr,
XFS_CORRUPTION_DUMP_LEN);
dump_stack();
@@ -1592,7 +1594,7 @@ __xfs_buf_submit(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
/* on shutdown we stale and complete the buffer immediately */
- if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+ if (xfs_is_shutdown(bp->b_mount)) {
xfs_buf_ioend_fail(bp);
return -EIO;
}
@@ -1794,7 +1796,7 @@ xfs_buftarg_drain(
xfs_buf_alert_ratelimited(bp,
"XFS: Corruption Alert",
"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
- (long long)bp->b_bn);
+ (long long)xfs_buf_daddr(bp));
}
xfs_buf_rele(bp);
}
@@ -1809,7 +1811,7 @@ xfs_buftarg_drain(
* down the fs.
*/
if (write_fail) {
- ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount));
+ ASSERT(xfs_is_shutdown(btp->bt_mount));
xfs_alert(btp->bt_mount,
"Please run xfs_repair to determine the extent of the problem.");
}
@@ -2302,7 +2304,7 @@ xfs_verify_magic(
struct xfs_mount *mp = bp->b_mount;
int idx;
- idx = xfs_sb_version_hascrc(&mp->m_sb);
+ idx = xfs_has_crc(mp);
if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
return false;
return dmagic == bp->b_ops->magic[idx];
@@ -2320,7 +2322,7 @@ xfs_verify_magic16(
struct xfs_mount *mp = bp->b_mount;
int idx;
- idx = xfs_sb_version_hascrc(&mp->m_sb);
+ idx = xfs_has_crc(mp);
if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
return false;
return dmagic == bp->b_ops->magic16[idx];
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 464dc548fa23..6b0200b8007d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -133,7 +133,8 @@ struct xfs_buf {
* fast-path on locking.
*/
struct rhash_head b_rhash_head; /* pag buffer hash node */
- xfs_daddr_t b_bn; /* block number of buffer */
+
+ xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
@@ -296,18 +297,10 @@ extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
extern int xfs_buf_init(void);
extern void xfs_buf_terminate(void);
-/*
- * These macros use the IO block map rather than b_bn. b_bn is now really
- * just for the buffer cache index for cached buffers. As IO does not use b_bn
- * anymore, uncached buffers do not use b_bn at all and hence must modify the IO
- * map directly. Uncached buffers are not allowed to be discontiguous, so this
- * is safe to do.
- *
- * In future, uncached buffers will pass the block number directly to the io
- * request function and hence these macros will go away at that point.
- */
-#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
-#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
+static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
+{
+ return bp->b_maps[0].bm_bn;
+}
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
@@ -355,12 +348,6 @@ extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-static inline int
-xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
-{
- return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
-}
-
int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2828ce45b701..b1ab100c09e1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -428,7 +428,7 @@ xfs_buf_item_format(
* occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) ||
+ if (xfs_has_v3inodes(lip->li_mountp) ||
!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
@@ -581,7 +581,7 @@ xfs_buf_item_push(
if (bp->b_flags & XBF_WRITE_FAIL) {
xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
"Failing async write on buffer block 0x%llx. Retrying async write.",
- (long long)bp->b_bn);
+ (long long)xfs_buf_daddr(bp));
}
if (!xfs_buf_delwri_queue(bp, buffer_list))
@@ -616,7 +616,7 @@ xfs_buf_item_put(
* that case, the bli is freed on buffer writeback completion.
*/
aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- XFS_FORCED_SHUTDOWN(lip->li_mountp);
+ xfs_is_shutdown(lip->li_mountp);
dirty = bip->bli_flags & XFS_BLI_DIRTY;
if (dirty && !aborted)
return false;
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 4775485b4062..a476c7ef5d53 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -219,7 +219,7 @@ xlog_recover_validate_buf_type(
* inconsistent state resulting in verification failures. Hence for now
* just avoid the verification stage for non-crc filesystems
*/
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
@@ -497,7 +497,7 @@ xlog_recover_do_reg_buffer(
if (fa) {
xfs_alert(mp,
"dquot corrupt at %pS trying to replay into block 0x%llx",
- fa, bp->b_bn);
+ fa, xfs_buf_daddr(bp));
goto next;
}
}
@@ -597,7 +597,7 @@ xlog_recover_do_inode_buffer(
* Post recovery validation only works properly on CRC enabled
* filesystems.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
bp->b_ops = &xfs_inode_buf_ops;
inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
@@ -710,7 +710,7 @@ xlog_recover_get_buf_lsn(
uint16_t blft;
/* v4 filesystems always recover immediately */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
goto recover_immediately;
/*
@@ -787,7 +787,7 @@ xlog_recover_get_buf_lsn(
* the relevant UUID in the superblock.
*/
lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
- if (xfs_sb_version_hasmetauuid(&mp->m_sb))
+ if (xfs_has_metauuid(mp))
uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
else
uuid = &((struct xfs_dsb *)blk)->sb_uuid;
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index da1cc683560c..8310005af00f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -32,7 +32,7 @@ xfs_dir3_get_dtype(
struct xfs_mount *mp,
uint8_t filetype)
{
- if (!xfs_sb_version_hasftype(&mp->m_sb))
+ if (!xfs_has_ftype(mp))
return DT_UNKNOWN;
if (filetype >= XFS_DIR3_FT_MAX)
@@ -512,7 +512,7 @@ xfs_readdir(
trace_xfs_readdir(dp);
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ if (xfs_is_shutdown(dp->i_mount))
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 736df5660f1f..0191de8ce9ce 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -169,7 +169,7 @@ xfs_ioc_trim(
* We haven't recovered the log, so we cannot use our bnobt-guided
* storage zapping commands.
*/
- if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ if (xfs_has_norecovery(mp))
return -EROFS;
if (copy_from_user(&range, urange, sizeof(range)))
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index ecd5059d6928..c15d61d47a06 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -223,9 +223,9 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_type = type;
- if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb))
+ if (curid > 0 && xfs_has_bigtime(mp))
d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -526,7 +526,7 @@ xfs_dquot_check_type(
* expect an exact match for user dquots and for non-root group and
* project dquots.
*/
- if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) ||
+ if (xfs_has_crc(dqp->q_mount) ||
dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0)
return ddqp_type == dqp_type;
@@ -847,9 +847,6 @@ xfs_qm_dqget_checks(
struct xfs_mount *mp,
xfs_dqtype_t type)
{
- if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp)))
- return -ESRCH;
-
switch (type) {
case XFS_DQTYPE_USER:
if (!XFS_IS_UQUOTA_ON(mp))
@@ -1222,7 +1219,7 @@ xfs_qm_dqflush_check(
/* bigtime flag should never be set on root dquots */
if (dqp->q_type & XFS_DQTYPE_BIGTIME) {
- if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb))
+ if (!xfs_has_bigtime(dqp->q_mount))
return __this_address;
if (dqp->q_id == 0)
return __this_address;
@@ -1301,7 +1298,7 @@ xfs_qm_dqflush(
* buffer always has a valid CRC. This ensures there is no possibility
* of a dquot without an up-to-date CRC getting to disk.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index f642884a6834..6b5e3cf40c8b 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -54,6 +54,16 @@ struct xfs_dquot_res {
xfs_qwarncnt_t warnings;
};
+static inline bool
+xfs_dquot_res_over_limits(
+ const struct xfs_dquot_res *qres)
+{
+ if ((qres->softlimit && qres->softlimit < qres->reserved) ||
+ (qres->hardlimit && qres->hardlimit < qres->reserved))
+ return true;
+ return false;
+}
+
/*
* The incore dquot structure
*/
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 8ed47b739b6c..6a1aae799cf1 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -218,137 +218,3 @@ xfs_qm_dquot_logitem_init(
&xfs_dquot_item_ops);
lp->qli_dquot = dqp;
}
-
-/*------------------ QUOTAOFF LOG ITEMS -------------------*/
-
-static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
-{
- return container_of(lip, struct xfs_qoff_logitem, qql_item);
-}
-
-
-/*
- * This returns the number of iovecs needed to log the given quotaoff item.
- * We only need 1 iovec for an quotaoff item. It just logs the
- * quotaoff_log_format structure.
- */
-STATIC void
-xfs_qm_qoff_logitem_size(
- struct xfs_log_item *lip,
- int *nvecs,
- int *nbytes)
-{
- *nvecs += 1;
- *nbytes += sizeof(struct xfs_qoff_logitem);
-}
-
-STATIC void
-xfs_qm_qoff_logitem_format(
- struct xfs_log_item *lip,
- struct xfs_log_vec *lv)
-{
- struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
- struct xfs_log_iovec *vecp = NULL;
- struct xfs_qoff_logformat *qlf;
-
- qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
- qlf->qf_type = XFS_LI_QUOTAOFF;
- qlf->qf_size = 1;
- qlf->qf_flags = qflip->qql_flags;
- xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
-}
-
-/*
- * There isn't much you can do to push a quotaoff item. It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_qm_qoff_logitem_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_LOCKED;
-}
-
-STATIC xfs_lsn_t
-xfs_qm_qoffend_logitem_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
- struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
-
- xfs_qm_qoff_logitem_relse(qfs);
-
- kmem_free(lip->li_lv_shadow);
- kmem_free(qfe);
- return (xfs_lsn_t)-1;
-}
-
-STATIC void
-xfs_qm_qoff_logitem_release(
- struct xfs_log_item *lip)
-{
- struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip);
-
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- if (qoff->qql_start_lip)
- xfs_qm_qoff_logitem_relse(qoff->qql_start_lip);
- xfs_qm_qoff_logitem_relse(qoff);
- }
-}
-
-static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
- .iop_size = xfs_qm_qoff_logitem_size,
- .iop_format = xfs_qm_qoff_logitem_format,
- .iop_committed = xfs_qm_qoffend_logitem_committed,
- .iop_push = xfs_qm_qoff_logitem_push,
- .iop_release = xfs_qm_qoff_logitem_release,
-};
-
-static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
- .iop_size = xfs_qm_qoff_logitem_size,
- .iop_format = xfs_qm_qoff_logitem_format,
- .iop_push = xfs_qm_qoff_logitem_push,
- .iop_release = xfs_qm_qoff_logitem_release,
-};
-
-/*
- * Delete the quotaoff intent from the AIL and free it. On success,
- * this should only be called for the start item. It can be used for
- * either on shutdown or abort.
- */
-void
-xfs_qm_qoff_logitem_relse(
- struct xfs_qoff_logitem *qoff)
-{
- struct xfs_log_item *lip = &qoff->qql_item;
-
- ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) ||
- test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- XFS_FORCED_SHUTDOWN(lip->li_mountp));
- xfs_trans_ail_delete(lip, 0);
- kmem_free(lip->li_lv_shadow);
- kmem_free(qoff);
-}
-
-/*
- * Allocate and initialize an quotaoff item of the correct quota type(s).
- */
-struct xfs_qoff_logitem *
-xfs_qm_qoff_logitem_init(
- struct xfs_mount *mp,
- struct xfs_qoff_logitem *start,
- uint flags)
-{
- struct xfs_qoff_logitem *qf;
-
- qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
-
- xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
- &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
- qf->qql_item.li_mountp = mp;
- qf->qql_start_lip = start;
- qf->qql_flags = flags;
- return qf;
-}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 2b86a43d7ce2..794710c24474 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -9,7 +9,6 @@
struct xfs_dquot;
struct xfs_trans;
struct xfs_mount;
-struct xfs_qoff_logitem;
struct xfs_dq_logitem {
struct xfs_log_item qli_item; /* common portion */
@@ -17,22 +16,6 @@ struct xfs_dq_logitem {
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
};
-struct xfs_qoff_logitem {
- struct xfs_log_item qql_item; /* common portion */
- struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
- unsigned int qql_flags;
-};
-
-
void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
-struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp,
- struct xfs_qoff_logitem *start,
- uint flags);
-void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *);
-struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp,
- struct xfs_qoff_logitem *startqoff,
- uint flags);
-void xfs_trans_log_quotaoff_item(struct xfs_trans *tp,
- struct xfs_qoff_logitem *qlp);
#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
index 5875c7e1bd28..8966ba842395 100644
--- a/fs/xfs/xfs_dquot_item_recover.c
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -136,7 +136,7 @@ xlog_recover_dquot_commit_pass2(
* If the dquot has an LSN in it, recover the dquot only if it's less
* than the lsn of the transaction we are replaying.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
@@ -146,7 +146,7 @@ xlog_recover_dquot_commit_pass2(
}
memcpy(ddq, recddq, item->ri_buf[1].i_len);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ce3bc1b291a1..81c445e9489b 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -371,7 +371,7 @@ xfs_buf_corruption_error(
xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
"Metadata corruption detected at %pS, %s block 0x%llx",
- fa, bp->b_ops->name, bp->b_bn);
+ fa, bp->b_ops->name, xfs_buf_daddr(bp));
xfs_alert(mp, "Unmount and run xfs_repair");
@@ -402,7 +402,7 @@ xfs_buf_verifier_error(
xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
"Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
- fa, bp->b_ops->name, bp->b_bn, name);
+ fa, bp->b_ops->name, xfs_buf_daddr(bp), name);
xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 1717b7508356..5735d5ea87ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -75,4 +75,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
#define XFS_PTAG_VERIFIER_ERROR 0x00000100
+#define XFS_PTAG_STRINGS \
+ { XFS_NO_PTAG, "none" }, \
+ { XFS_PTAG_IFLUSH, "iflush" }, \
+ { XFS_PTAG_LOGRES, "logres" }, \
+ { XFS_PTAG_AILDELETE, "aildelete" }, \
+ { XFS_PTAG_ERROR_REPORT , "error_report" }, \
+ { XFS_PTAG_SHUTDOWN_CORRUPT, "corrupt" }, \
+ { XFS_PTAG_SHUTDOWN_IOERROR, "ioerror" }, \
+ { XFS_PTAG_SHUTDOWN_LOGERROR, "logerror" }, \
+ { XFS_PTAG_FSBLOCK_ZERO, "fsb_zero" }, \
+ { XFS_PTAG_VERIFIER_ERROR, "verifier" }
+
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1da59bdff245..1064c2342876 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -44,6 +44,7 @@ xfs_fs_encode_fh(
int *max_len,
struct inode *parent)
{
+ struct xfs_mount *mp = XFS_M(inode->i_sb);
struct fid *fid = (struct fid *)fh;
struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh;
int fileid_type;
@@ -63,8 +64,7 @@ xfs_fs_encode_fh(
* large enough filesystem may contain them, thus the slightly
* confusing looking conditional below.
*/
- if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
- (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+ if (!xfs_has_small_inums(mp) || xfs_is_inode32(mp))
fileid_type |= XFS_FILEID_TYPE_64FLAG;
/*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 2424230ca2c3..3f8a0713573a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -629,6 +629,9 @@ xfs_efi_item_recover(
error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
extp->ext_len,
&XFS_RMAP_OINFO_ANY_OWNER, false);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ extp, sizeof(*extp));
if (error)
goto abort_error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index cc3cfb12df53..7aa943edfc02 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -185,7 +185,7 @@ xfs_file_fsync(
if (error)
return error;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
xfs_iflags_clear(ip, XFS_ITRUNCATED);
@@ -318,7 +318,7 @@ xfs_file_read_iter(
XFS_STATS_INC(mp, xs_read_calls);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (IS_DAX(inode))
@@ -462,7 +462,7 @@ xfs_dio_write_end_io(
trace_xfs_end_io_direct_write(ip, offset, size);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return -EIO;
if (error)
@@ -814,7 +814,7 @@ xfs_file_write_iter(
if (ocount == 0)
return 0;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return -EIO;
if (IS_DAX(inode))
@@ -1122,7 +1122,7 @@ static inline bool xfs_file_sync_writes(struct file *filp)
{
struct xfs_inode *ip = XFS_I(file_inode(filp));
- if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(ip->i_mount))
return true;
if (filp->f_flags & (__O_SYNC | O_DSYNC))
return true;
@@ -1153,10 +1153,10 @@ xfs_file_remap_range(
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return -EOPNOTSUPP;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/* Prepare and then clone file data. */
@@ -1205,7 +1205,7 @@ xfs_file_open(
{
if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
return -EFBIG;
- if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+ if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return 0;
@@ -1277,7 +1277,7 @@ xfs_file_llseek(
{
struct inode *inode = file->f_mapping->host;
- if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
+ if (xfs_is_shutdown(XFS_I(inode)->i_mount))
return -EIO;
switch (whence) {
@@ -1302,7 +1302,7 @@ xfs_file_llseek(
*
* mmap_lock (MM)
* sb_start_pagefault(vfs, freeze)
- * i_mmaplock (XFS - truncate serialisation)
+ * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
@@ -1323,24 +1323,27 @@ __xfs_filemap_fault(
file_update_time(vmf->vma->vm_file);
}
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
pfn_t pfn;
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
(write_fault && !vmf->cow_page) ?
&xfs_direct_write_iomap_ops :
&xfs_read_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
- if (write_fault)
+ if (write_fault) {
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = iomap_page_mkwrite(vmf,
&xfs_buffered_write_iomap_ops);
- else
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ } else {
ret = filemap_fault(vmf);
+ }
}
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (write_fault)
sb_end_pagefault(inode->i_sb);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index eed6ca5f8f91..6a3ce0f6dc9e 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -295,7 +295,7 @@ xfs_filestream_lookup_ag(
* Set the starting AG using the rotor for inode32, otherwise
* use the directory inode's AG.
*/
- if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ if (xfs_is_inode32(mp)) {
xfs_agnumber_t rotorstep = xfs_rotorstep;
startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
mp->m_agfrotor = (mp->m_agfrotor + 1) %
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 3af963743e4d..403226ebb80b 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -21,7 +21,7 @@ static inline int
xfs_inode_is_filestream(
struct xfs_inode *ip)
{
- return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
+ return xfs_has_filestreams(ip->i_mount) ||
(ip->i_diflags & XFS_DIFLAG_FILESTREAM);
}
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 7d0b09c1366e..48287caad28b 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -61,7 +61,7 @@ xfs_fsmap_to_internal(
static int
xfs_fsmap_owner_to_rmap(
struct xfs_rmap_irec *dest,
- struct xfs_fsmap *src)
+ const struct xfs_fsmap *src)
{
if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) {
dest->rm_owner = src->fmr_owner;
@@ -111,8 +111,8 @@ xfs_fsmap_owner_to_rmap(
/* Convert an rmapbt owner into an fsmap owner. */
static int
xfs_fsmap_owner_from_rmap(
- struct xfs_fsmap *dest,
- struct xfs_rmap_irec *src)
+ struct xfs_fsmap *dest,
+ const struct xfs_rmap_irec *src)
{
dest->fmr_flags = 0;
if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) {
@@ -171,7 +171,7 @@ struct xfs_getfsmap_info {
struct xfs_getfsmap_dev {
u32 dev;
int (*fn)(struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info);
};
@@ -192,7 +192,7 @@ STATIC int
xfs_getfsmap_is_shared(
struct xfs_trans *tp,
struct xfs_getfsmap_info *info,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
bool *stat)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -202,7 +202,7 @@ xfs_getfsmap_is_shared(
int error;
*stat = false;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return 0;
/* rt files will have no perag structure */
if (!info->pag)
@@ -245,7 +245,7 @@ STATIC int
xfs_getfsmap_helper(
struct xfs_trans *tp,
struct xfs_getfsmap_info *info,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
xfs_daddr_t rec_daddr)
{
struct xfs_fsmap fmr;
@@ -347,7 +347,7 @@ out:
STATIC int
xfs_getfsmap_datadev_helper(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xfs_mount *mp = cur->bc_mp;
@@ -365,7 +365,7 @@ xfs_getfsmap_datadev_helper(
STATIC int
xfs_getfsmap_datadev_bnobt_helper(
struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *rec,
+ const struct xfs_alloc_rec_incore *rec,
void *priv)
{
struct xfs_mount *mp = cur->bc_mp;
@@ -389,7 +389,7 @@ xfs_getfsmap_datadev_bnobt_helper(
static void
xfs_getfsmap_set_irec_flags(
struct xfs_rmap_irec *irec,
- struct xfs_fsmap *fmr)
+ const struct xfs_fsmap *fmr)
{
irec->rm_flags = 0;
if (fmr->fmr_flags & FMR_OF_ATTR_FORK)
@@ -404,7 +404,7 @@ xfs_getfsmap_set_irec_flags(
STATIC int
xfs_getfsmap_logdev(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -451,7 +451,7 @@ xfs_getfsmap_logdev(
STATIC int
xfs_getfsmap_rtdev_rtbitmap_helper(
struct xfs_trans *tp,
- struct xfs_rtalloc_rec *rec,
+ const struct xfs_rtalloc_rec *rec,
void *priv)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -473,7 +473,7 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
STATIC int
__xfs_getfsmap_rtdev(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
int (*query_fn)(struct xfs_trans *,
struct xfs_getfsmap_info *),
struct xfs_getfsmap_info *info)
@@ -481,16 +481,14 @@ __xfs_getfsmap_rtdev(
struct xfs_mount *mp = tp->t_mountp;
xfs_fsblock_t start_fsb;
xfs_fsblock_t end_fsb;
- xfs_daddr_t eofs;
+ uint64_t eofs;
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
- if (keys[1].fmr_physical >= eofs)
- keys[1].fmr_physical = eofs - 1;
start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
- end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical);
+ end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
/* Set up search keys */
info->low.rm_startblock = start_fsb;
@@ -523,27 +521,37 @@ xfs_getfsmap_rtdev_rtbitmap_query(
{
struct xfs_rtalloc_rec alow = { 0 };
struct xfs_rtalloc_rec ahigh = { 0 };
+ struct xfs_mount *mp = tp->t_mountp;
int error;
- xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ /*
+ * Set up query parameters to return free rtextents covering the range
+ * we want.
+ */
alow.ar_startext = info->low.rm_startblock;
ahigh.ar_startext = info->high.rm_startblock;
- do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize);
- if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize))
+ do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
+ if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
ahigh.ar_startext++;
error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
goto err;
- /* Report any gaps at the end of the rtbitmap */
+ /*
+ * Report any gaps at the end of the rtbitmap by simulating a null
+ * rmap starting at the block after the end of the query range.
+ */
info->last = true;
+ ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
+
error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
if (error)
goto err;
err:
- xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
return error;
}
@@ -551,7 +559,7 @@ err:
STATIC int
xfs_getfsmap_rtdev_rtbitmap(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
info->missing_owner = XFS_FMR_OWN_UNKNOWN;
@@ -564,7 +572,7 @@ xfs_getfsmap_rtdev_rtbitmap(
STATIC int
__xfs_getfsmap_datadev(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info,
int (*query_fn)(struct xfs_trans *,
struct xfs_getfsmap_info *,
@@ -579,16 +587,14 @@ __xfs_getfsmap_datadev(
xfs_fsblock_t end_fsb;
xfs_agnumber_t start_ag;
xfs_agnumber_t end_ag;
- xfs_daddr_t eofs;
+ uint64_t eofs;
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
- if (keys[1].fmr_physical >= eofs)
- keys[1].fmr_physical = eofs - 1;
start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical);
- end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical);
+ end_fsb = XFS_DADDR_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
/*
* Convert the fsmap low/high keys to AG based keys. Initialize
@@ -716,7 +722,7 @@ xfs_getfsmap_datadev_rmapbt_query(
STATIC int
xfs_getfsmap_datadev_rmapbt(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
info->missing_owner = XFS_FMR_OWN_FREE;
@@ -751,7 +757,7 @@ xfs_getfsmap_datadev_bnobt_query(
STATIC int
xfs_getfsmap_datadev_bnobt(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
struct xfs_alloc_rec_incore akeys[2];
@@ -859,7 +865,7 @@ xfs_getfsmap(
return -EINVAL;
use_rmap = capable(CAP_SYS_ADMIN) &&
- xfs_sb_version_hasrmapbt(&mp->m_sb);
+ xfs_has_rmapbt(mp);
head->fmh_entries = 0;
/* Set up our device handlers. */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 6ed29b158312..33e26690a8c4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -19,6 +19,7 @@
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_trace.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
@@ -177,7 +178,7 @@ xfs_growfs_data_private(
* particularly important for shrink because the write verifier
* will fail if sb_fdblocks is ever larger than sb_dblocks.
*/
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
xfs_log_sb(tp);
xfs_trans_set_sync(tp);
@@ -511,6 +512,11 @@ xfs_fs_goingdown(
* consistent. We don't do an unmount here; just shutdown the shop, make sure
* that absolutely nothing persistent happens to this filesystem after this
* point.
+ *
+ * The shutdown state change is atomic, resulting in the first and only the
+ * first shutdown call processing the shutdown. This means we only shutdown the
+ * log once as it requires, and we don't spam the logs when multiple concurrent
+ * shutdowns race to set the shutdown flags.
*/
void
xfs_do_force_shutdown(
@@ -519,48 +525,37 @@ xfs_do_force_shutdown(
char *fname,
int lnnum)
{
- bool logerror = flags & SHUTDOWN_LOG_IO_ERROR;
-
- /*
- * No need to duplicate efforts.
- */
- if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
- return;
-
- /*
- * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
- * queue up anybody new on the log reservations, and wakes up
- * everybody who's sleeping on log reservations to tell them
- * the bad news.
- */
- if (xfs_log_force_umount(mp, logerror))
- return;
+ int tag;
+ const char *why;
- if (flags & SHUTDOWN_FORCE_UMOUNT) {
- xfs_alert(mp,
-"User initiated shutdown (0x%x) received. Shutting down filesystem",
- flags);
+ if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate))
return;
- }
-
- if (flags & SHUTDOWN_CORRUPT_INCORE) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-"Corruption of in-memory data (0x%x) detected at %pS (%s:%d). Shutting down filesystem",
- flags, __return_address, fname, lnnum);
- if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
- xfs_stack_trace();
- } else if (logerror) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
-"Log I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem",
- flags, __return_address, fname, lnnum);
+ if (mp->m_sb_bp)
+ mp->m_sb_bp->b_flags |= XBF_DONE;
+
+ if (flags & SHUTDOWN_FORCE_UMOUNT)
+ xfs_alert(mp, "User initiated shutdown received.");
+
+ if (xlog_force_shutdown(mp->m_log, flags)) {
+ tag = XFS_PTAG_SHUTDOWN_LOGERROR;
+ why = "Log I/O Error";
+ } else if (flags & SHUTDOWN_CORRUPT_INCORE) {
+ tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+ why = "Corruption of in-memory data";
} else {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-"I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem",
- flags, __return_address, fname, lnnum);
+ tag = XFS_PTAG_SHUTDOWN_IOERROR;
+ why = "Metadata I/O Error";
}
+ trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum);
+
+ xfs_alert_tag(mp, tag,
+"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.",
+ why, flags, __return_address, fname, lnnum);
xfs_alert(mp,
"Please unmount the filesystem and rectify the problem(s)");
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
}
/*
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index eb10eacabc8f..72a075bb2c10 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -30,7 +30,7 @@ xfs_health_unmount(
unsigned int checked = 0;
bool warn = false;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return;
/* Measure AG corruption levels. */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 6007683482c6..f2210d927481 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -38,23 +38,11 @@
* radix tree tags when convenient. Avoid existing XFS_IWALK namespace.
*/
enum xfs_icwalk_goal {
- /* Goals that are not related to tags; these must be < 0. */
- XFS_ICWALK_DQRELE = -1,
-
/* Goals directly associated with tagged inodes. */
XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG,
XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG,
};
-#define XFS_ICWALK_NULL_TAG (-1U)
-
-/* Compute the inode radix tree tag for this goal. */
-static inline unsigned int
-xfs_icwalk_tag(enum xfs_icwalk_goal goal)
-{
- return goal < 0 ? XFS_ICWALK_NULL_TAG : goal;
-}
-
static int xfs_icwalk(struct xfs_mount *mp,
enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
static int xfs_icwalk_ag(struct xfs_perag *pag,
@@ -64,9 +52,6 @@ static int xfs_icwalk_ag(struct xfs_perag *pag,
* Private inode cache walk flags for struct xfs_icwalk. Must not
* coincide with XFS_ICWALK_FLAGS_VALID.
*/
-#define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31)
-#define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30)
-#define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29)
/* Stop scanning after icw_scan_limit inodes. */
#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
@@ -74,10 +59,7 @@ static int xfs_icwalk_ag(struct xfs_perag *pag,
#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27)
#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
-#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \
- XFS_ICWALK_FLAG_DROP_GDQUOT | \
- XFS_ICWALK_FLAG_DROP_PDQUOT | \
- XFS_ICWALK_FLAG_SCAN_LIMIT | \
+#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \
XFS_ICWALK_FLAG_RECLAIM_SICK | \
XFS_ICWALK_FLAG_UNION)
@@ -102,8 +84,9 @@ xfs_inode_alloc(
return NULL;
}
- /* VFS doesn't initialise i_mode! */
+ /* VFS doesn't initialise i_mode or i_state! */
VFS_I(ip)->i_mode = 0;
+ VFS_I(ip)->i_state = 0;
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -220,9 +203,14 @@ static inline void
xfs_blockgc_queue(
struct xfs_perag *pag)
{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ if (!xfs_is_blockgc_enabled(mp))
+ return;
+
rcu_read_lock();
if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
- queue_delayed_work(pag->pag_mount->m_gc_workqueue,
+ queue_delayed_work(pag->pag_mount->m_blockgc_wq,
&pag->pag_blockgc_work,
msecs_to_jiffies(xfs_blockgc_secs * 1000));
rcu_read_unlock();
@@ -301,31 +289,6 @@ xfs_perag_clear_inode_tag(
trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
}
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_mark_reclaimable(
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- spin_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
-
- xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
- __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-
- spin_unlock(&ip->i_flags_lock);
- spin_unlock(&pag->pag_ici_lock);
- xfs_perag_put(pag);
-}
-
static inline void
xfs_inew_wait(
struct xfs_inode *ip)
@@ -484,6 +447,21 @@ xfs_iget_check_free_state(
return 0;
}
+/* Make all pending inactivation work start immediately. */
+static void
+xfs_inodegc_queue_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_inodegc *gc;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (!llist_empty(&gc->list))
+ queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ }
+}
+
/*
* Check the validity of the inode we just found it the cache
*/
@@ -516,13 +494,30 @@ xfs_iget_cache_hit(
* reclaimable state, wait for the initialisation to complete
* before continuing.
*
+ * If we're racing with the inactivation worker we also want to wait.
+ * If we're creating a new file, it's possible that the worker
+ * previously marked the inode as free on disk but hasn't finished
+ * updating the incore state yet. The AGI buffer will be dirty and
+ * locked to the icreate transaction, so a synchronous push of the
+ * inodegc workers would result in deadlock. For a regular iget, the
+ * worker is running already, so we might as well wait.
+ *
* XXX(hch): eventually we should do something equivalent to
* wait_on_inode to wait for these flags to be cleared
* instead of polling for it.
*/
- if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM))
+ if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
goto out_skip;
+ if (ip->i_flags & XFS_NEED_INACTIVE) {
+ /* Unlinked inodes cannot be re-grabbed. */
+ if (VFS_I(ip)->i_nlink == 0) {
+ error = -ENOENT;
+ goto out_error;
+ }
+ goto out_inodegc_flush;
+ }
+
/*
* Check the inode free state is valid. This also detects lookup
* racing with unlinks.
@@ -570,6 +565,17 @@ out_error:
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
return error;
+
+out_inodegc_flush:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ /*
+ * Do not wait for the workers, because the caller could hold an AGI
+ * buffer lock. We're just going to sleep in a loop anyway.
+ */
+ if (xfs_is_inodegc_enabled(mp))
+ xfs_inodegc_queue_all(mp);
+ return -EAGAIN;
}
static int
@@ -597,7 +603,7 @@ xfs_iget_cache_miss(
/*
* For version 5 superblocks, if we are initialising a new inode and we
- * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
+ * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
* simply build the new inode core with a random generation number.
*
* For version 4 (and older) superblocks, log recovery is dependent on
@@ -605,8 +611,8 @@ xfs_iget_cache_miss(
* value and hence we must also read the inode off disk even when
* initializing new inodes.
*/
- if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
- (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ if (xfs_has_v3inodes(mp) &&
+ (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
VFS_I(ip)->i_generation = prandom_u32();
} else {
struct xfs_buf *bp;
@@ -817,97 +823,6 @@ xfs_icache_inode_is_allocated(
return 0;
}
-#ifdef CONFIG_XFS_QUOTA
-/* Decide if we want to grab this inode to drop its dquots. */
-static bool
-xfs_dqrele_igrab(
- struct xfs_inode *ip)
-{
- bool ret = false;
-
- ASSERT(rcu_read_lock_held());
-
- /* Check for stale RCU freed inode */
- spin_lock(&ip->i_flags_lock);
- if (!ip->i_ino)
- goto out_unlock;
-
- /*
- * Skip inodes that are anywhere in the reclaim machinery because we
- * drop dquots before tagging an inode for reclamation.
- */
- if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE))
- goto out_unlock;
-
- /*
- * The inode looks alive; try to grab a VFS reference so that it won't
- * get destroyed. If we got the reference, return true to say that
- * we grabbed the inode.
- *
- * If we can't get the reference, then we know the inode had its VFS
- * state torn down and hasn't yet entered the reclaim machinery. Since
- * we also know that dquots are detached from an inode before it enters
- * reclaim, we can skip the inode.
- */
- ret = igrab(VFS_I(ip)) != NULL;
-
-out_unlock:
- spin_unlock(&ip->i_flags_lock);
- return ret;
-}
-
-/* Drop this inode's dquots. */
-static void
-xfs_dqrele_inode(
- struct xfs_inode *ip,
- struct xfs_icwalk *icw)
-{
- if (xfs_iflags_test(ip, XFS_INEW))
- xfs_inew_wait(ip);
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) {
- xfs_qm_dqrele(ip->i_udquot);
- ip->i_udquot = NULL;
- }
- if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) {
- xfs_qm_dqrele(ip->i_gdquot);
- ip->i_gdquot = NULL;
- }
- if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) {
- xfs_qm_dqrele(ip->i_pdquot);
- ip->i_pdquot = NULL;
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
-}
-
-/*
- * Detach all dquots from incore inodes if we can. The caller must already
- * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will
- * not get reattached.
- */
-int
-xfs_dqrele_all_inodes(
- struct xfs_mount *mp,
- unsigned int qflags)
-{
- struct xfs_icwalk icw = { .icw_flags = 0 };
-
- if (qflags & XFS_UQUOTA_ACCT)
- icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT;
- if (qflags & XFS_GQUOTA_ACCT)
- icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT;
- if (qflags & XFS_PQUOTA_ACCT)
- icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
-
- return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw);
-}
-#else
-# define xfs_dqrele_igrab(ip) (false)
-# define xfs_dqrele_inode(ip, priv) ((void)0)
-#endif /* CONFIG_XFS_QUOTA */
-
/*
* Grab the inode for reclaim exclusively.
*
@@ -976,7 +891,7 @@ xfs_reclaim_inode(
if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
goto out_iunlock;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ if (xfs_is_shutdown(ip->i_mount)) {
xfs_iunpin_wait(ip);
xfs_iflush_abort(ip);
goto reclaim;
@@ -988,6 +903,7 @@ xfs_reclaim_inode(
xfs_iflags_clear(ip, XFS_IFLUSHING);
reclaim:
+ trace_xfs_inode_reclaiming(ip);
/*
* Because we use RCU freeing we need to ensure the inode always appears
@@ -1052,9 +968,8 @@ static inline bool
xfs_want_reclaim_sick(
struct xfs_mount *mp)
{
- return (mp->m_flags & XFS_MOUNT_UNMOUNTING) ||
- (mp->m_flags & XFS_MOUNT_NORECOVERY) ||
- XFS_FORCED_SHUTDOWN(mp);
+ return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
+ xfs_is_shutdown(mp);
}
void
@@ -1447,8 +1362,12 @@ xfs_blockgc_stop(
struct xfs_perag *pag;
xfs_agnumber_t agno;
- for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ if (!xfs_clear_blockgc_enabled(mp))
+ return;
+
+ for_each_perag(mp, agno, pag)
cancel_delayed_work_sync(&pag->pag_blockgc_work);
+ trace_xfs_blockgc_stop(mp, __return_address);
}
/* Enable post-EOF and CoW block auto-reclamation. */
@@ -1459,12 +1378,18 @@ xfs_blockgc_start(
struct xfs_perag *pag;
xfs_agnumber_t agno;
+ if (xfs_set_blockgc_enabled(mp))
+ return;
+
+ trace_xfs_blockgc_start(mp, __return_address);
for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
xfs_blockgc_queue(pag);
}
/* Don't try to run block gc on an inode that's in any of these states. */
#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \
+ XFS_NEED_INACTIVE | \
+ XFS_INACTIVATING | \
XFS_IRECLAIMABLE | \
XFS_IRECLAIM)
/*
@@ -1490,7 +1415,7 @@ xfs_blockgc_igrab(
spin_unlock(&ip->i_flags_lock);
/* nothing to sync during shutdown */
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return false;
/* If we can't grab the inode, it must on it's way to reclaim. */
@@ -1536,27 +1461,62 @@ xfs_blockgc_worker(
struct xfs_mount *mp = pag->pag_mount;
int error;
- if (!sb_start_write_trylock(mp->m_super))
- return;
+ trace_xfs_blockgc_worker(mp, __return_address);
+
error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
if (error)
xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
pag->pag_agno, error);
- sb_end_write(mp->m_super);
xfs_blockgc_queue(pag);
}
/*
- * Try to free space in the filesystem by purging eofblocks and cowblocks.
+ * Try to free space in the filesystem by purging inactive inodes, eofblocks
+ * and cowblocks.
*/
int
xfs_blockgc_free_space(
struct xfs_mount *mp,
struct xfs_icwalk *icw)
{
+ int error;
+
trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
- return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
+ error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
+ if (error)
+ return error;
+
+ xfs_inodegc_flush(mp);
+ return 0;
+}
+
+/*
+ * Reclaim all the free space that we can by scheduling the background blockgc
+ * and inodegc workers immediately and waiting for them all to clear.
+ */
+void
+xfs_blockgc_flush_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ trace_xfs_blockgc_flush_all(mp, __return_address);
+
+ /*
+ * For each blockgc worker, move its queue time up to now. If it
+ * wasn't queued, it will not be requeued. Then flush whatever's
+ * left.
+ */
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ mod_delayed_work(pag->pag_mount->m_blockgc_wq,
+ &pag->pag_blockgc_work, 0);
+
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ flush_delayed_work(&pag->pag_blockgc_work);
+
+ xfs_inodegc_flush(mp);
}
/*
@@ -1647,8 +1607,6 @@ xfs_icwalk_igrab(
struct xfs_icwalk *icw)
{
switch (goal) {
- case XFS_ICWALK_DQRELE:
- return xfs_dqrele_igrab(ip);
case XFS_ICWALK_BLOCKGC:
return xfs_blockgc_igrab(ip);
case XFS_ICWALK_RECLAIM:
@@ -1672,9 +1630,6 @@ xfs_icwalk_process_inode(
int error = 0;
switch (goal) {
- case XFS_ICWALK_DQRELE:
- xfs_dqrele_inode(ip, icw);
- break;
case XFS_ICWALK_BLOCKGC:
error = xfs_blockgc_scan_inode(ip, icw);
break;
@@ -1712,22 +1667,14 @@ restart:
nr_found = 0;
do {
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
- unsigned int tag = xfs_icwalk_tag(goal);
int error = 0;
int i;
rcu_read_lock();
- if (tag == XFS_ICWALK_NULL_TAG)
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
- (void **)batch, first_index,
- XFS_LOOKUP_BATCH);
- else
- nr_found = radix_tree_gang_lookup_tag(
- &pag->pag_ici_root,
- (void **) batch, first_index,
- XFS_LOOKUP_BATCH, tag);
-
+ nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+ (void **) batch, first_index,
+ XFS_LOOKUP_BATCH, goal);
if (!nr_found) {
done = true;
rcu_read_unlock();
@@ -1805,20 +1752,6 @@ restart:
return last_error;
}
-/* Fetch the next (possibly tagged) per-AG structure. */
-static inline struct xfs_perag *
-xfs_icwalk_get_perag(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- enum xfs_icwalk_goal goal)
-{
- unsigned int tag = xfs_icwalk_tag(goal);
-
- if (tag == XFS_ICWALK_NULL_TAG)
- return xfs_perag_get(mp, agno);
- return xfs_perag_get_tag(mp, agno, tag);
-}
-
/* Walk all incore inodes to achieve a given goal. */
static int
xfs_icwalk(
@@ -1829,18 +1762,465 @@ xfs_icwalk(
struct xfs_perag *pag;
int error = 0;
int last_error = 0;
- xfs_agnumber_t agno = 0;
+ xfs_agnumber_t agno;
- while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) {
- agno = pag->pag_agno + 1;
+ for_each_perag_tag(mp, agno, pag, goal) {
error = xfs_icwalk_ag(pag, goal, icw);
- xfs_perag_put(pag);
if (error) {
last_error = error;
- if (error == -EFSCORRUPTED)
+ if (error == -EFSCORRUPTED) {
+ xfs_perag_put(pag);
break;
+ }
}
}
return last_error;
BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
}
+
+#ifdef DEBUG
+static void
+xfs_check_delalloc(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+
+ if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
+ return;
+ do {
+ if (isnullstartblock(got.br_startblock)) {
+ xfs_warn(ip->i_mount,
+ "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
+ ip->i_ino,
+ whichfork == XFS_DATA_FORK ? "data" : "cow",
+ got.br_startoff, got.br_blockcount);
+ }
+ } while (xfs_iext_next_extent(ifp, &icur, &got));
+}
+#else
+#define xfs_check_delalloc(ip, whichfork) do { } while (0)
+#endif
+
+/* Schedule the inode for reclaim. */
+static void
+xfs_inodegc_set_reclaimable(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
+ xfs_check_delalloc(ip, XFS_DATA_FORK);
+ xfs_check_delalloc(ip, XFS_COW_FORK);
+ ASSERT(0);
+ }
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ trace_xfs_inode_set_reclaimable(ip);
+ ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
+ ip->i_flags |= XFS_IRECLAIMABLE;
+ xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * Free all speculative preallocations and possibly even the inode itself.
+ * This is the last chance to make changes to an otherwise unreferenced file
+ * before incore reclamation happens.
+ */
+static void
+xfs_inodegc_inactivate(
+ struct xfs_inode *ip)
+{
+ trace_xfs_inode_inactivating(ip);
+ xfs_inactive(ip);
+ xfs_inodegc_set_reclaimable(ip);
+}
+
+void
+xfs_inodegc_worker(
+ struct work_struct *work)
+{
+ struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc,
+ work);
+ struct llist_node *node = llist_del_all(&gc->list);
+ struct xfs_inode *ip, *n;
+
+ WRITE_ONCE(gc->items, 0);
+
+ if (!node)
+ return;
+
+ ip = llist_entry(node, struct xfs_inode, i_gclist);
+ trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
+
+ WRITE_ONCE(gc->shrinker_hits, 0);
+ llist_for_each_entry_safe(ip, n, node, i_gclist) {
+ xfs_iflags_set(ip, XFS_INACTIVATING);
+ xfs_inodegc_inactivate(ip);
+ }
+}
+
+/*
+ * Force all currently queued inode inactivation work to run immediately, and
+ * wait for the work to finish. Two pass - queue all the work first pass, wait
+ * for it in a second pass.
+ */
+void
+xfs_inodegc_flush(
+ struct xfs_mount *mp)
+{
+ struct xfs_inodegc *gc;
+ int cpu;
+
+ if (!xfs_is_inodegc_enabled(mp))
+ return;
+
+ trace_xfs_inodegc_flush(mp, __return_address);
+
+ xfs_inodegc_queue_all(mp);
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ flush_work(&gc->work);
+ }
+}
+
+/*
+ * Flush all the pending work and then disable the inode inactivation background
+ * workers and wait for them to stop.
+ */
+void
+xfs_inodegc_stop(
+ struct xfs_mount *mp)
+{
+ struct xfs_inodegc *gc;
+ int cpu;
+
+ if (!xfs_clear_inodegc_enabled(mp))
+ return;
+
+ xfs_inodegc_queue_all(mp);
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ cancel_work_sync(&gc->work);
+ }
+ trace_xfs_inodegc_stop(mp, __return_address);
+}
+
+/*
+ * Enable the inode inactivation background workers and schedule deferred inode
+ * inactivation work if there is any.
+ */
+void
+xfs_inodegc_start(
+ struct xfs_mount *mp)
+{
+ if (xfs_set_inodegc_enabled(mp))
+ return;
+
+ trace_xfs_inodegc_start(mp, __return_address);
+ xfs_inodegc_queue_all(mp);
+}
+
+#ifdef CONFIG_XFS_RT
+static inline bool
+xfs_inodegc_want_queue_rt_file(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint64_t freertx;
+
+ if (!XFS_IS_REALTIME_INODE(ip))
+ return false;
+
+ freertx = READ_ONCE(mp->m_sb.sb_frextents);
+ return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
+}
+#else
+# define xfs_inodegc_want_queue_rt_file(ip) (false)
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Schedule the inactivation worker when:
+ *
+ * - We've accumulated more than one inode cluster buffer's worth of inodes.
+ * - There is less than 5% free space left.
+ * - Any of the quotas for this inode are near an enforcement limit.
+ */
+static inline bool
+xfs_inodegc_want_queue_work(
+ struct xfs_inode *ip,
+ unsigned int items)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (items > mp->m_ino_geo.inodes_per_cluster)
+ return true;
+
+ if (__percpu_counter_compare(&mp->m_fdblocks,
+ mp->m_low_space[XFS_LOWSP_5_PCNT],
+ XFS_FDBLOCKS_BATCH) < 0)
+ return true;
+
+ if (xfs_inodegc_want_queue_rt_file(ip))
+ return true;
+
+ if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
+ return true;
+
+ if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
+ return true;
+
+ if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
+ return true;
+
+ return false;
+}
+
+/*
+ * Upper bound on the number of inodes in each AG that can be queued for
+ * inactivation at any given time, to avoid monopolizing the workqueue.
+ */
+#define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK)
+
+/*
+ * Make the frontend wait for inactivations when:
+ *
+ * - Memory shrinkers queued the inactivation worker and it hasn't finished.
+ * - The queue depth exceeds the maximum allowable percpu backlog.
+ *
+ * Note: If the current thread is running a transaction, we don't ever want to
+ * wait for other transactions because that could introduce a deadlock.
+ */
+static inline bool
+xfs_inodegc_want_flush_work(
+ struct xfs_inode *ip,
+ unsigned int items,
+ unsigned int shrinker_hits)
+{
+ if (current->journal_info)
+ return false;
+
+ if (shrinker_hits > 0)
+ return true;
+
+ if (items > XFS_INODEGC_MAX_BACKLOG)
+ return true;
+
+ return false;
+}
+
+/*
+ * Queue a background inactivation worker if there are inodes that need to be
+ * inactivated and higher level xfs code hasn't disabled the background
+ * workers.
+ */
+static void
+xfs_inodegc_queue(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_inodegc *gc;
+ int items;
+ unsigned int shrinker_hits;
+
+ trace_xfs_inode_set_need_inactive(ip);
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags |= XFS_NEED_INACTIVE;
+ spin_unlock(&ip->i_flags_lock);
+
+ gc = get_cpu_ptr(mp->m_inodegc);
+ llist_add(&ip->i_gclist, &gc->list);
+ items = READ_ONCE(gc->items);
+ WRITE_ONCE(gc->items, items + 1);
+ shrinker_hits = READ_ONCE(gc->shrinker_hits);
+ put_cpu_ptr(gc);
+
+ if (!xfs_is_inodegc_enabled(mp))
+ return;
+
+ if (xfs_inodegc_want_queue_work(ip, items)) {
+ trace_xfs_inodegc_queue(mp, __return_address);
+ queue_work(mp->m_inodegc_wq, &gc->work);
+ }
+
+ if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+ trace_xfs_inodegc_throttle(mp, __return_address);
+ flush_work(&gc->work);
+ }
+}
+
+/*
+ * Fold the dead CPU inodegc queue into the current CPUs queue.
+ */
+void
+xfs_inodegc_cpu_dead(
+ struct xfs_mount *mp,
+ unsigned int dead_cpu)
+{
+ struct xfs_inodegc *dead_gc, *gc;
+ struct llist_node *first, *last;
+ unsigned int count = 0;
+
+ dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
+ cancel_work_sync(&dead_gc->work);
+
+ if (llist_empty(&dead_gc->list))
+ return;
+
+ first = dead_gc->list.first;
+ last = first;
+ while (last->next) {
+ last = last->next;
+ count++;
+ }
+ dead_gc->list.first = NULL;
+ dead_gc->items = 0;
+
+ /* Add pending work to current CPU */
+ gc = get_cpu_ptr(mp->m_inodegc);
+ llist_add_batch(first, last, &gc->list);
+ count += READ_ONCE(gc->items);
+ WRITE_ONCE(gc->items, count);
+ put_cpu_ptr(gc);
+
+ if (xfs_is_inodegc_enabled(mp)) {
+ trace_xfs_inodegc_queue(mp, __return_address);
+ queue_work(mp->m_inodegc_wq, &gc->work);
+ }
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag. Once we get tag
+ * lookups on the radix tree, this inode flag can go away.
+ *
+ * We always use background reclaim here because even if the inode is clean, it
+ * still may be under IO and hence we have wait for IO completion to occur
+ * before we can reclaim the inode. The background reclaim path handles this
+ * more efficiently than we can here, so simply let background reclaim tear down
+ * all inodes.
+ */
+void
+xfs_inode_mark_reclaimable(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ bool need_inactive;
+
+ XFS_STATS_INC(mp, vn_reclaim);
+
+ /*
+ * We should never get here with any of the reclaim flags already set.
+ */
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
+
+ need_inactive = xfs_inode_needs_inactive(ip);
+ if (need_inactive) {
+ xfs_inodegc_queue(ip);
+ return;
+ }
+
+ /* Going straight to reclaim, so drop the dquots. */
+ xfs_qm_dqdetach(ip);
+ xfs_inodegc_set_reclaimable(ip);
+}
+
+/*
+ * Register a phony shrinker so that we can run background inodegc sooner when
+ * there's memory pressure. Inactivation does not itself free any memory but
+ * it does make inodes reclaimable, which eventually frees memory.
+ *
+ * The count function, seek value, and batch value are crafted to trigger the
+ * scan function during the second round of scanning. Hopefully this means
+ * that we reclaimed enough memory that initiating metadata transactions won't
+ * make things worse.
+ */
+#define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY)
+#define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
+
+static unsigned long
+xfs_inodegc_shrinker_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
+ m_inodegc_shrinker);
+ struct xfs_inodegc *gc;
+ int cpu;
+
+ if (!xfs_is_inodegc_enabled(mp))
+ return 0;
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (!llist_empty(&gc->list))
+ return XFS_INODEGC_SHRINKER_COUNT;
+ }
+
+ return 0;
+}
+
+static unsigned long
+xfs_inodegc_shrinker_scan(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
+ m_inodegc_shrinker);
+ struct xfs_inodegc *gc;
+ int cpu;
+ bool no_items = true;
+
+ if (!xfs_is_inodegc_enabled(mp))
+ return SHRINK_STOP;
+
+ trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (!llist_empty(&gc->list)) {
+ unsigned int h = READ_ONCE(gc->shrinker_hits);
+
+ WRITE_ONCE(gc->shrinker_hits, h + 1);
+ queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ no_items = false;
+ }
+ }
+
+ /*
+ * If there are no inodes to inactivate, we don't want the shrinker
+ * to think there's deferred work to call us back about.
+ */
+ if (no_items)
+ return LONG_MAX;
+
+ return SHRINK_STOP;
+}
+
+/* Register a shrinker so we can accelerate inodegc and throttle queuing. */
+int
+xfs_inodegc_register_shrinker(
+ struct xfs_mount *mp)
+{
+ struct shrinker *shrink = &mp->m_inodegc_shrinker;
+
+ shrink->count_objects = xfs_inodegc_shrinker_count;
+ shrink->scan_objects = xfs_inodegc_shrinker_scan;
+ shrink->seeks = 0;
+ shrink->flags = SHRINKER_NONSLAB;
+ shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
+
+ return register_shrinker(shrink);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index c751cc32dc46..2e4cfddf8b8e 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -59,6 +59,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
unsigned int iwalk_flags);
int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags);
int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
+void xfs_blockgc_flush_all(struct xfs_mount *mp);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
@@ -68,16 +69,17 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
void xfs_blockgc_worker(struct work_struct *work);
-#ifdef CONFIG_XFS_QUOTA
-int xfs_dqrele_all_inodes(struct xfs_mount *mp, unsigned int qflags);
-#else
-# define xfs_dqrele_all_inodes(mp, qflags) (0)
-#endif
-
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
void xfs_blockgc_stop(struct xfs_mount *mp);
void xfs_blockgc_start(struct xfs_mount *mp);
+void xfs_inodegc_worker(struct work_struct *work);
+void xfs_inodegc_flush(struct xfs_mount *mp);
+void xfs_inodegc_stop(struct xfs_mount *mp);
+void xfs_inodegc_start(struct xfs_mount *mp);
+void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
+int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
+
#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..017904a34c02 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -201,7 +201,7 @@ xlog_recover_icreate_commit_pass2(
if (length != igeo->ialloc_blks &&
length != igeo->ialloc_min_blks) {
xfs_warn(log->l_mp,
- "%s: unsupported chunk length", __FUNCTION__);
+ "%s: unsupported chunk length", __func__);
return -EINVAL;
}
@@ -209,7 +209,7 @@ xlog_recover_icreate_commit_pass2(
if ((count >> mp->m_sb.sb_inopblog) != length) {
xfs_warn(log->l_mp,
"%s: inconsistent inode count and chunk length",
- __FUNCTION__);
+ __func__);
return -EINVAL;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 990b72ae3635..a4f6f034fb81 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -132,7 +132,7 @@ xfs_ilock_attr_map_shared(
/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
- * multi-reader locks: i_mmap_lock and the i_lock. This routine allows
+ * multi-reader locks: invalidate_lock and the i_lock. This routine allows
* various combinations of the locks to be obtained.
*
* The 3 locks should always be ordered so that the IO lock is obtained first,
@@ -140,23 +140,23 @@ xfs_ilock_attr_map_shared(
*
* Basic locking order:
*
- * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
*
* mmap_lock locking order:
*
* i_rwsem -> page lock -> mmap_lock
- * mmap_lock -> i_mmap_lock -> page_lock
+ * mmap_lock -> invalidate_lock -> page_lock
*
* The difference in mmap_lock locking order mean that we cannot hold the
- * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
- * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_lock.
+ * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
+ * can fault in pages during copy in/out (for buffered IO) or require the
+ * mmap_lock in get_user_pages() to map the user pages into the kernel address
+ * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
+ * fault because page faults already hold the mmap_lock.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
- * taken in places where we need to invalidate the page cache in a race
+ * take both the i_rwsem and the invalidate_lock. These locks should *only* be
+ * both taken in places where we need to invalidate the page cache in a race
* free manner (e.g. truncate, hole punch and other extent manipulation
* functions).
*/
@@ -188,10 +188,13 @@ xfs_ilock(
XFS_IOLOCK_DEP(lock_flags));
}
- if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
- else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+ if (lock_flags & XFS_MMAPLOCK_EXCL) {
+ down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+ XFS_MMAPLOCK_DEP(lock_flags));
+ } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+ down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+ XFS_MMAPLOCK_DEP(lock_flags));
+ }
if (lock_flags & XFS_ILOCK_EXCL)
mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
}
if (lock_flags & XFS_MMAPLOCK_EXCL) {
- if (!mrtryupdate(&ip->i_mmaplock))
+ if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
goto out_undo_iolock;
} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
- if (!mrtryaccess(&ip->i_mmaplock))
+ if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
goto out_undo_iolock;
}
@@ -258,9 +261,9 @@ xfs_ilock_nowait(
out_undo_mmaplock:
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrunlock_excl(&ip->i_mmaplock);
+ up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mrunlock_shared(&ip->i_mmaplock);
+ up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
out_undo_iolock:
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
@@ -307,9 +310,9 @@ xfs_iunlock(
up_read(&VFS_I(ip)->i_rwsem);
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrunlock_excl(&ip->i_mmaplock);
+ up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mrunlock_shared(&ip->i_mmaplock);
+ up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_ILOCK_EXCL)
mrunlock_excl(&ip->i_lock);
@@ -335,7 +338,7 @@ xfs_ilock_demote(
if (lock_flags & XFS_ILOCK_EXCL)
mrdemote(&ip->i_lock);
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrdemote(&ip->i_mmaplock);
+ downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_IOLOCK_EXCL)
downgrade_write(&VFS_I(ip)->i_rwsem);
@@ -343,9 +346,29 @@ xfs_ilock_demote(
}
#if defined(DEBUG) || defined(XFS_WARN)
-int
+static inline bool
+__xfs_rwsem_islocked(
+ struct rw_semaphore *rwsem,
+ bool shared)
+{
+ if (!debug_locks)
+ return rwsem_is_locked(rwsem);
+
+ if (!shared)
+ return lockdep_is_held_type(rwsem, 0);
+
+ /*
+ * We are checking that the lock is held at least in shared
+ * mode but don't care that it might be held exclusively
+ * (i.e. shared | excl). Hence we check if the lock is held
+ * in any mode rather than an explicit shared mode.
+ */
+ return lockdep_is_held_type(rwsem, -1);
+}
+
+bool
xfs_isilocked(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
uint lock_flags)
{
if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
@@ -355,20 +378,17 @@ xfs_isilocked(
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- if (!(lock_flags & XFS_MMAPLOCK_SHARED))
- return !!ip->i_mmaplock.mr_writer;
- return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+ (lock_flags & XFS_IOLOCK_SHARED));
}
- if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
- if (!(lock_flags & XFS_IOLOCK_SHARED))
- return !debug_locks ||
- lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
- return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
+ if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+ (lock_flags & XFS_IOLOCK_SHARED));
}
ASSERT(0);
- return 0;
+ return false;
}
#endif
@@ -532,12 +552,10 @@ again:
}
/*
- * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
- * the mmaplock or the ilock, but not more than one type at a time. If we lock
- * more than one at a time, lockdep will report false positives saying we have
- * violated locking orders. The iolock must be double-locked separately since
- * we use i_rwsem for that. We now support taking one lock EXCL and the other
- * SHARED.
+ * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
+ * mmaplock must be double-locked separately since we use i_rwsem and
+ * invalidate_lock for that. We now support taking one lock EXCL and the
+ * other SHARED.
*/
void
xfs_lock_two_inodes(
@@ -555,15 +573,8 @@ xfs_lock_two_inodes(
ASSERT(hweight32(ip1_mode) == 1);
ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
- ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-
+ ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+ ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
ASSERT(ip0->i_ino != ip1->i_ino);
if (ip0->i_ino > ip1->i_ino) {
@@ -663,7 +674,7 @@ xfs_lookup(
trace_xfs_lookup(dp, name);
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ if (xfs_is_shutdown(dp->i_mount))
return -EIO;
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
@@ -705,7 +716,7 @@ xfs_inode_inherit_flags(
di_flags |= XFS_DIFLAG_PROJINHERIT;
} else if (S_ISREG(mode)) {
if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- xfs_sb_version_hasrealtime(&ip->i_mount->m_sb))
+ xfs_has_realtime(ip->i_mount))
di_flags |= XFS_DIFLAG_REALTIME;
if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
di_flags |= XFS_DIFLAG_EXTSIZE;
@@ -826,8 +837,7 @@ xfs_init_new_inode(
inode->i_rdev = rdev;
ip->i_projid = prid;
- if (dir && !(dir->i_mode & S_ISGID) &&
- (mp->m_flags & XFS_MOUNT_GRPID)) {
+ if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
inode_fsuid_set(inode, mnt_userns);
inode->i_gid = dir->i_gid;
inode->i_mode = mode;
@@ -857,7 +867,7 @@ xfs_init_new_inode(
ip->i_extsize = 0;
ip->i_diflags = 0;
- if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
+ if (xfs_has_v3inodes(mp)) {
inode_set_iversion(inode, 1);
ip->i_cowextsize = 0;
ip->i_crtime = tv;
@@ -897,7 +907,7 @@ xfs_init_new_inode(
* this saves us from needing to run a separate transaction to set the
* fork offset in the immediate future.
*/
- if (init_xattrs && xfs_sb_version_hasattr(&mp->m_sb)) {
+ if (init_xattrs && xfs_has_attr(mp)) {
ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
}
@@ -976,7 +986,7 @@ xfs_create(
trace_xfs_create(dp, name);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1068,7 +1078,7 @@ xfs_create(
* create transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
/*
@@ -1130,7 +1140,7 @@ xfs_create_tmpfile(
uint resblks;
xfs_ino_t ino;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1160,7 +1170,7 @@ xfs_create_tmpfile(
if (error)
goto out_trans_cancel;
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
/*
@@ -1220,7 +1230,7 @@ xfs_link(
ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_qm_dqattach(sip);
@@ -1294,7 +1304,7 @@ xfs_link(
* link transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
return xfs_trans_commit(tp);
@@ -1435,10 +1445,10 @@ xfs_release(
return 0;
/* If this is a read-only mount, don't do this (would generate I/O) */
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
return 0;
- if (!XFS_FORCED_SHUTDOWN(mp)) {
+ if (!xfs_is_shutdown(mp)) {
int truncated;
/*
@@ -1521,7 +1531,7 @@ xfs_inactive_truncate(
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1592,7 +1602,7 @@ xfs_inactive_ifree(
"Failed to remove inode(s) from unlinked list. "
"Please free space, unmount and run xfs_repair.");
} else {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
}
return error;
}
@@ -1628,7 +1638,7 @@ xfs_inactive_ifree(
* might do that, we need to make sure. Otherwise the
* inode might be lost for a long time or forever.
*/
- if (!XFS_FORCED_SHUTDOWN(mp)) {
+ if (!xfs_is_shutdown(mp)) {
xfs_notice(mp, "%s: xfs_ifree returned error %d",
__func__, error);
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
@@ -1655,6 +1665,59 @@ xfs_inactive_ifree(
}
/*
+ * Returns true if we need to update the on-disk metadata before we can free
+ * the memory used by this inode. Updates include freeing post-eof
+ * preallocations; freeing COW staging extents; and marking the inode free in
+ * the inobt if it is on the unlinked list.
+ */
+bool
+xfs_inode_needs_inactive(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+
+ /*
+ * If the inode is already free, then there can be nothing
+ * to clean up here.
+ */
+ if (VFS_I(ip)->i_mode == 0)
+ return false;
+
+ /* If this is a read-only mount, don't do this (would generate I/O) */
+ if (xfs_is_readonly(mp))
+ return false;
+
+ /* If the log isn't running, push inodes straight to reclaim. */
+ if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
+ return false;
+
+ /* Metadata inodes require explicit resource cleanup. */
+ if (xfs_is_metadata_inode(ip))
+ return false;
+
+ /* Want to clean out the cow blocks if there are any. */
+ if (cow_ifp && cow_ifp->if_bytes > 0)
+ return true;
+
+ /* Unlinked files must be freed. */
+ if (VFS_I(ip)->i_nlink == 0)
+ return true;
+
+ /*
+ * This file isn't being freed, so check if there are post-eof blocks
+ * to free. @force is true because we are evicting an inode from the
+ * cache. Post-eof blocks must be freed, lest we end up with broken
+ * free space accounting.
+ *
+ * Note: don't bother with iolock here since lockdep complains about
+ * acquiring it in reclaim context. We have the only reference to the
+ * inode at this point anyways.
+ */
+ return xfs_can_free_eofblocks(ip, true);
+}
+
+/*
* xfs_inactive
*
* This is called when the vnode reference count for the vnode
@@ -1683,7 +1746,7 @@ xfs_inactive(
ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
/* If this is a read-only mount, don't do this (would generate I/O) */
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
goto out;
/* Metadata inodes require explicit resource cleanup. */
@@ -1958,7 +2021,7 @@ xfs_iunlink_destroy(
rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
xfs_iunlink_free_item, &freed_anything);
- ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+ ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount));
}
/*
@@ -2703,7 +2766,7 @@ xfs_remove(
trace_xfs_remove(dp, name);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_qm_dqattach(dp);
@@ -2802,7 +2865,7 @@ xfs_remove(
* remove transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
@@ -2879,7 +2942,7 @@ xfs_finish_rename(
* If this is a synchronous mount, make sure that the rename transaction
* goes to disk before returning to the user.
*/
- if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
xfs_trans_set_sync(tp);
return xfs_trans_commit(tp);
@@ -3462,7 +3525,7 @@ xfs_iflush(
* happen but we need to still do it to ensure backwards compatibility
* with old kernels that predate logging all inode changes.
*/
- if (!xfs_sb_version_has_v3inode(&mp->m_sb))
+ if (!xfs_has_v3inodes(mp))
ip->i_flushiter++;
/*
@@ -3484,7 +3547,7 @@ xfs_iflush(
xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
/* Wrap, we never let the log put out DI_MAX_FLUSH */
- if (!xfs_sb_version_has_v3inode(&mp->m_sb)) {
+ if (!xfs_has_v3inodes(mp)) {
if (ip->i_flushiter == DI_MAX_FLUSH)
ip->i_flushiter = 0;
}
@@ -3603,7 +3666,7 @@ xfs_iflush_cluster(
* AIL, leaving a dirty/unpinned inode attached to the buffer
* that otherwise looks like it should be flushed.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (xfs_is_shutdown(mp)) {
xfs_iunpin_wait(ip);
xfs_iflush_abort(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -3741,11 +3804,8 @@ xfs_ilock2_io_mmap(
ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
if (ret)
return ret;
- if (ip1 == ip2)
- xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
- else
- xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
- ip2, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
return 0;
}
@@ -3755,12 +3815,9 @@ xfs_iunlock2_io_mmap(
struct xfs_inode *ip1,
struct xfs_inode *ip2)
{
- bool same_inode = (ip1 == ip2);
-
- xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
inode_unlock(VFS_I(ip2));
- if (!same_inode)
+ if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4b6703dbffb8..b21b177832d1 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -40,8 +40,8 @@ typedef struct xfs_inode {
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
- mrlock_t i_mmaplock; /* inode mmap IO lock */
atomic_t i_pincount; /* inode pin count */
+ struct llist_node i_gclist; /* deferred inactivation list */
/*
* Bitsets of inode metadata that have been checked and/or are sick.
@@ -240,6 +240,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */
+#define XFS_NEED_INACTIVE (1 << 10) /* see XFS_INACTIVATING below */
/*
* If this unlinked inode is in the middle of recovery, don't let drop_inode
* truncate and free the inode. This can happen if we iget the inode during
@@ -249,13 +250,29 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
#define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */
/*
+ * If we need to update on-disk metadata before this IRECLAIMABLE inode can be
+ * freed, then NEED_INACTIVE will be set. Once we start the updates, the
+ * INACTIVATING bit will be set to keep iget away from this inode. After the
+ * inactivation completes, both flags will be cleared and the inode is a
+ * plain old IRECLAIMABLE inode.
+ */
+#define XFS_INACTIVATING (1 << 13)
+
+/* All inode state flags related to inode reclaim. */
+#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
+ XFS_IRECLAIM | \
+ XFS_NEED_INACTIVE | \
+ XFS_INACTIVATING)
+
+/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
* inode lookup. This prevents unintended behaviour on the new inode from
* ocurring.
*/
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
- XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
+ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
+ XFS_INACTIVATING)
/*
* Flags for inode locking.
@@ -382,8 +399,7 @@ enum layout_break_reason {
* new subdirectory gets S_ISGID bit from parent.
*/
#define XFS_INHERIT_GID(pip) \
- (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
- (VFS_I(pip)->i_mode & S_ISGID))
+ (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID))
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
@@ -410,7 +426,7 @@ void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
-int xfs_isilocked(xfs_inode_t *, uint);
+bool xfs_isilocked(struct xfs_inode *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
@@ -493,6 +509,8 @@ extern struct kmem_zone *xfs_inode_zone;
/* The default CoW extent size hint. */
#define XFS_DEFAULT_COWEXTSZ_HINT 32
+bool xfs_inode_needs_inactive(struct xfs_inode *ip);
+
int xfs_iunlink_init(struct xfs_perag *pag);
void xfs_iunlink_destroy(struct xfs_perag *pag);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 35de30849fcc..0659d19c211e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -396,7 +396,7 @@ xfs_inode_to_log_dinode(
/* log a dummy value to ensure log structure is fully initialised */
to->di_next_unlinked = NULLAGINO;
- if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ if (xfs_has_v3inodes(ip->i_mount)) {
to->di_version = 3;
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index e0072a6cd2d3..239dd2e3384e 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -295,7 +295,7 @@ xlog_recover_inode_commit_pass2(
* superblock flag to determine whether we need to look at di_flushiter
* to skip replay when the on disk inode is newer than the log one
*/
- if (!xfs_sb_version_has_v3inode(&mp->m_sb) &&
+ if (!xfs_has_v3inodes(mp) &&
ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 16039ea10ac9..0c795dc093ef 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -756,7 +756,7 @@ xfs_ioc_fsbulkstat(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq)))
@@ -927,7 +927,7 @@ xfs_ioc_bulkstat(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
@@ -977,7 +977,7 @@ xfs_ioc_inumbers(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
@@ -1010,7 +1010,7 @@ xfs_ioc_fsgeometry(
struct xfs_fsop_geom fsgeo;
size_t len;
- xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
+ xfs_fs_geometry(mp, &fsgeo, struct_version);
if (struct_version <= 3)
len = sizeof(struct xfs_fsop_geom_v1);
@@ -1213,7 +1213,7 @@ xfs_ioctl_setattr_xflags(
/* diflags2 only valid for v3 inodes. */
i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
- if (i_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb))
+ if (i_flags2 && !xfs_has_v3inodes(mp))
return -EINVAL;
ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
@@ -1237,8 +1237,7 @@ xfs_ioctl_setattr_prepare_dax(
if (S_ISDIR(inode->i_mode))
return;
- if ((mp->m_flags & XFS_MOUNT_DAX_ALWAYS) ||
- (mp->m_flags & XFS_MOUNT_DAX_NEVER))
+ if (xfs_has_dax_always(mp) || xfs_has_dax_never(mp))
return;
if (((fa->fsx_xflags & FS_XFLAG_DAX) &&
@@ -1263,10 +1262,10 @@ xfs_ioctl_setattr_get_trans(
struct xfs_trans *tp;
int error = -EROFS;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
goto out_error;
error = -EIO;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
goto out_error;
error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
@@ -1274,7 +1273,7 @@ xfs_ioctl_setattr_get_trans(
if (error)
goto out_error;
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
return tp;
@@ -1362,9 +1361,9 @@ xfs_ioctl_setattr_check_projid(
if (!fa->fsx_valid)
return 0;
- /* Disallow 32bit project ids if projid32bit feature is not enabled. */
+ /* Disallow 32bit project ids if 32bit IDs are not enabled. */
if (fa->fsx_projid > (uint16_t)-1 &&
- !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+ !xfs_has_projid32(ip->i_mount))
return -EINVAL;
return 0;
}
@@ -1450,7 +1449,7 @@ xfs_fileattr_set(
/* Change the ownerships and register project quota modifications */
if (ip->i_projid != fa->fsx_projid) {
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+ if (XFS_IS_PQUOTA_ON(mp)) {
olddquot = xfs_qm_vop_chown(tp, ip,
&ip->i_pdquot, pdqp);
}
@@ -1467,7 +1466,7 @@ xfs_fileattr_set(
else
ip->i_extsize = 0;
- if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
+ if (xfs_has_v3inodes(mp)) {
if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
ip->i_cowextsize = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
else
@@ -1792,7 +1791,7 @@ xfs_ioc_swapext(
goto out_put_tmp_file;
}
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ if (xfs_is_shutdown(ip->i_mount)) {
error = -EIO;
goto out_put_tmp_file;
}
@@ -2081,7 +2080,7 @@ xfs_file_ioctl(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
return -EROFS;
if (copy_from_user(&inout, arg, sizeof(inout)))
@@ -2198,7 +2197,7 @@ xfs_file_ioctl(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
return -EROFS;
if (copy_from_user(&eofb, arg, sizeof(eofb)))
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index e6506773ba55..8783af203cfc 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -50,7 +50,7 @@ xfs_compat_ioc_fsgeometry_v1(
{
struct xfs_fsop_geom fsgeo;
- xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
+ xfs_fs_geometry(mp, &fsgeo, 3);
/* The 32-bit variant simply has some padding at the end */
if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
return -EFAULT;
@@ -254,7 +254,7 @@ xfs_compat_ioc_fsbulkstat(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (get_user(addr, &p32->lastip))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d8cd2583dedb..093758440ad5 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -132,7 +132,7 @@ xfs_eof_alignment(
* If mounted with the "-o swalloc" option the alignment is
* increased from the strip unit size to the stripe width.
*/
- if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ if (mp->m_swidth && xfs_has_swalloc(mp))
align = mp->m_swidth;
else if (mp->m_dalign)
align = mp->m_dalign;
@@ -734,7 +734,7 @@ xfs_direct_write_iomap_begin(
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/*
@@ -874,7 +874,7 @@ xfs_buffered_write_iomap_begin(
int allocfork = XFS_DATA_FORK;
int error = 0;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/* we can't use delayed allocations when using extent size hints */
@@ -994,7 +994,7 @@ xfs_buffered_write_iomap_begin(
* Determine the initial size of the preallocation.
* We clean up any extra preallocation when the file is closed.
*/
- if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
+ if (xfs_has_allocsize(mp))
prealloc_blocks = mp->m_allocsize_blocks;
else
prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork,
@@ -1064,11 +1064,11 @@ found_cow:
error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
if (error)
return error;
- } else {
- xfs_trim_extent(&cmap, offset_fsb,
- imap.br_startoff - offset_fsb);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+
+ xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1127,7 +1127,7 @@ xfs_buffered_write_iomap_end(
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
end_fsb - start_fsb);
- if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (error && !xfs_is_shutdown(mp)) {
xfs_alert(mp, "%s: unable to clean up ino %lld",
__func__, ip->i_ino);
return error;
@@ -1162,7 +1162,7 @@ xfs_read_iomap_begin(
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
@@ -1203,7 +1203,7 @@ xfs_seek_iomap_begin(
int error = 0;
unsigned lockmode;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
lockmode = xfs_ilock_data_map_shared(ip);
@@ -1285,7 +1285,7 @@ xfs_xattr_iomap_begin(
int nimaps = 1, error = 0;
unsigned lockmode;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
lockmode = xfs_ilock_attr_map_shared(ip);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 93c082db04b7..a607d6aca5c4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -393,7 +393,7 @@ xfs_vn_unlink(
* but still hashed. This is incompatible with case-insensitive
* mode, so invalidate (unhash) the dentry in CI-mode.
*/
- if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+ if (xfs_has_asciici(XFS_M(dir->i_sb)))
d_invalidate(dentry);
return 0;
}
@@ -558,10 +558,10 @@ xfs_stat_blksize(
* default buffered I/O size, return that, otherwise return the compat
* default.
*/
- if (mp->m_flags & XFS_MOUNT_LARGEIO) {
+ if (xfs_has_large_iosize(mp)) {
if (mp->m_swidth)
return XFS_FSB_TO_B(mp, mp->m_swidth);
- if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
+ if (xfs_has_allocsize(mp))
return 1U << mp->m_allocsize_log;
}
@@ -582,7 +582,7 @@ xfs_vn_getattr(
trace_xfs_getattr(ip);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
stat->size = XFS_ISIZE(ip);
@@ -597,7 +597,7 @@ xfs_vn_getattr(
stat->ctime = inode->i_ctime;
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
- if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
+ if (xfs_has_v3inodes(mp)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime = ip->i_crtime;
@@ -673,10 +673,10 @@ xfs_vn_change_ok(
{
struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
return -EROFS;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
return setattr_prepare(mnt_userns, dentry, iattr);
@@ -778,7 +778,7 @@ xfs_setattr_nonsize(
* in the transaction.
*/
if (!uid_eq(iuid, uid)) {
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
+ if (XFS_IS_UQUOTA_ON(mp)) {
ASSERT(mask & ATTR_UID);
ASSERT(udqp);
olddquot1 = xfs_qm_vop_chown(tp, ip,
@@ -787,8 +787,8 @@ xfs_setattr_nonsize(
inode->i_uid = uid;
}
if (!gid_eq(igid, gid)) {
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
- ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
+ if (XFS_IS_GQUOTA_ON(mp)) {
+ ASSERT(xfs_has_pquotino(mp) ||
!XFS_IS_PQUOTA_ON(mp));
ASSERT(mask & ATTR_GID);
ASSERT(gdqp);
@@ -808,7 +808,7 @@ xfs_setattr_nonsize(
XFS_STATS_INC(mp, xs_ig_attrchg);
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
@@ -1037,7 +1037,7 @@ xfs_setattr_size(
XFS_STATS_INC(mp, xs_ig_attrchg);
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
@@ -1287,11 +1287,11 @@ xfs_inode_should_enable_dax(
{
if (!IS_ENABLED(CONFIG_FS_DAX))
return false;
- if (ip->i_mount->m_flags & XFS_MOUNT_DAX_NEVER)
+ if (xfs_has_dax_never(ip->i_mount))
return false;
if (!xfs_inode_supports_dax(ip))
return false;
- if (ip->i_mount->m_flags & XFS_MOUNT_DAX_ALWAYS)
+ if (xfs_has_dax_always(ip->i_mount))
return true;
if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
return true;
@@ -1344,7 +1344,7 @@ xfs_setup_inode(
gfp_t gfp_mask;
inode->i_ino = ip->i_ino;
- inode->i_state = I_NEW;
+ inode->i_state |= I_NEW;
inode_sb_list_add(inode);
/* make the inode look hashed for the writeback code */
@@ -1401,7 +1401,7 @@ xfs_setup_iops(
inode->i_mapping->a_ops = &xfs_address_space_operations;
break;
case S_IFDIR:
- if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+ if (xfs_has_asciici(XFS_M(inode->i_sb)))
inode->i_op = &xfs_dir_ci_inode_operations;
else
inode->i_op = &xfs_dir_inode_operations;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f331975a16de..c08c79d9e311 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -19,6 +19,7 @@
#include "xfs_error.h"
#include "xfs_icache.h"
#include "xfs_health.h"
+#include "xfs_trans.h"
/*
* Bulk Stat
@@ -107,7 +108,7 @@ xfs_bulkstat_one_int(
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
buf->bs_version = XFS_BULKSTAT_VERSION_V5;
- if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
+ if (xfs_has_v3inodes(mp)) {
buf->bs_btime = ip->i_crtime.tv_sec;
buf->bs_btime_nsec = ip->i_crtime.tv_nsec;
if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
@@ -163,6 +164,7 @@ xfs_bulkstat_one(
.formatter = formatter,
.breq = breq,
};
+ struct xfs_trans *tp;
int error;
if (breq->mnt_userns != &init_user_ns) {
@@ -178,9 +180,18 @@ xfs_bulkstat_one(
if (!bc.buf)
return -ENOMEM;
- error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, NULL,
- breq->startino, &bc);
+ /*
+ * Grab an empty transaction so that we can use its recursive buffer
+ * locking abilities to detect cycles in the inobt without deadlocking.
+ */
+ error = xfs_trans_alloc_empty(breq->mp, &tp);
+ if (error)
+ goto out;
+ error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp,
+ breq->startino, &bc);
+ xfs_trans_cancel(tp);
+out:
kmem_free(bc.buf);
/*
@@ -244,6 +255,7 @@ xfs_bulkstat(
.formatter = formatter,
.breq = breq,
};
+ struct xfs_trans *tp;
int error;
if (breq->mnt_userns != &init_user_ns) {
@@ -259,9 +271,18 @@ xfs_bulkstat(
if (!bc.buf)
return -ENOMEM;
- error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags,
- xfs_bulkstat_iwalk, breq->icount, &bc);
+ /*
+ * Grab an empty transaction so that we can use its recursive buffer
+ * locking abilities to detect cycles in the inobt without deadlocking.
+ */
+ error = xfs_trans_alloc_empty(breq->mp, &tp);
+ if (error)
+ goto out;
+ error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags,
+ xfs_bulkstat_iwalk, breq->icount, &bc);
+ xfs_trans_cancel(tp);
+out:
kmem_free(bc.buf);
/*
@@ -374,13 +395,24 @@ xfs_inumbers(
.formatter = formatter,
.breq = breq,
};
+ struct xfs_trans *tp;
int error = 0;
if (xfs_bulkstat_already_done(breq->mp, breq->startino))
return 0;
- error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags,
+ /*
+ * Grab an empty transaction so that we can use its recursive buffer
+ * locking abilities to detect cycles in the inobt without deadlocking.
+ */
+ error = xfs_trans_alloc_empty(breq->mp, &tp);
+ if (error)
+ goto out;
+
+ error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags,
xfs_inumbers_walk, breq->icount, &ic);
+ xfs_trans_cancel(tp);
+out:
/*
* We found some inode groups, so clear the error status and return
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 917d51eefee3..7558486f4937 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -83,6 +83,9 @@ struct xfs_iwalk_ag {
/* Skip empty inobt records? */
unsigned int skip_empty:1;
+
+ /* Drop the (hopefully empty) transaction when calling iwalk_fn. */
+ unsigned int drop_trans:1;
};
/*
@@ -352,7 +355,6 @@ xfs_iwalk_run_callbacks(
int *has_more)
{
struct xfs_mount *mp = iwag->mp;
- struct xfs_trans *tp = iwag->tp;
struct xfs_inobt_rec_incore *irec;
xfs_agino_t next_agino;
int error;
@@ -362,10 +364,15 @@ xfs_iwalk_run_callbacks(
ASSERT(iwag->nr_recs > 0);
/* Delete cursor but remember the last record we cached... */
- xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+ xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0);
irec = &iwag->recs[iwag->nr_recs - 1];
ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
+ if (iwag->drop_trans) {
+ xfs_trans_cancel(iwag->tp);
+ iwag->tp = NULL;
+ }
+
error = xfs_iwalk_ag_recs(iwag);
if (error)
return error;
@@ -376,8 +383,15 @@ xfs_iwalk_run_callbacks(
if (!has_more)
return 0;
+ if (iwag->drop_trans) {
+ error = xfs_trans_alloc_empty(mp, &iwag->tp);
+ if (error)
+ return error;
+ }
+
/* ...and recreate the cursor just past where we left off. */
- error = xfs_inobt_cur(mp, tp, iwag->pag, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp,
+ agi_bpp);
if (error)
return error;
@@ -390,7 +404,6 @@ xfs_iwalk_ag(
struct xfs_iwalk_ag *iwag)
{
struct xfs_mount *mp = iwag->mp;
- struct xfs_trans *tp = iwag->tp;
struct xfs_perag *pag = iwag->pag;
struct xfs_buf *agi_bp = NULL;
struct xfs_btree_cur *cur = NULL;
@@ -469,7 +482,7 @@ xfs_iwalk_ag(
error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
out:
- xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
+ xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error);
return error;
}
@@ -599,8 +612,18 @@ xfs_iwalk_ag_work(
error = xfs_iwalk_alloc(iwag);
if (error)
goto out;
+ /*
+ * Grab an empty transaction so that we can use its recursive buffer
+ * locking abilities to detect cycles in the inobt without deadlocking.
+ */
+ error = xfs_trans_alloc_empty(mp, &iwag->tp);
+ if (error)
+ goto out;
+ iwag->drop_trans = 1;
error = xfs_iwalk_ag(iwag);
+ if (iwag->tp)
+ xfs_trans_cancel(iwag->tp);
xfs_iwalk_free(iwag);
out:
xfs_perag_put(iwag->pag);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 60ac5fd63f1e..f6cd2d4aa770 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -41,6 +41,8 @@ xlog_dealloc_log(
/* local state machine functions */
STATIC void xlog_state_done_syncing(
struct xlog_in_core *iclog);
+STATIC void xlog_state_do_callback(
+ struct xlog *log);
STATIC int
xlog_state_get_iclog_space(
struct xlog *log,
@@ -50,11 +52,6 @@ xlog_state_get_iclog_space(
int *continued_write,
int *logoffsetp);
STATIC void
-xlog_state_switch_iclogs(
- struct xlog *log,
- struct xlog_in_core *iclog,
- int eventual_size);
-STATIC void
xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
@@ -246,7 +243,7 @@ xlog_grant_head_wait(
list_add_tail(&tic->t_queue, &head->waiters);
do {
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
goto shutdown;
xlog_grant_push_ail(log, need_bytes);
@@ -260,7 +257,7 @@ xlog_grant_head_wait(
trace_xfs_log_grant_wake(log, tic);
spin_lock(&head->lock);
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
goto shutdown;
} while (xlog_space_left(log, &head->grant) < need_bytes);
@@ -298,7 +295,7 @@ xlog_grant_head_check(
int free_bytes;
int error = 0;
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+ ASSERT(!xlog_in_recovery(log));
/*
* If there are other waiters on the queue then give them a chance at
@@ -359,13 +356,13 @@ xfs_log_writable(
* mounts allow internal writes for log recovery and unmount purposes,
* so don't restrict that case.
*/
- if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ if (xfs_has_norecovery(mp))
return false;
if (xfs_readonly_buftarg(mp->m_ddev_targp))
return false;
if (xfs_readonly_buftarg(mp->m_log->l_targ))
return false;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xlog_is_shutdown(mp->m_log))
return false;
return true;
}
@@ -382,7 +379,7 @@ xfs_log_regrant(
int need_bytes;
int error = 0;
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
@@ -450,7 +447,7 @@ xfs_log_reserve(
ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
@@ -487,6 +484,42 @@ out_error:
}
/*
+ * Run all the pending iclog callbacks and wake log force waiters and iclog
+ * space waiters so they can process the newly set shutdown state. We really
+ * don't care what order we process callbacks here because the log is shut down
+ * and so state cannot change on disk anymore.
+ *
+ * We avoid processing actively referenced iclogs so that we don't run callbacks
+ * while the iclog owner might still be preparing the iclog for IO submssion.
+ * These will be caught by xlog_state_iclog_release() and call this function
+ * again to process any callbacks that may have been added to that iclog.
+ */
+static void
+xlog_state_shutdown_callbacks(
+ struct xlog *log)
+{
+ struct xlog_in_core *iclog;
+ LIST_HEAD(cb_list);
+
+ spin_lock(&log->l_icloglock);
+ iclog = log->l_iclog;
+ do {
+ if (atomic_read(&iclog->ic_refcnt)) {
+ /* Reference holder will re-run iclog callbacks. */
+ continue;
+ }
+ list_splice_init(&iclog->ic_callbacks, &cb_list);
+ wake_up_all(&iclog->ic_write_wait);
+ wake_up_all(&iclog->ic_force_wait);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+
+ wake_up_all(&log->l_flush_wait);
+ spin_unlock(&log->l_icloglock);
+
+ xlog_cil_process_committed(&cb_list);
+}
+
+/*
* Flush iclog to disk if this is the last reference to the given iclog and the
* it is in the WANT_SYNC state.
*
@@ -520,12 +553,11 @@ xlog_state_release_iclog(
xfs_lsn_t old_tail_lsn)
{
xfs_lsn_t tail_lsn;
+ bool last_ref;
+
lockdep_assert_held(&log->l_icloglock);
trace_xlog_iclog_release(iclog, _RET_IP_);
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- return -EIO;
-
/*
* Grabbing the current log tail needs to be atomic w.r.t. the writing
* of the tail LSN into the iclog so we guarantee that the log tail does
@@ -543,7 +575,23 @@ xlog_state_release_iclog(
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
}
- if (!atomic_dec_and_test(&iclog->ic_refcnt))
+ last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
+
+ if (xlog_is_shutdown(log)) {
+ /*
+ * If there are no more references to this iclog, process the
+ * pending iclog callbacks that were waiting on the release of
+ * this iclog.
+ */
+ if (last_ref) {
+ spin_unlock(&log->l_icloglock);
+ xlog_state_shutdown_callbacks(log);
+ spin_lock(&log->l_icloglock);
+ }
+ return -EIO;
+ }
+
+ if (!last_ref)
return 0;
if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
@@ -580,25 +628,27 @@ xfs_log_mount(
xfs_daddr_t blk_offset,
int num_bblks)
{
- bool fatal = xfs_sb_version_hascrc(&mp->m_sb);
+ struct xlog *log;
+ bool fatal = xfs_has_crc(mp);
int error = 0;
int min_logfsbs;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+ if (!xfs_has_norecovery(mp)) {
xfs_notice(mp, "Mounting V%d Filesystem",
XFS_SB_VERSION_NUM(&mp->m_sb));
} else {
xfs_notice(mp,
"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
XFS_SB_VERSION_NUM(&mp->m_sb));
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ ASSERT(xfs_is_readonly(mp));
}
- mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
- if (IS_ERR(mp->m_log)) {
- error = PTR_ERR(mp->m_log);
+ log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+ if (IS_ERR(log)) {
+ error = PTR_ERR(log);
goto out;
}
+ mp->m_log = log;
/*
* Validate the given log space and drop a critical message via syslog
@@ -663,51 +713,51 @@ xfs_log_mount(
xfs_warn(mp, "AIL initialisation failed: error %d", error);
goto out_free_log;
}
- mp->m_log->l_ailp = mp->m_ail;
+ log->l_ailp = mp->m_ail;
/*
* skip log recovery on a norecovery mount. pretend it all
* just worked.
*/
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
- int readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
-
- if (readonly)
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
-
- error = xlog_recover(mp->m_log);
-
+ if (!xfs_has_norecovery(mp)) {
+ /*
+ * log recovery ignores readonly state and so we need to clear
+ * mount-based read only state so it can write to disk.
+ */
+ bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY,
+ &mp->m_opstate);
+ error = xlog_recover(log);
if (readonly)
- mp->m_flags |= XFS_MOUNT_RDONLY;
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
if (error) {
xfs_warn(mp, "log mount/recovery failed: error %d",
error);
- xlog_recover_cancel(mp->m_log);
+ xlog_recover_cancel(log);
goto out_destroy_ail;
}
}
- error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
+ error = xfs_sysfs_init(&log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
"log");
if (error)
goto out_destroy_ail;
/* Normal transactions can now occur */
- mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
/*
* Now the log has been fully initialised and we know were our
* space grant counters are, we can initialise the permanent ticket
* needed for delayed logging to work.
*/
- xlog_cil_init_post_recovery(mp->m_log);
+ xlog_cil_init_post_recovery(log);
return 0;
out_destroy_ail:
xfs_trans_ail_destroy(mp);
out_free_log:
- xlog_dealloc_log(mp->m_log);
+ xlog_dealloc_log(log);
out:
return error;
}
@@ -726,19 +776,22 @@ int
xfs_log_mount_finish(
struct xfs_mount *mp)
{
- int error = 0;
- bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
- bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED;
+ struct xlog *log = mp->m_log;
+ bool readonly;
+ int error = 0;
- if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ if (xfs_has_norecovery(mp)) {
+ ASSERT(xfs_is_readonly(mp));
return 0;
- } else if (readonly) {
- /* Allow unlinked processing to proceed */
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
}
/*
+ * log recovery ignores readonly state and so we need to clear
+ * mount-based read only state so it can write to disk.
+ */
+ readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+
+ /*
* During the second phase of log recovery, we need iget and
* iput to behave like they do for an active filesystem.
* xfs_fs_drop_inode needs to be able to prevent the deletion
@@ -759,7 +812,8 @@ xfs_log_mount_finish(
* mount failure occurs.
*/
mp->m_super->s_flags |= SB_ACTIVE;
- error = xlog_recover_finish(mp->m_log);
+ if (xlog_recovery_needed(log))
+ error = xlog_recover_finish(log);
if (!error)
xfs_log_work_queue(mp);
mp->m_super->s_flags &= ~SB_ACTIVE;
@@ -774,17 +828,24 @@ xfs_log_mount_finish(
* Don't push in the error case because the AIL may have pending intents
* that aren't removed until recovery is cancelled.
*/
- if (!error && recovered) {
- xfs_log_force(mp, XFS_LOG_SYNC);
- xfs_ail_push_all_sync(mp->m_ail);
+ if (xlog_recovery_needed(log)) {
+ if (!error) {
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ xfs_ail_push_all_sync(mp->m_ail);
+ }
+ xfs_notice(mp, "Ending recovery (logdev: %s)",
+ mp->m_logname ? mp->m_logname : "internal");
+ } else {
+ xfs_info(mp, "Ending clean mount");
}
xfs_buftarg_drain(mp->m_ddev_targp);
+ clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
if (readonly)
- mp->m_flags |= XFS_MOUNT_RDONLY;
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
/* Make sure the log is dead if we're returning failure. */
- ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR));
+ ASSERT(!error || xlog_is_shutdown(log));
return error;
}
@@ -830,7 +891,7 @@ xlog_wait_on_iclog(
struct xlog *log = iclog->ic_log;
trace_xlog_iclog_wait_on(iclog, _RET_IP_);
- if (!XLOG_FORCED_SHUTDOWN(log) &&
+ if (!xlog_is_shutdown(log) &&
iclog->ic_state != XLOG_STATE_ACTIVE &&
iclog->ic_state != XLOG_STATE_DIRTY) {
XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
@@ -839,7 +900,7 @@ xlog_wait_on_iclog(
spin_unlock(&log->l_icloglock);
}
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
return -EIO;
return 0;
}
@@ -870,7 +931,7 @@ xlog_write_unmount_record(
/* account for space used by record data */
ticket->t_curr_res -= sizeof(ulf);
- return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS);
+ return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS);
}
/*
@@ -893,7 +954,7 @@ xlog_unmount_write(
error = xlog_write_unmount_record(log, tic);
/*
* At this point, we're umounting anyway, so there's no point in
- * transitioning log state to IOERROR. Just continue...
+ * transitioning log state to shutdown. Just continue...
*/
out_err:
if (error)
@@ -940,7 +1001,7 @@ xfs_log_unmount_write(
xfs_log_force(mp, XFS_LOG_SYNC);
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
return;
/*
@@ -972,6 +1033,20 @@ int
xfs_log_quiesce(
struct xfs_mount *mp)
{
+ /*
+ * Clear log incompat features since we're quiescing the log. Report
+ * failures, though it's not fatal to have a higher log feature
+ * protection level than the log contents actually require.
+ */
+ if (xfs_clear_incompat_log_features(mp)) {
+ int error;
+
+ error = xfs_sync_sb(mp, false);
+ if (error)
+ xfs_warn(mp,
+ "Failed to clear log incompat features on quiesce");
+ }
+
cancel_delayed_work_sync(&mp->m_log->l_work);
xfs_log_force(mp, XFS_LOG_SYNC);
@@ -1049,11 +1124,11 @@ xfs_log_space_wake(
struct xlog *log = mp->m_log;
int free_bytes;
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
return;
if (!list_empty_careful(&log->l_write_head.waiters)) {
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+ ASSERT(!xlog_in_recovery(log));
spin_lock(&log->l_write_head.lock);
free_bytes = xlog_space_left(log, &log->l_write_head.grant);
@@ -1062,7 +1137,7 @@ xfs_log_space_wake(
}
if (!list_empty_careful(&log->l_reserve_head.waiters)) {
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+ ASSERT(!xlog_in_recovery(log));
spin_lock(&log->l_reserve_head.lock);
free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
@@ -1140,7 +1215,7 @@ xfs_log_cover(
ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) &&
!xfs_ail_min_lsn(mp->m_log->l_ailp)) ||
- XFS_FORCED_SHUTDOWN(mp));
+ xlog_is_shutdown(mp->m_log));
if (!xfs_log_writable(mp))
return 0;
@@ -1157,7 +1232,7 @@ xfs_log_cover(
* handles this for us.
*/
need_covered = xfs_log_need_covered(mp);
- if (!need_covered && !xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (!need_covered && !xfs_has_lazysbcount(mp))
return 0;
/*
@@ -1230,16 +1305,18 @@ xlog_assign_tail_lsn(
* wrap the tail, we should blow up. Rather than catch this case here,
* we depend on other ASSERTions in other parts of the code. XXXmiken
*
- * This code also handles the case where the reservation head is behind
- * the tail. The details of this case are described below, but the end
- * result is that we return the size of the log as the amount of space left.
+ * If reservation head is behind the tail, we have a problem. Warn about it,
+ * but then treat it as if the log is empty.
+ *
+ * If the log is shut down, the head and tail may be invalid or out of whack, so
+ * shortcut invalidity asserts in this case so that we don't trigger them
+ * falsely.
*/
STATIC int
xlog_space_left(
struct xlog *log,
atomic64_t *head)
{
- int free_bytes;
int tail_bytes;
int tail_cycle;
int head_cycle;
@@ -1249,29 +1326,30 @@ xlog_space_left(
xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
tail_bytes = BBTOB(tail_bytes);
if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
- free_bytes = log->l_logsize - (head_bytes - tail_bytes);
- else if (tail_cycle + 1 < head_cycle)
+ return log->l_logsize - (head_bytes - tail_bytes);
+ if (tail_cycle + 1 < head_cycle)
return 0;
- else if (tail_cycle < head_cycle) {
+
+ /* Ignore potential inconsistency when shutdown. */
+ if (xlog_is_shutdown(log))
+ return log->l_logsize;
+
+ if (tail_cycle < head_cycle) {
ASSERT(tail_cycle == (head_cycle - 1));
- free_bytes = tail_bytes - head_bytes;
- } else {
- /*
- * The reservation head is behind the tail.
- * In this case we just want to return the size of the
- * log as the amount of space left.
- */
- xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
- xfs_alert(log->l_mp,
- " tail_cycle = %d, tail_bytes = %d",
- tail_cycle, tail_bytes);
- xfs_alert(log->l_mp,
- " GH cycle = %d, GH bytes = %d",
- head_cycle, head_bytes);
- ASSERT(0);
- free_bytes = log->l_logsize;
+ return tail_bytes - head_bytes;
}
- return free_bytes;
+
+ /*
+ * The reservation head is behind the tail. In this case we just want to
+ * return the size of the log as the amount of space left.
+ */
+ xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
+ xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d",
+ tail_cycle, tail_bytes);
+ xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d",
+ head_cycle, head_bytes);
+ ASSERT(0);
+ return log->l_logsize;
}
@@ -1349,6 +1427,32 @@ xfs_log_work_queue(
}
/*
+ * Clear the log incompat flags if we have the opportunity.
+ *
+ * This only happens if we're about to log the second dummy transaction as part
+ * of covering the log and we can get the log incompat feature usage lock.
+ */
+static inline void
+xlog_clear_incompat(
+ struct xlog *log)
+{
+ struct xfs_mount *mp = log->l_mp;
+
+ if (!xfs_sb_has_incompat_log_feature(&mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_ALL))
+ return;
+
+ if (log->l_covered_state != XLOG_STATE_COVER_DONE2)
+ return;
+
+ if (!down_write_trylock(&log->l_incompat_users))
+ return;
+
+ xfs_clear_incompat_log_features(mp);
+ up_write(&log->l_incompat_users);
+}
+
+/*
* Every sync period we need to unpin all items in the AIL and push them to
* disk. If there is nothing dirty, then we might need to cover the log to
* indicate that the filesystem is idle.
@@ -1374,6 +1478,7 @@ xfs_log_worker(
* synchronously log the superblock instead to ensure the
* superblock is immediately unpinned and can be written back.
*/
+ xlog_clear_incompat(log);
xfs_sync_sb(mp, true);
} else
xfs_log_force(mp, 0);
@@ -1417,7 +1522,7 @@ xlog_alloc_log(
log->l_logBBstart = blk_offset;
log->l_logBBsize = num_bblks;
log->l_covered_state = XLOG_STATE_COVER_IDLE;
- log->l_flags |= XLOG_ACTIVE_RECOVERY;
+ set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
log->l_prev_block = -1;
@@ -1426,7 +1531,7 @@ xlog_alloc_log(
xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
- if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+ if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1)
log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
else
log->l_iclog_roundoff = BBSIZE;
@@ -1435,7 +1540,7 @@ xlog_alloc_log(
xlog_grant_head_init(&log->l_write_head);
error = -EFSCORRUPTED;
- if (xfs_sb_version_hassector(&mp->m_sb)) {
+ if (xfs_has_sector(mp)) {
log2_size = mp->m_sb.sb_logsectlog;
if (log2_size < BBSHIFT) {
xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
@@ -1452,7 +1557,7 @@ xlog_alloc_log(
/* for larger sector sizes, must have v2 or external log */
if (log2_size && log->l_logBBstart > 0 &&
- !xfs_sb_version_haslogv2(&mp->m_sb)) {
+ !xfs_has_logv2(mp)) {
xfs_warn(mp,
"log sector size (0x%x) invalid for configuration.",
log2_size);
@@ -1461,6 +1566,8 @@ xlog_alloc_log(
}
log->l_sectBBsize = 1 << log2_size;
+ init_rwsem(&log->l_incompat_users);
+
xlog_get_iclog_buffer_size(mp, log);
spin_lock_init(&log->l_icloglock);
@@ -1476,7 +1583,6 @@ xlog_alloc_log(
*/
ASSERT(log->l_iclog_size >= 4096);
for (i = 0; i < log->l_iclog_bufs; i++) {
- int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
sizeof(struct bio_vec);
@@ -1488,8 +1594,8 @@ xlog_alloc_log(
iclog->ic_prev = prev_iclog;
prev_iclog = iclog;
- iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
- KM_MAYFAIL | KM_ZERO);
+ iclog->ic_data = kvzalloc(log->l_iclog_size,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog->ic_data)
goto out_free_iclog;
#ifdef DEBUG
@@ -1499,7 +1605,7 @@ xlog_alloc_log(
memset(head, 0, sizeof(xlog_rec_header_t));
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
head->h_version = cpu_to_be32(
- xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+ xfs_has_logv2(log->l_mp) ? 2 : 1);
head->h_size = cpu_to_be32(log->l_iclog_size);
/* new fields */
head->h_fmt = cpu_to_be32(XLOG_FMT);
@@ -1551,37 +1657,6 @@ out:
} /* xlog_alloc_log */
/*
- * Write out the commit record of a transaction associated with the given
- * ticket to close off a running log write. Return the lsn of the commit record.
- */
-int
-xlog_commit_record(
- struct xlog *log,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- xfs_lsn_t *lsn)
-{
- struct xfs_log_iovec reg = {
- .i_addr = NULL,
- .i_len = 0,
- .i_type = XLOG_REG_TYPE_COMMIT,
- };
- struct xfs_log_vec vec = {
- .lv_niovecs = 1,
- .lv_iovecp = &reg,
- };
- int error;
-
- if (XLOG_FORCED_SHUTDOWN(log))
- return -EIO;
-
- error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS);
- if (error)
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
- return error;
-}
-
-/*
* Compute the LSN that we'd need to push the log tail towards in order to have
* (a) enough on-disk log space to log the number of bytes specified, (b) at
* least 25% of the log space free, and (c) at least 256 blocks free. If the
@@ -1653,7 +1728,7 @@ xlog_grant_push_ail(
xfs_lsn_t threshold_lsn;
threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
- if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log))
+ if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log))
return;
/*
@@ -1689,7 +1764,7 @@ xlog_pack_data(
dp += BBSIZE;
}
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
xlog_in_core_2_t *xhdr = iclog->ic_data;
for ( ; i < BTOBB(size); i++) {
@@ -1726,7 +1801,7 @@ xlog_cksum(
offsetof(struct xlog_rec_header, h_crc));
/* ... then for additional cycle data for v2 logs ... */
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
int i;
int xheads;
@@ -1795,7 +1870,7 @@ xlog_write_iclog(
* across the log IO to archieve that.
*/
down(&iclog->ic_sema);
- if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) {
+ if (xlog_is_shutdown(log)) {
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
@@ -1953,7 +2028,7 @@ xlog_sync(
/* real byte length */
size = iclog->ic_offset;
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb))
+ if (xfs_has_logv2(log->l_mp))
size += roundoff;
iclog->ic_header.h_len = cpu_to_be32(size);
@@ -2303,8 +2378,7 @@ xlog_write_copy_finish(
int *data_cnt,
int *partial_copy,
int *partial_copy_len,
- int log_offset,
- struct xlog_in_core **commit_iclog)
+ int log_offset)
{
int error;
@@ -2323,27 +2397,20 @@ xlog_write_copy_finish(
*partial_copy = 0;
*partial_copy_len = 0;
- if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
- /* no more space in this iclog - push it. */
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
-
- if (iclog->ic_state == XLOG_STATE_ACTIVE)
- xlog_state_switch_iclogs(log, iclog, 0);
- else
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR);
- if (!commit_iclog)
- goto release_iclog;
- spin_unlock(&log->l_icloglock);
- ASSERT(flags & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
- }
+ if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t))
+ return 0;
- return 0;
+ /* no more space in this iclog - push it. */
+ spin_lock(&log->l_icloglock);
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ *record_cnt = 0;
+ *data_cnt = 0;
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, iclog, 0);
+ else
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ xlog_is_shutdown(log));
release_iclog:
error = xlog_state_release_iclog(log, iclog, 0);
spin_unlock(&log->l_icloglock);
@@ -2393,10 +2460,9 @@ release_iclog:
int
xlog_write(
struct xlog *log,
+ struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector,
struct xlog_ticket *ticket,
- xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog,
uint optype)
{
struct xlog_in_core *iclog = NULL;
@@ -2426,8 +2492,6 @@ xlog_write(
}
len = xlog_write_calc_vec_length(ticket, log_vector, optype);
- if (start_lsn)
- *start_lsn = 0;
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2440,9 +2504,15 @@ xlog_write(
ASSERT(log_offset <= iclog->ic_size - 1);
ptr = iclog->ic_datap + log_offset;
- /* Start_lsn is the first lsn written to. */
- if (start_lsn && !*start_lsn)
- *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ /*
+ * If we have a context pointer, pass it the first iclog we are
+ * writing to so it can record state needed for iclog write
+ * ordering.
+ */
+ if (ctx) {
+ xlog_cil_set_ctx_write_state(ctx, iclog);
+ ctx = NULL;
+ }
/*
* This loop writes out as many regions as can fit in the amount
@@ -2521,8 +2591,7 @@ xlog_write(
&record_cnt, &data_cnt,
&partial_copy,
&partial_copy_len,
- log_offset,
- commit_iclog);
+ log_offset);
if (error)
return error;
@@ -2560,12 +2629,7 @@ next_lv:
spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- if (commit_iclog) {
- ASSERT(optype & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
- } else {
- error = xlog_state_release_iclog(log, iclog, 0);
- }
+ error = xlog_state_release_iclog(log, iclog, 0);
spin_unlock(&log->l_icloglock);
return error;
@@ -2751,8 +2815,7 @@ xlog_state_set_callback(
static bool
xlog_state_iodone_process_iclog(
struct xlog *log,
- struct xlog_in_core *iclog,
- bool *ioerror)
+ struct xlog_in_core *iclog)
{
xfs_lsn_t lowest_lsn;
xfs_lsn_t header_lsn;
@@ -2764,15 +2827,6 @@ xlog_state_iodone_process_iclog(
* Skip all iclogs in the ACTIVE & DIRTY states:
*/
return false;
- case XLOG_STATE_IOERROR:
- /*
- * Between marking a filesystem SHUTDOWN and stopping the log,
- * we do flush all iclogs to disk (if there wasn't a log I/O
- * error). So, we do want things to go smoothly in case of just
- * a SHUTDOWN w/o a LOG_IO_ERROR.
- */
- *ioerror = true;
- return false;
case XLOG_STATE_DONE_SYNC:
/*
* Now that we have an iclog that is in the DONE_SYNC state, do
@@ -2796,72 +2850,75 @@ xlog_state_iodone_process_iclog(
}
}
-STATIC void
-xlog_state_do_callback(
+/*
+ * Loop over all the iclogs, running attached callbacks on them. Return true if
+ * we ran any callbacks, indicating that we dropped the icloglock. We don't need
+ * to handle transient shutdown state here at all because
+ * xlog_state_shutdown_callbacks() will be run to do the necessary shutdown
+ * cleanup of the callbacks.
+ */
+static bool
+xlog_state_do_iclog_callbacks(
struct xlog *log)
+ __releases(&log->l_icloglock)
+ __acquires(&log->l_icloglock)
{
- struct xlog_in_core *iclog;
- struct xlog_in_core *first_iclog;
- bool cycled_icloglock;
- bool ioerror;
- int flushcnt = 0;
- int repeats = 0;
+ struct xlog_in_core *first_iclog = log->l_iclog;
+ struct xlog_in_core *iclog = first_iclog;
+ bool ran_callback = false;
- spin_lock(&log->l_icloglock);
do {
- /*
- * Scan all iclogs starting with the one pointed to by the
- * log. Reset this starting point each time the log is
- * unlocked (during callbacks).
- *
- * Keep looping through iclogs until one full pass is made
- * without running any callbacks.
- */
- first_iclog = log->l_iclog;
- iclog = log->l_iclog;
- cycled_icloglock = false;
- ioerror = false;
- repeats++;
+ LIST_HEAD(cb_list);
- do {
- LIST_HEAD(cb_list);
+ if (xlog_state_iodone_process_iclog(log, iclog))
+ break;
+ if (iclog->ic_state != XLOG_STATE_CALLBACK) {
+ iclog = iclog->ic_next;
+ continue;
+ }
+ list_splice_init(&iclog->ic_callbacks, &cb_list);
+ spin_unlock(&log->l_icloglock);
- if (xlog_state_iodone_process_iclog(log, iclog,
- &ioerror))
- break;
+ trace_xlog_iclog_callbacks_start(iclog, _RET_IP_);
+ xlog_cil_process_committed(&cb_list);
+ trace_xlog_iclog_callbacks_done(iclog, _RET_IP_);
+ ran_callback = true;
- if (iclog->ic_state != XLOG_STATE_CALLBACK &&
- iclog->ic_state != XLOG_STATE_IOERROR) {
- iclog = iclog->ic_next;
- continue;
- }
- list_splice_init(&iclog->ic_callbacks, &cb_list);
- spin_unlock(&log->l_icloglock);
+ spin_lock(&log->l_icloglock);
+ xlog_state_clean_iclog(log, iclog);
+ iclog = iclog->ic_next;
+ } while (iclog != first_iclog);
+
+ return ran_callback;
+}
- trace_xlog_iclog_callbacks_start(iclog, _RET_IP_);
- xlog_cil_process_committed(&cb_list);
- trace_xlog_iclog_callbacks_done(iclog, _RET_IP_);
- cycled_icloglock = true;
- spin_lock(&log->l_icloglock);
- if (XLOG_FORCED_SHUTDOWN(log))
- wake_up_all(&iclog->ic_force_wait);
- else
- xlog_state_clean_iclog(log, iclog);
- iclog = iclog->ic_next;
- } while (first_iclog != iclog);
+/*
+ * Loop running iclog completion callbacks until there are no more iclogs in a
+ * state that can run callbacks.
+ */
+STATIC void
+xlog_state_do_callback(
+ struct xlog *log)
+{
+ int flushcnt = 0;
+ int repeats = 0;
+
+ spin_lock(&log->l_icloglock);
+ while (xlog_state_do_iclog_callbacks(log)) {
+ if (xlog_is_shutdown(log))
+ break;
- if (repeats > 5000) {
+ if (++repeats > 5000) {
flushcnt += repeats;
repeats = 0;
xfs_warn(log->l_mp,
"%s: possible infinite loop (%d iterations)",
__func__, flushcnt);
}
- } while (!ioerror && cycled_icloglock);
+ }
- if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE ||
- log->l_iclog->ic_state == XLOG_STATE_IOERROR)
+ if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE)
wake_up_all(&log->l_flush_wait);
spin_unlock(&log->l_icloglock);
@@ -2871,13 +2928,6 @@ xlog_state_do_callback(
/*
* Finish transitioning this iclog to the dirty state.
*
- * Make sure that we completely execute this routine only when this is
- * the last call to the iclog. There is a good chance that iclog flushes,
- * when we reach the end of the physical log, get turned into 2 separate
- * calls to bwrite. Hence, one iclog flush could generate two calls to this
- * routine. By using the reference count bwritecnt, we guarantee that only
- * the second completion goes through.
- *
* Callbacks could take time, so they are done outside the scope of the
* global state machine log lock.
*/
@@ -2896,7 +2946,7 @@ xlog_state_done_syncing(
* split log writes, on the second, we shut down the file system and
* no iclogs should ever be attempted to be written to disk again.
*/
- if (!XLOG_FORCED_SHUTDOWN(log)) {
+ if (!xlog_is_shutdown(log)) {
ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
iclog->ic_state = XLOG_STATE_DONE_SYNC;
}
@@ -2944,7 +2994,7 @@ xlog_state_get_iclog_space(
restart:
spin_lock(&log->l_icloglock);
- if (XLOG_FORCED_SHUTDOWN(log)) {
+ if (xlog_is_shutdown(log)) {
spin_unlock(&log->l_icloglock);
return -EIO;
}
@@ -3122,7 +3172,7 @@ xfs_log_ticket_ungrant(
* This routine will mark the current iclog in the ring as WANT_SYNC and move
* the current iclog pointer to the next iclog in the ring.
*/
-STATIC void
+void
xlog_state_switch_iclogs(
struct xlog *log,
struct xlog_in_core *iclog,
@@ -3237,10 +3287,10 @@ xfs_log_force(
xlog_cil_force(log);
spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- if (iclog->ic_state == XLOG_STATE_IOERROR)
+ if (xlog_is_shutdown(log))
goto out_error;
+ iclog = log->l_iclog;
trace_xlog_iclog_force(iclog, _RET_IP_);
if (iclog->ic_state == XLOG_STATE_DIRTY ||
@@ -3294,6 +3344,20 @@ out_error:
return -EIO;
}
+/*
+ * Force the log to a specific LSN.
+ *
+ * If an iclog with that lsn can be found:
+ * If it is in the DIRTY state, just return.
+ * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ * state and go to sleep or return.
+ * If it is in any other state, go to sleep or return.
+ *
+ * Synchronous forces are implemented with a wait queue. All callers trying
+ * to force a given lsn to disk must wait on the queue attached to the
+ * specific in-core log. When given in-core log finally completes its write
+ * to disk, that thread will wake up all threads waiting on the queue.
+ */
static int
xlog_force_lsn(
struct xlog *log,
@@ -3306,10 +3370,10 @@ xlog_force_lsn(
bool completed;
spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- if (iclog->ic_state == XLOG_STATE_IOERROR)
+ if (xlog_is_shutdown(log))
goto out_error;
+ iclog = log->l_iclog;
while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
trace_xlog_iclog_force_lsn(iclog, _RET_IP_);
iclog = iclog->ic_next;
@@ -3379,18 +3443,13 @@ out_error:
}
/*
- * Force the in-core log to disk for a specific LSN.
- *
- * Find in-core log with lsn.
- * If it is in the DIRTY state, just return.
- * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
- * state and go to sleep or return.
- * If it is in any other state, go to sleep or return.
+ * Force the log to a specific checkpoint sequence.
*
- * Synchronous forces are implemented with a wait queue. All callers trying
- * to force a given lsn to disk must wait on the queue attached to the
- * specific in-core log. When given in-core log finally completes its write
- * to disk, that thread will wake up all threads waiting on the queue.
+ * First force the CIL so that all the required changes have been flushed to the
+ * iclogs. If the CIL force completed it will return a commit LSN that indicates
+ * the iclog that needs to be flushed to stable storage. If the caller needs
+ * a synchronous log force, we will wait on the iclog with the LSN returned by
+ * xlog_cil_force_seq() to be completed.
*/
int
xfs_log_force_seq(
@@ -3619,17 +3678,15 @@ xlog_verify_grant_tail(
xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
if (tail_cycle != cycle) {
if (cycle - 1 != tail_cycle &&
- !(log->l_flags & XLOG_TAIL_WARN)) {
+ !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"%s: cycle - 1 != tail_cycle", __func__);
- log->l_flags |= XLOG_TAIL_WARN;
}
if (space > BBTOB(tail_blocks) &&
- !(log->l_flags & XLOG_TAIL_WARN)) {
+ !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"%s: space > BBTOB(tail_blocks)", __func__);
- log->l_flags |= XLOG_TAIL_WARN;
}
}
}
@@ -3765,105 +3822,66 @@ xlog_verify_iclog(
#endif
/*
- * Mark all iclogs IOERROR. l_icloglock is held by the caller.
- */
-STATIC int
-xlog_state_ioerror(
- struct xlog *log)
-{
- xlog_in_core_t *iclog, *ic;
-
- iclog = log->l_iclog;
- if (iclog->ic_state != XLOG_STATE_IOERROR) {
- /*
- * Mark all the incore logs IOERROR.
- * From now on, no log flushes will result.
- */
- ic = iclog;
- do {
- ic->ic_state = XLOG_STATE_IOERROR;
- ic = ic->ic_next;
- } while (ic != iclog);
- return 0;
- }
- /*
- * Return non-zero, if state transition has already happened.
- */
- return 1;
-}
-
-/*
- * This is called from xfs_force_shutdown, when we're forcibly
- * shutting down the filesystem, typically because of an IO error.
+ * Perform a forced shutdown on the log. This should be called once and once
+ * only by the high level filesystem shutdown code to shut the log subsystem
+ * down cleanly.
+ *
* Our main objectives here are to make sure that:
- * a. if !logerror, flush the logs to disk. Anything modified
- * after this is ignored.
- * b. the filesystem gets marked 'SHUTDOWN' for all interested
- * parties to find out, 'atomically'.
- * c. those who're sleeping on log reservations, pinned objects and
- * other resources get woken up, and be told the bad news.
- * d. nothing new gets queued up after (b) and (c) are done.
+ * a. if the shutdown was not due to a log IO error, flush the logs to
+ * disk. Anything modified after this is ignored.
+ * b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested
+ * parties to find out. Nothing new gets queued after this is done.
+ * c. Tasks sleeping on log reservations, pinned objects and
+ * other resources get woken up.
*
- * Note: for the !logerror case we need to flush the regions held in memory out
- * to disk first. This needs to be done before the log is marked as shutdown,
- * otherwise the iclog writes will fail.
+ * Return true if the shutdown cause was a log IO error and we actually shut the
+ * log down.
*/
-int
-xfs_log_force_umount(
- struct xfs_mount *mp,
- int logerror)
+bool
+xlog_force_shutdown(
+ struct xlog *log,
+ int shutdown_flags)
{
- struct xlog *log;
- int retval;
-
- log = mp->m_log;
+ bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
/*
- * If this happens during log recovery, don't worry about
- * locking; the log isn't open for business yet.
+ * If this happens during log recovery then we aren't using the runtime
+ * log mechanisms yet so there's nothing to shut down.
*/
- if (!log ||
- log->l_flags & XLOG_ACTIVE_RECOVERY) {
- mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
- if (mp->m_sb_bp)
- mp->m_sb_bp->b_flags |= XBF_DONE;
- return 0;
- }
+ if (!log || xlog_in_recovery(log))
+ return false;
- /*
- * Somebody could've already done the hard work for us.
- * No need to get locks for this.
- */
- if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) {
- ASSERT(XLOG_FORCED_SHUTDOWN(log));
- return 1;
- }
+ ASSERT(!xlog_is_shutdown(log));
/*
* Flush all the completed transactions to disk before marking the log
- * being shut down. We need to do it in this order to ensure that
- * completed operations are safely on disk before we shut down, and that
- * we don't have to issue any buffer IO after the shutdown flags are set
- * to guarantee this.
+ * being shut down. We need to do this first as shutting down the log
+ * before the force will prevent the log force from flushing the iclogs
+ * to disk.
+ *
+ * Re-entry due to a log IO error shutdown during the log force is
+ * prevented by the atomicity of higher level shutdown code.
*/
- if (!logerror)
- xfs_log_force(mp, XFS_LOG_SYNC);
+ if (!log_error)
+ xfs_log_force(log->l_mp, XFS_LOG_SYNC);
/*
- * mark the filesystem and the as in a shutdown state and wake
- * everybody up to tell them the bad news.
+ * Atomically set the shutdown state. If the shutdown state is already
+ * set, there someone else is performing the shutdown and so we are done
+ * here. This should never happen because we should only ever get called
+ * once by the first shutdown caller.
+ *
+ * Much of the log state machine transitions assume that shutdown state
+ * cannot change once they hold the log->l_icloglock. Hence we need to
+ * hold that lock here, even though we use the atomic test_and_set_bit()
+ * operation to set the shutdown state.
*/
spin_lock(&log->l_icloglock);
- mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
- if (mp->m_sb_bp)
- mp->m_sb_bp->b_flags |= XBF_DONE;
-
- /*
- * Mark the log and the iclogs with IO error flags to prevent any
- * further log IO from being issued or completed.
- */
- log->l_flags |= XLOG_IO_ERROR;
- retval = xlog_state_ioerror(log);
+ if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) {
+ spin_unlock(&log->l_icloglock);
+ ASSERT(0);
+ return false;
+ }
spin_unlock(&log->l_icloglock);
/*
@@ -3883,12 +3901,12 @@ xfs_log_force_umount(
* avoid races.
*/
spin_lock(&log->l_cilp->xc_push_lock);
+ wake_up_all(&log->l_cilp->xc_start_wait);
wake_up_all(&log->l_cilp->xc_commit_wait);
spin_unlock(&log->l_cilp->xc_push_lock);
- xlog_state_do_callback(log);
+ xlog_state_shutdown_callbacks(log);
- /* return non-zero if log IOERROR transition had already happened */
- return retval;
+ return log_error;
}
STATIC int
@@ -3926,7 +3944,7 @@ xfs_log_check_lsn(
* resets the in-core LSN. We can't validate in this mode, but
* modifications are not allowed anyways so just return true.
*/
- if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ if (xfs_has_norecovery(mp))
return true;
/*
@@ -3952,11 +3970,22 @@ xfs_log_check_lsn(
return valid;
}
-bool
-xfs_log_in_recovery(
- struct xfs_mount *mp)
+/*
+ * Notify the log that we're about to start using a feature that is protected
+ * by a log incompat feature flag. This will prevent log covering from
+ * clearing those flags.
+ */
+void
+xlog_use_incompat_feat(
+ struct xlog *log)
{
- struct xlog *log = mp->m_log;
+ down_read(&log->l_incompat_users);
+}
- return log->l_flags & XLOG_ACTIVE_RECOVERY;
+/* Notify the log that we've finished using log incompat features. */
+void
+xlog_drop_incompat_feat(
+ struct xlog *log)
+{
+ up_read(&log->l_incompat_users);
}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 813b972e9788..dc1b77b92fc1 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -104,6 +104,7 @@ struct xlog_ticket;
struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
+struct xlog;
int xfs_log_force(struct xfs_mount *mp, uint flags);
int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags,
@@ -125,7 +126,6 @@ int xfs_log_reserve(struct xfs_mount *mp,
bool permanent);
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
void xfs_log_unmount(struct xfs_mount *mp);
-int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
@@ -138,8 +138,11 @@ void xfs_log_work_queue(struct xfs_mount *mp);
int xfs_log_quiesce(struct xfs_mount *mp);
void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
-bool xfs_log_in_recovery(struct xfs_mount *);
xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
+bool xlog_force_shutdown(struct xlog *log, int shutdown_flags);
+
+void xlog_use_incompat_feat(struct xlog *log);
+void xlog_drop_incompat_feat(struct xlog *log);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4c44bc3786c0..6c93c8ada6f3 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -48,6 +48,34 @@ xlog_cil_ticket_alloc(
}
/*
+ * Unavoidable forward declaration - xlog_cil_push_work() calls
+ * xlog_cil_ctx_alloc() itself.
+ */
+static void xlog_cil_push_work(struct work_struct *work);
+
+static struct xfs_cil_ctx *
+xlog_cil_ctx_alloc(void)
+{
+ struct xfs_cil_ctx *ctx;
+
+ ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
+ INIT_LIST_HEAD(&ctx->committing);
+ INIT_LIST_HEAD(&ctx->busy_extents);
+ INIT_WORK(&ctx->push_work, xlog_cil_push_work);
+ return ctx;
+}
+
+static void
+xlog_cil_ctx_switch(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx)
+{
+ ctx->sequence = ++cil->xc_current_sequence;
+ ctx->cil = cil;
+ cil->xc_ctx = ctx;
+}
+
+/*
* After the first stage of log recovery is done, we know where the head and
* tail of the log are. We need this log initialisation done before we can
* initialise the first CIL checkpoint context.
@@ -185,7 +213,15 @@ xlog_cil_alloc_shadow_bufs(
*/
kmem_free(lip->li_lv_shadow);
- lv = kmem_alloc_large(buf_size, KM_NOFS);
+ /*
+ * We are in transaction context, which means this
+ * allocation will pick up GFP_NOFS from the
+ * memalloc_nofs_save/restore context the transaction
+ * holds. This means we can use GFP_KERNEL here so the
+ * generic kvmalloc() code will run vmalloc on
+ * contiguous page allocation failure as we require.
+ */
+ lv = kvmalloc(buf_size, GFP_KERNEL);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
lv->lv_item = lip;
@@ -535,7 +571,7 @@ xlog_discard_busy_extents(
struct blk_plug plug;
int error = 0;
- ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+ ASSERT(xfs_has_discard(mp));
blk_start_plug(&plug);
list_for_each_entry(busyp, list, list) {
@@ -576,7 +612,7 @@ xlog_cil_committed(
struct xfs_cil_ctx *ctx)
{
struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
- bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log);
+ bool abort = xlog_is_shutdown(ctx->cil->xc_log);
/*
* If the I/O failed, we're aborting the commit and already shutdown.
@@ -587,6 +623,7 @@ xlog_cil_committed(
*/
if (abort) {
spin_lock(&ctx->cil->xc_push_lock);
+ wake_up_all(&ctx->cil->xc_start_wait);
wake_up_all(&ctx->cil->xc_commit_wait);
spin_unlock(&ctx->cil->xc_push_lock);
}
@@ -596,7 +633,7 @@ xlog_cil_committed(
xfs_extent_busy_sort(&ctx->busy_extents);
xfs_extent_busy_clear(mp, &ctx->busy_extents,
- (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+ xfs_has_discard(mp) && !abort);
spin_lock(&ctx->cil->xc_push_lock);
list_del(&ctx->committing);
@@ -624,6 +661,180 @@ xlog_cil_process_committed(
}
/*
+* Record the LSN of the iclog we were just granted space to start writing into.
+* If the context doesn't have a start_lsn recorded, then this iclog will
+* contain the start record for the checkpoint. Otherwise this write contains
+* the commit record for the checkpoint.
+*/
+void
+xlog_cil_set_ctx_write_state(
+ struct xfs_cil_ctx *ctx,
+ struct xlog_in_core *iclog)
+{
+ struct xfs_cil *cil = ctx->cil;
+ xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+
+ ASSERT(!ctx->commit_lsn);
+ if (!ctx->start_lsn) {
+ spin_lock(&cil->xc_push_lock);
+ /*
+ * The LSN we need to pass to the log items on transaction
+ * commit is the LSN reported by the first log vector write, not
+ * the commit lsn. If we use the commit record lsn then we can
+ * move the tail beyond the grant write head.
+ */
+ ctx->start_lsn = lsn;
+ wake_up_all(&cil->xc_start_wait);
+ spin_unlock(&cil->xc_push_lock);
+ return;
+ }
+
+ /*
+ * Take a reference to the iclog for the context so that we still hold
+ * it when xlog_write is done and has released it. This means the
+ * context controls when the iclog is released for IO.
+ */
+ atomic_inc(&iclog->ic_refcnt);
+
+ /*
+ * xlog_state_get_iclog_space() guarantees there is enough space in the
+ * iclog for an entire commit record, so we can attach the context
+ * callbacks now. This needs to be done before we make the commit_lsn
+ * visible to waiters so that checkpoints with commit records in the
+ * same iclog order their IO completion callbacks in the same order that
+ * the commit records appear in the iclog.
+ */
+ spin_lock(&cil->xc_log->l_icloglock);
+ list_add_tail(&ctx->iclog_entry, &iclog->ic_callbacks);
+ spin_unlock(&cil->xc_log->l_icloglock);
+
+ /*
+ * Now we can record the commit LSN and wake anyone waiting for this
+ * sequence to have the ordered commit record assigned to a physical
+ * location in the log.
+ */
+ spin_lock(&cil->xc_push_lock);
+ ctx->commit_iclog = iclog;
+ ctx->commit_lsn = lsn;
+ wake_up_all(&cil->xc_commit_wait);
+ spin_unlock(&cil->xc_push_lock);
+}
+
+
+/*
+ * Ensure that the order of log writes follows checkpoint sequence order. This
+ * relies on the context LSN being zero until the log write has guaranteed the
+ * LSN that the log write will start at via xlog_state_get_iclog_space().
+ */
+enum _record_type {
+ _START_RECORD,
+ _COMMIT_RECORD,
+};
+
+static int
+xlog_cil_order_write(
+ struct xfs_cil *cil,
+ xfs_csn_t sequence,
+ enum _record_type record)
+{
+ struct xfs_cil_ctx *ctx;
+
+restart:
+ spin_lock(&cil->xc_push_lock);
+ list_for_each_entry(ctx, &cil->xc_committing, committing) {
+ /*
+ * Avoid getting stuck in this loop because we were woken by the
+ * shutdown, but then went back to sleep once already in the
+ * shutdown state.
+ */
+ if (xlog_is_shutdown(cil->xc_log)) {
+ spin_unlock(&cil->xc_push_lock);
+ return -EIO;
+ }
+
+ /*
+ * Higher sequences will wait for this one so skip them.
+ * Don't wait for our own sequence, either.
+ */
+ if (ctx->sequence >= sequence)
+ continue;
+
+ /* Wait until the LSN for the record has been recorded. */
+ switch (record) {
+ case _START_RECORD:
+ if (!ctx->start_lsn) {
+ xlog_wait(&cil->xc_start_wait, &cil->xc_push_lock);
+ goto restart;
+ }
+ break;
+ case _COMMIT_RECORD:
+ if (!ctx->commit_lsn) {
+ xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
+ goto restart;
+ }
+ break;
+ }
+ }
+ spin_unlock(&cil->xc_push_lock);
+ return 0;
+}
+
+/*
+ * Write out the log vector change now attached to the CIL context. This will
+ * write a start record that needs to be strictly ordered in ascending CIL
+ * sequence order so that log recovery will always use in-order start LSNs when
+ * replaying checkpoints.
+ */
+static int
+xlog_cil_write_chain(
+ struct xfs_cil_ctx *ctx,
+ struct xfs_log_vec *chain)
+{
+ struct xlog *log = ctx->cil->xc_log;
+ int error;
+
+ error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
+ if (error)
+ return error;
+ return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS);
+}
+
+/*
+ * Write out the commit record of a checkpoint transaction to close off a
+ * running log write. These commit records are strictly ordered in ascending CIL
+ * sequence order so that log recovery will always replay the checkpoints in the
+ * correct order.
+ */
+static int
+xlog_cil_write_commit_record(
+ struct xfs_cil_ctx *ctx)
+{
+ struct xlog *log = ctx->cil->xc_log;
+ struct xfs_log_iovec reg = {
+ .i_addr = NULL,
+ .i_len = 0,
+ .i_type = XLOG_REG_TYPE_COMMIT,
+ };
+ struct xfs_log_vec vec = {
+ .lv_niovecs = 1,
+ .lv_iovecp = &reg,
+ };
+ int error;
+
+ if (xlog_is_shutdown(log))
+ return -EIO;
+
+ error = xlog_cil_order_write(ctx->cil, ctx->sequence, _COMMIT_RECORD);
+ if (error)
+ return error;
+
+ error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
+ if (error)
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ return error;
+}
+
+/*
* Push the Committed Item List to the log.
*
* If the current sequence is the same as xc_push_seq we need to do a flush. If
@@ -641,13 +852,12 @@ static void
xlog_cil_push_work(
struct work_struct *work)
{
- struct xfs_cil *cil =
- container_of(work, struct xfs_cil, xc_push_work);
+ struct xfs_cil_ctx *ctx =
+ container_of(work, struct xfs_cil_ctx, push_work);
+ struct xfs_cil *cil = ctx->cil;
struct xlog *log = cil->xc_log;
struct xfs_log_vec *lv;
- struct xfs_cil_ctx *ctx;
struct xfs_cil_ctx *new_ctx;
- struct xlog_in_core *commit_iclog;
struct xlog_ticket *tic;
int num_iovecs;
int error = 0;
@@ -655,20 +865,21 @@ xlog_cil_push_work(
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
xfs_lsn_t preflush_tail_lsn;
- xfs_lsn_t commit_lsn;
xfs_csn_t push_seq;
struct bio bio;
DECLARE_COMPLETION_ONSTACK(bdev_flush);
+ bool push_commit_stable;
- new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
+ new_ctx = xlog_cil_ctx_alloc();
new_ctx->ticket = xlog_cil_ticket_alloc(log);
down_write(&cil->xc_ctx_lock);
- ctx = cil->xc_ctx;
spin_lock(&cil->xc_push_lock);
push_seq = cil->xc_push_seq;
ASSERT(push_seq <= ctx->sequence);
+ push_commit_stable = cil->xc_push_commit_stable;
+ cil->xc_push_commit_stable = false;
/*
* As we are about to switch to a new, empty CIL context, we no longer
@@ -694,7 +905,7 @@ xlog_cil_push_work(
/* check for a previously pushed sequence */
- if (push_seq < cil->xc_ctx->sequence) {
+ if (push_seq < ctx->sequence) {
spin_unlock(&cil->xc_push_lock);
goto out_skip;
}
@@ -767,19 +978,7 @@ xlog_cil_push_work(
}
/*
- * initialise the new context and attach it to the CIL. Then attach
- * the current context to the CIL committing list so it can be found
- * during log forces to extract the commit lsn of the sequence that
- * needs to be forced.
- */
- INIT_LIST_HEAD(&new_ctx->committing);
- INIT_LIST_HEAD(&new_ctx->busy_extents);
- new_ctx->sequence = ctx->sequence + 1;
- new_ctx->cil = cil;
- cil->xc_ctx = new_ctx;
-
- /*
- * The switch is now done, so we can drop the context lock and move out
+ * Switch the contexts so we can drop the context lock and move out
* of a shared context. We can't just go straight to the commit record,
* though - we need to synchronise with previous and future commits so
* that the commit records are correctly ordered in the log to ensure
@@ -804,7 +1003,7 @@ xlog_cil_push_work(
* deferencing a freed context pointer.
*/
spin_lock(&cil->xc_push_lock);
- cil->xc_current_sequence = new_ctx->sequence;
+ xlog_cil_ctx_switch(cil, new_ctx);
spin_unlock(&cil->xc_push_lock);
up_write(&cil->xc_ctx_lock);
@@ -837,78 +1036,17 @@ xlog_cil_push_work(
*/
wait_for_completion(&bdev_flush);
- error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL,
- XLOG_START_TRANS);
+ error = xlog_cil_write_chain(ctx, &lvhdr);
if (error)
goto out_abort_free_ticket;
- /*
- * now that we've written the checkpoint into the log, strictly
- * order the commit records so replay will get them in the right order.
- */
-restart:
- spin_lock(&cil->xc_push_lock);
- list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
- /*
- * Avoid getting stuck in this loop because we were woken by the
- * shutdown, but then went back to sleep once already in the
- * shutdown state.
- */
- if (XLOG_FORCED_SHUTDOWN(log)) {
- spin_unlock(&cil->xc_push_lock);
- goto out_abort_free_ticket;
- }
-
- /*
- * Higher sequences will wait for this one so skip them.
- * Don't wait for our own sequence, either.
- */
- if (new_ctx->sequence >= ctx->sequence)
- continue;
- if (!new_ctx->commit_lsn) {
- /*
- * It is still being pushed! Wait for the push to
- * complete, then start again from the beginning.
- */
- xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
- goto restart;
- }
- }
- spin_unlock(&cil->xc_push_lock);
-
- error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
+ error = xlog_cil_write_commit_record(ctx);
if (error)
goto out_abort_free_ticket;
xfs_log_ticket_ungrant(log, tic);
/*
- * Once we attach the ctx to the iclog, a shutdown can process the
- * iclog, run the callbacks and free the ctx. The only thing preventing
- * this potential UAF situation here is that we are holding the
- * icloglock. Hence we cannot access the ctx once we have attached the
- * callbacks and dropped the icloglock.
- */
- spin_lock(&log->l_icloglock);
- if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
- spin_unlock(&log->l_icloglock);
- goto out_abort;
- }
- ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE ||
- commit_iclog->ic_state == XLOG_STATE_WANT_SYNC);
- list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks);
-
- /*
- * now the checkpoint commit is complete and we've attached the
- * callbacks to the iclog we can assign the commit LSN to the context
- * and wake up anyone who is waiting for the commit to complete.
- */
- spin_lock(&cil->xc_push_lock);
- ctx->commit_lsn = commit_lsn;
- wake_up_all(&cil->xc_commit_wait);
- spin_unlock(&cil->xc_push_lock);
-
- /*
* If the checkpoint spans multiple iclogs, wait for all previous iclogs
* to complete before we submit the commit_iclog. We can't use state
* checks for this - ACTIVE can be either a past completed iclog or a
@@ -919,21 +1057,19 @@ restart:
* wakeup until this commit_iclog is written to disk. Hence we use the
* iclog header lsn and compare it to the commit lsn to determine if we
* need to wait on iclogs or not.
- *
- * NOTE: It is not safe to reference the ctx after this check as we drop
- * the icloglock if we have to wait for completion of other iclogs.
*/
- if (ctx->start_lsn != commit_lsn) {
+ spin_lock(&log->l_icloglock);
+ if (ctx->start_lsn != ctx->commit_lsn) {
xfs_lsn_t plsn;
- plsn = be64_to_cpu(commit_iclog->ic_prev->ic_header.h_lsn);
- if (plsn && XFS_LSN_CMP(plsn, commit_lsn) < 0) {
+ plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn);
+ if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) {
/*
* Waiting on ic_force_wait orders the completion of
* iclogs older than ic_prev. Hence we only need to wait
* on the most recent older iclog here.
*/
- xlog_wait_on_iclog(commit_iclog->ic_prev);
+ xlog_wait_on_iclog(ctx->commit_iclog->ic_prev);
spin_lock(&log->l_icloglock);
}
@@ -941,16 +1077,27 @@ restart:
* We need to issue a pre-flush so that the ordering for this
* checkpoint is correctly preserved down to stable storage.
*/
- commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+ ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
}
/*
* The commit iclog must be written to stable storage to guarantee
* journal IO vs metadata writeback IO is correctly ordered on stable
* storage.
+ *
+ * If the push caller needs the commit to be immediately stable and the
+ * commit_iclog is not yet marked as XLOG_STATE_WANT_SYNC to indicate it
+ * will be written when released, switch it's state to WANT_SYNC right
+ * now.
*/
- commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
- xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn);
+ ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
+ if (push_commit_stable &&
+ ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
+ xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn);
+
+ /* Not safe to reference ctx now! */
+
spin_unlock(&log->l_icloglock);
return;
@@ -962,9 +1109,15 @@ out_skip:
out_abort_free_ticket:
xfs_log_ticket_ungrant(log, tic);
-out_abort:
- ASSERT(XLOG_FORCED_SHUTDOWN(log));
- xlog_cil_committed(ctx);
+ ASSERT(xlog_is_shutdown(log));
+ if (!ctx->commit_iclog) {
+ xlog_cil_committed(ctx);
+ return;
+ }
+ spin_lock(&log->l_icloglock);
+ xlog_state_release_iclog(log, ctx->commit_iclog, 0);
+ /* Not safe to reference ctx now! */
+ spin_unlock(&log->l_icloglock);
}
/*
@@ -998,7 +1151,7 @@ xlog_cil_push_background(
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
- queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+ queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
}
/*
@@ -1034,13 +1187,26 @@ xlog_cil_push_background(
/*
* xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
* number that is passed. When it returns, the work will be queued for
- * @push_seq, but it won't be completed. The caller is expected to do any
- * waiting for push_seq to complete if it is required.
+ * @push_seq, but it won't be completed.
+ *
+ * If the caller is performing a synchronous force, we will flush the workqueue
+ * to get previously queued work moving to minimise the wait time they will
+ * undergo waiting for all outstanding pushes to complete. The caller is
+ * expected to do the required waiting for push_seq to complete.
+ *
+ * If the caller is performing an async push, we need to ensure that the
+ * checkpoint is fully flushed out of the iclogs when we finish the push. If we
+ * don't do this, then the commit record may remain sitting in memory in an
+ * ACTIVE iclog. This then requires another full log force to push to disk,
+ * which defeats the purpose of having an async, non-blocking CIL force
+ * mechanism. Hence in this case we need to pass a flag to the push work to
+ * indicate it needs to flush the commit record itself.
*/
static void
xlog_cil_push_now(
struct xlog *log,
- xfs_lsn_t push_seq)
+ xfs_lsn_t push_seq,
+ bool async)
{
struct xfs_cil *cil = log->l_cilp;
@@ -1050,7 +1216,8 @@ xlog_cil_push_now(
ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
/* start on any pending background push to minimise wait time on it */
- flush_work(&cil->xc_push_work);
+ if (!async)
+ flush_workqueue(cil->xc_push_wq);
/*
* If the CIL is empty or we've already pushed the sequence then
@@ -1063,7 +1230,8 @@ xlog_cil_push_now(
}
cil->xc_push_seq = push_seq;
- queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+ cil->xc_push_commit_stable = async;
+ queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
spin_unlock(&cil->xc_push_lock);
}
@@ -1116,7 +1284,7 @@ xlog_cil_commit(
xlog_cil_insert_items(log, tp);
- if (regrant && !XLOG_FORCED_SHUTDOWN(log))
+ if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
else
xfs_log_ticket_ungrant(log, tp->t_ticket);
@@ -1148,11 +1316,26 @@ xlog_cil_commit(
}
/*
+ * Flush the CIL to stable storage but don't wait for it to complete. This
+ * requires the CIL push to ensure the commit record for the push hits the disk,
+ * but otherwise is no different to a push done from a log force.
+ */
+void
+xlog_cil_flush(
+ struct xlog *log)
+{
+ xfs_csn_t seq = log->l_cilp->xc_current_sequence;
+
+ trace_xfs_log_force(log->l_mp, seq, _RET_IP_);
+ xlog_cil_push_now(log, seq, true);
+}
+
+/*
* Conditionally push the CIL based on the sequence passed in.
*
- * We only need to push if we haven't already pushed the sequence
- * number given. Hence the only time we will trigger a push here is
- * if the push sequence is the same as the current context.
+ * We only need to push if we haven't already pushed the sequence number given.
+ * Hence the only time we will trigger a push here is if the push sequence is
+ * the same as the current context.
*
* We return the current commit lsn to allow the callers to determine if a
* iclog flush is necessary following this call.
@@ -1168,13 +1351,17 @@ xlog_cil_force_seq(
ASSERT(sequence <= cil->xc_current_sequence);
+ if (!sequence)
+ sequence = cil->xc_current_sequence;
+ trace_xfs_log_force(log->l_mp, sequence, _RET_IP_);
+
/*
* check to see if we need to force out the current context.
* xlog_cil_push() handles racing pushes for the same sequence,
* so no need to deal with it here.
*/
restart:
- xlog_cil_push_now(log, sequence);
+ xlog_cil_push_now(log, sequence, false);
/*
* See if we can find a previous sequence still committing.
@@ -1189,7 +1376,7 @@ restart:
* shutdown, but then went back to sleep once already in the
* shutdown state.
*/
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
goto out_shutdown;
if (ctx->sequence > sequence)
continue;
@@ -1198,6 +1385,7 @@ restart:
* It is still being pushed! Wait for the push to
* complete, then start again from the beginning.
*/
+ XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
goto restart;
}
@@ -1282,32 +1470,35 @@ xlog_cil_init(
cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
if (!cil)
return -ENOMEM;
+ /*
+ * Limit the CIL pipeline depth to 4 concurrent works to bound the
+ * concurrency the log spinlocks will be exposed to.
+ */
+ cil->xc_push_wq = alloc_workqueue("xfs-cil/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
+ 4, log->l_mp->m_super->s_id);
+ if (!cil->xc_push_wq)
+ goto out_destroy_cil;
- ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
- if (!ctx) {
- kmem_free(cil);
- return -ENOMEM;
- }
-
- INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
INIT_LIST_HEAD(&cil->xc_cil);
INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock);
spin_lock_init(&cil->xc_push_lock);
init_waitqueue_head(&cil->xc_push_wait);
init_rwsem(&cil->xc_ctx_lock);
+ init_waitqueue_head(&cil->xc_start_wait);
init_waitqueue_head(&cil->xc_commit_wait);
-
- INIT_LIST_HEAD(&ctx->committing);
- INIT_LIST_HEAD(&ctx->busy_extents);
- ctx->sequence = 1;
- ctx->cil = cil;
- cil->xc_ctx = ctx;
- cil->xc_current_sequence = ctx->sequence;
-
cil->xc_log = log;
log->l_cilp = cil;
+
+ ctx = xlog_cil_ctx_alloc();
+ xlog_cil_ctx_switch(cil, ctx);
+
return 0;
+
+out_destroy_cil:
+ kmem_free(cil);
+ return -ENOMEM;
}
void
@@ -1321,6 +1512,7 @@ xlog_cil_destroy(
}
ASSERT(list_empty(&log->l_cilp->xc_cil));
+ destroy_workqueue(log->l_cilp->xc_push_wq);
kmem_free(log->l_cilp);
}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f3e79a45d60a..844fbeec3545 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -12,15 +12,6 @@ struct xlog_ticket;
struct xfs_mount;
/*
- * Flags for log structure
- */
-#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
-#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
-#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
- shutdown */
-#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
-
-/*
* get client id from packed copy.
*
* this hack is here because the xlog_pack code copies four bytes
@@ -47,7 +38,6 @@ enum xlog_iclog_state {
XLOG_STATE_DONE_SYNC, /* Done syncing to disk */
XLOG_STATE_CALLBACK, /* Callback functions now */
XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */
- XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */
};
#define XLOG_STATE_STRINGS \
@@ -56,8 +46,7 @@ enum xlog_iclog_state {
{ XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \
{ XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \
{ XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \
- { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \
- { XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" }
+ { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }
/*
* In core log flags
@@ -251,6 +240,7 @@ struct xfs_cil_ctx {
xfs_csn_t sequence; /* chkpt sequence # */
xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
+ struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
int nvecs; /* number of regions */
int space_used; /* aggregate size of regions */
@@ -259,6 +249,7 @@ struct xfs_cil_ctx {
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work;
+ struct work_struct push_work;
};
/*
@@ -281,16 +272,18 @@ struct xfs_cil {
struct xlog *xc_log;
struct list_head xc_cil;
spinlock_t xc_cil_lock;
+ struct workqueue_struct *xc_push_wq;
struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
struct xfs_cil_ctx *xc_ctx;
spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
xfs_csn_t xc_push_seq;
+ bool xc_push_commit_stable;
struct list_head xc_committing;
wait_queue_head_t xc_commit_wait;
+ wait_queue_head_t xc_start_wait;
xfs_csn_t xc_current_sequence;
- struct work_struct xc_push_work;
wait_queue_head_t xc_push_wait; /* background push throttle */
} ____cacheline_aligned_in_smp;
@@ -407,7 +400,7 @@ struct xlog {
struct xfs_buftarg *l_targ; /* buftarg of log */
struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */
struct delayed_work l_work; /* background flush work */
- uint l_flags;
+ long l_opstate; /* operational state */
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
struct list_head *l_buf_cancel_table;
int l_iclog_hsize; /* size of iclog header */
@@ -456,13 +449,40 @@ struct xlog {
xfs_lsn_t l_recovery_lsn;
uint32_t l_iclog_roundoff;/* padding roundoff */
+
+ /* Users of log incompat features should take a read lock. */
+ struct rw_semaphore l_incompat_users;
};
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
-#define XLOG_FORCED_SHUTDOWN(log) \
- (unlikely((log)->l_flags & XLOG_IO_ERROR))
+/*
+ * Bits for operational state
+ */
+#define XLOG_ACTIVE_RECOVERY 0 /* in the middle of recovery */
+#define XLOG_RECOVERY_NEEDED 1 /* log was recovered */
+#define XLOG_IO_ERROR 2 /* log hit an I/O error, and being
+ shutdown */
+#define XLOG_TAIL_WARN 3 /* log tail verify warning issued */
+
+static inline bool
+xlog_recovery_needed(struct xlog *log)
+{
+ return test_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
+}
+
+static inline bool
+xlog_in_recovery(struct xlog *log)
+{
+ return test_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
+}
+
+static inline bool
+xlog_is_shutdown(struct xlog *log)
+{
+ return test_bit(XLOG_IO_ERROR, &log->l_opstate);
+}
/* common routines */
extern int
@@ -496,14 +516,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
-int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
- struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog, uint optype);
-int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
- struct xlog_in_core **iclog, xfs_lsn_t *lsn);
+int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
+ struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
+ uint optype);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
+void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
+ int eventual_size);
int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
xfs_lsn_t log_tail_lsn);
@@ -571,10 +591,14 @@ void xlog_cil_destroy(struct xlog *log);
bool xlog_cil_empty(struct xlog *log);
void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp,
xfs_csn_t *commit_seq, bool regrant);
+void xlog_cil_set_ctx_write_state(struct xfs_cil_ctx *ctx,
+ struct xlog_in_core *iclog);
+
/*
* CIL force routines
*/
+void xlog_cil_flush(struct xlog *log);
xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
static inline void
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1721fce2ec94..10562ecbd9ea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -26,6 +26,8 @@
#include "xfs_error.h"
#include "xfs_buf_item.h"
#include "xfs_ag.h"
+#include "xfs_quota.h"
+
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
@@ -79,8 +81,6 @@ xlog_alloc_buffer(
struct xlog *log,
int nbblks)
{
- int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
-
/*
* Pass log block 0 since we don't have an addr yet, buffer will be
* verified on read.
@@ -108,7 +108,7 @@ xlog_alloc_buffer(
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
+ return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
}
/*
@@ -146,7 +146,7 @@ xlog_do_io(
error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
BBTOB(nbblks), data, op);
- if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ if (error && !xlog_is_shutdown(log)) {
xfs_alert(log->l_mp,
"log recovery %s I/O error at daddr 0x%llx len %d error %d",
op == REQ_OP_WRITE ? "write" : "read",
@@ -375,7 +375,7 @@ out:
static inline int
xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
{
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
int h_size = be32_to_cpu(rh->h_size);
if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
@@ -1347,7 +1347,7 @@ xlog_find_tail(
* headers if we have a filesystem using non-persistent counters.
*/
if (clean)
- log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+ set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
/*
* Make sure that there are no blocks in front of the head
@@ -1504,7 +1504,7 @@ xlog_add_record(
recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
recp->h_cycle = cpu_to_be32(cycle);
recp->h_version = cpu_to_be32(
- xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+ xfs_has_logv2(log->l_mp) ? 2 : 1);
recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
recp->h_fmt = cpu_to_be32(XLOG_FMT);
@@ -1756,6 +1756,30 @@ xlog_recover_release_intent(
spin_unlock(&ailp->ail_lock);
}
+int
+xlog_recover_iget(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
+ if (error)
+ return error;
+
+ error = xfs_qm_dqattach(*ipp);
+ if (error) {
+ xfs_irele(*ipp);
+ return error;
+ }
+
+ if (VFS_I(*ipp)->i_nlink == 0)
+ xfs_iflags_set(*ipp, XFS_IRECOVERY);
+
+ return 0;
+}
+
/******************************************************************************
*
* Log recover routines
@@ -2062,7 +2086,9 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
+ ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -2786,6 +2812,13 @@ xlog_recover_process_iunlinks(
}
xfs_buf_rele(agibp);
}
+
+ /*
+ * Flush the pending unlinked inodes to ensure that the inactivations
+ * are fully completed on disk and the incore inodes can be reclaimed
+ * before we signal that recovery is complete.
+ */
+ xfs_inodegc_flush(mp);
}
STATIC void
@@ -2802,7 +2835,7 @@ xlog_unpack_data(
dp += BBSIZE;
}
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -2850,7 +2883,7 @@ xlog_recover_process(
* the kernel from one that does not add CRCs by default.
*/
if (crc != old_crc) {
- if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ if (old_crc || xfs_has_crc(log->l_mp)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
le32_to_cpu(old_crc),
@@ -2862,7 +2895,7 @@ xlog_recover_process(
* If the filesystem is CRC enabled, this mismatch becomes a
* fatal log corruption failure.
*/
- if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ if (xfs_has_crc(log->l_mp)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
}
@@ -2948,7 +2981,7 @@ xlog_do_recovery_pass(
* Read the header of the tail block and get the iclog buffer size from
* h_size. Use this to tell how many sectors make up the log header.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
/*
* When using variable length iclogs, read first sector of
* iclog header and extract the header size from it. Get a
@@ -3280,10 +3313,7 @@ xlog_do_recover(
if (error)
return error;
- /*
- * If IO errors happened during recovery, bail out.
- */
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xlog_is_shutdown(log))
return -EIO;
/*
@@ -3305,7 +3335,7 @@ xlog_do_recover(
xfs_buf_hold(bp);
error = _xfs_buf_read(bp, XBF_READ);
if (error) {
- if (!XFS_FORCED_SHUTDOWN(mp)) {
+ if (!xlog_is_shutdown(log)) {
xfs_buf_ioerror_alert(bp, __this_address);
ASSERT(0);
}
@@ -3318,6 +3348,7 @@ xlog_do_recover(
xfs_buf_relse(bp);
/* re-initialise in-core superblock and geometry structures */
+ mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
if (error) {
@@ -3329,7 +3360,7 @@ xlog_do_recover(
xlog_recover_check_summary(log);
/* Normal transactions can now occur */
- log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
return 0;
}
@@ -3355,7 +3386,7 @@ xlog_recover(
* could not be verified. Check the superblock LSN against the current
* LSN now that it's known.
*/
- if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+ if (xfs_has_crc(log->l_mp) &&
!xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
return -EINVAL;
@@ -3382,7 +3413,7 @@ xlog_recover(
* (e.g. unsupported transactions, then simply reject the
* attempt at recovery before touching anything.
*/
- if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
+ if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
xfs_warn(log->l_mp,
@@ -3413,68 +3444,64 @@ xlog_recover(
: "internal");
error = xlog_do_recover(log, head_blk, tail_blk);
- log->l_flags |= XLOG_RECOVERY_NEEDED;
+ set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
}
return error;
}
/*
- * In the first part of recovery we replay inodes and buffers and build
- * up the list of extent free items which need to be processed. Here
- * we process the extent free items and clean up the on disk unlinked
- * inode lists. This is separated from the first part of recovery so
- * that the root and real-time bitmap inodes can be read in from disk in
- * between the two stages. This is necessary so that we can free space
- * in the real-time portion of the file system.
+ * In the first part of recovery we replay inodes and buffers and build up the
+ * list of intents which need to be processed. Here we process the intents and
+ * clean up the on disk unlinked inode lists. This is separated from the first
+ * part of recovery so that the root and real-time bitmap inodes can be read in
+ * from disk in between the two stages. This is necessary so that we can free
+ * space in the real-time portion of the file system.
*/
int
xlog_recover_finish(
struct xlog *log)
{
- /*
- * Now we're ready to do the transactions needed for the
- * rest of recovery. Start with completing all the extent
- * free intent records and then process the unlinked inode
- * lists. At this point, we essentially run in normal mode
- * except that we're still performing recovery actions
- * rather than accepting new requests.
- */
- if (log->l_flags & XLOG_RECOVERY_NEEDED) {
- int error;
- error = xlog_recover_process_intents(log);
- if (error) {
- /*
- * Cancel all the unprocessed intent items now so that
- * we don't leave them pinned in the AIL. This can
- * cause the AIL to livelock on the pinned item if
- * anyone tries to push the AIL (inode reclaim does
- * this) before we get around to xfs_log_mount_cancel.
- */
- xlog_recover_cancel_intents(log);
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
- xfs_alert(log->l_mp, "Failed to recover intents");
- return error;
- }
+ int error;
+ error = xlog_recover_process_intents(log);
+ if (error) {
/*
- * Sync the log to get all the intents out of the AIL.
- * This isn't absolutely necessary, but it helps in
- * case the unlink transactions would have problems
- * pushing the intents out of the way.
+ * Cancel all the unprocessed intent items now so that we don't
+ * leave them pinned in the AIL. This can cause the AIL to
+ * livelock on the pinned item if anyone tries to push the AIL
+ * (inode reclaim does this) before we get around to
+ * xfs_log_mount_cancel.
*/
- xfs_log_force(log->l_mp, XFS_LOG_SYNC);
-
- xlog_recover_process_iunlinks(log);
+ xlog_recover_cancel_intents(log);
+ xfs_alert(log->l_mp, "Failed to recover intents");
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ return error;
+ }
- xlog_recover_check_summary(log);
+ /*
+ * Sync the log to get all the intents out of the AIL. This isn't
+ * absolutely necessary, but it helps in case the unlink transactions
+ * would have problems pushing the intents out of the way.
+ */
+ xfs_log_force(log->l_mp, XFS_LOG_SYNC);
- xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
- log->l_mp->m_logname ? log->l_mp->m_logname
- : "internal");
- log->l_flags &= ~XLOG_RECOVERY_NEEDED;
- } else {
- xfs_info(log->l_mp, "Ending clean mount");
+ /*
+ * Now that we've recovered the log and all the intents, we can clear
+ * the log incompat feature bits in the superblock because there's no
+ * longer anything to protect. We rely on the AIL push to write out the
+ * updated superblock after everything else.
+ */
+ if (xfs_clear_incompat_log_features(log->l_mp)) {
+ error = xfs_sync_sb(log->l_mp, false);
+ if (error < 0) {
+ xfs_alert(log->l_mp,
+ "Failed to clear log incompat features on recovery");
+ return error;
+ }
}
+
+ xlog_recover_process_iunlinks(log);
+ xlog_recover_check_summary(log);
return 0;
}
@@ -3482,7 +3509,7 @@ void
xlog_recover_cancel(
struct xlog *log)
{
- if (log->l_flags & XLOG_RECOVERY_NEEDED)
+ if (xlog_recovery_needed(log))
xlog_recover_cancel_intents(log);
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d0755494597f..06dac09eddbd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -62,7 +62,7 @@ xfs_uuid_mount(
/* Publish UUID in struct super_block */
uuid_copy(&mp->m_super->s_uuid, uuid);
- if (mp->m_flags & XFS_MOUNT_NOUUID)
+ if (xfs_has_nouuid(mp))
return 0;
if (uuid_is_null(uuid)) {
@@ -104,7 +104,7 @@ xfs_uuid_unmount(
uuid_t *uuid = &mp->m_sb.sb_uuid;
int i;
- if (mp->m_flags & XFS_MOUNT_NOUUID)
+ if (xfs_has_nouuid(mp))
return;
mutex_lock(&xfs_uuid_table_mutex);
@@ -225,6 +225,7 @@ reread:
goto reread;
}
+ mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
/* no need to be quiet anymore, so reset the buf ops */
@@ -318,7 +319,7 @@ xfs_validate_new_dalign(
}
}
- if (!xfs_sb_version_hasdalign(&mp->m_sb)) {
+ if (!xfs_has_dalign(mp)) {
xfs_warn(mp,
"cannot change alignment: superblock does not support data alignment");
return -EINVAL;
@@ -349,8 +350,7 @@ xfs_update_alignment(
sbp->sb_unit = mp->m_dalign;
sbp->sb_width = mp->m_swidth;
mp->m_update_sb = true;
- } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
- xfs_sb_version_hasdalign(&mp->m_sb)) {
+ } else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) {
mp->m_dalign = sbp->sb_unit;
mp->m_swidth = sbp->sb_width;
}
@@ -365,13 +365,16 @@ void
xfs_set_low_space_thresholds(
struct xfs_mount *mp)
{
- int i;
+ uint64_t dblocks = mp->m_sb.sb_dblocks;
+ uint64_t rtexts = mp->m_sb.sb_rextents;
+ int i;
- for (i = 0; i < XFS_LOWSP_MAX; i++) {
- uint64_t space = mp->m_sb.sb_dblocks;
+ do_div(dblocks, 100);
+ do_div(rtexts, 100);
- do_div(space, 100);
- mp->m_low_space[i] = space * (i + 1);
+ for (i = 0; i < XFS_LOWSP_MAX; i++) {
+ mp->m_low_space[i] = dblocks * (i + 1);
+ mp->m_low_rtexts[i] = rtexts * (i + 1);
}
}
@@ -485,7 +488,7 @@ xfs_check_summary_counts(
* counters. If any of them are obviously incorrect, we can recompute
* them from the AGF headers in the next step.
*/
- if (XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
+ if (xfs_is_clean(mp) &&
(mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
!xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
@@ -502,8 +505,7 @@ xfs_check_summary_counts(
* superblock to be correct and we don't need to do anything here.
* Otherwise, recalculate the summary counters.
*/
- if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) ||
- XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) &&
+ if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) &&
!xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
return 0;
@@ -514,7 +516,8 @@ xfs_check_summary_counts(
* Flush and reclaim dirty inodes in preparation for unmount. Inodes and
* internal inode structures can be sitting in the CIL and AIL at this point,
* so we need to unpin them, write them back and/or reclaim them before unmount
- * can proceed.
+ * can proceed. In other words, callers are required to have inactivated all
+ * inodes.
*
* An inode cluster that has been freed can have its buffer still pinned in
* memory because the transaction is still sitting in a iclog. The stale inodes
@@ -543,9 +546,10 @@ xfs_unmount_flush_inodes(
xfs_extent_busy_wait_all(mp);
flush_workqueue(xfs_discard_wq);
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+ set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate);
xfs_ail_push_all_sync(mp->m_ail);
+ xfs_inodegc_stop(mp);
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp);
xfs_health_unmount(mp);
@@ -607,29 +611,13 @@ xfs_mountfs(
xfs_warn(mp, "correcting sb_features alignment problem");
sbp->sb_features2 |= sbp->sb_bad_features2;
mp->m_update_sb = true;
-
- /*
- * Re-check for ATTR2 in case it was found in bad_features2
- * slot.
- */
- if (xfs_sb_version_hasattr2(&mp->m_sb) &&
- !(mp->m_flags & XFS_MOUNT_NOATTR2))
- mp->m_flags |= XFS_MOUNT_ATTR2;
}
- if (xfs_sb_version_hasattr2(&mp->m_sb) &&
- (mp->m_flags & XFS_MOUNT_NOATTR2)) {
- xfs_sb_version_removeattr2(&mp->m_sb);
- mp->m_update_sb = true;
-
- /* update sb_versionnum for the clearing of the morebits */
- if (!sbp->sb_features2)
- mp->m_update_sb = true;
- }
/* always use v2 inodes by default now */
if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+ mp->m_features |= XFS_FEAT_NLINK;
mp->m_update_sb = true;
}
@@ -702,7 +690,7 @@ xfs_mountfs(
* cluster size. Full inode chunk alignment must match the chunk size,
* but that is checked on sb read verification...
*/
- if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
+ if (xfs_has_sparseinodes(mp) &&
mp->m_sb.sb_spino_align !=
XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
xfs_warn(mp,
@@ -764,6 +752,10 @@ xfs_mountfs(
goto out_free_perag;
}
+ error = xfs_inodegc_register_shrinker(mp);
+ if (error)
+ goto out_fail_wait;
+
/*
* Log's mount-time initialization. The first part of recovery can place
* some items on the AIL, to be handled when recovery is finished or
@@ -774,7 +766,7 @@ xfs_mountfs(
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
if (error) {
xfs_warn(mp, "log mount failed");
- goto out_fail_wait;
+ goto out_inodegc_shrinker;
}
/* Make sure the summary counts are ok. */
@@ -782,6 +774,23 @@ xfs_mountfs(
if (error)
goto out_log_dealloc;
+ /* Enable background inode inactivation workers. */
+ xfs_inodegc_start(mp);
+ xfs_blockgc_start(mp);
+
+ /*
+ * Now that we've recovered any pending superblock feature bit
+ * additions, we can finish setting up the attr2 behaviour for the
+ * mount. The noattr2 option overrides the superblock flag, so only
+ * check the superblock feature flag if the mount option is not set.
+ */
+ if (xfs_has_noattr2(mp)) {
+ mp->m_features &= ~XFS_FEAT_ATTR2;
+ } else if (!xfs_has_attr2(mp) &&
+ (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
+ mp->m_features |= XFS_FEAT_ATTR2;
+ }
+
/*
* Get and sanity-check the root inode.
* Save the pointer to it in the mount structure.
@@ -825,7 +834,7 @@ xfs_mountfs(
* the next remount into writeable mode. Otherwise we would never
* perform the update e.g. for the root filesystem.
*/
- if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ if (mp->m_update_sb && !xfs_is_readonly(mp)) {
error = xfs_sync_sb(mp, false);
if (error) {
xfs_warn(mp, "failed to write sb changes");
@@ -836,13 +845,11 @@ xfs_mountfs(
/*
* Initialise the XFS quota management subsystem for this mount
*/
- if (XFS_IS_QUOTA_RUNNING(mp)) {
+ if (XFS_IS_QUOTA_ON(mp)) {
error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
if (error)
goto out_rtunmount;
} else {
- ASSERT(!XFS_IS_QUOTA_ON(mp));
-
/*
* If a file system had quotas running earlier, but decided to
* mount without -o uquota/pquota/gquota options, revoke the
@@ -884,10 +891,8 @@ xfs_mountfs(
* We use the same quiesce mechanism as the rw->ro remount, as they are
* semantically identical operations.
*/
- if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
- XFS_MOUNT_RDONLY) {
+ if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
xfs_log_clean(mp);
- }
/*
* Complete the quota initialisation, post-log-replay component.
@@ -910,7 +915,7 @@ xfs_mountfs(
* This may drive us straight to ENOSPC on mount, but that implies
* we were already there on the last unmount. Warn if this occurs.
*/
- if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ if (!xfs_is_readonly(mp)) {
resblks = xfs_default_resblks(mp);
error = xfs_reserve_blocks(mp, &resblks, NULL);
if (error)
@@ -944,6 +949,15 @@ xfs_mountfs(
xfs_irele(rip);
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
+
+ /*
+ * Inactivate all inodes that might still be in memory after a log
+ * intent recovery failure so that reclaim can free them. Metadata
+ * inodes and the root directory shouldn't need inactivation, but the
+ * mount failed for some reason, so pull down all the state and flee.
+ */
+ xfs_inodegc_flush(mp);
+
/*
* Flush all inode reclamation work and flush the log.
* We have to do this /after/ rtunmount and qm_unmount because those
@@ -958,6 +972,8 @@ xfs_mountfs(
xfs_unmount_flush_inodes(mp);
out_log_dealloc:
xfs_log_mount_cancel(mp);
+ out_inodegc_shrinker:
+ unregister_shrinker(&mp->m_inodegc_shrinker);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_buftarg_drain(mp->m_logdev_targp);
@@ -991,6 +1007,16 @@ xfs_unmountfs(
uint64_t resblks;
int error;
+ /*
+ * Perform all on-disk metadata updates required to inactivate inodes
+ * that the VFS evicted earlier in the unmount process. Freeing inodes
+ * and discarding CoW fork preallocations can cause shape changes to
+ * the free inode and refcount btrees, respectively, so we must finish
+ * this before we discard the metadata space reservations. Metadata
+ * inodes and the root directory do not require inactivation.
+ */
+ xfs_inodegc_flush(mp);
+
xfs_blockgc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
@@ -1028,6 +1054,7 @@ xfs_unmountfs(
#if defined(DEBUG)
xfs_errortag_clearall(mp);
#endif
+ unregister_shrinker(&mp->m_inodegc_shrinker);
xfs_free_perag(mp);
xfs_errortag_del(mp);
@@ -1049,20 +1076,12 @@ xfs_fs_writable(
{
ASSERT(level > SB_UNFROZEN);
if ((mp->m_super->s_writers.frozen >= level) ||
- XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY))
+ xfs_is_shutdown(mp) || xfs_is_readonly(mp))
return false;
return true;
}
-/*
- * Deltas for the block count can vary from 1 to very large, but lock contention
- * only occurs on frequent small block count updates such as in the delayed
- * allocation path for buffered writes (page a time updates). Hence we set
- * a large batch count (1024) to minimise global counter updates except when
- * we get near to ENOSPC and we have to be very accurate with our updates.
- */
-#define XFS_FDBLOCKS_BATCH 1024
int
xfs_mod_fdblocks(
struct xfs_mount *mp,
@@ -1210,13 +1229,123 @@ void
xfs_force_summary_recalc(
struct xfs_mount *mp)
{
- if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (!xfs_has_lazysbcount(mp))
return;
xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
}
/*
+ * Enable a log incompat feature flag in the primary superblock. The caller
+ * cannot have any other transactions in progress.
+ */
+int
+xfs_add_incompat_log_feature(
+ struct xfs_mount *mp,
+ uint32_t feature)
+{
+ struct xfs_dsb *dsb;
+ int error;
+
+ ASSERT(hweight32(feature) == 1);
+ ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+
+ /*
+ * Force the log to disk and kick the background AIL thread to reduce
+ * the chances that the bwrite will stall waiting for the AIL to unpin
+ * the primary superblock buffer. This isn't a data integrity
+ * operation, so we don't need a synchronous push.
+ */
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
+ if (error)
+ return error;
+ xfs_ail_push_all(mp->m_ail);
+
+ /*
+ * Lock the primary superblock buffer to serialize all callers that
+ * are trying to set feature bits.
+ */
+ xfs_buf_lock(mp->m_sb_bp);
+ xfs_buf_hold(mp->m_sb_bp);
+
+ if (xfs_is_shutdown(mp)) {
+ error = -EIO;
+ goto rele;
+ }
+
+ if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature))
+ goto rele;
+
+ /*
+ * Write the primary superblock to disk immediately, because we need
+ * the log_incompat bit to be set in the primary super now to protect
+ * the log items that we're going to commit later.
+ */
+ dsb = mp->m_sb_bp->b_addr;
+ xfs_sb_to_disk(dsb, &mp->m_sb);
+ dsb->sb_features_log_incompat |= cpu_to_be32(feature);
+ error = xfs_bwrite(mp->m_sb_bp);
+ if (error)
+ goto shutdown;
+
+ /*
+ * Add the feature bits to the incore superblock before we unlock the
+ * buffer.
+ */
+ xfs_sb_add_incompat_log_features(&mp->m_sb, feature);
+ xfs_buf_relse(mp->m_sb_bp);
+
+ /* Log the superblock to disk. */
+ return xfs_sync_sb(mp, false);
+shutdown:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+rele:
+ xfs_buf_relse(mp->m_sb_bp);
+ return error;
+}
+
+/*
+ * Clear all the log incompat flags from the superblock.
+ *
+ * The caller cannot be in a transaction, must ensure that the log does not
+ * contain any log items protected by any log incompat bit, and must ensure
+ * that there are no other threads that depend on the state of the log incompat
+ * feature flags in the primary super.
+ *
+ * Returns true if the superblock is dirty.
+ */
+bool
+xfs_clear_incompat_log_features(
+ struct xfs_mount *mp)
+{
+ bool ret = false;
+
+ if (!xfs_has_crc(mp) ||
+ !xfs_sb_has_incompat_log_feature(&mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
+ xfs_is_shutdown(mp))
+ return false;
+
+ /*
+ * Update the incore superblock. We synchronize on the primary super
+ * buffer lock to be consistent with the add function, though at least
+ * in theory this shouldn't be necessary.
+ */
+ xfs_buf_lock(mp->m_sb_bp);
+ xfs_buf_hold(mp->m_sb_bp);
+
+ if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
+ xfs_info(mp, "Clearing log incompat feature flags.");
+ xfs_sb_remove_incompat_log_features(&mp->m_sb);
+ ret = true;
+ }
+
+ xfs_buf_relse(mp->m_sb_bp);
+ return ret;
+}
+
+/*
* Update the in-core delayed block counter.
*
* We prefer to update the counter without having to take a spinlock for every
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c78b63fe779a..e091f3b3fa15 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -57,6 +57,18 @@ struct xfs_error_cfg {
};
/*
+ * Per-cpu deferred inode inactivation GC lists.
+ */
+struct xfs_inodegc {
+ struct llist_head list;
+ struct work_struct work;
+
+ /* approximate count of inodes in the list */
+ unsigned int items;
+ unsigned int shrinker_hits;
+};
+
+/*
* The struct xfsmount layout is optimised to separate read-mostly variables
* from variables that are frequently modified. We put the read-mostly variables
* first, then place all the other variables at the end.
@@ -82,6 +94,9 @@ typedef struct xfs_mount {
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
+ struct list_head m_mount_list; /* global mount list */
+ void __percpu *m_inodegc; /* percpu inodegc structures */
+
/*
* Optional cache of rt summary level per bitmap block with the
* invariant that m_rsum_cache[bbno] <= the minimum i for which
@@ -92,10 +107,10 @@ typedef struct xfs_mount {
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
- struct workqueue_struct *m_cil_workqueue;
struct workqueue_struct *m_reclaim_workqueue;
- struct workqueue_struct *m_gc_workqueue;
struct workqueue_struct *m_sync_workqueue;
+ struct workqueue_struct *m_blockgc_wq;
+ struct workqueue_struct *m_inodegc_wq;
int m_bsize; /* fs logical block size */
uint8_t m_blkbit_log; /* blocklog + NBBY */
@@ -131,11 +146,13 @@ typedef struct xfs_mount {
uint m_rsumsize; /* size of rt summary, bytes */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_qflags; /* quota status flags */
- uint64_t m_flags; /* global mount flags */
- int64_t m_low_space[XFS_LOWSP_MAX];
+ uint64_t m_features; /* active filesystem features */
+ uint64_t m_low_space[XFS_LOWSP_MAX];
+ uint64_t m_low_rtexts[XFS_LOWSP_MAX];
struct xfs_ino_geometry m_ino_geo; /* inode geometry */
struct xfs_trans_resv m_resv; /* precomputed res values */
/* low free space thresholds */
+ unsigned long m_opstate; /* dynamic state flags */
bool m_always_cow;
bool m_fail_unmount;
bool m_finobt_nores; /* no per-AG finobt resv. */
@@ -193,6 +210,8 @@ typedef struct xfs_mount {
xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
spinlock_t m_agirotor_lock;/* .. and lock protecting it */
+ /* Memory shrinker to throttle and reprioritize inodegc */
+ struct shrinker m_inodegc_shrinker;
/*
* Workqueue item so that we can coalesce multiple inode flush attempts
* into a single flush.
@@ -225,38 +244,178 @@ typedef struct xfs_mount {
#define M_IGEO(mp) (&(mp)->m_ino_geo)
/*
- * Flags for m_flags.
+ * Flags for m_features.
+ *
+ * These are all the active features in the filesystem, regardless of how
+ * they are configured.
*/
-#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
- must be synchronous except
- for space allocations */
-#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */
-#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
-#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
- operations, typically for
- disk errors in metadata */
-#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
-#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
- allocations */
-#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
-#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
-#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
-#define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */
-#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
-#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
-#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
-#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
-#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width
- * allocation */
-#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */
-#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
-#define XFS_MOUNT_LARGEIO (1ULL << 22) /* report large preferred
+#define XFS_FEAT_ATTR (1ULL << 0) /* xattrs present in fs */
+#define XFS_FEAT_NLINK (1ULL << 1) /* 32 bit link counts */
+#define XFS_FEAT_QUOTA (1ULL << 2) /* quota active */
+#define XFS_FEAT_ALIGN (1ULL << 3) /* inode alignment */
+#define XFS_FEAT_DALIGN (1ULL << 4) /* data alignment */
+#define XFS_FEAT_LOGV2 (1ULL << 5) /* version 2 logs */
+#define XFS_FEAT_SECTOR (1ULL << 6) /* sector size > 512 bytes */
+#define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */
+#define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */
+#define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */
+#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */
+#define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */
+#define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */
+#define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */
+#define XFS_FEAT_V3INODES (1ULL << 14) /* Version 3 inodes */
+#define XFS_FEAT_PQUOTINO (1ULL << 15) /* non-shared proj/grp quotas */
+#define XFS_FEAT_FTYPE (1ULL << 16) /* inode type in dir */
+#define XFS_FEAT_FINOBT (1ULL << 17) /* free inode btree */
+#define XFS_FEAT_RMAPBT (1ULL << 18) /* reverse map btree */
+#define XFS_FEAT_REFLINK (1ULL << 19) /* reflinked files */
+#define XFS_FEAT_SPINODES (1ULL << 20) /* sparse inode chunks */
+#define XFS_FEAT_META_UUID (1ULL << 21) /* metadata UUID */
+#define XFS_FEAT_REALTIME (1ULL << 22) /* realtime device present */
+#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */
+#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
+#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
+
+/* Mount features */
+#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
+#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
+#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
+#define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred
* I/O size in stat() */
-#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
- allocator */
-#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
-#define XFS_MOUNT_DAX_ALWAYS (1ULL << 26)
-#define XFS_MOUNT_DAX_NEVER (1ULL << 27)
+#define XFS_FEAT_WSYNC (1ULL << 52) /* synchronous metadata ops */
+#define XFS_FEAT_DIRSYNC (1ULL << 53) /* synchronous directory ops */
+#define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */
+#define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */
+#define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */
+#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/
+#define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */
+#define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */
+#define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */
+#define XFS_FEAT_DAX_NEVER (1ULL << 61) /* DAX never enabled */
+#define XFS_FEAT_NORECOVERY (1ULL << 62) /* no recovery - dirty fs */
+#define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */
+
+#define __XFS_HAS_FEAT(name, NAME) \
+static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
+{ \
+ return mp->m_features & XFS_FEAT_ ## NAME; \
+}
+
+/* Some features can be added dynamically so they need a set wrapper, too. */
+#define __XFS_ADD_FEAT(name, NAME) \
+ __XFS_HAS_FEAT(name, NAME); \
+static inline void xfs_add_ ## name (struct xfs_mount *mp) \
+{ \
+ mp->m_features |= XFS_FEAT_ ## NAME; \
+ xfs_sb_version_add ## name(&mp->m_sb); \
+}
+
+/* Superblock features */
+__XFS_ADD_FEAT(attr, ATTR)
+__XFS_HAS_FEAT(nlink, NLINK)
+__XFS_ADD_FEAT(quota, QUOTA)
+__XFS_HAS_FEAT(align, ALIGN)
+__XFS_HAS_FEAT(dalign, DALIGN)
+__XFS_HAS_FEAT(logv2, LOGV2)
+__XFS_HAS_FEAT(sector, SECTOR)
+__XFS_HAS_FEAT(extflg, EXTFLG)
+__XFS_HAS_FEAT(asciici, ASCIICI)
+__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT)
+__XFS_ADD_FEAT(attr2, ATTR2)
+__XFS_HAS_FEAT(parent, PARENT)
+__XFS_ADD_FEAT(projid32, PROJID32)
+__XFS_HAS_FEAT(crc, CRC)
+__XFS_HAS_FEAT(v3inodes, V3INODES)
+__XFS_HAS_FEAT(pquotino, PQUOTINO)
+__XFS_HAS_FEAT(ftype, FTYPE)
+__XFS_HAS_FEAT(finobt, FINOBT)
+__XFS_HAS_FEAT(rmapbt, RMAPBT)
+__XFS_HAS_FEAT(reflink, REFLINK)
+__XFS_HAS_FEAT(sparseinodes, SPINODES)
+__XFS_HAS_FEAT(metauuid, META_UUID)
+__XFS_HAS_FEAT(realtime, REALTIME)
+__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
+__XFS_HAS_FEAT(bigtime, BIGTIME)
+__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+
+/*
+ * Mount features
+ *
+ * These do not change dynamically - features that can come and go, such as 32
+ * bit inodes and read-only state, are kept as operational state rather than
+ * features.
+ */
+__XFS_HAS_FEAT(noattr2, NOATTR2)
+__XFS_HAS_FEAT(noalign, NOALIGN)
+__XFS_HAS_FEAT(allocsize, ALLOCSIZE)
+__XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE)
+__XFS_HAS_FEAT(wsync, WSYNC)
+__XFS_HAS_FEAT(dirsync, DIRSYNC)
+__XFS_HAS_FEAT(discard, DISCARD)
+__XFS_HAS_FEAT(grpid, GRPID)
+__XFS_HAS_FEAT(small_inums, SMALL_INUMS)
+__XFS_HAS_FEAT(ikeep, IKEEP)
+__XFS_HAS_FEAT(swalloc, SWALLOC)
+__XFS_HAS_FEAT(filestreams, FILESTREAMS)
+__XFS_HAS_FEAT(dax_always, DAX_ALWAYS)
+__XFS_HAS_FEAT(dax_never, DAX_NEVER)
+__XFS_HAS_FEAT(norecovery, NORECOVERY)
+__XFS_HAS_FEAT(nouuid, NOUUID)
+
+/*
+ * Operational mount state flags
+ *
+ * Use these with atomic bit ops only!
+ */
+#define XFS_OPSTATE_UNMOUNTING 0 /* filesystem is unmounting */
+#define XFS_OPSTATE_CLEAN 1 /* mount was clean */
+#define XFS_OPSTATE_SHUTDOWN 2 /* stop all fs operations */
+#define XFS_OPSTATE_INODE32 3 /* inode32 allocator active */
+#define XFS_OPSTATE_READONLY 4 /* read-only fs */
+
+/*
+ * If set, inactivation worker threads will be scheduled to process queued
+ * inodegc work. If not, queued inodes remain in memory waiting to be
+ * processed.
+ */
+#define XFS_OPSTATE_INODEGC_ENABLED 5
+/*
+ * If set, background speculative prealloc gc worker threads will be scheduled
+ * to process queued blockgc work. If not, inodes retain their preallocations
+ * until explicitly deleted.
+ */
+#define XFS_OPSTATE_BLOCKGC_ENABLED 6
+
+#define __XFS_IS_OPSTATE(name, NAME) \
+static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
+{ \
+ return test_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+} \
+static inline bool xfs_clear_ ## name (struct xfs_mount *mp) \
+{ \
+ return test_and_clear_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+} \
+static inline bool xfs_set_ ## name (struct xfs_mount *mp) \
+{ \
+ return test_and_set_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+}
+
+__XFS_IS_OPSTATE(unmounting, UNMOUNTING)
+__XFS_IS_OPSTATE(clean, CLEAN)
+__XFS_IS_OPSTATE(shutdown, SHUTDOWN)
+__XFS_IS_OPSTATE(inode32, INODE32)
+__XFS_IS_OPSTATE(readonly, READONLY)
+__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
+__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
+
+#define XFS_OPSTATE_STRINGS \
+ { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \
+ { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \
+ { (1UL << XFS_OPSTATE_SHUTDOWN), "shutdown" }, \
+ { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \
+ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \
+ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \
+ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }
/*
* Max and min values for mount-option defined I/O
@@ -265,9 +424,7 @@ typedef struct xfs_mount {
#define XFS_MAX_IO_LOG 30 /* 1G */
#define XFS_MIN_IO_LOG PAGE_SHIFT
-#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
- ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
-#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
+#define xfs_is_shutdown(mp) xfs_is_shutdown(mp)
void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
int lnnum);
#define xfs_force_shutdown(m,f) \
@@ -278,6 +435,12 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
+#define XFS_SHUTDOWN_STRINGS \
+ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \
+ { SHUTDOWN_LOG_IO_ERROR, "log_io" }, \
+ { SHUTDOWN_FORCE_UMOUNT, "force_umount" }, \
+ { SHUTDOWN_CORRUPT_INCORE, "corruption" }
+
/*
* Flags for xfs_mountfs
*/
@@ -306,6 +469,15 @@ extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
+/*
+ * Deltas for the block count can vary from 1 to very large, but lock contention
+ * only occurs on frequent small block count updates such as in the delayed
+ * allocation path for buffered writes (page a time updates). Hence we set
+ * a large batch count (1024) to minimise global counter updates except when
+ * we get near to ENOSPC and we have to be very accurate with our updates.
+ */
+#define XFS_FDBLOCKS_BATCH 1024
+
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
@@ -325,6 +497,8 @@ int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
int error_class, int error);
void xfs_force_summary_recalc(struct xfs_mount *mp);
+int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
+bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 956cca24e67f..5e1d29d8b2e7 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -92,7 +92,7 @@ xfs_fs_map_blocks(
uint lock_flags;
int error = 0;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/*
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index fe341f3fd419..5608066d6e53 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -157,7 +157,7 @@ xfs_qm_dqpurge(
}
ASSERT(atomic_read(&dqp->q_pincount) == 0);
- ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ ASSERT(xfs_is_shutdown(mp) ||
!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
xfs_dqfunlock(dqp);
@@ -185,17 +185,13 @@ out_unlock:
/*
* Purge the dquot cache.
*/
-void
+static void
xfs_qm_dqpurge_all(
- struct xfs_mount *mp,
- uint flags)
+ struct xfs_mount *mp)
{
- if (flags & XFS_QMOPT_UQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL);
- if (flags & XFS_QMOPT_GQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL);
- if (flags & XFS_QMOPT_PQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL);
}
/*
@@ -206,7 +202,7 @@ xfs_qm_unmount(
struct xfs_mount *mp)
{
if (mp->m_quotainfo) {
- xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+ xfs_qm_dqpurge_all(mp);
xfs_qm_destroy_quotainfo(mp);
}
}
@@ -299,8 +295,6 @@ xfs_qm_need_dqattach(
{
struct xfs_mount *mp = ip->i_mount;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return false;
if (!XFS_IS_QUOTA_ON(mp))
return false;
if (!XFS_NOT_DQATTACHED(mp, ip))
@@ -635,7 +629,7 @@ xfs_qm_init_quotainfo(
struct xfs_quotainfo *qinf;
int error;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ ASSERT(XFS_IS_QUOTA_ON(mp));
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
@@ -662,7 +656,7 @@ xfs_qm_init_quotainfo(
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
- if (xfs_sb_version_hasbigtime(&mp->m_sb)) {
+ if (xfs_has_bigtime(mp)) {
qinf->qi_expiry_min =
xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN);
qinf->qi_expiry_max =
@@ -680,11 +674,11 @@ xfs_qm_init_quotainfo(
xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP);
xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ);
- if (XFS_IS_UQUOTA_RUNNING(mp))
+ if (XFS_IS_UQUOTA_ON(mp))
xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf);
- if (XFS_IS_GQUOTA_RUNNING(mp))
+ if (XFS_IS_GQUOTA_ON(mp))
xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf);
- if (XFS_IS_PQUOTA_RUNNING(mp))
+ if (XFS_IS_PQUOTA_ON(mp))
xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
@@ -755,7 +749,7 @@ xfs_qm_qino_alloc(
* with PQUOTA, just use sb_gquotino for sb_pquotino and
* vice-versa.
*/
- if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
+ if (!xfs_has_pquotino(mp) &&
(flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
xfs_ino_t ino = NULLFSINO;
@@ -808,9 +802,9 @@ xfs_qm_qino_alloc(
*/
spin_lock(&mp->m_sb_lock);
if (flags & XFS_QMOPT_SBVERSION) {
- ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
+ ASSERT(!xfs_has_quota(mp));
- xfs_sb_version_addquota(&mp->m_sb);
+ xfs_add_quota(mp);
mp->m_sb.sb_uquotino = NULLFSINO;
mp->m_sb.sb_gquotino = NULLFSINO;
mp->m_sb.sb_pquotino = NULLFSINO;
@@ -829,7 +823,7 @@ xfs_qm_qino_alloc(
error = xfs_trans_commit(tp);
if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
xfs_alert(mp, "%s failed (error %d)!", __func__, error);
}
if (need_alloc)
@@ -896,11 +890,11 @@ xfs_qm_reset_dqcounts(
ddq->d_bwarns = 0;
ddq->d_iwarns = 0;
ddq->d_rtbwarns = 0;
- if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ if (xfs_has_bigtime(mp))
ddq->d_type |= XFS_DQTYPE_BIGTIME;
}
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
xfs_update_cksum((char *)&dqb[j],
sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -1147,7 +1141,7 @@ xfs_qm_dqusage_adjust(
xfs_filblks_t rtblks = 0; /* total rt blks */
int error;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ ASSERT(XFS_IS_QUOTA_ON(mp));
/*
* rootino must have its resources accounted for, not so with the quota
@@ -1288,7 +1282,7 @@ xfs_qm_quotacheck(
flags = 0;
ASSERT(uip || gip || pip);
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ ASSERT(XFS_IS_QUOTA_ON(mp));
xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1359,7 +1353,7 @@ xfs_qm_quotacheck(
* at this point (because we intentionally didn't in dqget_noattach).
*/
if (error) {
- xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+ xfs_qm_dqpurge_all(mp);
goto error_return;
}
@@ -1418,7 +1412,7 @@ xfs_qm_mount_quotas(
goto write_changes;
}
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ ASSERT(XFS_IS_QUOTA_ON(mp));
/*
* Allocate the quotainfo structure inside the mount struct, and
@@ -1473,7 +1467,7 @@ xfs_qm_mount_quotas(
* the incore structures are convinced that quotas are
* off, but the on disk superblock doesn't know that !
*/
- ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+ ASSERT(!(XFS_IS_QUOTA_ON(mp)));
xfs_alert(mp, "%s: Superblock update failed!",
__func__);
}
@@ -1504,7 +1498,7 @@ xfs_qm_init_quotainos(
/*
* Get the uquota and gquota inodes
*/
- if (xfs_sb_version_hasquota(&mp->m_sb)) {
+ if (xfs_has_quota(mp)) {
if (XFS_IS_UQUOTA_ON(mp) &&
mp->m_sb.sb_uquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_uquotino > 0);
@@ -1645,7 +1639,7 @@ xfs_qm_vop_dqalloc(
int error;
uint lockflags;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
lockflags = XFS_ILOCK_EXCL;
@@ -1776,7 +1770,7 @@ xfs_qm_vop_chown(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+ ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
/* old dquot */
prevdq = *IO_olddq;
@@ -1829,7 +1823,7 @@ xfs_qm_vop_rename_dqattach(
struct xfs_mount *mp = i_tab[0]->i_mount;
int i;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
for (i = 0; (i < 4 && i_tab[i]); i++) {
@@ -1860,7 +1854,7 @@ xfs_qm_vop_create_dqattach(
{
struct xfs_mount *mp = tp->t_mountp;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1888,3 +1882,37 @@ xfs_qm_vop_create_dqattach(
}
}
+/* Decide if this inode's dquot is near an enforcement boundary. */
+bool
+xfs_inode_near_dquot_enforcement(
+ struct xfs_inode *ip,
+ xfs_dqtype_t type)
+{
+ struct xfs_dquot *dqp;
+ int64_t freesp;
+
+ /* We only care for quotas that are enabled and enforced. */
+ dqp = xfs_inode_dquot(ip, type);
+ if (!dqp || !xfs_dquot_is_enforced(dqp))
+ return false;
+
+ if (xfs_dquot_res_over_limits(&dqp->q_ino) ||
+ xfs_dquot_res_over_limits(&dqp->q_rtb))
+ return true;
+
+ /* For space on the data device, check the various thresholds. */
+ if (!dqp->q_prealloc_hi_wmark)
+ return false;
+
+ if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark)
+ return false;
+
+ if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark)
+ return true;
+
+ freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved;
+ if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT])
+ return true;
+
+ return false;
+}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index ebbb484c49dc..442a0f97a9d4 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -140,9 +140,6 @@ struct xfs_dquot_acct {
extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
-/* dquot stuff */
-extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
-
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
extern int xfs_qm_scall_getquota(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index df00dfbf5c9d..b77673dd0558 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -75,7 +75,7 @@ xfs_qm_newmount(
uint quotaondisk;
uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
- quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
+ quotaondisk = xfs_has_quota(mp) &&
(mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
if (quotaondisk) {
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 13a56e1ea15c..47fe60e1a887 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -19,91 +19,11 @@
#include "xfs_qm.h"
#include "xfs_icache.h"
-STATIC int
-xfs_qm_log_quotaoff(
- struct xfs_mount *mp,
- struct xfs_qoff_logitem **qoffstartp,
- uint flags)
-{
- struct xfs_trans *tp;
- int error;
- struct xfs_qoff_logitem *qoffi;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
- if (error)
- goto out;
-
- qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
- spin_unlock(&mp->m_sb_lock);
-
- xfs_log_sb(tp);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp);
- if (error)
- goto out;
-
- *qoffstartp = qoffi;
-out:
- return error;
-}
-
-STATIC int
-xfs_qm_log_quotaoff_end(
- struct xfs_mount *mp,
- struct xfs_qoff_logitem **startqoff,
- uint flags)
-{
- struct xfs_trans *tp;
- int error;
- struct xfs_qoff_logitem *qoffi;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
- if (error)
- return error;
-
- qoffi = xfs_trans_get_qoff_item(tp, *startqoff,
- flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
- *startqoff = NULL;
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
-}
-
-/*
- * Turn off quota accounting and/or enforcement for all udquots and/or
- * gdquots. Called only at unmount time.
- *
- * This assumes that there are no dquots of this file system cached
- * incore, and modifies the ondisk dquot directly. Therefore, for example,
- * it is an error to call this twice, without purging the cache.
- */
int
xfs_qm_scall_quotaoff(
xfs_mount_t *mp,
uint flags)
{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- uint dqtype;
- int error;
- uint inactivate_flags;
- struct xfs_qoff_logitem *qoffstart = NULL;
-
/*
* No file system can have quotas enabled on disk but not in core.
* Note that quota utilities (like quotaoff) _expect_
@@ -111,160 +31,23 @@ xfs_qm_scall_quotaoff(
*/
if ((mp->m_qflags & flags) == 0)
return -EEXIST;
- error = 0;
-
- flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
-
- /*
- * We don't want to deal with two quotaoffs messing up each other,
- * so we're going to serialize it. quotaoff isn't exactly a performance
- * critical thing.
- * If quotaoff, then we must be dealing with the root filesystem.
- */
- ASSERT(q);
- mutex_lock(&q->qi_quotaofflock);
/*
- * If we're just turning off quota enforcement, change mp and go.
+ * We do not support actually turning off quota accounting any more.
+ * Just log a warning and ignore the accounting related flags.
*/
- if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
- mp->m_qflags &= ~(flags);
+ if (flags & XFS_ALL_QUOTA_ACCT)
+ xfs_info(mp, "disabling of quota accounting not supported.");
- spin_lock(&mp->m_sb_lock);
- mp->m_sb.sb_qflags = mp->m_qflags;
- spin_unlock(&mp->m_sb_lock);
- mutex_unlock(&q->qi_quotaofflock);
-
- /* XXX what to do if error ? Revert back to old vals incore ? */
- return xfs_sync_sb(mp, false);
- }
-
- dqtype = 0;
- inactivate_flags = 0;
- /*
- * If accounting is off, we must turn enforcement off, clear the
- * quota 'CHKD' certificate to make it known that we have to
- * do a quotacheck the next time this quota is turned on.
- */
- if (flags & XFS_UQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_UQUOTA;
- flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
- inactivate_flags |= XFS_UQUOTA_ACTIVE;
- }
- if (flags & XFS_GQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_GQUOTA;
- flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
- inactivate_flags |= XFS_GQUOTA_ACTIVE;
- }
- if (flags & XFS_PQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_PQUOTA;
- flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
- inactivate_flags |= XFS_PQUOTA_ACTIVE;
- }
-
- /*
- * Nothing to do? Don't complain. This happens when we're just
- * turning off quota enforcement.
- */
- if ((mp->m_qflags & flags) == 0)
- goto out_unlock;
-
- /*
- * Write the LI_QUOTAOFF log record, and do SB changes atomically,
- * and synchronously. If we fail to write, we should abort the
- * operation as it cannot be recovered safely if we crash.
- */
- error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
- if (error)
- goto out_unlock;
-
- /*
- * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
- * to take care of the race between dqget and quotaoff. We don't take
- * any special locks to reset these bits. All processes need to check
- * these bits *after* taking inode lock(s) to see if the particular
- * quota type is in the process of being turned off. If *ACTIVE, it is
- * guaranteed that all dquot structures and all quotainode ptrs will all
- * stay valid as long as that inode is kept locked.
- *
- * There is no turning back after this.
- */
- mp->m_qflags &= ~inactivate_flags;
-
- /*
- * Give back all the dquot reference(s) held by inodes.
- * Here we go thru every single incore inode in this file system, and
- * do a dqrele on the i_udquot/i_gdquot that it may have.
- * Essentially, as long as somebody has an inode locked, this guarantees
- * that quotas will not be turned off. This is handy because in a
- * transaction once we lock the inode(s) and check for quotaon, we can
- * depend on the quota inodes (and other things) being valid as long as
- * we keep the lock(s).
- */
- error = xfs_dqrele_all_inodes(mp, flags);
- ASSERT(!error);
-
- /*
- * Next we make the changes in the quota flag in the mount struct.
- * This isn't protected by a particular lock directly, because we
- * don't want to take a mrlock every time we depend on quotas being on.
- */
- mp->m_qflags &= ~flags;
-
- /*
- * Go through all the dquots of this file system and purge them,
- * according to what was turned off.
- */
- xfs_qm_dqpurge_all(mp, dqtype);
-
- /*
- * Transactions that had started before ACTIVE state bit was cleared
- * could have logged many dquots, so they'd have higher LSNs than
- * the first QUOTAOFF log record does. If we happen to crash when
- * the tail of the log has gone past the QUOTAOFF record, but
- * before the last dquot modification, those dquots __will__
- * recover, and that's not good.
- *
- * So, we have QUOTAOFF start and end logitems; the start
- * logitem won't get overwritten until the end logitem appears...
- */
- error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags);
- if (error) {
- /* We're screwed now. Shutdown is the only option. */
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- goto out_unlock;
- }
-
- /*
- * If all quotas are completely turned off, close shop.
- */
- if (mp->m_qflags == 0) {
- mutex_unlock(&q->qi_quotaofflock);
- xfs_qm_destroy_quotainfo(mp);
- return 0;
- }
-
- /*
- * Release our quotainode references if we don't need them anymore.
- */
- if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
- xfs_irele(q->qi_uquotaip);
- q->qi_uquotaip = NULL;
- }
- if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
- xfs_irele(q->qi_gquotaip);
- q->qi_gquotaip = NULL;
- }
- if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
- xfs_irele(q->qi_pquotaip);
- q->qi_pquotaip = NULL;
- }
+ mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+ mp->m_qflags &= ~(flags & XFS_ALL_QUOTA_ENFD);
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags = mp->m_qflags;
+ spin_unlock(&mp->m_sb_lock);
+ mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
-out_unlock:
- if (error && qoffstart)
- xfs_qm_qoff_logitem_relse(qoffstart);
- mutex_unlock(&q->qi_quotaofflock);
- return error;
+ /* XXX what to do if error ? Revert back to old vals incore ? */
+ return xfs_sync_sb(mp, false);
}
STATIC int
@@ -322,7 +105,7 @@ xfs_qm_scall_trunc_qfiles(
{
int error = -EINVAL;
- if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
+ if (!xfs_has_quota(mp) || flags == 0 ||
(flags & ~XFS_QMOPT_QUOTALL)) {
xfs_debug(mp, "%s: flags=%x m_qflags=%x",
__func__, flags, mp->m_qflags);
@@ -421,7 +204,7 @@ xfs_qm_scall_quotaon(
(mp->m_qflags & XFS_GQUOTA_ACCT)))
return 0;
- if (! XFS_IS_QUOTA_RUNNING(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
/*
@@ -698,6 +481,10 @@ xfs_qm_scall_getquota(
struct xfs_dquot *dqp;
int error;
+ /* Flush inodegc work at the start of a quota reporting scan. */
+ if (id == 0)
+ xfs_inodegc_flush(mp);
+
/*
* Try to get the dquot. We don't want it allocated on disk, so don't
* set doalloc. If it doesn't exist, we'll get ENOENT back.
@@ -736,6 +523,10 @@ xfs_qm_scall_getquota_next(
struct xfs_dquot *dqp;
int error;
+ /* Flush inodegc work at the start of a quota reporting scan. */
+ if (*id == 0)
+ xfs_inodegc_flush(mp);
+
error = xfs_qm_dqget_next(mp, *id, type, &dqp);
if (error)
return error;
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index d00d01302545..dcc785fdd345 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -113,6 +113,7 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
{
return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
}
+bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -168,6 +169,7 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
+#define xfs_inode_near_dquot_enforcement(ip, type) (false)
#endif /* CONFIG_XFS_QUOTA */
static inline int
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 88d70c236a54..07989bd67728 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -60,18 +60,18 @@ xfs_fs_get_quota_state(
struct xfs_quotainfo *q = mp->m_quotainfo;
memset(state, 0, sizeof(*state));
- if (!XFS_IS_QUOTA_RUNNING(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
state->s_incoredqs = q->qi_dquots;
- if (XFS_IS_UQUOTA_RUNNING(mp))
+ if (XFS_IS_UQUOTA_ON(mp))
state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED;
if (XFS_IS_UQUOTA_ENFORCED(mp))
state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
- if (XFS_IS_GQUOTA_RUNNING(mp))
+ if (XFS_IS_GQUOTA_ON(mp))
state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED;
if (XFS_IS_GQUOTA_ENFORCED(mp))
state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
- if (XFS_IS_PQUOTA_RUNNING(mp))
+ if (XFS_IS_PQUOTA_ON(mp))
state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED;
if (XFS_IS_PQUOTA_ENFORCED(mp))
state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED;
@@ -114,10 +114,8 @@ xfs_fs_set_info(
if (sb_rdonly(sb))
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -ESRCH;
+ return -ENOSYS;
if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK)
return -EINVAL;
if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0)
@@ -164,7 +162,7 @@ xfs_quota_enable(
if (sb_rdonly(sb))
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return -ENOSYS;
return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
@@ -179,10 +177,8 @@ xfs_quota_disable(
if (sb_rdonly(sb))
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -EINVAL;
+ return -ENOSYS;
return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
}
@@ -223,10 +219,8 @@ xfs_fs_get_dqblk(
struct xfs_mount *mp = XFS_M(sb);
xfs_dqid_t id;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -ESRCH;
+ return -ENOSYS;
id = from_kqid(&init_user_ns, qid);
return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq);
@@ -243,10 +237,8 @@ xfs_fs_get_nextdqblk(
struct xfs_mount *mp = XFS_M(sb);
xfs_dqid_t id;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -ESRCH;
+ return -ENOSYS;
id = from_kqid(&init_user_ns, *qid);
ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type),
@@ -269,10 +261,8 @@ xfs_fs_set_dqblk(
if (sb_rdonly(sb))
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -ESRCH;
+ return -ENOSYS;
return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
xfs_quota_type(qid.type), qdq);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 746f4eda724c..46904b793bd4 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -423,7 +423,7 @@ xfs_cui_validate_phys(
struct xfs_mount *mp,
struct xfs_phys_extent *refc)
{
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return false;
if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
@@ -522,6 +522,9 @@ xfs_cui_item_recover(
error = xfs_trans_log_finish_refcount_update(tp, cudp,
type, refc->pe_startblock, refc->pe_len,
&new_fsb, &new_len, &rcur);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ refc, sizeof(*refc));
if (error)
goto abort_error;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c256104772cb..76355f293488 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -759,7 +759,7 @@ xfs_reflink_recover_cow(
xfs_agnumber_t agno;
int error = 0;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return 0;
for_each_perag(mp, agno, pag) {
@@ -967,7 +967,7 @@ xfs_reflink_ag_has_free_space(
struct xfs_perag *pag;
int error = 0;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
pag = xfs_perag_get(mp, agno);
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 487b00434b96..bea65f2fe657 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -8,8 +8,7 @@
static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
{
- return ip->i_mount->m_always_cow &&
- xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+ return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
}
static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index dc4f0c9f0897..5f0695980467 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -466,7 +466,7 @@ xfs_rui_validate_map(
struct xfs_mount *mp,
struct xfs_map_extent *rmap)
{
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return false;
if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
@@ -578,6 +578,9 @@ xfs_rui_item_recover(
rmap->me_owner, whichfork,
rmap->me_startoff, rmap->me_startblock,
rmap->me_len, state, &rcur);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ rmap, sizeof(*rmap));
if (error)
goto abort_error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 699066fb9052..b8c79ee791af 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -951,8 +951,7 @@ xfs_growfs_rt(
return -EINVAL;
/* Unsupported realtime features. */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb) ||
- xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
return -EOPNOTSUPP;
nrblocks = in->newblocks;
@@ -1131,6 +1130,9 @@ error_cancel:
error = xfs_trans_commit(tp);
if (error)
break;
+
+ /* Ensure the mount RT feature flag is now set. */
+ mp->m_features |= XFS_FEAT_REALTIME;
}
if (error)
goto out_free;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ed885620589c..91b00289509b 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -22,9 +22,9 @@ struct xfs_rtalloc_rec {
};
typedef int (*xfs_rtalloc_query_range_fn)(
- struct xfs_trans *tp,
- struct xfs_rtalloc_rec *rec,
- void *priv);
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv);
#ifdef CONFIG_XFS_RT
/*
@@ -124,10 +124,9 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
int xfs_rtalloc_query_range(struct xfs_trans *tp,
- struct xfs_rtalloc_rec *low_rec,
- struct xfs_rtalloc_rec *high_rec,
- xfs_rtalloc_query_range_fn fn,
- void *priv);
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn, void *priv);
int xfs_rtalloc_query_all(struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2c9e26a44546..9a86d3ec2cb6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,6 +49,28 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+static LIST_HEAD(xfs_mount_list);
+static DEFINE_SPINLOCK(xfs_mount_list_lock);
+
+static inline void xfs_mount_list_add(struct xfs_mount *mp)
+{
+ spin_lock(&xfs_mount_list_lock);
+ list_add(&mp->m_mount_list, &xfs_mount_list);
+ spin_unlock(&xfs_mount_list_lock);
+}
+
+static inline void xfs_mount_list_del(struct xfs_mount *mp)
+{
+ spin_lock(&xfs_mount_list_lock);
+ list_del(&mp->m_mount_list);
+ spin_unlock(&xfs_mount_list_lock);
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static inline void xfs_mount_list_add(struct xfs_mount *mp) {}
+static inline void xfs_mount_list_del(struct xfs_mount *mp) {}
+#endif
+
enum xfs_dax_mode {
XFS_DAX_INODE = 0,
XFS_DAX_ALWAYS = 1,
@@ -62,15 +84,15 @@ xfs_mount_set_dax_mode(
{
switch (mode) {
case XFS_DAX_INODE:
- mp->m_flags &= ~(XFS_MOUNT_DAX_ALWAYS | XFS_MOUNT_DAX_NEVER);
+ mp->m_features &= ~(XFS_FEAT_DAX_ALWAYS | XFS_FEAT_DAX_NEVER);
break;
case XFS_DAX_ALWAYS:
- mp->m_flags |= XFS_MOUNT_DAX_ALWAYS;
- mp->m_flags &= ~XFS_MOUNT_DAX_NEVER;
+ mp->m_features |= XFS_FEAT_DAX_ALWAYS;
+ mp->m_features &= ~XFS_FEAT_DAX_NEVER;
break;
case XFS_DAX_NEVER:
- mp->m_flags |= XFS_MOUNT_DAX_NEVER;
- mp->m_flags &= ~XFS_MOUNT_DAX_ALWAYS;
+ mp->m_features |= XFS_FEAT_DAX_NEVER;
+ mp->m_features &= ~XFS_FEAT_DAX_ALWAYS;
break;
}
}
@@ -154,33 +176,32 @@ xfs_fs_show_options(
{
static struct proc_xfs_info xfs_info_set[] = {
/* the few simple ones we can get from the mount struct */
- { XFS_MOUNT_IKEEP, ",ikeep" },
- { XFS_MOUNT_WSYNC, ",wsync" },
- { XFS_MOUNT_NOALIGN, ",noalign" },
- { XFS_MOUNT_SWALLOC, ",swalloc" },
- { XFS_MOUNT_NOUUID, ",nouuid" },
- { XFS_MOUNT_NORECOVERY, ",norecovery" },
- { XFS_MOUNT_ATTR2, ",attr2" },
- { XFS_MOUNT_FILESTREAMS, ",filestreams" },
- { XFS_MOUNT_GRPID, ",grpid" },
- { XFS_MOUNT_DISCARD, ",discard" },
- { XFS_MOUNT_LARGEIO, ",largeio" },
- { XFS_MOUNT_DAX_ALWAYS, ",dax=always" },
- { XFS_MOUNT_DAX_NEVER, ",dax=never" },
+ { XFS_FEAT_IKEEP, ",ikeep" },
+ { XFS_FEAT_WSYNC, ",wsync" },
+ { XFS_FEAT_NOALIGN, ",noalign" },
+ { XFS_FEAT_SWALLOC, ",swalloc" },
+ { XFS_FEAT_NOUUID, ",nouuid" },
+ { XFS_FEAT_NORECOVERY, ",norecovery" },
+ { XFS_FEAT_ATTR2, ",attr2" },
+ { XFS_FEAT_FILESTREAMS, ",filestreams" },
+ { XFS_FEAT_GRPID, ",grpid" },
+ { XFS_FEAT_DISCARD, ",discard" },
+ { XFS_FEAT_LARGE_IOSIZE, ",largeio" },
+ { XFS_FEAT_DAX_ALWAYS, ",dax=always" },
+ { XFS_FEAT_DAX_NEVER, ",dax=never" },
{ 0, NULL }
};
struct xfs_mount *mp = XFS_M(root->d_sb);
struct proc_xfs_info *xfs_infop;
for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
- if (mp->m_flags & xfs_infop->flag)
+ if (mp->m_features & xfs_infop->flag)
seq_puts(m, xfs_infop->str);
}
- seq_printf(m, ",inode%d",
- (mp->m_flags & XFS_MOUNT_SMALL_INUMS) ? 32 : 64);
+ seq_printf(m, ",inode%d", xfs_has_small_inums(mp) ? 32 : 64);
- if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
+ if (xfs_has_allocsize(mp))
seq_printf(m, ",allocsize=%dk",
(1 << mp->m_allocsize_log) >> 10);
@@ -201,25 +222,20 @@ xfs_fs_show_options(
seq_printf(m, ",swidth=%d",
(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
- if (mp->m_qflags & XFS_UQUOTA_ACCT) {
- if (mp->m_qflags & XFS_UQUOTA_ENFD)
- seq_puts(m, ",usrquota");
- else
- seq_puts(m, ",uqnoenforce");
- }
+ if (mp->m_qflags & XFS_UQUOTA_ENFD)
+ seq_puts(m, ",usrquota");
+ else if (mp->m_qflags & XFS_UQUOTA_ACCT)
+ seq_puts(m, ",uqnoenforce");
- if (mp->m_qflags & XFS_PQUOTA_ACCT) {
- if (mp->m_qflags & XFS_PQUOTA_ENFD)
- seq_puts(m, ",prjquota");
- else
- seq_puts(m, ",pqnoenforce");
- }
- if (mp->m_qflags & XFS_GQUOTA_ACCT) {
- if (mp->m_qflags & XFS_GQUOTA_ENFD)
- seq_puts(m, ",grpquota");
- else
- seq_puts(m, ",gqnoenforce");
- }
+ if (mp->m_qflags & XFS_PQUOTA_ENFD)
+ seq_puts(m, ",prjquota");
+ else if (mp->m_qflags & XFS_PQUOTA_ACCT)
+ seq_puts(m, ",pqnoenforce");
+
+ if (mp->m_qflags & XFS_GQUOTA_ENFD)
+ seq_puts(m, ",grpquota");
+ else if (mp->m_qflags & XFS_GQUOTA_ACCT)
+ seq_puts(m, ",gqnoenforce");
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
@@ -230,11 +246,11 @@ xfs_fs_show_options(
/*
* Set parameters for inode allocation heuristics, taking into account
* filesystem size and inode32/inode64 mount options; i.e. specifically
- * whether or not XFS_MOUNT_SMALL_INUMS is set.
+ * whether or not XFS_FEAT_SMALL_INUMS is set.
*
* Inode allocation patterns are altered only if inode32 is requested
- * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large.
- * If altered, XFS_MOUNT_32BITINODES is set as well.
+ * (XFS_FEAT_SMALL_INUMS), and the filesystem is sufficiently large.
+ * If altered, XFS_OPSTATE_INODE32 is set as well.
*
* An agcount independent of that in the mount structure is provided
* because in the growfs case, mp->m_sb.sb_agcount is not yet updated
@@ -276,13 +292,13 @@ xfs_set_inode_alloc(
/*
* If user asked for no more than 32-bit inodes, and the fs is
- * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter
+ * sufficiently large, set XFS_OPSTATE_INODE32 if we must alter
* the allocator to accommodate the request.
*/
- if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
- mp->m_flags |= XFS_MOUNT_32BITINODES;
+ if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32)
+ set_bit(XFS_OPSTATE_INODE32, &mp->m_opstate);
else
- mp->m_flags &= ~XFS_MOUNT_32BITINODES;
+ clear_bit(XFS_OPSTATE_INODE32, &mp->m_opstate);
for (index = 0; index < agcount; index++) {
struct xfs_perag *pag;
@@ -291,7 +307,7 @@ xfs_set_inode_alloc(
pag = xfs_perag_get(mp, index);
- if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ if (xfs_is_inode32(mp)) {
if (ino > XFS_MAXINUMBER_32) {
pag->pagi_inodeok = 0;
pag->pagf_metadata = 0;
@@ -311,7 +327,7 @@ xfs_set_inode_alloc(
xfs_perag_put(pag);
}
- return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount;
+ return xfs_is_inode32(mp) ? maxagi : agcount;
}
STATIC int
@@ -468,7 +484,7 @@ xfs_setup_devices(
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
unsigned int log_sector_size = BBSIZE;
- if (xfs_sb_version_hassector(&mp->m_sb))
+ if (xfs_has_sector(mp))
log_sector_size = mp->m_sb.sb_logsectsize;
error = xfs_setsize_buftarg(mp->m_logdev_targp,
log_sector_size);
@@ -501,37 +517,37 @@ xfs_init_mount_workqueues(
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
- mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
- 0, mp->m_super->s_id);
- if (!mp->m_cil_workqueue)
- goto out_destroy_unwritten;
-
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
- goto out_destroy_cil;
+ goto out_destroy_unwritten;
- mp->m_gc_workqueue = alloc_workqueue("xfs-gc/%s",
- WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM,
+ mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s",
+ XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM),
0, mp->m_super->s_id);
- if (!mp->m_gc_workqueue)
+ if (!mp->m_blockgc_wq)
goto out_destroy_reclaim;
+ mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 1, mp->m_super->s_id);
+ if (!mp->m_inodegc_wq)
+ goto out_destroy_blockgc;
+
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
if (!mp->m_sync_workqueue)
- goto out_destroy_eofb;
+ goto out_destroy_inodegc;
return 0;
-out_destroy_eofb:
- destroy_workqueue(mp->m_gc_workqueue);
+out_destroy_inodegc:
+ destroy_workqueue(mp->m_inodegc_wq);
+out_destroy_blockgc:
+ destroy_workqueue(mp->m_blockgc_wq);
out_destroy_reclaim:
destroy_workqueue(mp->m_reclaim_workqueue);
-out_destroy_cil:
- destroy_workqueue(mp->m_cil_workqueue);
out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
out_destroy_buf:
@@ -545,9 +561,9 @@ xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
destroy_workqueue(mp->m_sync_workqueue);
- destroy_workqueue(mp->m_gc_workqueue);
+ destroy_workqueue(mp->m_blockgc_wq);
+ destroy_workqueue(mp->m_inodegc_wq);
destroy_workqueue(mp->m_reclaim_workqueue);
- destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
destroy_workqueue(mp->m_buf_workqueue);
}
@@ -596,32 +612,6 @@ xfs_fs_alloc_inode(
return NULL;
}
-#ifdef DEBUG
-static void
-xfs_check_delalloc(
- struct xfs_inode *ip,
- int whichfork)
-{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- struct xfs_bmbt_irec got;
- struct xfs_iext_cursor icur;
-
- if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
- return;
- do {
- if (isnullstartblock(got.br_startblock)) {
- xfs_warn(ip->i_mount,
- "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
- ip->i_ino,
- whichfork == XFS_DATA_FORK ? "data" : "cow",
- got.br_startoff, got.br_blockcount);
- }
- } while (xfs_iext_next_extent(ifp, &icur, &got));
-}
-#else
-#define xfs_check_delalloc(ip, whichfork) do { } while (0)
-#endif
-
/*
* Now that the generic code is guaranteed not to be accessing
* the linux inode, we can inactivate and reclaim the inode.
@@ -637,30 +627,6 @@ xfs_fs_destroy_inode(
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
XFS_STATS_INC(ip->i_mount, vn_rele);
XFS_STATS_INC(ip->i_mount, vn_remove);
-
- xfs_inactive(ip);
-
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) {
- xfs_check_delalloc(ip, XFS_DATA_FORK);
- xfs_check_delalloc(ip, XFS_COW_FORK);
- ASSERT(0);
- }
-
- XFS_STATS_INC(ip->i_mount, vn_reclaim);
-
- /*
- * We should never get here with one of the reclaim flags already set.
- */
- ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
- ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
-
- /*
- * We always use background reclaim here because even if the inode is
- * clean, it still may be under IO and hence we have wait for IO
- * completion to occur before we can reclaim the inode. The background
- * reclaim path handles this more efficiently than we can here, so
- * simply let background reclaim tear down all inodes.
- */
xfs_inode_mark_reclaimable(ip);
}
@@ -709,8 +675,6 @@ xfs_fs_inode_init_once(
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
- mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
- "xfsino", ip->i_ino);
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
}
@@ -734,7 +698,7 @@ xfs_fs_drop_inode(
* that. See the comment for this inode flag.
*/
if (ip->i_flags & XFS_IRECOVERY) {
- ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED);
+ ASSERT(xlog_recovery_needed(ip->i_mount->m_log));
return 0;
}
@@ -757,6 +721,8 @@ xfs_fs_sync_fs(
{
struct xfs_mount *mp = XFS_M(sb);
+ trace_xfs_fs_sync_fs(mp, __return_address);
+
/*
* Doing anything during the async pass would be counterproductive.
*/
@@ -773,6 +739,25 @@ xfs_fs_sync_fs(
flush_delayed_work(&mp->m_log->l_work);
}
+ /*
+ * If we are called with page faults frozen out, it means we are about
+ * to freeze the transaction subsystem. Take the opportunity to shut
+ * down inodegc because once SB_FREEZE_FS is set it's too late to
+ * prevent inactivation races with freeze. The fs doesn't get called
+ * again by the freezing process until after SB_FREEZE_FS has been set,
+ * so it's now or never. Same logic applies to speculative allocation
+ * garbage collection.
+ *
+ * We don't care if this is a normal syncfs call that does this or
+ * freeze that does this - we can run this multiple times without issue
+ * and we won't race with a restart because a restart can only occur
+ * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE.
+ */
+ if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
+ xfs_inodegc_stop(mp);
+ xfs_blockgc_stop(mp);
+ }
+
return 0;
}
@@ -791,6 +776,9 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
+ /* Wait for whatever inactivations are in progress. */
+ xfs_inodegc_flush(mp);
+
statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
@@ -886,10 +874,22 @@ xfs_fs_freeze(
* set a GFP_NOFS context here to avoid recursion deadlocks.
*/
flags = memalloc_nofs_save();
- xfs_blockgc_stop(mp);
xfs_save_resvblks(mp);
ret = xfs_log_quiesce(mp);
memalloc_nofs_restore(flags);
+
+ /*
+ * For read-write filesystems, we need to restart the inodegc on error
+ * because we stopped it at SB_FREEZE_PAGEFAULT level and a thaw is not
+ * going to be run to restart it now. We are at SB_FREEZE_FS level
+ * here, so we can restart safely without racing with a stop in
+ * xfs_fs_sync_fs().
+ */
+ if (ret && !xfs_is_readonly(mp)) {
+ xfs_blockgc_start(mp);
+ xfs_inodegc_start(mp);
+ }
+
return ret;
}
@@ -901,7 +901,18 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_blockgc_start(mp);
+
+ /*
+ * Don't reactivate the inodegc worker on a readonly filesystem because
+ * inodes are sent directly to reclaim. Don't reactivate the blockgc
+ * worker because there are no speculative preallocations on a readonly
+ * filesystem.
+ */
+ if (!xfs_is_readonly(mp)) {
+ xfs_blockgc_start(mp);
+ xfs_inodegc_start(mp);
+ }
+
return 0;
}
@@ -913,10 +924,8 @@ STATIC int
xfs_finish_flags(
struct xfs_mount *mp)
{
- int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-
/* Fail a mount where the logbuf is smaller than the log stripe */
- if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+ if (xfs_has_logv2(mp)) {
if (mp->m_logbsize <= 0 &&
mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
mp->m_logbsize = mp->m_sb.sb_logsunit;
@@ -938,33 +947,24 @@ xfs_finish_flags(
/*
* V5 filesystems always use attr2 format for attributes.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+ if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) {
xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
"attr2 is always enabled for V5 filesystems.");
return -EINVAL;
}
/*
- * mkfs'ed attr2 will turn on attr2 mount unless explicitly
- * told by noattr2 to turn it off
- */
- if (xfs_sb_version_hasattr2(&mp->m_sb) &&
- !(mp->m_flags & XFS_MOUNT_NOATTR2))
- mp->m_flags |= XFS_MOUNT_ATTR2;
-
- /*
* prohibit r/w mounts of read-only filesystems
*/
- if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+ if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) {
xfs_warn(mp,
"cannot mount a read-only filesystem as read-write");
return -EROFS;
}
- if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
- (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
- !xfs_sb_version_has_pquotino(&mp->m_sb)) {
+ if ((mp->m_qflags & XFS_GQUOTA_ACCT) &&
+ (mp->m_qflags & XFS_PQUOTA_ACCT) &&
+ !xfs_has_pquotino(mp)) {
xfs_warn(mp,
"Super block does not support project and group quota together");
return -EINVAL;
@@ -1022,11 +1022,40 @@ xfs_destroy_percpu_counters(
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
percpu_counter_destroy(&mp->m_fdblocks);
- ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
}
+static int
+xfs_inodegc_init_percpu(
+ struct xfs_mount *mp)
+{
+ struct xfs_inodegc *gc;
+ int cpu;
+
+ mp->m_inodegc = alloc_percpu(struct xfs_inodegc);
+ if (!mp->m_inodegc)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ init_llist_head(&gc->list);
+ gc->items = 0;
+ INIT_WORK(&gc->work, xfs_inodegc_worker);
+ }
+ return 0;
+}
+
+static void
+xfs_inodegc_free_percpu(
+ struct xfs_mount *mp)
+{
+ if (!mp->m_inodegc)
+ return;
+ free_percpu(mp->m_inodegc);
+}
+
static void
xfs_fs_put_super(
struct super_block *sb)
@@ -1043,6 +1072,8 @@ xfs_fs_put_super(
xfs_freesb(mp);
free_percpu(mp->m_stats.xs_stats);
+ xfs_mount_list_del(mp);
+ xfs_inodegc_free_percpu(mp);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
@@ -1131,7 +1162,7 @@ xfs_fs_warn_deprecated(
* already had the flag set
*/
if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
- !!(XFS_M(fc->root->d_sb)->m_flags & flag) == value)
+ !!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
return;
xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
}
@@ -1179,27 +1210,27 @@ xfs_fs_parse_param(
if (suffix_kstrtoint(param->string, 10, &size))
return -EINVAL;
parsing_mp->m_allocsize_log = ffs(size) - 1;
- parsing_mp->m_flags |= XFS_MOUNT_ALLOCSIZE;
+ parsing_mp->m_features |= XFS_FEAT_ALLOCSIZE;
return 0;
case Opt_grpid:
case Opt_bsdgroups:
- parsing_mp->m_flags |= XFS_MOUNT_GRPID;
+ parsing_mp->m_features |= XFS_FEAT_GRPID;
return 0;
case Opt_nogrpid:
case Opt_sysvgroups:
- parsing_mp->m_flags &= ~XFS_MOUNT_GRPID;
+ parsing_mp->m_features &= ~XFS_FEAT_GRPID;
return 0;
case Opt_wsync:
- parsing_mp->m_flags |= XFS_MOUNT_WSYNC;
+ parsing_mp->m_features |= XFS_FEAT_WSYNC;
return 0;
case Opt_norecovery:
- parsing_mp->m_flags |= XFS_MOUNT_NORECOVERY;
+ parsing_mp->m_features |= XFS_FEAT_NORECOVERY;
return 0;
case Opt_noalign:
- parsing_mp->m_flags |= XFS_MOUNT_NOALIGN;
+ parsing_mp->m_features |= XFS_FEAT_NOALIGN;
return 0;
case Opt_swalloc:
- parsing_mp->m_flags |= XFS_MOUNT_SWALLOC;
+ parsing_mp->m_features |= XFS_FEAT_SWALLOC;
return 0;
case Opt_sunit:
parsing_mp->m_dalign = result.uint_32;
@@ -1208,62 +1239,58 @@ xfs_fs_parse_param(
parsing_mp->m_swidth = result.uint_32;
return 0;
case Opt_inode32:
- parsing_mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ parsing_mp->m_features |= XFS_FEAT_SMALL_INUMS;
return 0;
case Opt_inode64:
- parsing_mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ parsing_mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
return 0;
case Opt_nouuid:
- parsing_mp->m_flags |= XFS_MOUNT_NOUUID;
+ parsing_mp->m_features |= XFS_FEAT_NOUUID;
return 0;
case Opt_largeio:
- parsing_mp->m_flags |= XFS_MOUNT_LARGEIO;
+ parsing_mp->m_features |= XFS_FEAT_LARGE_IOSIZE;
return 0;
case Opt_nolargeio:
- parsing_mp->m_flags &= ~XFS_MOUNT_LARGEIO;
+ parsing_mp->m_features &= ~XFS_FEAT_LARGE_IOSIZE;
return 0;
case Opt_filestreams:
- parsing_mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+ parsing_mp->m_features |= XFS_FEAT_FILESTREAMS;
return 0;
case Opt_noquota:
parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
- parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
return 0;
case Opt_quota:
case Opt_uquota:
case Opt_usrquota:
- parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
- XFS_UQUOTA_ENFD);
+ parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD);
return 0;
case Opt_qnoenforce:
case Opt_uqnoenforce:
- parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+ parsing_mp->m_qflags |= XFS_UQUOTA_ACCT;
parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD;
return 0;
case Opt_pquota:
case Opt_prjquota:
- parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_PQUOTA_ENFD);
+ parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD);
return 0;
case Opt_pqnoenforce:
- parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+ parsing_mp->m_qflags |= XFS_PQUOTA_ACCT;
parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD;
return 0;
case Opt_gquota:
case Opt_grpquota:
- parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_GQUOTA_ENFD);
+ parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD);
return 0;
case Opt_gqnoenforce:
- parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+ parsing_mp->m_qflags |= XFS_GQUOTA_ACCT;
parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD;
return 0;
case Opt_discard:
- parsing_mp->m_flags |= XFS_MOUNT_DISCARD;
+ parsing_mp->m_features |= XFS_FEAT_DISCARD;
return 0;
case Opt_nodiscard:
- parsing_mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ parsing_mp->m_features &= ~XFS_FEAT_DISCARD;
return 0;
#ifdef CONFIG_FS_DAX
case Opt_dax:
@@ -1275,21 +1302,20 @@ xfs_fs_parse_param(
#endif
/* Following mount options will be removed in September 2025 */
case Opt_ikeep:
- xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, true);
- parsing_mp->m_flags |= XFS_MOUNT_IKEEP;
+ xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true);
+ parsing_mp->m_features |= XFS_FEAT_IKEEP;
return 0;
case Opt_noikeep:
- xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, false);
- parsing_mp->m_flags &= ~XFS_MOUNT_IKEEP;
+ xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false);
+ parsing_mp->m_features &= ~XFS_FEAT_IKEEP;
return 0;
case Opt_attr2:
- xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_ATTR2, true);
- parsing_mp->m_flags |= XFS_MOUNT_ATTR2;
+ xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true);
+ parsing_mp->m_features |= XFS_FEAT_ATTR2;
return 0;
case Opt_noattr2:
- xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_NOATTR2, true);
- parsing_mp->m_flags &= ~XFS_MOUNT_ATTR2;
- parsing_mp->m_flags |= XFS_MOUNT_NOATTR2;
+ xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
+ parsing_mp->m_features |= XFS_FEAT_NOATTR2;
return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
@@ -1303,17 +1329,23 @@ static int
xfs_fs_validate_params(
struct xfs_mount *mp)
{
+ /* No recovery flag requires a read-only mount */
+ if (xfs_has_norecovery(mp) && !xfs_is_readonly(mp)) {
+ xfs_warn(mp, "no-recovery mounts must be read-only.");
+ return -EINVAL;
+ }
+
/*
- * no recovery flag requires a read-only mount
+ * We have not read the superblock at this point, so only the attr2
+ * mount option can set the attr2 feature by this stage.
*/
- if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
- !(mp->m_flags & XFS_MOUNT_RDONLY)) {
- xfs_warn(mp, "no-recovery mounts must be read-only.");
+ if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) {
+ xfs_warn(mp, "attr2 and noattr2 cannot both be specified.");
return -EINVAL;
}
- if ((mp->m_flags & XFS_MOUNT_NOALIGN) &&
- (mp->m_dalign || mp->m_swidth)) {
+
+ if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) {
xfs_warn(mp,
"sunit and swidth options incompatible with the noalign option");
return -EINVAL;
@@ -1357,7 +1389,7 @@ xfs_fs_validate_params(
return -EINVAL;
}
- if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) &&
+ if (xfs_has_allocsize(mp) &&
(mp->m_allocsize_log > XFS_MAX_IO_LOG ||
mp->m_allocsize_log < XFS_MIN_IO_LOG)) {
xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
@@ -1418,11 +1450,22 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_workqueues;
+ error = xfs_inodegc_init_percpu(mp);
+ if (error)
+ goto out_destroy_counters;
+
+ /*
+ * All percpu data structures requiring cleanup when a cpu goes offline
+ * must be allocated before adding this @mp to the cpu-dead handler's
+ * mount list.
+ */
+ xfs_mount_list_add(mp);
+
/* Allocate stats memory before we do operations that might use it */
mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
if (!mp->m_stats.xs_stats) {
error = -ENOMEM;
- goto out_destroy_counters;
+ goto out_destroy_inodegc;
}
error = xfs_readsb(mp, flags);
@@ -1438,7 +1481,7 @@ xfs_fs_fill_super(
goto out_free_sb;
/* V4 support is undergoing deprecation. */
- if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_has_crc(mp)) {
#ifdef CONFIG_XFS_SUPPORT_V4
xfs_warn_once(mp,
"Deprecated V4 format (crc=0) will not be supported after September 2030.");
@@ -1451,7 +1494,7 @@ xfs_fs_fill_super(
}
/* Filesystem claims it needs repair, so refuse the mount. */
- if (xfs_sb_version_needsrepair(&mp->m_sb)) {
+ if (xfs_has_needsrepair(mp)) {
xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair.");
error = -EFSCORRUPTED;
goto out_free_sb;
@@ -1523,7 +1566,7 @@ xfs_fs_fill_super(
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
- if (xfs_sb_version_hasbigtime(&mp->m_sb)) {
+ if (xfs_has_bigtime(mp)) {
sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN);
sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX);
} else {
@@ -1536,14 +1579,10 @@ xfs_fs_fill_super(
set_posix_acl_flag(sb);
/* version 5 superblocks support inode version counters. */
- if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ if (xfs_has_crc(mp))
sb->s_flags |= SB_I_VERSION;
- if (xfs_sb_version_hasbigtime(&mp->m_sb))
- xfs_warn(mp,
- "EXPERIMENTAL big timestamp feature in use. Use at your own risk!");
-
- if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) {
+ if (xfs_has_dax_always(mp)) {
bool rtdev_is_dax = false, datadev_is_dax;
xfs_warn(mp,
@@ -1559,7 +1598,7 @@ xfs_fs_fill_super(
"DAX unsupported by block device. Turning off DAX.");
xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
}
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
xfs_alert(mp,
"DAX and reflink cannot be used together!");
error = -EINVAL;
@@ -1567,17 +1606,17 @@ xfs_fs_fill_super(
}
}
- if (mp->m_flags & XFS_MOUNT_DISCARD) {
+ if (xfs_has_discard(mp)) {
struct request_queue *q = bdev_get_queue(sb->s_bdev);
if (!blk_queue_discard(q)) {
xfs_warn(mp, "mounting with \"discard\" option, but "
"the device does not support discard");
- mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ mp->m_features &= ~XFS_FEAT_DISCARD;
}
}
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
if (mp->m_sb.sb_rblocks) {
xfs_alert(mp,
"reflink not compatible with realtime device!");
@@ -1591,17 +1630,13 @@ xfs_fs_fill_super(
}
}
- if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
+ if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) {
xfs_alert(mp,
"reverse mapping btree not compatible with realtime device!");
error = -EINVAL;
goto out_filestream_unmount;
}
- if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
- xfs_warn(mp,
- "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!");
-
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1625,6 +1660,9 @@ xfs_fs_fill_super(
xfs_freesb(mp);
out_free_stats:
free_percpu(mp->m_stats.xs_stats);
+ out_destroy_inodegc:
+ xfs_mount_list_del(mp);
+ xfs_inodegc_free_percpu(mp);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
out_destroy_workqueues:
@@ -1656,13 +1694,13 @@ xfs_remount_rw(
struct xfs_sb *sbp = &mp->m_sb;
int error;
- if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+ if (xfs_has_norecovery(mp)) {
xfs_warn(mp,
"ro->rw transition prohibited on norecovery mount");
return -EINVAL;
}
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ if (xfs_sb_is_v5(sbp) &&
xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
xfs_warn(mp,
"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
@@ -1671,7 +1709,7 @@ xfs_remount_rw(
return -EINVAL;
}
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
+ clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
/*
* If this is the first remount to writeable state we might have some
@@ -1708,6 +1746,9 @@ xfs_remount_rw(
if (error && error != -ENOSPC)
return error;
+ /* Re-enable the background inode inactivation worker. */
+ xfs_inodegc_start(mp);
+
return 0;
}
@@ -1730,6 +1771,15 @@ xfs_remount_ro(
return error;
}
+ /*
+ * Stop the inodegc background worker. xfs_fs_reconfigure already
+ * flushed all pending inodegc work when it sync'd the filesystem.
+ * The VFS holds s_umount, so we know that inodes cannot enter
+ * xfs_fs_destroy_inode during a remount operation. In readonly mode
+ * we send inodes straight to reclaim, so no inodes will be queued.
+ */
+ xfs_inodegc_stop(mp);
+
/* Free the per-AG metadata reservation pool. */
error = xfs_fs_unreserve_ag_blocks(mp);
if (error) {
@@ -1747,7 +1797,7 @@ xfs_remount_ro(
xfs_save_resvblks(mp);
xfs_log_clean(mp);
- mp->m_flags |= XFS_MOUNT_RDONLY;
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
return 0;
}
@@ -1770,12 +1820,11 @@ xfs_fs_reconfigure(
{
struct xfs_mount *mp = XFS_M(fc->root->d_sb);
struct xfs_mount *new_mp = fc->s_fs_info;
- xfs_sb_t *sbp = &mp->m_sb;
int flags = fc->sb_flags;
int error;
/* version 5 superblocks always support version counters. */
- if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ if (xfs_has_crc(mp))
fc->sb_flags |= SB_I_VERSION;
error = xfs_fs_validate_params(new_mp);
@@ -1785,28 +1834,26 @@ xfs_fs_reconfigure(
sync_filesystem(mp->m_super);
/* inode32 -> inode64 */
- if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
- !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
- mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
- mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
+ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
+ mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
}
/* inode64 -> inode32 */
- if (!(mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
- (new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
+ if (!xfs_has_small_inums(mp) && xfs_has_small_inums(new_mp)) {
+ mp->m_features |= XFS_FEAT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
}
/* ro -> rw */
- if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(flags & SB_RDONLY)) {
+ if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) {
error = xfs_remount_rw(mp);
if (error)
return error;
}
/* rw -> ro */
- if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (flags & SB_RDONLY)) {
+ if (!xfs_is_readonly(mp) && (flags & SB_RDONLY)) {
error = xfs_remount_ro(mp);
if (error)
return error;
@@ -1873,11 +1920,11 @@ static int xfs_init_fs_context(
* Copy binary VFS mount flags we are interested in.
*/
if (fc->sb_flags & SB_RDONLY)
- mp->m_flags |= XFS_MOUNT_RDONLY;
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
if (fc->sb_flags & SB_DIRSYNC)
- mp->m_flags |= XFS_MOUNT_DIRSYNC;
+ mp->m_features |= XFS_FEAT_DIRSYNC;
if (fc->sb_flags & SB_SYNCHRONOUS)
- mp->m_flags |= XFS_MOUNT_WSYNC;
+ mp->m_features |= XFS_FEAT_WSYNC;
fc->s_fs_info = mp;
fc->ops = &xfs_context_ops;
@@ -2120,6 +2167,48 @@ xfs_destroy_workqueues(void)
destroy_workqueue(xfs_alloc_wq);
}
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+xfs_cpu_dead(
+ unsigned int cpu)
+{
+ struct xfs_mount *mp, *n;
+
+ spin_lock(&xfs_mount_list_lock);
+ list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
+ spin_unlock(&xfs_mount_list_lock);
+ xfs_inodegc_cpu_dead(mp, cpu);
+ spin_lock(&xfs_mount_list_lock);
+ }
+ spin_unlock(&xfs_mount_list_lock);
+ return 0;
+}
+
+static int __init
+xfs_cpu_hotplug_init(void)
+{
+ int error;
+
+ error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL,
+ xfs_cpu_dead);
+ if (error < 0)
+ xfs_alert(NULL,
+"Failed to initialise CPU hotplug, error %d. XFS is non-functional.",
+ error);
+ return error;
+}
+
+static void
+xfs_cpu_hotplug_destroy(void)
+{
+ cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static inline int xfs_cpu_hotplug_init(void) { return 0; }
+static inline void xfs_cpu_hotplug_destroy(void) {}
+#endif
+
STATIC int __init
init_xfs_fs(void)
{
@@ -2132,10 +2221,14 @@ init_xfs_fs(void)
xfs_dir_startup();
- error = xfs_init_zones();
+ error = xfs_cpu_hotplug_init();
if (error)
goto out;
+ error = xfs_init_zones();
+ if (error)
+ goto out_destroy_hp;
+
error = xfs_init_workqueues();
if (error)
goto out_destroy_zones;
@@ -2215,6 +2308,8 @@ init_xfs_fs(void)
xfs_destroy_workqueues();
out_destroy_zones:
xfs_destroy_zones();
+ out_destroy_hp:
+ xfs_cpu_hotplug_destroy();
out:
return error;
}
@@ -2237,6 +2332,7 @@ exit_xfs_fs(void)
xfs_destroy_workqueues();
xfs_destroy_zones();
xfs_uuid_table_free();
+ xfs_cpu_hotplug_destroy();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 1525636f4065..fc2c6a404647 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -63,7 +63,7 @@ xfs_readlink_bmap_ilocked(
byte_cnt = pathlen;
cur_chunk = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
byte_cnt, bp)) {
error = -EFSCORRUPTED;
@@ -107,7 +107,7 @@ xfs_readlink(
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -168,7 +168,7 @@ xfs_symlink(
trace_xfs_symlink(dp, link_name);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/*
@@ -321,9 +321,8 @@ xfs_symlink(
* symlink transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
- }
error = xfs_trans_commit(tp);
if (error)
@@ -445,7 +444,7 @@ xfs_inactive_symlink_rmt(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
goto error_unlock;
}
@@ -478,7 +477,7 @@ xfs_inactive_symlink(
trace_xfs_inactive_symlink(ip);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
xfs_ilock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index f1bc88f4367c..18dc5eca6c04 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -10,6 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sysfs.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 7e01e00550ac..d269ef57ff01 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -20,6 +20,7 @@
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_buf_item.h"
#include "xfs_quota.h"
@@ -32,6 +33,7 @@
#include "xfs_icache.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_error.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 19260291ff8b..1033a95fbf8e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2,6 +2,41 @@
/*
* Copyright (c) 2009, Christoph Hellwig
* All Rights Reserved.
+ *
+ * NOTE: none of these tracepoints shall be considered a stable kernel ABI
+ * as they can change at any time.
+ *
+ * Current conventions for printing numbers measuring specific units:
+ *
+ * agno: allocation group number
+ *
+ * agino: per-AG inode number
+ * ino: filesystem inode number
+ *
+ * agbno: per-AG block number in fs blocks
+ * startblock: physical block number for file mappings. This is either a
+ * segmented fsblock for data device mappings, or a rfsblock
+ * for realtime device mappings
+ * fsbcount: number of blocks in an extent, in fs blocks
+ *
+ * daddr: physical block number in 512b blocks
+ * bbcount: number of blocks in a physical extent, in 512b blocks
+ *
+ * owner: reverse-mapping owner, usually inodes
+ *
+ * fileoff: file offset, in fs blocks
+ * pos: file offset, in bytes
+ * bytecount: number of bytes
+ *
+ * disize: ondisk file size, in bytes
+ * isize: incore file size, in bytes
+ *
+ * forkoff: inode fork offset, in bytes
+ *
+ * ireccount: number of inode records
+ *
+ * Numbers describing space allocations (blocks, extents, inodes) should be
+ * formatted in hexadecimal.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM xfs
@@ -139,7 +174,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
__entry->refcount = refcount;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno %u refcount %d caller %pS",
+ TP_printk("dev %d:%d agno 0x%x refcount %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->refcount,
@@ -157,6 +192,84 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_put);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+TRACE_EVENT(xfs_inodegc_worker,
+ TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
+ TP_ARGS(mp, shrinker_hits),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, shrinker_hits)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->shrinker_hits = shrinker_hits;
+ ),
+ TP_printk("dev %d:%d shrinker_hits %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->shrinker_hits)
+);
+
+DECLARE_EVENT_CLASS(xfs_fs_class,
+ TP_PROTO(struct xfs_mount *mp, void *caller_ip),
+ TP_ARGS(mp, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long long, mflags)
+ __field(unsigned long, opstate)
+ __field(unsigned long, sbflags)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ if (mp) {
+ __entry->dev = mp->m_super->s_dev;
+ __entry->mflags = mp->m_features;
+ __entry->opstate = mp->m_opstate;
+ __entry->sbflags = mp->m_super->s_flags;
+ }
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d m_features 0x%llx opstate (%s) s_flags 0x%lx caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->mflags,
+ __print_flags(__entry->opstate, "|", XFS_OPSTATE_STRINGS),
+ __entry->sbflags,
+ __entry->caller_ip)
+);
+
+#define DEFINE_FS_EVENT(name) \
+DEFINE_EVENT(xfs_fs_class, name, \
+ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+ TP_ARGS(mp, caller_ip))
+DEFINE_FS_EVENT(xfs_inodegc_flush);
+DEFINE_FS_EVENT(xfs_inodegc_start);
+DEFINE_FS_EVENT(xfs_inodegc_stop);
+DEFINE_FS_EVENT(xfs_inodegc_queue);
+DEFINE_FS_EVENT(xfs_inodegc_throttle);
+DEFINE_FS_EVENT(xfs_fs_sync_fs);
+DEFINE_FS_EVENT(xfs_blockgc_start);
+DEFINE_FS_EVENT(xfs_blockgc_stop);
+DEFINE_FS_EVENT(xfs_blockgc_worker);
+DEFINE_FS_EVENT(xfs_blockgc_flush_all);
+
+TRACE_EVENT(xfs_inodegc_shrinker_scan,
+ TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc,
+ void *caller_ip),
+ TP_ARGS(mp, sc, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, nr_to_scan)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->nr_to_scan = sc->nr_to_scan;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d nr_to_scan %lu caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->nr_to_scan,
+ __entry->caller_ip)
+);
+
DECLARE_EVENT_CLASS(xfs_ag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
TP_ARGS(mp, agno),
@@ -168,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_ag_class,
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
),
- TP_printk("dev %d:%d agno %u",
+ TP_printk("dev %d:%d agno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno)
);
@@ -268,7 +381,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d ino 0x%llx state %s cur %p/%d "
- "offset %lld block %lld count %lld flag %d caller %pS",
+ "fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx flag %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -308,10 +421,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
- if (bp->b_bn == XFS_BUF_DADDR_NULL)
- __entry->bno = bp->b_maps[0].bm_bn;
- else
- __entry->bno = bp->b_bn;
+ __entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
@@ -319,7 +429,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
+ TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
"lock %d flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
@@ -370,7 +480,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, bno)
- __field(size_t, buffer_length)
+ __field(unsigned int, length)
__field(int, hold)
__field(int, pincount)
__field(unsigned, lockval)
@@ -379,19 +489,19 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
- __entry->bno = bp->b_bn;
- __entry->buffer_length = BBTOB(bp->b_length);
+ __entry->bno = xfs_buf_daddr(bp);
+ __entry->length = bp->b_length;
__entry->flags = flags;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
"lock %d flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
- __entry->buffer_length,
+ __entry->length,
__entry->hold,
__entry->pincount,
__entry->lockval,
@@ -413,7 +523,7 @@ TRACE_EVENT(xfs_buf_ioerror,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, bno)
- __field(size_t, buffer_length)
+ __field(unsigned int, length)
__field(unsigned, flags)
__field(int, hold)
__field(int, pincount)
@@ -423,8 +533,8 @@ TRACE_EVENT(xfs_buf_ioerror,
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
- __entry->bno = bp->b_bn;
- __entry->buffer_length = BBTOB(bp->b_length);
+ __entry->bno = xfs_buf_daddr(bp);
+ __entry->length = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
@@ -432,11 +542,11 @@ TRACE_EVENT(xfs_buf_ioerror,
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
"lock %d error %d flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
- __entry->buffer_length,
+ __entry->length,
__entry->hold,
__entry->pincount,
__entry->lockval,
@@ -451,7 +561,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, buf_bno)
- __field(size_t, buf_len)
+ __field(unsigned int, buf_len)
__field(int, buf_hold)
__field(int, buf_pincount)
__field(int, buf_lockval)
@@ -466,15 +576,15 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->bli_flags = bip->bli_flags;
__entry->bli_recur = bip->bli_recur;
__entry->bli_refcount = atomic_read(&bip->bli_refcount);
- __entry->buf_bno = bip->bli_buf->b_bn;
- __entry->buf_len = BBTOB(bip->bli_buf->b_length);
+ __entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
+ __entry->buf_len = bip->bli_buf->b_length;
__entry->buf_flags = bip->bli_buf->b_flags;
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count;
__entry->li_flags = bip->bli_item.li_flags;
),
- TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
"lock %d flags %s recur %d refcount %d bliflags %s "
"liflags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -534,7 +644,7 @@ DECLARE_EVENT_CLASS(xfs_filestream_class,
__entry->agno = agno;
__entry->streams = xfs_filestream_peek_ag(mp, agno);
),
- TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
+ TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->agno,
@@ -568,7 +678,7 @@ TRACE_EVENT(xfs_filestream_pick,
__entry->free = free;
__entry->nscan = nscan;
),
- TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
+ TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d nscan %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->agno,
@@ -616,14 +726,17 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
+ __field(unsigned long, iflags)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
+ __entry->iflags = ip->i_flags;
),
- TP_printk("dev %d:%d ino 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino)
+ __entry->ino,
+ __entry->iflags)
)
#define DEFINE_INODE_EVENT(name) \
@@ -667,6 +780,10 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
+DEFINE_INODE_EVENT(xfs_inode_set_reclaimable);
+DEFINE_INODE_EVENT(xfs_inode_reclaiming);
+DEFINE_INODE_EVENT(xfs_inode_set_need_inactive);
+DEFINE_INODE_EVENT(xfs_inode_inactivating);
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -773,9 +890,12 @@ TRACE_EVENT(xfs_irec_merge_pre,
__entry->nagino = nagino;
__entry->nholemask = holemask;
),
- TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
- MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
- __entry->agino, __entry->holemask, __entry->nagino,
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->holemask,
+ __entry->nagino,
__entry->nholemask)
)
@@ -795,8 +915,11 @@ TRACE_EVENT(xfs_irec_merge_post,
__entry->agino = agino;
__entry->holemask = holemask;
),
- TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
- MINOR(__entry->dev), __entry->agno, __entry->agino,
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
__entry->holemask)
)
@@ -1301,7 +1424,7 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__entry->offset = iocb->ki_pos;
__entry->count = iov_iter_count(iter);
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -1348,14 +1471,14 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
- "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx "
+ "fork %s startoff 0x%llx startblock 0x%llx fsbcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
__entry->count,
- __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount)
@@ -1391,7 +1514,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
__entry->count = count;
),
TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
- "offset 0x%llx count %zd",
+ "pos 0x%llx bytecount 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->isize,
@@ -1427,7 +1550,7 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
__entry->size = ip->i_disk_size;
__entry->new_size = new_size;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx new_size 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -1458,7 +1581,7 @@ TRACE_EVENT(xfs_pagecache_inval,
__entry->start = start;
__entry->finish = finish;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -1467,14 +1590,14 @@ TRACE_EVENT(xfs_pagecache_inval,
);
TRACE_EVENT(xfs_bunmap,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
+ TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len,
int flags, unsigned long caller_ip),
- TP_ARGS(ip, bno, len, flags, caller_ip),
+ TP_ARGS(ip, fileoff, len, flags, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
- __field(xfs_fileoff_t, bno)
+ __field(xfs_fileoff_t, fileoff)
__field(xfs_filblks_t, len)
__field(unsigned long, caller_ip)
__field(int, flags)
@@ -1483,17 +1606,17 @@ TRACE_EVENT(xfs_bunmap,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_disk_size;
- __entry->bno = bno;
+ __entry->fileoff = fileoff;
__entry->len = len;
__entry->caller_ip = caller_ip;
__entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx"
"flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->bno,
+ __entry->fileoff,
__entry->len,
__print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
(void *)__entry->caller_ip)
@@ -1516,7 +1639,7 @@ DECLARE_EVENT_CLASS(xfs_extent_busy_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -1554,7 +1677,7 @@ TRACE_EVENT(xfs_extent_busy_trim,
__entry->tbno = tbno;
__entry->tlen = tlen;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -1601,7 +1724,7 @@ DECLARE_EVENT_CLASS(xfs_agf_class,
__entry->longest = be32_to_cpu(agf->agf_longest);
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
+ TP_printk("dev %d:%d agno 0x%x flags %s length %u roots b %u c %u "
"levels b %u c %u flfirst %u fllast %u flcount %u "
"freeblks %u longest %u caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1650,7 +1773,7 @@ TRACE_EVENT(xfs_free_extent,
__entry->haveleft = haveleft;
__entry->haveright = haveright;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x resv %d %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -1707,7 +1830,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->datatype = args->datatype;
__entry->firstblock = args->tp->t_firstblock;
),
- TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u "
"prod %u minleft %u total %u alignment %u minalignslop %u "
"len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
"datatype 0x%x firstblock 0x%llx",
@@ -1785,7 +1908,7 @@ TRACE_EVENT(xfs_alloc_cur_check,
__entry->diff = diff;
__entry->new = new;
),
- TP_printk("dev %d:%d btree %s bno 0x%x len 0x%x diff 0x%x new %d",
+ TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->bno, __entry->len, __entry->diff, __entry->new)
@@ -2060,7 +2183,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
- "broot size %d, fork offset %d",
+ "broot size %d, forkoff 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
@@ -2186,7 +2309,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
__entry->size = buf_f->blf_size;
__entry->map_size = buf_f->blf_map_size;
),
- TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+ TP_printk("dev %d:%d daddr 0x%llx, bbcount 0x%x, flags 0x%x, size %d, "
"map_size %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->blkno,
@@ -2237,7 +2360,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
__entry->boffset = in_f->ilf_boffset;
),
TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
- "dsize %d, blkno 0x%llx, len %d, boffset %d",
+ "dsize %d, daddr 0x%llx, bbcount 0x%x, boffset %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -2278,10 +2401,14 @@ DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class,
__entry->length = be32_to_cpu(in_f->icl_length);
__entry->gen = be32_to_cpu(in_f->icl_gen);
),
- TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u "
- "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->agno, __entry->agbno, __entry->count, __entry->isize,
- __entry->length, __entry->gen)
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x ireccount %u isize %u gen 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->length,
+ __entry->count,
+ __entry->isize,
+ __entry->gen)
)
#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \
@@ -2307,7 +2434,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2350,7 +2477,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_ptrs[level];
- __entry->daddr = bp ? bp->b_bn : -1;
+ __entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
),
TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -2466,7 +2593,7 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d op %d agno %u agbno %u len %u",
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->agno,
@@ -2513,13 +2640,13 @@ DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
__entry->l_state = state;
__entry->op = op;
),
- TP_printk("dev %d:%d op %d agno %u agbno %u owner %lld %s offset %llu len %llu state %d",
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->op,
__entry->agno,
__entry->agbno,
__entry->ino,
- __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data",
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->l_loff,
__entry->l_len,
__entry->l_state)
@@ -2583,7 +2710,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
if (unwritten)
__entry->flags |= XFS_RMAP_UNWRITTEN;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%lx",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2616,7 +2743,7 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class,
__entry->error = error;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno %u error %d caller %pS",
+ TP_printk("dev %d:%d agno 0x%x error %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->error,
@@ -2663,7 +2790,7 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
__entry->offset = offset;
__entry->flags = flags;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2732,7 +2859,7 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class,
__entry->asked = r ? r->ar_asked : 0;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u "
+ TP_printk("dev %d:%d agno 0x%x resv %d freeblks %u flcount %u "
"resv %u ask %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
@@ -2785,7 +2912,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
__entry->agbno = agbno;
__entry->dir = dir;
),
- TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2818,7 +2945,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -2853,7 +2980,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
__entry->refcount = irec->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -2893,8 +3020,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -2939,8 +3066,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__entry->i2_refcount = i2->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u @ agbno %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -2991,9 +3118,9 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
__entry->i3_blockcount = i3->rc_blockcount;
__entry->i3_refcount = i3->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -3080,7 +3207,7 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover,
__entry->new_agbno = new_agbno;
__entry->new_len = new_len;
),
- TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u",
+ TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->agno,
@@ -3106,7 +3233,7 @@ DECLARE_EVENT_CLASS(xfs_inode_error_class,
__entry->error = error;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino %llx error %d caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx error %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->error,
@@ -3132,7 +3259,7 @@ DECLARE_EVENT_CLASS(xfs_double_io_class,
__field(loff_t, src_isize)
__field(loff_t, src_disize)
__field(loff_t, src_offset)
- __field(size_t, len)
+ __field(long long, len)
__field(xfs_ino_t, dest_ino)
__field(loff_t, dest_isize)
__field(loff_t, dest_disize)
@@ -3150,9 +3277,9 @@ DECLARE_EVENT_CLASS(xfs_double_io_class,
__entry->dest_disize = dest->i_disk_size;
__entry->dest_offset = doffset;
),
- TP_printk("dev %d:%d count %zd "
- "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> "
- "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx",
+ TP_printk("dev %d:%d bytecount 0x%llx "
+ "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> "
+ "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->len,
__entry->src_ino,
@@ -3191,7 +3318,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
__entry->pblk = irec->br_startblock;
__entry->state = irec->br_state;
),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
+ TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx fsbcount 0x%x startblock 0x%llx st %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->lblk,
@@ -3231,9 +3358,8 @@ TRACE_EVENT(xfs_reflink_remap_blocks,
__entry->dest_ino = dest->i_ino;
__entry->dest_lblk = doffset;
),
- TP_printk("dev %d:%d len 0x%llx "
- "ino 0x%llx offset 0x%llx blocks -> "
- "ino 0x%llx offset 0x%llx blocks",
+ TP_printk("dev %d:%d fsbcount 0x%llx "
+ "ino 0x%llx fileoff 0x%llx -> ino 0x%llx fileoff 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->len,
__entry->src_ino,
@@ -3272,9 +3398,7 @@ TRACE_EVENT(xfs_ioctl_clone,
__entry->dest_ino = dest->i_ino;
__entry->dest_isize = i_size_read(dest);
),
- TP_printk("dev %d:%d "
- "ino 0x%lx isize 0x%llx -> "
- "ino 0x%lx isize 0x%llx",
+ TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->src_ino,
__entry->src_isize,
@@ -3310,7 +3434,7 @@ DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
/* fsmap traces */
DECLARE_EVENT_CLASS(xfs_fsmap_class,
TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
- struct xfs_rmap_irec *rmap),
+ const struct xfs_rmap_irec *rmap),
TP_ARGS(mp, keydev, agno, rmap),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -3332,7 +3456,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class,
__entry->offset = rmap->rm_offset;
__entry->flags = rmap->rm_flags;
),
- TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x",
+ TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
MAJOR(__entry->keydev), MINOR(__entry->keydev),
__entry->agno,
@@ -3345,7 +3469,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class,
#define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT(xfs_fsmap_class, name, \
TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
- struct xfs_rmap_irec *rmap), \
+ const struct xfs_rmap_irec *rmap), \
TP_ARGS(mp, keydev, agno, rmap))
DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
@@ -3372,7 +3496,7 @@ DECLARE_EVENT_CLASS(xfs_getfsmap_class,
__entry->offset = fsmap->fmr_offset;
__entry->flags = fsmap->fmr_flags;
),
- TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx",
+ TP_printk("dev %d:%d keydev %d:%d daddr 0x%llx bbcount 0x%llx owner 0x%llx fileoff_daddr 0x%llx flags 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
MAJOR(__entry->keydev), MINOR(__entry->keydev),
__entry->block,
@@ -3471,7 +3595,7 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
__entry->old_ptr = old_ptr;
__entry->new_ptr = new_ptr;
),
- TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+ TP_printk("dev %d:%d agno 0x%x bucket %u old 0x%x new 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->bucket,
@@ -3497,7 +3621,7 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
__entry->old_ptr = old_ptr;
__entry->new_ptr = new_ptr;
),
- TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agino,
@@ -3518,7 +3642,7 @@ DECLARE_EVENT_CLASS(xfs_ag_inode_class,
__entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
__entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
),
- TP_printk("dev %d:%d agno %u agino %u",
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno, __entry->agino)
)
@@ -3570,7 +3694,7 @@ DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
__entry->agno = agno;
__entry->flags = flags;
),
- TP_printk("dev %d:%d agno %u flags 0x%x",
+ TP_printk("dev %d:%d agno 0x%x flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno, __entry->flags)
);
@@ -3621,7 +3745,7 @@ TRACE_EVENT(xfs_iwalk_ag,
__entry->agno = agno;
__entry->startino = startino;
),
- TP_printk("dev %d:%d agno %d startino %u",
+ TP_printk("dev %d:%d agno 0x%x startino 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
__entry->startino)
)
@@ -3642,7 +3766,7 @@ TRACE_EVENT(xfs_iwalk_ag_rec,
__entry->startino = irec->ir_startino;
__entry->freemask = irec->ir_free;
),
- TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
+ TP_printk("dev %d:%d agno 0x%x startino 0x%x freemask 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
__entry->startino, __entry->freemask)
)
@@ -3689,8 +3813,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \
TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
TP_ARGS(size, flags, caller_ip))
DEFINE_KMEM_EVENT(kmem_alloc);
-DEFINE_KMEM_EVENT(kmem_alloc_io);
-DEFINE_KMEM_EVENT(kmem_alloc_large);
TRACE_EVENT(xfs_check_new_dalign,
TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
@@ -3707,7 +3829,7 @@ TRACE_EVENT(xfs_check_new_dalign,
__entry->sb_rootino = mp->m_sb.sb_rootino;
__entry->calc_rootino = calc_rootino;
),
- TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu",
+ TP_printk("dev %d:%d new_dalign %d sb_rootino 0x%llx calc_rootino 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->new_dalign, __entry->sb_rootino,
__entry->calc_rootino)
@@ -3732,7 +3854,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
__entry->levels = cur->bc_ag.afake->af_levels;
__entry->blocks = cur->bc_ag.afake->af_blocks;
),
- TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
+ TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->agno,
@@ -3764,12 +3886,12 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
__entry->blocks = cur->bc_ino.ifake->if_blocks;
__entry->whichfork = cur->bc_ino.whichfork;
),
- TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u",
+ TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->agno,
__entry->agino,
- __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data",
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->levels,
__entry->blocks)
)
@@ -3847,7 +3969,7 @@ TRACE_EVENT(xfs_btree_bload_block,
}
__entry->nr_records = nr_records;
),
- TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u",
+ TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
@@ -3934,7 +4056,6 @@ TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING);
TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC);
TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK);
TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY);
-TRACE_DEFINE_ENUM(XLOG_STATE_IOERROR);
DECLARE_EVENT_CLASS(xlog_iclog_class,
TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip),
@@ -3990,6 +4111,57 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);
+DECLARE_EVENT_CLASS(xfs_das_state_class,
+ TP_PROTO(int das, struct xfs_inode *ip),
+ TP_ARGS(das, ip),
+ TP_STRUCT__entry(
+ __field(int, das)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->das = das;
+ __entry->ino = ip->i_ino;
+ ),
+ TP_printk("state change %d ino 0x%llx",
+ __entry->das, __entry->ino)
+)
+
+#define DEFINE_DAS_STATE_EVENT(name) \
+DEFINE_EVENT(xfs_das_state_class, name, \
+ TP_PROTO(int das, struct xfs_inode *ip), \
+ TP_ARGS(das, ip))
+DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
+
+TRACE_EVENT(xfs_force_shutdown,
+ TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
+ int line_num),
+ TP_ARGS(mp, ptag, flags, fname, line_num),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, ptag)
+ __field(int, flags)
+ __string(fname, fname)
+ __field(int, line_num)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ptag = ptag;
+ __entry->flags = flags;
+ __assign_str(fname, fname);
+ __entry->line_num = line_num;
+ ),
+ TP_printk("dev %d:%d tag %s flags %s file %s line_num %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_flags(__entry->ptag, "|", XFS_PTAG_STRINGS),
+ __print_flags(__entry->flags, "|", XFS_SHUTDOWN_STRINGS),
+ __get_str(fname),
+ __entry->line_num)
+);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 87bffd12c20c..67dec11e34c7 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -9,7 +9,6 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
-#include "xfs_log_priv.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_extent_busy.h"
@@ -17,6 +16,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_defer.h"
@@ -275,7 +275,7 @@ retry:
WARN_ON(resp->tr_logres > 0 &&
mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
- xfs_sb_version_haslazysbcount(&mp->m_sb));
+ xfs_has_lazysbcount(mp));
tp->t_magic = XFS_TRANS_HEADER_MAGIC;
tp->t_flags = flags;
@@ -295,10 +295,7 @@ retry:
* Do not perform a synchronous scan because callers can hold
* other locks.
*/
- error = xfs_blockgc_free_space(mp, NULL);
- if (error)
- return error;
-
+ xfs_blockgc_flush_all(mp);
want_retry = false;
goto retry;
}
@@ -367,12 +364,12 @@ xfs_trans_mod_sb(
switch (field) {
case XFS_TRANS_SB_ICOUNT:
tp->t_icount_delta += delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_IFREE:
tp->t_ifree_delta += delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_FDBLOCKS:
@@ -401,7 +398,7 @@ xfs_trans_mod_sb(
delta -= blkres_delta;
}
tp->t_fdblocks_delta += delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_RES_FDBLOCKS:
@@ -411,7 +408,7 @@ xfs_trans_mod_sb(
* be applied to the on-disk superblock.
*/
tp->t_res_fdblocks_delta += delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_FREXTENTS:
@@ -490,7 +487,7 @@ xfs_trans_apply_sb_deltas(
/*
* Only update the superblock counters if we are logging them
*/
- if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) {
+ if (!xfs_has_lazysbcount((tp->t_mountp))) {
if (tp->t_icount_delta)
be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
if (tp->t_ifree_delta)
@@ -588,7 +585,7 @@ xfs_trans_unreserve_and_mod_sb(
if (tp->t_blk_res > 0)
blkdelta = tp->t_blk_res;
if ((tp->t_fdblocks_delta != 0) &&
- (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+ (xfs_has_lazysbcount(mp) ||
(tp->t_flags & XFS_TRANS_SB_DIRTY)))
blkdelta += tp->t_fdblocks_delta;
@@ -598,7 +595,7 @@ xfs_trans_unreserve_and_mod_sb(
(tp->t_flags & XFS_TRANS_SB_DIRTY))
rtxdelta += tp->t_frextents_delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+ if (xfs_has_lazysbcount(mp) ||
(tp->t_flags & XFS_TRANS_SB_DIRTY)) {
idelta = tp->t_icount_delta;
ifreedelta = tp->t_ifree_delta;
@@ -778,7 +775,7 @@ xfs_trans_committed_bulk(
* object into the AIL as we are in a shutdown situation.
*/
if (aborted) {
- ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+ ASSERT(xfs_is_shutdown(ailp->ail_mount));
if (lip->li_ops->iop_unpin)
lip->li_ops->iop_unpin(lip, 1);
continue;
@@ -867,7 +864,7 @@ __xfs_trans_commit(
if (!(tp->t_flags & XFS_TRANS_DIRTY))
goto out_unreserve;
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (xfs_is_shutdown(mp)) {
error = -EIO;
goto out_unreserve;
}
@@ -908,7 +905,7 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log))
+ if (regrant && !xlog_is_shutdown(mp->m_log))
xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
else
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
@@ -953,12 +950,12 @@ xfs_trans_cancel(
* filesystem. This happens in paths where we detect
* corruption and decide to give up.
*/
- if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (dirty && !xfs_is_shutdown(mp)) {
XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
#ifdef DEBUG
- if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (!dirty && !xfs_is_shutdown(mp)) {
struct xfs_log_item *lip;
list_for_each_entry(lip, &tp->t_items, li_trans)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dbb69b4bf3ed..2a8c8dc54c95 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -17,6 +17,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#ifdef DEBUG
/*
@@ -429,8 +430,12 @@ xfsaild_push(
/*
* If we encountered pinned items or did not finish writing out all
- * buffers the last time we ran, force the log first and wait for it
- * before pushing again.
+ * buffers the last time we ran, force a background CIL push to get the
+ * items unpinned in the near future. We do not wait on the CIL push as
+ * that could stall us for seconds if there is enough background IO
+ * load. Stalling for that long when the tail of the log is pinned and
+ * needs flushing will hard stop the transaction subsystem when log
+ * space runs out.
*/
if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 &&
(!list_empty_careful(&ailp->ail_buf_list) ||
@@ -438,7 +443,7 @@ xfsaild_push(
ailp->ail_log_flush = 0;
XFS_STATS_INC(mp, xs_push_ail_flush);
- xfs_log_force(mp, XFS_LOG_SYNC);
+ xlog_cil_flush(mp->m_log);
}
spin_lock(&ailp->ail_lock);
@@ -615,7 +620,7 @@ xfsaild(
* opportunity to release such buffers from the queue.
*/
ASSERT(list_empty(&ailp->ail_buf_list) ||
- XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+ xfs_is_shutdown(ailp->ail_mount));
xfs_buf_delwri_cancel(&ailp->ail_buf_list);
break;
}
@@ -678,7 +683,7 @@ xfs_ail_push(
struct xfs_log_item *lip;
lip = xfs_ail_min(ailp);
- if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) ||
+ if (!lip || xfs_is_shutdown(ailp->ail_mount) ||
XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0)
return;
@@ -743,7 +748,7 @@ xfs_ail_update_finish(
return;
}
- if (!XFS_FORCED_SHUTDOWN(mp))
+ if (!xfs_is_shutdown(mp))
xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->ail_head))
@@ -863,7 +868,7 @@ xfs_trans_ail_delete(
spin_lock(&ailp->ail_lock);
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
- if (shutdown_type && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (shutdown_type && !xfs_is_shutdown(mp)) {
xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
"%s: attempting to delete a log item that is not in the AIL",
__func__);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index d11d032da0b4..6549e50d852c 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -38,7 +38,7 @@ xfs_trans_buf_item_match(
blip = (struct xfs_buf_log_item *)lip;
if (blip->bli_item.li_type == XFS_LI_BUF &&
blip->bli_buf->b_target == target &&
- XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn &&
+ xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn &&
blip->bli_buf->b_length == len) {
ASSERT(blip->bli_buf->b_map_count == nmaps);
return blip->bli_buf;
@@ -138,7 +138,7 @@ xfs_trans_get_buf_map(
bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
if (bp != NULL) {
ASSERT(xfs_buf_islocked(bp));
- if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+ if (xfs_is_shutdown(tp->t_mountp)) {
xfs_buf_stale(bp);
bp->b_flags |= XBF_DONE;
}
@@ -244,7 +244,7 @@ xfs_trans_read_buf_map(
* We never locked this buf ourselves, so we shouldn't
* brelse it either. Just get out.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (xfs_is_shutdown(mp)) {
trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
return -EIO;
}
@@ -300,7 +300,7 @@ xfs_trans_read_buf_map(
return error;
}
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (xfs_is_shutdown(mp)) {
xfs_buf_relse(bp);
trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
return -EIO;
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 48e09ea30ee5..3872ce671411 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -58,7 +58,7 @@ xfs_trans_log_dquot(
/* Upgrade the dquot to bigtime format if possible. */
if (dqp->q_id != 0 &&
- xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) &&
+ xfs_has_bigtime(tp->t_mountp) &&
!(dqp->q_type & XFS_DQTYPE_BIGTIME))
dqp->q_type |= XFS_DQTYPE_BIGTIME;
@@ -132,8 +132,7 @@ xfs_trans_mod_dquot_byino(
{
xfs_mount_t *mp = tp->t_mountp;
- if (!XFS_IS_QUOTA_RUNNING(mp) ||
- !XFS_IS_QUOTA_ON(mp) ||
+ if (!XFS_IS_QUOTA_ON(mp) ||
xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return;
@@ -192,7 +191,7 @@ xfs_trans_mod_dquot(
struct xfs_dqtrx *qtrx;
ASSERT(tp);
- ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+ ASSERT(XFS_IS_QUOTA_ON(tp->t_mountp));
qtrx = NULL;
if (!delta)
@@ -738,7 +737,7 @@ xfs_trans_reserve_quota_bydquots(
{
int error;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
@@ -795,7 +794,7 @@ xfs_trans_reserve_quota_nblks(
unsigned int qflags = 0;
int error;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
@@ -836,51 +835,13 @@ xfs_trans_reserve_quota_icreate(
{
struct xfs_mount *mp = tp->t_mountp;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp,
dblocks, 1, XFS_QMOPT_RES_REGBLKS);
}
-/*
- * This routine is called to allocate a quotaoff log item.
- */
-struct xfs_qoff_logitem *
-xfs_trans_get_qoff_item(
- struct xfs_trans *tp,
- struct xfs_qoff_logitem *startqoff,
- uint flags)
-{
- struct xfs_qoff_logitem *q;
-
- ASSERT(tp != NULL);
-
- q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
- ASSERT(q != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &q->qql_item);
- return q;
-}
-
-
-/*
- * This is called to mark the quotaoff logitem as needing
- * to be logged when the transaction is committed. The logitem must
- * already be associated with the given transaction.
- */
-void
-xfs_trans_log_quotaoff_item(
- struct xfs_trans *tp,
- struct xfs_qoff_logitem *qlp)
-{
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
-}
-
STATIC void
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 70055d486bf7..ddc346a9df9b 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -462,7 +462,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
inode_dio_wait(inode);
/* Serialize against page faults */
- down_write(&zi->i_mmap_sem);
+ filemap_invalidate_lock(inode->i_mapping);
/* Serialize against zonefs_iomap_begin() */
mutex_lock(&zi->i_truncate_mutex);
@@ -500,7 +500,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
unlock:
mutex_unlock(&zi->i_truncate_mutex);
- up_write(&zi->i_mmap_sem);
+ filemap_invalidate_unlock(inode->i_mapping);
return ret;
}
@@ -575,18 +575,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
return ret;
}
-static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file));
- vm_fault_t ret;
-
- down_read(&zi->i_mmap_sem);
- ret = filemap_fault(vmf);
- up_read(&zi->i_mmap_sem);
-
- return ret;
-}
-
static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -607,16 +595,16 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
file_update_time(vmf->vma->vm_file);
/* Serialize against truncates */
- down_read(&zi->i_mmap_sem);
+ filemap_invalidate_lock_shared(inode->i_mapping);
ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
- up_read(&zi->i_mmap_sem);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
return ret;
}
static const struct vm_operations_struct zonefs_file_vm_ops = {
- .fault = zonefs_filemap_fault,
+ .fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = zonefs_filemap_page_mkwrite,
};
@@ -1155,7 +1143,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
- init_rwsem(&zi->i_mmap_sem);
zi->i_wr_refcnt = 0;
return &zi->i_vnode;
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 51141907097c..7b147907c328 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -70,12 +70,11 @@ struct zonefs_inode_info {
* and changes to the inode private data, and in particular changes to
* a sequential file size on completion of direct IO writes.
* Serialization of mmap read IOs with truncate and syscall IO
- * operations is done with i_mmap_sem in addition to i_truncate_mutex.
- * Only zonefs_seq_file_truncate() takes both lock (i_mmap_sem first,
- * i_truncate_mutex second).
+ * operations is done with invalidate_lock in addition to
+ * i_truncate_mutex. Only zonefs_seq_file_truncate() takes both lock
+ * (invalidate_lock first, i_truncate_mutex second).
*/
struct mutex i_truncate_mutex;
- struct rw_semaphore i_mmap_sem;
/* guarded by i_truncate_mutex */
unsigned int i_wr_refcnt;