aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile4
-rw-r--r--fs/afs/cell.c1
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/afs/inode.c3
-rw-r--r--fs/afs/protocol_yfs.h11
-rw-r--r--fs/afs/rxrpc.c53
-rw-r--r--fs/afs/server_list.c4
-rw-r--r--fs/afs/yfsclient.c2
-rw-r--r--fs/aio.c95
-rw-r--r--fs/autofs/autofs_i.h3
-rw-r--r--fs/autofs/expire.c3
-rw-r--r--fs/autofs/inode.c23
-rw-r--r--fs/binfmt_aout.c83
-rw-r--r--fs/binfmt_elf.c32
-rw-r--r--fs/binfmt_script.c57
-rw-r--r--fs/block_dev.c56
-rw-r--r--fs/btrfs/acl.c9
-rw-r--r--fs/btrfs/async-thread.c10
-rw-r--r--fs/btrfs/backref.c22
-rw-r--r--fs/btrfs/compression.c256
-rw-r--r--fs/btrfs/compression.h52
-rw-r--r--fs/btrfs/ctree.c150
-rw-r--r--fs/btrfs/ctree.h102
-rw-r--r--fs/btrfs/delayed-ref.c15
-rw-r--r--fs/btrfs/delayed-ref.h11
-rw-r--r--fs/btrfs/dev-replace.c9
-rw-r--r--fs/btrfs/disk-io.c54
-rw-r--r--fs/btrfs/extent-tree.c309
-rw-r--r--fs/btrfs/extent_io.c99
-rw-r--r--fs/btrfs/extent_io.h15
-rw-r--r--fs/btrfs/extent_map.c5
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/inode.c218
-rw-r--r--fs/btrfs/ioctl.c109
-rw-r--r--fs/btrfs/locking.c108
-rw-r--r--fs/btrfs/locking.h15
-rw-r--r--fs/btrfs/lzo.c31
-rw-r--r--fs/btrfs/qgroup.c372
-rw-r--r--fs/btrfs/qgroup.h120
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/relocation.c119
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c49
-rw-r--r--fs/btrfs/super.c16
-rw-r--r--fs/btrfs/transaction.c33
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c282
-rw-r--r--fs/btrfs/volumes.c216
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/zlib.c45
-rw-r--r--fs/btrfs/zstd.c316
-rw-r--r--fs/buffer.c31
-rw-r--r--fs/ceph/addr.c5
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/quota.c13
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/cifs/Kconfig120
-rw-r--r--fs/cifs/cifs_debug.c1
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h85
-rw-r--r--fs/cifs/cifsproto.h8
-rw-r--r--fs/cifs/cifssmb.c109
-rw-r--r--fs/cifs/connect.c94
-rw-r--r--fs/cifs/dfs_cache.c1
-rw-r--r--fs/cifs/file.c325
-rw-r--r--fs/cifs/inode.c12
-rw-r--r--fs/cifs/link.c14
-rw-r--r--fs/cifs/smb1ops.c8
-rw-r--r--fs/cifs/smb2file.c8
-rw-r--r--fs/cifs/smb2inode.c17
-rw-r--r--fs/cifs/smb2misc.c17
-rw-r--r--fs/cifs/smb2ops.c179
-rw-r--r--fs/cifs/smb2pdu.c261
-rw-r--r--fs/cifs/smb2pdu.h23
-rw-r--r--fs/cifs/smb2transport.c25
-rw-r--r--fs/cifs/smbdirect.c6
-rw-r--r--fs/cifs/trace.c10
-rw-r--r--fs/cifs/trace.h99
-rw-r--r--fs/cifs/transport.c295
-rw-r--r--fs/crypto/Kconfig6
-rw-r--r--fs/crypto/bio.c3
-rw-r--r--fs/crypto/fscrypt_private.h1
-rw-r--r--fs/crypto/hooks.c6
-rw-r--r--fs/crypto/keyinfo.c4
-rw-r--r--fs/crypto/policy.c3
-rw-r--r--fs/dcache.c38
-rw-r--r--fs/debugfs/inode.c40
-rw-r--r--fs/direct-io.c9
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/drop_caches.c8
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/eventpoll.c173
-rw-r--r--fs/exec.c15
-rw-r--r--fs/ext2/dir.c35
-rw-r--r--fs/ext2/ext2.h17
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c30
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c44
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext4/Kconfig15
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h12
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c4
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c24
-rw-r--r--fs/ext4/ioctl.c4
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c18
-rw-r--r--fs/ext4/page-io.c12
-rw-r--r--fs/ext4/readpage.c8
-rw-r--r--fs/ext4/super.c6
-rw-r--r--fs/ext4/sysfs.c4
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/data.c13
-rw-r--r--fs/f2fs/debug.c20
-rw-r--r--fs/f2fs/dir.c10
-rw-r--r--fs/f2fs/f2fs.h18
-rw-r--r--fs/f2fs/file.c10
-rw-r--r--fs/f2fs/inode.c4
-rw-r--r--fs/f2fs/namei.c6
-rw-r--r--fs/f2fs/super.c13
-rw-r--r--fs/f2fs/sysfs.c4
-rw-r--r--fs/fat/file.c1
-rw-r--r--fs/file.c16
-rw-r--r--fs/file_table.c9
-rw-r--r--fs/fs-writeback.c40
-rw-r--r--fs/fs_types.c105
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c72
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/lops.c196
-rw-r--r--fs/gfs2/lops.h4
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/gfs2/recovery.c123
-rw-r--r--fs/gfs2/recovery.h2
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/hugetlbfs/inode.c75
-rw-r--r--fs/inode.c15
-rw-r--r--fs/io_uring.c2971
-rw-r--r--fs/iomap.c90
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/file.c31
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c15
-rw-r--r--fs/locks.c32
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/nfs/nfs4file.c8
-rw-r--r--fs/nfs/nfs4idmap.c31
-rw-r--r--fs/nfs/super.c5
-rw-r--r--fs/nfs/write.c20
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/vfs.c6
-rw-r--r--fs/notify/fanotify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c267
-rw-r--r--fs/notify/fanotify/fanotify.h116
-rw-r--r--fs/notify/fanotify/fanotify_user.c373
-rw-r--r--fs/notify/fsnotify.c15
-rw-r--r--fs/notify/inotify/inotify.h1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c11
-rw-r--r--fs/notify/mark.c42
-rw-r--r--fs/notify/notification.c42
-rw-r--r--fs/ocfs2/alloc.c159
-rw-r--r--fs/ocfs2/cluster/nodemanager.c14
-rw-r--r--fs/ocfs2/dlmglue.c5
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/slot_map.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/proc/array.c16
-rw-r--r--fs/proc/base.c78
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/internal.h4
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/proc_net.c20
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/self.c16
-rw-r--r--fs/proc/stat.c89
-rw-r--r--fs/proc/task_mmu.c32
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/proc/thread_self.c16
-rw-r--r--fs/pstore/platform.c3
-rw-r--r--fs/pstore/ram.c72
-rw-r--r--fs/read_write.c6
-rw-r--r--fs/select.c4
-rw-r--r--fs/splice.c14
-rw-r--r--fs/statfs.c14
-rw-r--r--fs/sysfs/dir.c3
-rw-r--r--fs/sysfs/file.c8
-rw-r--r--fs/sysfs/group.c3
-rw-r--r--fs/sysfs/symlink.c3
-rw-r--r--fs/timerfd.c4
-rw-r--r--fs/ubifs/Kconfig12
-rw-r--r--fs/ubifs/Makefile2
-rw-r--r--fs/ubifs/ioctl.c4
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/super.c51
-rw-r--r--fs/utimes.c10
-rw-r--r--fs/xfs/libxfs/xfs_ag.c6
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c74
-rw-r--r--fs/xfs/libxfs/xfs_attr.c17
-rw-r--r--fs/xfs/libxfs/xfs_attr.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c302
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h16
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c49
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c17
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c100
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c10
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c29
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c11
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/libxfs/xfs_shared.h4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c3
-rw-r--r--fs/xfs/libxfs/xfs_types.c24
-rw-r--r--fs/xfs/libxfs/xfs_types.h3
-rw-r--r--fs/xfs/scrub/agheader.c10
-rw-r--r--fs/xfs/scrub/agheader_repair.c12
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bmap.c27
-rw-r--r--fs/xfs/scrub/dir.c6
-rw-r--r--fs/xfs/scrub/ialloc.c330
-rw-r--r--fs/xfs/scrub/repair.c14
-rw-r--r--fs/xfs/scrub/repair.h3
-rw-r--r--fs/xfs/scrub/rtbitmap.c5
-rw-r--r--fs/xfs/scrub/trace.h45
-rw-r--r--fs/xfs/xfs_aops.c273
-rw-r--r--fs/xfs/xfs_aops.h24
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_util.c9
-rw-r--r--fs/xfs/xfs_buf.c63
-rw-r--r--fs/xfs/xfs_buf.h8
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_file.c32
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_globals.c2
-rw-r--r--fs/xfs/xfs_inode.c769
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_iomap.c518
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c21
-rw-r--r--fs/xfs/xfs_log_recover.c14
-rw-r--r--fs/xfs/xfs_mount.c5
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_ondisk.h21
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_reflink.c150
-rw-r--r--fs/xfs/xfs_reflink.h18
-rw-r--r--fs/xfs/xfs_super.c22
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h115
-rw-r--r--fs/xfs/xfs_trans_bmap.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_refcount.c1
-rw-r--r--fs/xfs/xfs_trans_rmap.c1
-rw-r--r--fs/xfs/xfs_xattr.c3
302 files changed, 10853 insertions, 4151 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 4a930ee78d68..7bff9abecfa4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,8 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
- stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
+ fs_types.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -30,6 +31,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
+obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index cf445dbd5f2e..9de46116c749 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -173,6 +173,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
rcu_assign_pointer(cell->vl_servers, vllist);
cell->dns_expiry = TIME64_MAX;
+ __clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags);
} else {
cell->dns_expiry = ktime_get_real_seconds();
}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0568fd986821..e432bd27a2e7 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -208,7 +208,7 @@ again:
/* The new front of the queue now owns the state variables. */
next = list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link);
- vnode->lock_key = afs_file_key(next->fl_file);
+ vnode->lock_key = key_get(afs_file_key(next->fl_file));
vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
goto again;
@@ -413,7 +413,7 @@ static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl)
/* The new front of the queue now owns the state variables. */
next = list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link);
- vnode->lock_key = afs_file_key(next->fl_file);
+ vnode->lock_key = key_get(afs_file_key(next->fl_file));
vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
afs_lock_may_be_available(vnode);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 6b17d3620414..1a4ce07fb406 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -414,7 +414,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
valid = true;
} else {
- vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
vnode->cb_v_break = vnode->volume->cb_v_break;
valid = false;
}
@@ -546,6 +545,8 @@ void afs_evict_inode(struct inode *inode)
#endif
afs_put_permits(rcu_access_pointer(vnode->permit_cache));
+ key_put(vnode->lock_key);
+ vnode->lock_key = NULL;
_leave("");
}
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
index 07bc10f076aa..d443e2bfa094 100644
--- a/fs/afs/protocol_yfs.h
+++ b/fs/afs/protocol_yfs.h
@@ -161,3 +161,14 @@ struct yfs_xdr_YFSStoreVolumeStatus {
struct yfs_xdr_u64 max_quota;
struct yfs_xdr_u64 file_quota;
} __packed;
+
+enum yfs_lock_type {
+ yfs_LockNone = -1,
+ yfs_LockRead = 0,
+ yfs_LockWrite = 1,
+ yfs_LockExtend = 2,
+ yfs_LockRelease = 3,
+ yfs_LockMandatoryRead = 0x100,
+ yfs_LockMandatoryWrite = 0x101,
+ yfs_LockMandatoryExtend = 0x102,
+};
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index a7b44863d502..2c588f9bbbda 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -23,6 +23,7 @@ struct workqueue_struct *afs_async_calls;
static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
+static void afs_delete_async_call(struct work_struct *);
static void afs_process_async_call(struct work_struct *);
static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
@@ -203,20 +204,26 @@ void afs_put_call(struct afs_call *call)
}
}
+static struct afs_call *afs_get_call(struct afs_call *call,
+ enum afs_call_trace why)
+{
+ int u = atomic_inc_return(&call->usage);
+
+ trace_afs_call(call, why, u,
+ atomic_read(&call->net->nr_outstanding_calls),
+ __builtin_return_address(0));
+ return call;
+}
+
/*
* Queue the call for actual work.
*/
static void afs_queue_call_work(struct afs_call *call)
{
if (call->type->work) {
- int u = atomic_inc_return(&call->usage);
-
- trace_afs_call(call, afs_call_trace_work, u,
- atomic_read(&call->net->nr_outstanding_calls),
- __builtin_return_address(0));
-
INIT_WORK(&call->work, call->type->work);
+ afs_get_call(call, afs_call_trace_work);
if (!queue_work(afs_wq, &call->work))
afs_put_call(call);
}
@@ -398,6 +405,12 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
}
}
+ /* If the call is going to be asynchronous, we need an extra ref for
+ * the call to hold itself so the caller need not hang on to its ref.
+ */
+ if (call->async)
+ afs_get_call(call, afs_call_trace_get);
+
/* create a call */
rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
(unsigned long)call,
@@ -438,15 +451,17 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
goto error_do_abort;
}
- /* at this point, an async call may no longer exist as it may have
- * already completed */
- if (call->async)
+ /* Note that at this point, we may have received the reply or an abort
+ * - and an asynchronous call may already have completed.
+ */
+ if (call->async) {
+ afs_put_call(call);
return -EINPROGRESS;
+ }
return afs_wait_for_call_to_complete(call, ac);
error_do_abort:
- call->state = AFS_CALL_COMPLETE;
if (ret != -ECONNABORTED) {
rxrpc_kernel_abort_call(call->net->socket, rxcall,
RX_USER_ABORT, ret, "KSD");
@@ -463,8 +478,24 @@ error_do_abort:
error_kill_call:
if (call->type->done)
call->type->done(call);
- afs_put_call(call);
+
+ /* We need to dispose of the extra ref we grabbed for an async call.
+ * The call, however, might be queued on afs_async_calls and we need to
+ * make sure we don't get any more notifications that might requeue it.
+ */
+ if (call->rxcall) {
+ rxrpc_kernel_end_call(call->net->socket, call->rxcall);
+ call->rxcall = NULL;
+ }
+ if (call->async) {
+ if (cancel_work_sync(&call->async_work))
+ afs_put_call(call);
+ afs_put_call(call);
+ }
+
ac->error = ret;
+ call->state = AFS_CALL_COMPLETE;
+ afs_put_call(call);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 95d0761cdb34..155dc14caef9 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -42,9 +42,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
if (vldb->fs_mask[i] & type_mask)
nr_servers++;
- slist = kzalloc(sizeof(struct afs_server_list) +
- sizeof(struct afs_server_entry) * nr_servers,
- GFP_KERNEL);
+ slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
if (!slist)
goto error;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 12658c1363ae..5aa57929e8c2 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -803,7 +803,7 @@ int yfs_fs_create_file(struct afs_fs_cursor *fc,
bp = xdr_encode_YFSFid(bp, &vnode->fid);
bp = xdr_encode_string(bp, name, namesz);
bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
- bp = xdr_encode_u32(bp, 0); /* ViceLockType */
+ bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
yfs_check_req(call, bp);
afs_use_fs_server(call, fc->cbi);
diff --git a/fs/aio.c b/fs/aio.c
index b906ff70c90f..38b741aef0bf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -167,9 +167,13 @@ struct kioctx {
unsigned id;
};
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
struct fsync_iocb {
- struct work_struct work;
struct file *file;
+ struct work_struct work;
bool datasync;
};
@@ -183,8 +187,15 @@ struct poll_iocb {
struct work_struct work;
};
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
struct aio_kiocb {
union {
+ struct file *ki_filp;
struct kiocb rw;
struct fsync_iocb fsync;
struct poll_iocb poll;
@@ -1060,6 +1071,8 @@ static inline void iocb_put(struct aio_kiocb *iocb)
{
if (refcount_read(&iocb->ki_refcnt) == 0 ||
refcount_dec_and_test(&iocb->ki_refcnt)) {
+ if (iocb->ki_filp)
+ fput(iocb->ki_filp);
percpu_ref_put(&iocb->ki_ctx->reqs);
kmem_cache_free(kiocb_cachep, iocb);
}
@@ -1424,7 +1437,6 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
file_end_write(kiocb->ki_filp);
}
- fput(kiocb->ki_filp);
aio_complete(iocb, res, res2);
}
@@ -1432,10 +1444,8 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
{
int ret;
- req->ki_filp = fget(iocb->aio_fildes);
- if (unlikely(!req->ki_filp))
- return -EBADF;
req->ki_complete = aio_complete_rw;
+ req->private = NULL;
req->ki_pos = iocb->aio_offset;
req->ki_flags = iocb_flags(req->ki_filp);
if (iocb->aio_flags & IOCB_FLAG_RESFD)
@@ -1450,7 +1460,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
ret = ioprio_check_cap(iocb->aio_reqprio);
if (ret) {
pr_debug("aio ioprio check cap error: %d\n", ret);
- goto out_fput;
+ return ret;
}
req->ki_ioprio = iocb->aio_reqprio;
@@ -1459,14 +1469,10 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
if (unlikely(ret))
- goto out_fput;
+ return ret;
req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
return 0;
-
-out_fput:
- fput(req->ki_filp);
- return ret;
}
static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
@@ -1520,24 +1526,19 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
if (ret)
return ret;
file = req->ki_filp;
-
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- goto out_fput;
+ return -EBADF;
ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
aio_rw_done(req, call_read_iter(file, req, &iter));
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1554,16 +1555,14 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
return ret;
file = req->ki_filp;
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->write_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
/*
@@ -1581,9 +1580,6 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
aio_rw_done(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1593,7 +1589,6 @@ static void aio_fsync_work(struct work_struct *work)
int ret;
ret = vfs_fsync(req->file, req->datasync);
- fput(req->file);
aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
}
@@ -1604,13 +1599,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
iocb->aio_rw_flags))
return -EINVAL;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
- if (unlikely(!req->file->f_op->fsync)) {
- fput(req->file);
+ if (unlikely(!req->file->f_op->fsync))
return -EINVAL;
- }
req->datasync = datasync;
INIT_WORK(&req->work, aio_fsync_work);
@@ -1620,10 +1610,7 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
{
- struct file *file = iocb->poll.file;
-
aio_complete(iocb, mangle_poll(mask), 0);
- fput(file);
}
static void aio_poll_complete_work(struct work_struct *work)
@@ -1679,6 +1666,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
__poll_t mask = key_to_poll(key);
+ unsigned long flags;
req->woken = true;
@@ -1687,10 +1675,15 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (!(mask & req->events))
return 0;
- /* try to complete the iocb inline if we can: */
- if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ /*
+ * Try to complete the iocb inline if we can. Use
+ * irqsave/irqrestore because not all filesystems (e.g. fuse)
+ * call this function with IRQs disabled and because IRQs
+ * have to be disabled before ctx_lock is obtained.
+ */
+ if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
list_del(&iocb->ki_list);
- spin_unlock(&iocb->ki_ctx->ctx_lock);
+ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
list_del_init(&req->wait.entry);
aio_poll_complete(iocb, mask);
@@ -1742,9 +1735,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
INIT_WORK(&req->work, aio_poll_complete_work);
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
req->head = NULL;
req->woken = false;
@@ -1787,10 +1777,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
spin_unlock_irq(&ctx->ctx_lock);
out:
- if (unlikely(apt.error)) {
- fput(req->file);
+ if (unlikely(apt.error))
return apt.error;
- }
if (mask)
aio_poll_complete(aiocb, mask);
@@ -1828,6 +1816,11 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
if (unlikely(!req))
goto out_put_reqs_available;
+ req->ki_filp = fget(iocb->aio_fildes);
+ ret = -EBADF;
+ if (unlikely(!req->ki_filp))
+ goto out_put_req;
+
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -2198,11 +2191,11 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
- compat_long_t, min_nr,
- compat_long_t, nr,
- struct io_event __user *, events,
- struct old_timespec32 __user *, timeout)
+SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
+ __s32, min_nr,
+ __s32, nr,
+ struct io_event __user *, events,
+ struct old_timespec32 __user *, timeout)
{
struct timespec64 t;
int ret;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 3e59f0ed777b..70c132acdab1 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_wait_queue {
#define AUTOFS_SBI_CATATONIC 0x0001
#define AUTOFS_SBI_STRICTEXPIRE 0x0002
+#define AUTOFS_SBI_IGNORE 0x0004
struct autofs_sb_info {
u32 magic;
@@ -215,6 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
return -EINVAL;
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
+ /* We don't expect -EAGAIN */
+ pipe->f_flags &= ~O_NONBLOCK;
return 0;
}
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index d441244b79df..28d9c2b1b3bb 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -596,7 +596,6 @@ int autofs_expire_run(struct super_block *sb,
pkt.len = dentry->d_name.len;
memcpy(pkt.name, dentry->d_name.name, pkt.len);
pkt.name[pkt.len] = '\0';
- dput(dentry);
if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
ret = -EFAULT;
@@ -609,6 +608,8 @@ int autofs_expire_run(struct super_block *sb,
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
+ dput(dentry);
+
return ret;
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 0e8ea2d9a2bb..80597b88718b 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -82,18 +82,20 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",maxproto=%d", sbi->max_proto);
if (autofs_type_offset(sbi->type))
- seq_printf(m, ",offset");
+ seq_puts(m, ",offset");
else if (autofs_type_direct(sbi->type))
- seq_printf(m, ",direct");
+ seq_puts(m, ",direct");
else
- seq_printf(m, ",indirect");
+ seq_puts(m, ",indirect");
if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
- seq_printf(m, ",strictexpire");
+ seq_puts(m, ",strictexpire");
+ if (sbi->flags & AUTOFS_SBI_IGNORE)
+ seq_puts(m, ",ignore");
#ifdef CONFIG_CHECKPOINT_RESTORE
if (sbi->pipe)
seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
else
- seq_printf(m, ",pipe_ino=-1");
+ seq_puts(m, ",pipe_ino=-1");
#endif
return 0;
}
@@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = {
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire};
+ Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
+ Opt_ignore};
static const match_table_t tokens = {
{Opt_fd, "fd=%u"},
@@ -124,6 +127,7 @@ static const match_table_t tokens = {
{Opt_direct, "direct"},
{Opt_offset, "offset"},
{Opt_strictexpire, "strictexpire"},
+ {Opt_ignore, "ignore"},
{Opt_err, NULL}
};
@@ -206,6 +210,9 @@ static int parse_options(char *options,
case Opt_strictexpire:
sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
break;
+ case Opt_ignore:
+ sbi->flags |= AUTOFS_SBI_IGNORE;
+ break;
default:
return 1;
}
@@ -266,8 +273,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
}
root_inode = autofs_get_inode(s, S_IFDIR | 0755);
root = d_make_root(root_inode);
- if (!root)
+ if (!root) {
+ ret = -ENOMEM;
goto fail_ino;
+ }
pipe = NULL;
root->d_fsdata = ino;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ca9725f18e00..1fefd87eb4b4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -29,97 +29,14 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/a.out-core.h>
static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file*);
-#ifdef CONFIG_COREDUMP
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-static int aout_core_dump(struct coredump_params *cprm)
-{
- mm_segment_t fs;
- int has_dumped = 0;
- void __user *dump_start;
- int dump_size;
- struct user dump;
-#ifdef __alpha__
-# define START_DATA(u) ((void __user *)u.start_data)
-#else
-# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
- u.start_code))
-#endif
-# define START_STACK(u) ((void __user *)u.start_stack)
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- has_dumped = 1;
- strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
- dump.u_ar0 = offsetof(struct user, regs);
- dump.signal = cprm->siginfo->si_signo;
- aout_dump_thread(cprm->regs, &dump);
-
-/* If the size of the dump file exceeds the rlimit, then see what would happen
- if we wrote the stack, but not the data area. */
- if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
- dump.u_dsize = 0;
-
-/* Make sure we have enough room to write the stack and data areas. */
- if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
- dump.u_ssize = 0;
-
-/* make sure we actually have a data and stack area to dump */
- set_fs(USER_DS);
- if (!access_ok(START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
- dump.u_dsize = 0;
- if (!access_ok(START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
- dump.u_ssize = 0;
-
- set_fs(KERNEL_DS);
-/* struct user */
- if (!dump_emit(cprm, &dump, sizeof(dump)))
- goto end_coredump;
-/* Now dump all of the user data. Include malloced stuff as well */
- if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
- goto end_coredump;
-/* now we start writing out the user space info */
- set_fs(USER_DS);
-/* Dump the data area */
- if (dump.u_dsize != 0) {
- dump_start = START_DATA(dump);
- dump_size = dump.u_dsize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-/* Now prepare to dump the stack area */
- if (dump.u_ssize != 0) {
- dump_start = START_STACK(dump);
- dump_size = dump.u_ssize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-end_coredump:
- set_fs(fs);
- return has_dumped;
-}
-#else
-#define aout_core_dump NULL
-#endif
-
static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
.load_binary = load_aout_binary,
.load_shlib = load_aout_library,
- .core_dump = aout_core_dump,
- .min_coredump = PAGE_SIZE
};
#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 54207327f98f..7d09d125f148 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -57,8 +57,6 @@
#endif
static int load_elf_binary(struct linux_binprm *bprm);
-static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
- int, int, unsigned long);
#ifdef CONFIG_USELIB
static int load_elf_library(struct file *);
@@ -347,7 +345,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
#ifndef elf_map
static unsigned long elf_map(struct file *filep, unsigned long addr,
- struct elf_phdr *eppnt, int prot, int type,
+ const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
{
unsigned long map_addr;
@@ -387,7 +385,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
#endif /* !elf_map */
-static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -414,12 +412,13 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
* header pointed to by elf_ex, into a newly allocated array. The caller is
* responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
*/
-static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, size, err = -1;
+ int retval, err = -1;
loff_t pos = elf_ex->e_phoff;
+ unsigned int size;
/*
* If the size of this structure has changed, then punt, since
@@ -429,13 +428,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
goto out;
/* Sanity check the number of program headers... */
- if (elf_ex->e_phnum < 1 ||
- elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
- goto out;
-
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -2033,7 +2028,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
- struct list_head *t;
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,10 +2044,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_add(&ets->list, &info->thread_list);
}
- list_for_each(t, &info->thread_list) {
+ list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- ets = list_entry(t, struct elf_thread_status, list);
sz = elf_dump_thread_status(siginfo->si_signo, ets);
info->thread_status_size += sz;
}
@@ -2117,20 +2110,17 @@ static size_t get_note_info_size(struct elf_note_info *info)
static int write_note_info(struct elf_note_info *info,
struct coredump_params *cprm)
{
+ struct elf_thread_status *ets;
int i;
- struct list_head *t;
for (i = 0; i < info->numnote; i++)
if (!writenote(info->notes + i, cprm))
return 0;
/* write out the thread status notes section */
- list_for_each(t, &info->thread_list) {
- struct elf_thread_status *tmp =
- list_entry(t, struct elf_thread_status, list);
-
- for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], cprm))
+ list_for_each_entry(ets, &info->thread_list, list) {
+ for (i = 0; i < ets->num_notes; i++)
+ if (!writenote(&ets->notes[i], cprm))
return 0;
}
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d0078cbb718b..e996174cbfc0 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,13 +14,30 @@
#include <linux/err.h>
#include <linux/fs.h>
+static inline bool spacetab(char c) { return c == ' ' || c == '\t'; }
+static inline char *next_non_spacetab(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (!spacetab(*first))
+ return first;
+ return NULL;
+}
+static inline char *next_terminator(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (spacetab(*first) || !*first)
+ return first;
+ return NULL;
+}
+
static int load_script(struct linux_binprm *bprm)
{
const char *i_arg, *i_name;
- char *cp;
+ char *cp, *buf_end;
struct file *file;
int retval;
+ /* Not ours to exec if we don't start with "#!". */
if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
return -ENOEXEC;
@@ -33,23 +50,41 @@ static int load_script(struct linux_binprm *bprm)
if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
return -ENOENT;
- /*
- * This section does the #! interpretation.
- * Sorta complicated, but hopefully it will work. -TYT
- */
-
+ /* Release since we are not mapping a binary into memory. */
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
- for (cp = bprm->buf+2;; cp++) {
- if (cp >= bprm->buf + BINPRM_BUF_SIZE)
+ /*
+ * This section handles parsing the #! line into separate
+ * interpreter path and argument strings. We must be careful
+ * because bprm->buf is not yet guaranteed to be NUL-terminated
+ * (though the buffer will have trailing NUL padding when the
+ * file size was smaller than the buffer size).
+ *
+ * We do not want to exec a truncated interpreter path, so either
+ * we find a newline (which indicates nothing is truncated), or
+ * we find a space/tab/NUL after the interpreter path (which
+ * itself may be preceded by spaces/tabs). Truncating the
+ * arguments is fine: the interpreter can re-read the script to
+ * parse them on its own.
+ */
+ buf_end = bprm->buf + sizeof(bprm->buf) - 1;
+ cp = strnchr(bprm->buf, sizeof(bprm->buf), '\n');
+ if (!cp) {
+ cp = next_non_spacetab(bprm->buf + 2, buf_end);
+ if (!cp)
+ return -ENOEXEC; /* Entire buf is spaces/tabs */
+ /*
+ * If there is no later space/tab/NUL we must assume the
+ * interpreter path is truncated.
+ */
+ if (!next_terminator(cp, buf_end))
return -ENOEXEC;
- if (!*cp || (*cp == '\n'))
- break;
+ cp = buf_end;
}
+ /* NUL-terminate the buffer and any trailing spaces/tabs. */
*cp = '\0';
-
while (cp > bprm->buf) {
cp--;
if ((*cp == ' ') || (*cp == '\t'))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c546cdce77e6..e9faa52bb489 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -104,6 +104,20 @@ void invalidate_bdev(struct block_device *bdev)
}
EXPORT_SYMBOL(invalidate_bdev);
+static void set_init_blocksize(struct block_device *bdev)
+{
+ unsigned bsize = bdev_logical_block_size(bdev);
+ loff_t size = i_size_read(bdev->bd_inode);
+
+ while (bsize < PAGE_SIZE) {
+ if (size & bsize)
+ break;
+ bsize <<= 1;
+ }
+ bdev->bd_block_size = bsize;
+ bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+
int set_blocksize(struct block_device *bdev, int size)
{
/* Size must be a power of two, and between 512 and PAGE_SIZE */
@@ -197,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
ssize_t ret;
blk_qc_t qc;
int i;
+ struct bvec_iter_all iter_all;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -233,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
task_io_account_write(ret);
}
if (iocb->ki_flags & IOCB_HIPRI)
- bio.bi_opf |= REQ_HIPRI;
+ bio_set_polled(&bio, iocb);
qc = submit_bio(&bio);
for (;;) {
@@ -246,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
}
__set_current_state(TASK_RUNNING);
- bio_for_each_segment_all(bvec, &bio, i) {
+ bio_for_each_segment_all(bvec, &bio, i, iter_all) {
if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
@@ -279,6 +294,14 @@ struct blkdev_dio {
static struct bio_set blkdev_dio_pool;
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+ struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
@@ -315,8 +338,9 @@ static void blkdev_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
@@ -392,10 +416,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
- if (iocb->ki_flags & IOCB_HIPRI)
- bio->bi_opf |= REQ_HIPRI;
+ bool polled = false;
+
+ if (iocb->ki_flags & IOCB_HIPRI) {
+ bio_set_polled(bio, iocb);
+ polled = true;
+ }
qc = submit_bio(bio);
+
+ if (polled)
+ WRITE_ONCE(iocb->ki_cookie, qc);
break;
}
@@ -1431,18 +1462,9 @@ EXPORT_SYMBOL(check_disk_change);
void bd_set_size(struct block_device *bdev, loff_t size)
{
- unsigned bsize = bdev_logical_block_size(bdev);
-
inode_lock(bdev->bd_inode);
i_size_write(bdev->bd_inode, size);
inode_unlock(bdev->bd_inode);
- while (bsize < PAGE_SIZE) {
- if (size & bsize)
- break;
- bsize <<= 1;
- }
- bdev->bd_block_size = bsize;
- bdev->bd_inode->i_blkbits = blksize_bits(bsize);
}
EXPORT_SYMBOL(bd_set_size);
@@ -1519,8 +1541,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret)
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ set_init_blocksize(bdev);
+ }
/*
* If the device is invalidated, rescan partition
@@ -1555,6 +1579,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ set_init_blocksize(bdev);
}
if (bdev->bd_bdi == &noop_backing_dev_info)
@@ -2068,6 +2093,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
+ .iopoll = blkdev_iopoll,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 3b66c957ea6f..5810463dc6d2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -9,6 +9,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/posix_acl.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
@@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
}
if (acl) {
+ unsigned int nofs_flag;
+
size = posix_acl_xattr_size(acl->a_count);
+ /*
+ * We're holding a transaction handle, so use a NOFS memory
+ * allocation context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
value = kmalloc(size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!value) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d522494698fa..122cb97c7909 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
}
if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
- ret->current_active, "btrfs",
- name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
+ ret->current_active, name);
else
- ret->normal_wq = alloc_workqueue("%s-%s", flags,
- ret->current_active, "btrfs",
- name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
+ ret->current_active, name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 78556447e1d5..11459fe84a29 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -712,7 +712,7 @@ out:
* read tree blocks and add keys where required.
*/
static int add_missing_keys(struct btrfs_fs_info *fs_info,
- struct preftrees *preftrees)
+ struct preftrees *preftrees, bool lock)
{
struct prelim_ref *ref;
struct extent_buffer *eb;
@@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
- btrfs_tree_read_lock(eb);
+ if (lock)
+ btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
else
btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
- btrfs_tree_read_unlock(eb);
+ if (lock)
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
cond_resched();
@@ -1227,7 +1229,7 @@ again:
btrfs_release_path(path);
- ret = add_missing_keys(fs_info, &preftrees);
+ ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
if (ret)
goto out;
@@ -1288,11 +1290,15 @@ again:
ret = -EIO;
goto out;
}
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+ if (!path->skip_locking) {
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_read(eb);
+ }
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie, ignore_offset);
- btrfs_tree_read_unlock_blocking(eb);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
@@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
if (!path->skip_locking)
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->nodes[0] = NULL;
path->locks[0] = 0;
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 548057630b69..4f2a8ae0aa42 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -162,13 +162,14 @@ csum_failed:
} else {
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, i)
+ bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio);
@@ -730,6 +731,28 @@ struct heuristic_ws {
struct list_head list;
};
+static struct workspace_manager heuristic_wsm;
+
+static void heuristic_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
+}
+
+static void heuristic_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&heuristic_wsm);
+}
+
+static struct list_head *heuristic_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&heuristic_wsm, level);
+}
+
+static void heuristic_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&heuristic_wsm, ws);
+}
+
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -742,7 +765,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(unsigned int level)
{
struct heuristic_ws *ws;
@@ -769,65 +792,59 @@ fail:
return ERR_PTR(-ENOMEM);
}
-struct workspaces_list {
- struct list_head idle_ws;
- spinlock_t ws_lock;
- /* Number of free workspaces */
- int free_ws;
- /* Total number of allocated workspaces */
- atomic_t total_ws;
- /* Waiters for a free workspace */
- wait_queue_head_t ws_wait;
+const struct btrfs_compress_op btrfs_heuristic_compress = {
+ .init_workspace_manager = heuristic_init_workspace_manager,
+ .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
+ .get_workspace = heuristic_get_workspace,
+ .put_workspace = heuristic_put_workspace,
+ .alloc_workspace = alloc_heuristic_ws,
+ .free_workspace = free_heuristic_ws,
};
-static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
-
-static struct workspaces_list btrfs_heuristic_ws;
-
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+ /* The heuristic is represented as compression type 0 */
+ &btrfs_heuristic_compress,
&btrfs_zlib_compress,
&btrfs_lzo_compress,
&btrfs_zstd_compress,
};
-void __init btrfs_init_compress(void)
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops)
{
struct list_head *workspace;
- int i;
- INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
- spin_lock_init(&btrfs_heuristic_ws.ws_lock);
- atomic_set(&btrfs_heuristic_ws.total_ws, 0);
- init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+ wsm->ops = ops;
- workspace = alloc_heuristic_ws();
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ init_waitqueue_head(&wsm->ws_wait);
+
+ /*
+ * Preallocate one workspace for each compression type so we can
+ * guarantee forward progress in the worst case
+ */
+ workspace = wsm->ops->alloc_workspace(0);
if (IS_ERR(workspace)) {
pr_warn(
- "BTRFS: cannot preallocate heuristic workspace, will try later\n");
+ "BTRFS: cannot preallocate compression workspace, will try later\n");
} else {
- atomic_set(&btrfs_heuristic_ws.total_ws, 1);
- btrfs_heuristic_ws.free_ws = 1;
- list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+ list_add(workspace, &wsm->idle_ws);
}
+}
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
- spin_lock_init(&btrfs_comp_ws[i].ws_lock);
- atomic_set(&btrfs_comp_ws[i].total_ws, 0);
- init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+{
+ struct list_head *ws;
- /*
- * Preallocate one workspace for each compression type so
- * we can guarantee forward progress in the worst case
- */
- workspace = btrfs_compress_op[i]->alloc_workspace();
- if (IS_ERR(workspace)) {
- pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
- } else {
- atomic_set(&btrfs_comp_ws[i].total_ws, 1);
- btrfs_comp_ws[i].free_ws = 1;
- list_add(workspace, &btrfs_comp_ws[i].idle_ws);
- }
+ while (!list_empty(&wsman->idle_ws)) {
+ ws = wsman->idle_ws.next;
+ list_del(ws);
+ wsman->ops->free_workspace(ws);
+ atomic_dec(&wsman->total_ws);
}
}
@@ -837,11 +854,11 @@ void __init btrfs_init_compress(void)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-static struct list_head *__find_workspace(int type, bool heuristic)
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level)
{
struct list_head *workspace;
int cpus = num_online_cpus();
- int idx = type - 1;
unsigned nofs_flag;
struct list_head *idle_ws;
spinlock_t *ws_lock;
@@ -849,19 +866,11 @@ static struct list_head *__find_workspace(int type, bool heuristic)
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
again:
spin_lock(ws_lock);
@@ -892,10 +901,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- if (heuristic)
- workspace = alloc_heuristic_ws();
- else
- workspace = btrfs_compress_op[idx]->alloc_workspace();
+ workspace = wsm->ops->alloc_workspace(level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -926,85 +932,47 @@ again:
return workspace;
}
-static struct list_head *find_workspace(int type)
+static struct list_head *get_workspace(int type, int level)
{
- return __find_workspace(type, false);
+ return btrfs_compress_op[type]->get_workspace(level);
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-static void __free_workspace(int type, struct list_head *workspace,
- bool heuristic)
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
{
- int idx = type - 1;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
- list_add(workspace, idle_ws);
+ list_add(ws, idle_ws);
(*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
- if (heuristic)
- free_heuristic_ws(workspace);
- else
- btrfs_compress_op[idx]->free_workspace(workspace);
+ wsm->ops->free_workspace(ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
}
-static void free_workspace(int type, struct list_head *ws)
+static void put_workspace(int type, struct list_head *ws)
{
- return __free_workspace(type, ws, false);
-}
-
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
- struct list_head *workspace;
- int i;
-
- while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
- workspace = btrfs_heuristic_ws.idle_ws.next;
- list_del(workspace);
- free_heuristic_ws(workspace);
- atomic_dec(&btrfs_heuristic_ws.total_ws);
- }
-
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
- workspace = btrfs_comp_ws[i].idle_ws.next;
- list_del(workspace);
- btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&btrfs_comp_ws[i].total_ws);
- }
- }
+ return btrfs_compress_op[type]->put_workspace(ws);
}
/*
@@ -1036,18 +1004,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *total_in,
unsigned long *total_out)
{
+ int type = btrfs_compress_type(type_level);
+ int level = btrfs_compress_level(type_level);
struct list_head *workspace;
int ret;
- int type = type_level & 0xF;
-
- workspace = find_workspace(type);
- btrfs_compress_op[type - 1]->set_level(workspace, type_level);
- ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+ workspace = get_workspace(type, level);
+ ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
start, pages,
out_pages,
total_in, total_out);
- free_workspace(type, workspace);
+ put_workspace(type, workspace);
return ret;
}
@@ -1071,9 +1038,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int ret;
int type = cb->compress_type;
- workspace = find_workspace(type);
- ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
- free_workspace(type, workspace);
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ put_workspace(type, workspace);
return ret;
}
@@ -1089,19 +1056,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
struct list_head *workspace;
int ret;
- workspace = find_workspace(type);
-
- ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress(workspace, data_in,
dest_page, start_byte,
srclen, destlen);
+ put_workspace(type, workspace);
- free_workspace(type, workspace);
return ret;
}
+void __init btrfs_init_compress(void)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->init_workspace_manager();
+}
+
void __cold btrfs_exit_compress(void)
{
- free_workspaces();
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->cleanup_workspace_manager();
}
/*
@@ -1512,7 +1489,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*/
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
{
- struct list_head *ws_list = __find_workspace(0, true);
+ struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
@@ -1581,18 +1558,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
}
out:
- __free_workspace(0, ws_list, true);
+ put_workspace(0, ws_list);
return ret;
}
-unsigned int btrfs_compress_str2level(const char *str)
+/*
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ * level, unrecognized string will set the default level
+ */
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
{
- if (strncmp(str, "zlib", 4) != 0)
+ unsigned int level = 0;
+ int ret;
+
+ if (!type)
return 0;
- /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
- if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
- return str[5] - '0';
+ if (str[0] == ':') {
+ ret = kstrtouint(str + 1, 10, &level);
+ if (ret)
+ level = 0;
+ }
+
+ level = btrfs_compress_op[type]->set_level(level);
- return BTRFS_ZLIB_DEFAULT_LEVEL;
+ return level;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ddda9b80bf20..9976fe0f7526 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -64,6 +64,16 @@ struct compressed_bio {
u32 sums;
};
+static inline unsigned int btrfs_compress_type(unsigned int type_level)
+{
+ return (type_level & 0xF);
+}
+
+static inline unsigned int btrfs_compress_level(unsigned int type_level)
+{
+ return ((type_level & 0xF0) >> 4);
+}
+
void __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
@@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-unsigned btrfs_compress_str2level(const char *str);
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
@@ -97,8 +107,35 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_TYPES = 3,
};
+struct workspace_manager {
+ const struct btrfs_compress_op *ops;
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
+ wait_queue_head_t ws_wait;
+};
+
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops);
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level);
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+
struct btrfs_compress_op {
- struct list_head *(*alloc_workspace)(void);
+ void (*init_workspace_manager)(void);
+
+ void (*cleanup_workspace_manager)(void);
+
+ struct list_head *(*get_workspace)(unsigned int level);
+
+ void (*put_workspace)(struct list_head *ws);
+
+ struct list_head *(*alloc_workspace)(unsigned int level);
void (*free_workspace)(struct list_head *workspace);
@@ -119,9 +156,18 @@ struct btrfs_compress_op {
unsigned long start_byte,
size_t srclen, size_t destlen);
- void (*set_level)(struct list_head *ws, unsigned int type);
+ /*
+ * This bounds the level set by the user to be within range of a
+ * particular compression type. It returns the level that will be used
+ * if the level is out of bounds or the default if 0 is passed in.
+ */
+ unsigned int (*set_level)(unsigned int level);
};
+/* The heuristic workspaces are managed via the 0th workspace manager */
+#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
+
+extern const struct btrfs_compress_op btrfs_heuristic_compress;
extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d92462fe66c8..324df36d28bf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -13,6 +13,7 @@
#include "print-tree.h"
#include "locking.h"
#include "volumes.h"
+#include "qgroup.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -45,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
if (!p->nodes[i] || !p->locks[i])
continue;
- btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
- if (p->locks[i] == BTRFS_READ_LOCK)
+ /*
+ * If we currently have a spinning reader or writer lock this
+ * will bump the count of blocking holders and drop the
+ * spinlock.
+ */
+ if (p->locks[i] == BTRFS_READ_LOCK) {
+ btrfs_set_lock_blocking_read(p->nodes[i]);
p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- else if (p->locks[i] == BTRFS_WRITE_LOCK)
+ } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+ btrfs_set_lock_blocking_write(p->nodes[i]);
p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
}
}
@@ -968,6 +976,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}
+static struct extent_buffer *alloc_tree_block_no_bg_flush(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent_start,
+ const struct btrfs_disk_key *disk_key,
+ int level,
+ u64 hint,
+ u64 empty_size)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *ret;
+
+ /*
+ * If we are COWing a node/leaf from the extent, chunk, device or free
+ * space trees, make sure that we do not finish block group creation of
+ * pending block groups. We do this to avoid a deadlock.
+ * COWing can result in allocation of a new chunk, and flushing pending
+ * block groups (btrfs_create_pending_block_groups()) can be triggered
+ * when finishing allocation of a new chunk. Creation of a pending block
+ * group modifies the extent, chunk, device and free space trees,
+ * therefore we could deadlock with ourselves since we are holding a
+ * lock on an extent buffer that btrfs_create_pending_block_groups() may
+ * try to COW later.
+ * For similar reasons, we also need to delay flushing pending block
+ * groups when splitting a leaf or node, from one of those trees, since
+ * we are holding a write lock on it and its parent or when inserting a
+ * new root node for one of those trees.
+ */
+ if (root == fs_info->extent_root ||
+ root == fs_info->chunk_root ||
+ root == fs_info->dev_root ||
+ root == fs_info->free_space_root)
+ trans->can_flush_pending_bgs = false;
+
+ ret = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, disk_key, level,
+ hint, empty_size);
+ trans->can_flush_pending_bgs = true;
+
+ return ret;
+}
+
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
@@ -1015,26 +1065,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
- /*
- * If we are COWing a node/leaf from the extent, chunk or device trees,
- * make sure that we do not finish block group creation of pending block
- * groups. We do this to avoid a deadlock.
- * COWing can result in allocation of a new chunk, and flushing pending
- * block groups (btrfs_create_pending_block_groups()) can be triggered
- * when finishing allocation of a new chunk. Creation of a pending block
- * group modifies the extent, chunk and device trees, therefore we could
- * deadlock with ourselves since we are holding a lock on an extent
- * buffer that btrfs_create_pending_block_groups() may try to COW later.
- */
- if (root == fs_info->extent_root ||
- root == fs_info->chunk_root ||
- root == fs_info->dev_root)
- trans->can_flush_pending_bgs = false;
-
- cow = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, &disk_key, level,
- search_start, empty_size);
- trans->can_flush_pending_bgs = true;
+ cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
+ level, search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1264,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
return eb;
btrfs_set_path_blocking(path);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
@@ -1354,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
- btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb_root);
eb = btrfs_clone_extent_buffer(eb_root);
btrfs_tree_read_unlock_blocking(eb_root);
free_extent_buffer(eb_root);
@@ -1462,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
- btrfs_set_lock_blocking(parent);
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(parent);
+ btrfs_set_lock_blocking_write(buf);
+ /*
+ * Before CoWing this block for later modification, check if it's
+ * the subtree root and do the delayed subtree trace if needed.
+ *
+ * Also We don't care about the error, as it's handled internally.
+ */
+ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0);
@@ -1558,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
- btrfs_set_lock_blocking(parent);
+ btrfs_set_lock_blocking_write(parent);
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
@@ -1617,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
search_start = last_block;
btrfs_tree_lock(cur);
- btrfs_set_lock_blocking(cur);
+ btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
@@ -1832,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
- btrfs_set_lock_blocking(child);
+ btrfs_set_lock_blocking_write(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
if (ret) {
btrfs_tree_unlock(child);
@@ -1870,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (left) {
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left);
if (wret) {
@@ -1885,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (right) {
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right);
if (wret) {
@@ -2048,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 left_nr;
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2103,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 right_nr;
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2505,26 +2544,6 @@ done:
return ret;
}
-static void key_search_validate(struct extent_buffer *b,
- const struct btrfs_key *key,
- int level)
-{
-#ifdef CONFIG_BTRFS_ASSERT
- struct btrfs_disk_key disk_key;
-
- btrfs_cpu_key_to_disk(&disk_key, key);
-
- if (level == 0)
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
- offsetof(struct btrfs_leaf, items[0].key),
- sizeof(disk_key)));
- else
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
- offsetof(struct btrfs_node, ptrs[0].key),
- sizeof(disk_key)));
-#endif
-}
-
static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
int level, int *prev_cmp, int *slot)
{
@@ -2533,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
return *prev_cmp;
}
- key_search_validate(b, key, level);
*slot = 0;
return 0;
@@ -2981,6 +2999,8 @@ again:
*/
prev_cmp = -1;
ret = key_search(b, key, level, &prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
if (level != 0) {
int dec = 0;
@@ -3343,8 +3363,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &lower_key, level, root->node->start, 0);
+ c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
+ root->node->start, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3473,8 +3493,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &disk_key, level, c->start, 0);
+ split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
+ c->start, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3747,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(fs_info, right);
if (free_space < data_size)
@@ -3981,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(fs_info, left);
if (free_space < data_size) {
@@ -4258,8 +4278,8 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
- right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &disk_key, 0, l->start, 0);
+ right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
+ l->start, 0);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -5132,6 +5152,10 @@ again:
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
sret = btrfs_bin_search(cur, min_key, level, &slot);
+ if (sret < 0) {
+ ret = sret;
+ goto out;
+ }
/* at the lowest level, we're done, setup the path and exit */
if (level == path->lowest_level) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0a68cf7032f5..129d26226e70 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_pending_snapshot;
+struct btrfs_delayed_ref_root;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
@@ -786,6 +787,9 @@ enum {
* main phase. The fs_info::balance_ctl is initialized.
*/
BTRFS_FS_BALANCE_RUNNING,
+
+ /* Indicate that the cleaner thread is awake and doing something. */
+ BTRFS_FS_CLEANER_RUNNING,
};
struct btrfs_fs_info {
@@ -930,7 +934,8 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
- struct mutex cleaner_delayed_iput_mutex;
+ atomic_t nr_delayed_iputs;
+ wait_queue_head_t delayed_iputs_wait;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@@ -1070,10 +1075,13 @@ struct btrfs_fs_info {
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
- int scrub_workers_refcnt;
+ /*
+ * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+ * running.
+ */
+ refcount_t scrub_workers_refcnt;
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_nocow_workers;
struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1195,6 +1203,24 @@ enum {
BTRFS_ROOT_MULTI_LOG_TASKS,
BTRFS_ROOT_DIRTY,
BTRFS_ROOT_DELETING,
+
+ /*
+ * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
+ *
+ * Set for the subvolume tree owning the reloc tree.
+ */
+ BTRFS_ROOT_DEAD_RELOC_TREE,
+};
+
+/*
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+ spinlock_t lock;
+ /* RM_EMPTY_ROOT() of above blocks[] */
+ bool swapped;
+ struct rb_root blocks[BTRFS_MAX_LEVEL];
};
/*
@@ -1308,6 +1334,14 @@ struct btrfs_root {
u64 nr_ordered_extents;
/*
+ * Not empty if this subvolume root has gone through tree block swap
+ * (relocation)
+ *
+ * Will be used by reloc_control::dirty_subvol_roots.
+ */
+ struct list_head reloc_dirty_list;
+
+ /*
* Number of currently running SEND ioctls to prevent
* manipulation with the read-only status via SUBVOL_SETFLAGS
*/
@@ -1324,6 +1358,9 @@ struct btrfs_root {
/* Number of active swapfiles */
atomic_t nr_swapfiles;
+ /* Record pairs of swapped blocks for qgroup */
+ struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@@ -2661,6 +2698,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long count);
int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
unsigned long count, u64 transid, int wait);
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -2768,7 +2808,8 @@ enum btrfs_flush_state {
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
ALLOC_CHUNK = 7,
- COMMIT_TRANS = 8,
+ ALLOC_CHUNK_FORCE = 8,
+ COMMIT_TRANS = 9,
};
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@@ -3174,8 +3215,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
/* inode.c */
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset, u64 start,
- u64 len, int create);
+ u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes);
@@ -3247,6 +3287,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
@@ -3254,7 +3295,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
@@ -3408,31 +3449,17 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \
- ##args);\
-} while (0)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \
- ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
@@ -3483,21 +3510,18 @@ do { \
rcu_read_unlock(); \
} while (0)
-#ifdef CONFIG_BTRFS_ASSERT
-
__cold
static inline void assfail(const char *expr, const char *file, int line)
{
- pr_err("assertion failed: %s, file: %s, line: %d\n",
- expr, file, line);
- BUG();
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+ pr_err("assertion failed: %s, file: %s, line: %d\n",
+ expr, file, line);
+ BUG();
+ }
}
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#else
-#define ASSERT(expr) ((void)0)
-#endif
/*
* Use that for functions that are conditionally exported for sanity tests but
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cad36c99a483..7d2a413df90d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
- head_ref->qgroup_reserved = 0;
- head_ref->qgroup_ref_root = 0;
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
if (qrecord) {
if (ref_root && reserved) {
- head_ref->qgroup_ref_root = ref_root;
- head_ref->qgroup_reserved = reserved;
+ qrecord->data_rsv = reserved;
+ qrecord->data_rsv_refroot = ref_root;
}
-
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- WARN_ON(qrecord && head_ref->qgroup_ref_root
- && head_ref->qgroup_reserved
- && existing->qgroup_ref_root
- && existing->qgroup_reserved);
update_existing_head_ref(trans, existing, head_ref,
old_ref_mod);
/*
@@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
- record = kmalloc(sizeof(*record), GFP_NOFS);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
- record = kmalloc(sizeof(*record), GFP_NOFS);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d2af974f68a1..70606da440aa 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -103,17 +103,6 @@ struct btrfs_delayed_ref_head {
int ref_mod;
/*
- * For qgroup reserved space freeing.
- *
- * ref_root and reserved will be recorded after
- * BTRFS_ADD_DELAYED_EXTENT is called.
- * And will be used to free reserved qgroup space at
- * run_delayed_refs() time.
- */
- u64 qgroup_ref_root;
- u64 qgroup_reserved;
-
- /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8750c835f535..ee193c5222b2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -111,11 +111,11 @@ no_valid_dev_replace_entry_found:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
- NULL, NULL);
- dev_replace->tgtdev = btrfs_find_device(fs_info,
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
+ src_devid, NULL, NULL, true);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL);
+ NULL, NULL, true);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -862,6 +862,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
break;
default:
+ up_write(&dev_replace->rwsem);
result = -EINVAL;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8da2f380d3c0..f0cdb53f3e2d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -17,6 +17,7 @@
#include <linux/semaphore.h>
#include <linux/error-injection.h>
#include <linux/crc32c.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
@@ -341,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (need_lock) {
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@ -832,9 +833,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec;
struct btrfs_root *root;
int i, ret = 0;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
@@ -1120,7 +1122,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
-buf->len,
fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
@@ -1175,6 +1177,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->delalloc_root);
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
+ INIT_LIST_HEAD(&root->reloc_dirty_list);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
@@ -1218,6 +1221,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@ -1258,10 +1262,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key key;
+ unsigned int nofs_flag;
int ret = 0;
uuid_le uuid = NULL_UUID_LE;
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1682,6 +1693,8 @@ static int cleaner_kthread(void *arg)
while (1) {
again = 0;
+ set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+
/* Make the cleaner go to sleep early. */
if (btrfs_need_cleaner_sleep(fs_info))
goto sleep;
@@ -1705,9 +1718,7 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&fs_info->cleaner_mutex);
@@ -1728,6 +1739,7 @@ static int cleaner_kthread(void *arg)
*/
btrfs_delete_unused_bgs(fs_info);
sleep:
+ clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
kthread_parkme();
if (kthread_should_stop())
@@ -2098,7 +2110,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
- fs_info->scrub_workers_refcnt = 0;
+ refcount_set(&fs_info->scrub_workers_refcnt, 0);
}
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@ -2663,7 +2675,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
- mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2685,6 +2696,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
+ atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2762,6 +2774,7 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ init_waitqueue_head(&fs_info->delayed_iputs_wait);
INIT_LIST_HEAD(&fs_info->pinned_chunks);
@@ -4201,6 +4214,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->ordered_root_lock);
}
spin_unlock(&fs_info->ordered_root_lock);
+
+ /*
+ * We need this here because if we've been flipped read-only we won't
+ * get sync() from the umount, so we need to make sure any ordered
+ * extents that haven't had their dirty pages IO start writeout yet
+ * actually get run and error out properly.
+ */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -4227,16 +4248,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
- if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->refs);
- spin_unlock(&delayed_refs->lock);
-
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref_head(head);
- spin_lock(&delayed_refs->lock);
+ if (btrfs_delayed_ref_lock(delayed_refs, head))
continue;
- }
+
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -4252,12 +4266,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
- delayed_refs->num_heads--;
- if (head->processing == 0)
- delayed_refs->num_heads_ready--;
- atomic_dec(&delayed_refs->num_entries);
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
+ btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
@@ -4265,6 +4274,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (pin_bytes)
btrfs_pin_extent(fs_info, head->bytenr,
head->num_bytes, 1);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b15afeae16df..994f0cc41799 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2456,12 +2456,10 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
return ret ? ret : 1;
}
-static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head)
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_ref_root *delayed_refs =
- &trans->transaction->delayed_refs;
int nr_items = 1; /* Dropping this ref head update. */
if (head->total_ref_mod < 0) {
@@ -2494,9 +2492,6 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
}
}
- /* Also free its reserved qgroup space */
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
- head->qgroup_reserved);
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
}
@@ -2544,7 +2539,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}
- cleanup_ref_head_accounting(trans, head);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
@@ -3015,8 +3010,7 @@ again:
}
if (run_all) {
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first_cached(&delayed_refs->href_root);
@@ -4282,10 +4276,14 @@ commit_trans:
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
- * more space is released.
+ * more space is released. We don't need to
+ * explicitly run the delayed iputs here because
+ * the commit_transaction would have woken up
+ * the cleaner.
*/
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+ ret = btrfs_wait_on_delayed_iputs(fs_info);
+ if (ret)
+ return ret;
goto again;
} else {
btrfs_end_transaction(trans);
@@ -4398,7 +4396,6 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, int force)
{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
@@ -4406,14 +4403,6 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
return 1;
/*
- * We need to take into account the global rsv because for all intents
- * and purposes it's used space. Don't worry about locking the
- * global_rsv, it doesn't change except when the transaction commits.
- */
- if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
- bytes_used += calc_global_rsv_need_space(global_rsv);
-
- /*
* in limited mode, we want to have some free space up to
* about 1% of the FS size.
*/
@@ -4743,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
- u64 max_reclaim;
+ u64 async_pages;
u64 items;
long time_left;
unsigned long nr_pages;
@@ -4768,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while (delalloc_bytes && loops < 3) {
- max_reclaim = min(delalloc_bytes, to_reclaim);
- nr_pages = max_reclaim >> PAGE_SHIFT;
+ nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+ /*
+ * Triggers inode writeback for up to nr_pages. This will invoke
+ * ->writepages callback and trigger delalloc filling
+ * (btrfs_run_delalloc_range()).
+ */
btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
/*
- * We need to wait for the async pages to actually start before
- * we do anything.
+ * We need to wait for the compressed pages to start before
+ * we continue.
*/
- max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
- if (!max_reclaim)
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
goto skip_async;
- if (max_reclaim <= nr_pages)
- max_reclaim = 0;
+ /*
+ * Calculate how many compressed pages we want to be written
+ * before we continue. I.e if there are more async pages than we
+ * require wait_event will wait until nr_pages are written.
+ */
+ if (async_pages <= nr_pages)
+ async_pages = 0;
else
- max_reclaim -= nr_pages;
+ async_pages -= nr_pages;
wait_event(fs_info->async_submit_wait,
atomic_read(&fs_info->async_delalloc_pages) <=
- (int)max_reclaim);
+ (int)async_pages);
skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
@@ -4810,6 +4810,7 @@ skip_async:
}
struct reserve_ticket {
+ u64 orig_bytes;
u64 bytes;
int error;
struct list_head list;
@@ -4853,10 +4854,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
if (!bytes_needed)
return 0;
- /* See if there is enough pinned space to make this reservation */
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * See if there is enough pinned space to make this reservation, or if
+ * we have block groups that are going to be freed, allowing us to
+ * possibly do a chunk allocation the next loop through.
+ */
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+ __percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes_needed,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
goto commit;
/*
@@ -4864,7 +4874,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
* this reservation.
*/
if (space_info != delayed_rsv->space_info)
- return -ENOSPC;
+ goto enospc;
spin_lock(&delayed_rsv->lock);
reclaim_bytes += delayed_rsv->reserved;
@@ -4879,16 +4889,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
if (__percpu_counter_compare(&space_info->total_bytes_pinned,
bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
- return -ENOSPC;
- }
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+ goto enospc;
commit:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return -ENOSPC;
-
return btrfs_commit_transaction(trans);
+enospc:
+ btrfs_end_transaction(trans);
+ return -ENOSPC;
}
/*
@@ -4941,6 +4949,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
+ case ALLOC_CHUNK_FORCE:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -4948,12 +4957,21 @@ static void flush_space(struct btrfs_fs_info *fs_info,
}
ret = do_chunk_alloc(trans,
btrfs_metadata_alloc_profile(fs_info),
- CHUNK_ALLOC_NO_FORCE);
+ (state == ALLOC_CHUNK) ?
+ CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
case COMMIT_TRANS:
+ /*
+ * If we have pending delayed iputs then we could free up a
+ * bunch of pinned space, so make sure we run the iputs before
+ * we do our pinned bytes check below.
+ */
+ btrfs_run_delayed_iputs(fs_info);
+ btrfs_wait_on_delayed_iputs(fs_info);
+
ret = may_commit_transaction(fs_info, space_info);
break;
default:
@@ -5023,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
-static void wake_all_tickets(struct list_head *head)
+static bool wake_all_tickets(struct list_head *head)
{
struct reserve_ticket *ticket;
@@ -5032,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head)
list_del_init(&ticket->list);
ticket->error = -ENOSPC;
wake_up(&ticket->wait);
+ if (ticket->bytes != ticket->orig_bytes)
+ return true;
}
+ return false;
}
/*
@@ -5084,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
commit_cycles--;
}
+ /*
+ * We don't want to force a chunk allocation until we've tried
+ * pretty hard to reclaim space. Think of the case where we
+ * freed up a bunch of space and so have a lot of pinned space
+ * to reclaim. We would rather use that than possibly create a
+ * underutilized metadata chunk. So if this is our first run
+ * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+ * commit the transaction. If nothing has changed the next go
+ * around then we can force a chunk allocation.
+ */
+ if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+ flush_state++;
+
if (flush_state > COMMIT_TRANS) {
commit_cycles++;
if (commit_cycles > 2) {
- wake_all_tickets(&space_info->tickets);
- space_info->flush = 0;
+ if (wake_all_tickets(&space_info->tickets)) {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ commit_cycles--;
+ } else {
+ space_info->flush = 0;
+ }
} else {
flush_state = FLUSH_DELAYED_ITEMS_NR;
}
@@ -5102,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
INIT_WORK(work, btrfs_async_reclaim_metadata_space);
}
+static const enum btrfs_flush_state priority_flush_states[] = {
+ FLUSH_DELAYED_ITEMS_NR,
+ FLUSH_DELAYED_ITEMS,
+ ALLOC_CHUNK,
+};
+
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
u64 to_reclaim;
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
+ int flush_state;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
@@ -5118,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
}
spin_unlock(&space_info->lock);
+ flush_state = 0;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state);
+ flush_space(fs_info, space_info, to_reclaim,
+ priority_flush_states[flush_state]);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
@@ -5127,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
return;
}
spin_unlock(&space_info->lock);
-
- /*
- * Priority flushers can't wait on delalloc without
- * deadlocking.
- */
- if (flush_state == FLUSH_DELALLOC ||
- flush_state == FLUSH_DELALLOC_WAIT)
- flush_state = ALLOC_CHUNK;
- } while (flush_state < COMMIT_TRANS);
+ } while (flush_state < ARRAY_SIZE(priority_flush_states));
}
static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket, u64 orig_bytes)
+ struct reserve_ticket *ticket)
{
DEFINE_WAIT(wait);
+ u64 reclaim_bytes = 0;
int ret = 0;
spin_lock(&space_info->lock);
@@ -5164,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
ret = ticket->error;
if (!list_empty(&ticket->list))
list_del_init(&ticket->list);
- if (ticket->bytes && ticket->bytes < orig_bytes) {
- u64 num_bytes = orig_bytes - ticket->bytes;
- update_bytes_may_use(space_info, -num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, num_bytes, 0);
- }
+ if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
+ reclaim_bytes = ticket->orig_bytes - ticket->bytes;
spin_unlock(&space_info->lock);
+ if (reclaim_bytes)
+ space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
return ret;
}
@@ -5197,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket ticket;
u64 used;
+ u64 reclaim_bytes = 0;
int ret = 0;
ASSERT(orig_bytes);
@@ -5232,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ ticket.orig_bytes = orig_bytes;
ticket.bytes = orig_bytes;
ticket.error = 0;
init_waitqueue_head(&ticket.wait);
@@ -5272,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
return ret;
if (flush == BTRFS_RESERVE_FLUSH_ALL)
- return wait_reserve_ticket(fs_info, space_info, &ticket,
- orig_bytes);
+ return wait_reserve_ticket(fs_info, space_info, &ticket);
ret = 0;
priority_reclaim_metadata_space(fs_info, space_info, &ticket);
spin_lock(&space_info->lock);
if (ticket.bytes) {
- if (ticket.bytes < orig_bytes) {
- u64 num_bytes = orig_bytes - ticket.bytes;
- update_bytes_may_use(space_info, -num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags,
- num_bytes, 0);
-
- }
+ if (ticket.bytes < orig_bytes)
+ reclaim_bytes = orig_bytes - ticket.bytes;
list_del_init(&ticket.list);
ret = -ENOSPC;
}
spin_unlock(&space_info->lock);
+
+ if (reclaim_bytes)
+ space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
ASSERT(list_empty(&ticket.list));
return ret;
}
@@ -5768,6 +5803,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
+static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 *metadata_bytes, u64 *qgroup_bytes)
+{
+ *metadata_bytes = 0;
+ *qgroup_bytes = 0;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size)
+ *metadata_bytes = block_rsv->size - block_rsv->reserved;
+ if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+ *qgroup_bytes = block_rsv->qgroup_rsv_size -
+ block_rsv->qgroup_rsv_reserved;
+ spin_unlock(&block_rsv->lock);
+}
+
/**
* btrfs_inode_rsv_refill - refill the inode block rsv.
* @inode - the inode we are refilling.
@@ -5783,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
- u64 num_bytes = 0;
- u64 qgroup_num_bytes = 0;
+ u64 num_bytes, last = 0;
+ u64 qgroup_num_bytes;
int ret = -ENOSPC;
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved < block_rsv->size)
- num_bytes = block_rsv->size - block_rsv->reserved;
- if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
- qgroup_num_bytes = block_rsv->qgroup_rsv_size -
- block_rsv->qgroup_rsv_reserved;
- spin_unlock(&block_rsv->lock);
-
+ calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
if (num_bytes == 0)
return 0;
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
- if (ret)
- return ret;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ do {
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
+ true);
+ if (ret)
+ return ret;
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (ret) {
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ last = num_bytes;
+ /*
+ * If we are fragmented we can end up with a lot of
+ * outstanding extents which will make our size be much
+ * larger than our reserved amount.
+ *
+ * If the reservation happens here, it might be very
+ * big though not needed in the end, if the delalloc
+ * flushing happens.
+ *
+ * If this is the case try and do the reserve again.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ calc_refill_bytes(block_rsv, &num_bytes,
+ &qgroup_num_bytes);
+ if (num_bytes == 0)
+ return 0;
+ }
+ } while (ret && last != num_bytes);
+
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -5811,8 +5878,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
spin_lock(&block_rsv->lock);
block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
spin_unlock(&block_rsv->lock);
- } else
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ }
return ret;
}
@@ -7188,7 +7254,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved)
ret = 1;
- cleanup_ref_head_accounting(trans, head);
+ btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return ret;
@@ -8059,6 +8125,15 @@ loop:
return ret;
}
+#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
+do { \
+ struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
+ spin_lock(&__rsv->lock); \
+ btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
+ __rsv->size, __rsv->reserved); \
+ spin_unlock(&__rsv->lock); \
+} while (0)
+
static void dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
@@ -8078,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
info->bytes_readonly);
spin_unlock(&info->lock);
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
if (!dump_block_groups)
return;
@@ -8485,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clean_tree_block(fs_info, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8910,7 +8991,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
reada = 1;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@@ -8970,7 +9051,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
}
level--;
@@ -9082,7 +9163,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9124,7 +9205,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
clean_tree_block(fs_info, eb);
@@ -9291,7 +9372,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking(path->nodes[level]);
+ btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
memset(&wc->update_progress, 0,
@@ -9321,7 +9402,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
- btrfs_set_lock_blocking(path->nodes[level]);
+ btrfs_set_lock_blocking_write(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9588,6 +9669,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
+ u64 sinfo_used;
u64 min_allocable_bytes;
int ret = -ENOSPC;
@@ -9614,9 +9696,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
num_bytes = cache->key.offset - cache->reserved - cache->pinned -
cache->bytes_super - btrfs_block_group_used(&cache->item);
+ sinfo_used = btrfs_space_info_used(sinfo, true);
- if (btrfs_space_info_used(sinfo, true) + num_bytes +
- min_allocable_bytes <= sinfo->total_bytes) {
+ if (sinfo_used + num_bytes + min_allocable_bytes <=
+ sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -9625,6 +9708,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
out:
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
+ if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(cache->fs_info,
+ "unable to make block group %llu ro",
+ cache->key.objectid);
+ btrfs_info(cache->fs_info,
+ "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+ sinfo_used, num_bytes, min_allocable_bytes);
+ dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ }
return ret;
}
@@ -10774,13 +10866,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
spin_lock(&trans->transaction->dirty_bgs_lock);
- if (!list_empty(&block_group->dirty_list)) {
- WARN_ON(1);
- }
- if (!list_empty(&block_group->io_list)) {
- WARN_ON(1);
- }
+ WARN_ON(!list_empty(&block_group->dirty_list));
+ WARN_ON(!list_empty(&block_group->io_list));
spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 52abe4082680..ab705183d749 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -147,7 +147,39 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd);
+static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ blk_status_t ret = 0;
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
+ struct bio_vec bv;
+ struct extent_io_tree *tree = bio->bi_private;
+ u64 start;
+
+ mp_bvec_last_segment(bvec, &bv);
+ start = page_offset(bv.bv_page) + bv.bv_offset;
+
+ bio->bi_private = NULL;
+
+ if (tree->ops)
+ ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+ mirror_num, bio_flags, start);
+ else
+ btrfsic_submit_bio(bio);
+
+ return blk_status_to_errno(ret);
+}
+
+static void flush_write_bio(struct extent_page_data *epd)
+{
+ if (epd->bio) {
+ int ret;
+
+ ret = submit_one_bio(epd->bio, 0, 0);
+ BUG_ON(ret < 0); /* -ENOMEM */
+ epd->bio = NULL;
+ }
+}
int __init extent_io_init(void)
{
@@ -281,8 +313,8 @@ do_insert:
}
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **prev_ret,
struct rb_node **next_ret,
+ struct rb_node **prev_ret,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
@@ -311,23 +343,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
if (parent_ret)
*parent_ret = prev;
- if (prev_ret) {
+ if (next_ret) {
orig_prev = prev;
while (prev && offset > prev_entry->end) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *prev_ret = prev;
+ *next_ret = prev;
prev = orig_prev;
}
- if (next_ret) {
+ if (prev_ret) {
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
while (prev && offset < prev_entry->start) {
prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *next_ret = prev;
+ *prev_ret = prev;
}
return NULL;
}
@@ -338,12 +370,12 @@ tree_search_for_insert(struct extent_io_tree *tree,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
- struct rb_node *prev = NULL;
+ struct rb_node *next= NULL;
struct rb_node *ret;
- ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+ ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
if (!ret)
- return prev;
+ return next;
return ret;
}
@@ -585,7 +617,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (delete)
bits |= ~EXTENT_CTLBITS;
- bits |= EXTENT_FIRST_DELALLOC;
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
@@ -850,7 +881,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
- bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@@ -2350,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
- unsigned failed_bio_pages = bio_pages_all(failed_bio);
+ unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2422,9 +2452,10 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2493,9 +2524,10 @@ static void end_bio_extent_readpage(struct bio *bio)
int mirror;
int ret;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2692,28 +2724,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
return bio;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
-{
- blk_status_t ret = 0;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
- struct page *page = bvec->bv_page;
- struct extent_io_tree *tree = bio->bi_private;
- u64 start;
-
- start = page_offset(page) + bvec->bv_offset;
-
- bio->bi_private = NULL;
-
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags, start);
- else
- btrfsic_submit_bio(bio);
-
- return blk_status_to_errno(ret);
-}
-
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @tree: tree so we can call our merge_bio hook
@@ -3634,9 +3644,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
struct bio_vec *bvec;
struct extent_buffer *eb;
int i, done;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private;
@@ -4007,17 +4018,6 @@ retry:
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd)
-{
- if (epd->bio) {
- int ret;
-
- ret = submit_one_bio(epd->bio, 0, 0);
- BUG_ON(ret < 0); /* -ENOMEM */
- epd->bio = NULL;
- }
-}
-
int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
@@ -4259,8 +4259,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
- len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
if (IS_ERR_OR_NULL(em))
return em;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9673be3f3d1f..08749e0b9c32 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,17 +18,16 @@
#define EXTENT_BOUNDARY (1U << 9)
#define EXTENT_NODATASUM (1U << 10)
#define EXTENT_CLEAR_META_RESV (1U << 11)
-#define EXTENT_FIRST_DELALLOC (1U << 12)
-#define EXTENT_NEED_WAIT (1U << 13)
-#define EXTENT_DAMAGED (1U << 14)
-#define EXTENT_NORESERVE (1U << 15)
-#define EXTENT_QGROUP_RESERVED (1U << 16)
-#define EXTENT_CLEAR_DATA_RESV (1U << 17)
-#define EXTENT_DELALLOC_NEW (1U << 18)
+#define EXTENT_NEED_WAIT (1U << 12)
+#define EXTENT_DAMAGED (1U << 13)
+#define EXTENT_NORESERVE (1U << 14)
+#define EXTENT_QGROUP_RESERVED (1U << 15)
+#define EXTENT_CLEAR_DATA_RESV (1U << 16)
+#define EXTENT_DELALLOC_NEW (1U << 17)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
/*
* flags for bio submission. The high bits indicate the compression
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a042a193c120..928f729c55ba 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (!list_empty(&prev->list) || !list_empty(&next->list))
return 0;
+ ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
+ prev->block_start != EXTENT_MAP_DELALLOC);
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start == EXTENT_MAP_DELALLOC &&
- prev->block_start == EXTENT_MAP_DELALLOC) ||
(next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
next->block_start == extent_map_block_end(prev)))) {
return 1;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ef05a0121652..473f039fcd7c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -9,6 +9,7 @@
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
+/* used only during fiemap calls */
#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d38dc8c31533..34fe8a58b0e9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3218,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
&cached_state);
while (start < inode->i_size) {
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
- start, len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43eb4535319d..82fdda8ff5ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -453,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
- u64 isize = i_size_read(inode);
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -467,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode,
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, isize, end + 1);
+ actual_end = min_t(u64, i_size_read(inode), end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -714,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent)
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline void submit_compressed_extents(struct inode *inode,
- struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_cow *async_cow)
{
+ struct inode *inode = async_cow->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_extent *async_extent;
u64 alloc_hint = 0;
@@ -1166,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work)
5 * SZ_1M)
cond_wake_up_nomb(&fs_info->async_submit_wait);
+ /*
+ * ->inode could be NULL if async_cow_start has failed to compress,
+ * in which case we don't have anything to submit, yet we need to
+ * always adjust ->async_delalloc_pages as its paired with the init
+ * happening in cow_file_range_async
+ */
if (async_cow->inode)
- submit_compressed_extents(async_cow->inode, async_cow);
+ submit_compressed_extents(async_cow);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -1194,7 +1199,12 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
- async_cow->inode = igrab(inode);
+ /*
+ * igrab is called higher up in the call chain, take only the
+ * lightweight reference for the callback lifetime
+ */
+ ihold(inode);
+ async_cow->inode = inode;
async_cow->fs_info = fs_info;
async_cow->locked_page = locked_page;
async_cow->start = start;
@@ -1586,11 +1596,10 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
- struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
unsigned int write_flags = wbc_to_write_flags(wbc);
@@ -3129,9 +3138,6 @@ out:
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
- /* Try to release some metadata so we don't get an OOM but don't wait */
- btrfs_btree_balance_dirty_nodelay(fs_info);
-
return ret;
}
@@ -3250,10 +3256,13 @@ void btrfs_add_delayed_iput(struct inode *inode)
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
+ atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
ASSERT(list_empty(&binode->delayed_iput));
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock(&fs_info->delayed_iput_lock);
+ if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
}
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
@@ -3268,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+ wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
+/**
+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
+ * @fs_info - the fs_info for this fs
+ * @return - EINTR if we were killed, 0 if nothing's pending
+ *
+ * This will wait on any delayed iputs that are currently running with KILLABLE
+ * set. Once they are all done running we will return, unless we are killed in
+ * which case we return EINTR. This helps in user operations like fallocate etc
+ * that might get blocked on the iputs.
+ */
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+ atomic_read(&fs_info->nr_delayed_iputs) == 0);
+ if (ret)
+ return -EINTR;
+ return 0;
+}
+
/*
* This creates an orphan entry for the given inode in case something goes wrong
* in the middle of an unlink.
@@ -5263,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
int failures = 0;
for (;;) {
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+ ret = btrfs_block_rsv_refill(root, rsv,
+ rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@@ -5278,9 +5310,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
return ERR_PTR(-ENOSPC);
}
+ /*
+ * Evict can generate a large amount of delayed refs without
+ * having a way to add space back since we exhaust our temporary
+ * block rsv. We aren't allowed to do FLUSH_ALL in this case
+ * because we could deadlock with so many things in the flushing
+ * code, so we have to try and hold some extra space to
+ * compensate for our delayed ref generation. If we can't get
+ * that space then we need see if we can steal our minimum from
+ * the global reserve. We will be ratelimited by the amount of
+ * space we have for the delayed refs rsv, so we'll end up
+ * committing and trying again.
+ */
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans) || !ret)
+ if (IS_ERR(trans) || !ret) {
+ if (!IS_ERR(trans)) {
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ trans->bytes_reserved = delayed_refs_extra;
+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+ delayed_refs_extra, 1);
+ }
return trans;
+ }
/*
* Try to steal from the global reserve if there is space for
@@ -6732,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
- u32 found_type;
+ u8 extent_type;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
@@ -6787,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (ret < 0) {
err = ret;
goto out;
- }
-
- if (ret != 0) {
+ } else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
@@ -6798,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- /* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = found_key.type;
if (found_key.objectid != objectid ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
+ found_key.type != BTRFS_EXTENT_DATA_KEY) {
/*
* If we backup past the first extent we want to move forward
* and see if there is an extent in front of us, otherwise we'll
@@ -6813,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto next;
}
- found_type = btrfs_file_extent_type(leaf, item);
+ extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -6841,9 +6888,9 @@ next:
if (ret < 0) {
err = ret;
goto out;
- }
- if (ret > 0)
+ } else if (ret > 0) {
goto not_found;
+ }
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6854,19 +6901,22 @@ next:
goto not_found;
if (start > found_key.offset)
goto next;
+
+ /* New extent overlaps with existing one */
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
- goto not_found_em;
+ em->block_start = EXTENT_MAP_HOLE;
+ goto insert;
}
btrfs_extent_item_to_extent_map(inode, path, item,
new_inline, em);
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
char *map;
size_t size;
@@ -6917,7 +6967,6 @@ not_found:
em->start = start;
em->orig_start = start;
em->len = len;
-not_found_em:
em->block_start = EXTENT_MAP_HOLE;
insert:
btrfs_release_path(path);
@@ -6947,19 +6996,17 @@ out:
}
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ u64 start, u64 len)
{
struct extent_map *em;
struct extent_map *hole_em = NULL;
- u64 range_start = start;
+ u64 delalloc_start = start;
u64 end;
- u64 found;
- u64 found_end;
+ u64 delalloc_len;
+ u64 delalloc_end;
int err = 0;
- em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em))
return em;
/*
@@ -6984,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em = NULL;
/* ok, we didn't find anything, lets look for delalloc */
- found = count_range_bits(&inode->io_tree, &range_start,
+ delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
end, len, EXTENT_DELALLOC, 1);
- found_end = range_start + found;
- if (found_end < range_start)
- found_end = (u64)-1;
+ delalloc_end = delalloc_start + delalloc_len;
+ if (delalloc_end < delalloc_start)
+ delalloc_end = (u64)-1;
/*
- * we didn't find anything useful, return
- * the original results from get_extent()
+ * We didn't find anything useful, return the original results from
+ * get_extent()
*/
- if (range_start > end || found_end <= start) {
+ if (delalloc_start > end || delalloc_end <= start) {
em = hole_em;
hole_em = NULL;
goto out;
}
- /* adjust the range_start to make sure it doesn't
- * go backwards from the start they passed in
+ /*
+ * Adjust the delalloc_start to make sure it doesn't go backwards from
+ * the start they passed in
*/
- range_start = max(start, range_start);
- found = found_end - range_start;
+ delalloc_start = max(start, delalloc_start);
+ delalloc_len = delalloc_end - delalloc_start;
- if (found > 0) {
- u64 hole_start = start;
- u64 hole_len = len;
+ if (delalloc_len > 0) {
+ u64 hole_start;
+ u64 hole_len;
+ const u64 hole_end = extent_map_end(hole_em);
em = alloc_extent_map();
if (!em) {
err = -ENOMEM;
goto out;
}
+ em->bdev = NULL;
+
+ ASSERT(hole_em);
/*
- * when btrfs_get_extent can't find anything it
- * returns one huge hole
+ * When btrfs_get_extent can't find anything it returns one
+ * huge hole
*
- * make sure what it found really fits our range, and
- * adjust to make sure it is based on the start from
- * the caller
+ * Make sure what it found really fits our range, and adjust to
+ * make sure it is based on the start from the caller
*/
- if (hole_em) {
- u64 calc_end = extent_map_end(hole_em);
-
- if (calc_end <= start || (hole_em->start > end)) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = calc_end - hole_start;
- }
+ if (hole_end <= start || hole_em->start > end) {
+ free_extent_map(hole_em);
+ hole_em = NULL;
+ } else {
+ hole_start = max(hole_em->start, start);
+ hole_len = hole_end - hole_start;
}
- em->bdev = NULL;
- if (hole_em && range_start > hole_start) {
- /* our hole starts before our delalloc, so we
- * have to return just the parts of the hole
- * that go until the delalloc starts
+
+ if (hole_em && delalloc_start > hole_start) {
+ /*
+ * Our hole starts before our delalloc, so we have to
+ * return just the parts of the hole that go until the
+ * delalloc starts
*/
- em->len = min(hole_len,
- range_start - hole_start);
+ em->len = min(hole_len, delalloc_start - hole_start);
em->start = hole_start;
em->orig_start = hole_start;
/*
- * don't adjust block start at all,
- * it is fixed at EXTENT_MAP_HOLE
+ * Don't adjust block start at all, it is fixed at
+ * EXTENT_MAP_HOLE
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
- em->start = range_start;
- em->len = found;
- em->orig_start = range_start;
+ /*
+ * Hole is out of passed range or it starts after
+ * delalloc range
+ */
+ em->start = delalloc_start;
+ em->len = delalloc_len;
+ em->orig_start = delalloc_start;
em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = found;
+ em->block_len = delalloc_len;
}
} else {
return hole_em;
@@ -7778,6 +7829,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@@ -7789,7 +7841,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
io_tree, done->start, bvec->bv_page,
btrfs_ino(BTRFS_I(inode)), 0);
@@ -7868,6 +7920,7 @@ static void btrfs_retry_endio(struct bio *bio)
int uptodate;
int ret;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
@@ -7881,7 +7934,7 @@ static void btrfs_retry_endio(struct bio *bio)
failure_tree = &BTRFS_I(inode)->io_failure_tree;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
bvec->bv_offset, done->start,
bvec->bv_len);
@@ -9911,7 +9964,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
btrfs_run_delalloc_work, NULL, NULL);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fab9443f6a42..494f0f10d70e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1642,7 +1642,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -3178,7 +3178,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
s_uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
+ NULL, true);
if (!dev) {
ret = -ENODEV;
@@ -3221,32 +3222,38 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
inode_lock_nested(inode2, I_MUTEX_CHILD);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ if (inode1 < inode2) {
+ swap(inode1, inode2);
+ swap(loff1, loff2);
+ } else if (inode1 == inode2 && loff2 < loff1) {
+ swap(loff1, loff2);
+ }
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
- u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
int ret;
- u64 len = olen;
- if (loff + len == src->i_size)
- len = ALIGN(src->i_size, bs) - loff;
/*
- * For same inode case we don't want our length pushed out past i_size
- * as comparing that data range makes no sense.
- *
- * This effectively means we require aligned extents for the single
- * inode case, whereas the other cases allow an unaligned length so long
- * as it ends at i_size.
- */
- if (dst == src && len != olen)
- return -EINVAL;
-
- /*
- * Lock destination range to serialize with concurrent readpages().
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
*/
- lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);
- ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
- unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1);
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
}
@@ -3257,21 +3264,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
- int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
u64 i, tail_len, chunk_count;
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM))
- return -EINVAL;
-
- if (IS_SWAPFILE(src) || IS_SWAPFILE(dst))
- return -ETXTBSY;
-
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
- if (chunk_count == 0)
- num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
for (i = 0; i < chunk_count; i++) {
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
@@ -3887,14 +3883,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* be either compressed or non-compressed.
*/
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- return -EINVAL;
-
- if (IS_SWAPFILE(src) || IS_SWAPFILE(inode))
- return -ETXTBSY;
-
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
* eof block into the middle of a file, which would result in corruption
@@ -3905,17 +3893,33 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
len = ALIGN(src->i_size, bs) - off;
if (destoff > inode->i_size) {
+ const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
+
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
return ret;
+ /*
+ * We may have truncated the last block if the inode's size is
+ * not sector size aligned, so we need to wait for writeback to
+ * complete before proceeding further, otherwise we can race
+ * with cloning and attempt to increment a reference to an
+ * extent that no longer exists (writeback completed right after
+ * we found the previous extent covering eof and before we
+ * attempted to increment its reference count).
+ */
+ ret = btrfs_wait_ordered_range(inode, wb_start,
+ destoff - wb_start);
+ if (ret)
+ return ret;
}
/*
- * Lock destination range to serialize with concurrent readpages().
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
*/
- lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
/*
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data.
@@ -3954,6 +3958,13 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
else
btrfs_double_inode_lock(inode_in, inode_out);
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* Now that the inodes are locked, we need to start writeback ourselves
* and can not rely on the writeback from the VFS's generic helper
@@ -4344,7 +4355,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
@@ -4377,7 +4388,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@@ -4401,7 +4412,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
ret = btrfs_get_dev_stats(fs_info, sa);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@@ -4447,7 +4458,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
break;
}
- if (copy_to_user(arg, p, sizeof(*p)))
+ if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
ret = -EFAULT;
out:
kfree(p);
@@ -4753,7 +4764,7 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if (arg) {
+ if ((ret == 0 || ret == -ECANCELED) && arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1da768e5ef75..82b84e4daad1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -14,43 +14,58 @@
static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
-/*
- * if we currently have a spinning reader or writer lock
- * (indicated by the rw flag) this will bump the count
- * of blocking holders and drop the spinlock.
- */
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
/*
- * no lock is required. The lock owner may change if
- * we have a read lock, but it won't change to or away
- * from us. If we have the write lock, we are the owner
- * and it'll never change.
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
- if (rw == BTRFS_WRITE_LOCK) {
- if (atomic_read(&eb->blocking_writers) == 0) {
- WARN_ON(atomic_read(&eb->spinning_writers) != 1);
- atomic_dec(&eb->spinning_writers);
- btrfs_assert_tree_locked(eb);
- atomic_inc(&eb->blocking_writers);
- write_unlock(&eb->lock);
- }
- } else if (rw == BTRFS_READ_LOCK) {
- btrfs_assert_tree_read_locked(eb);
- atomic_inc(&eb->blocking_readers);
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
- read_unlock(&eb->lock);
+ btrfs_assert_tree_read_locked(eb);
+ atomic_inc(&eb->blocking_readers);
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+ read_unlock(&eb->lock);
+}
+
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
+{
+ /*
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ if (atomic_read(&eb->blocking_writers) == 0) {
+ WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+ atomic_dec(&eb->spinning_writers);
+ btrfs_assert_tree_locked(eb);
+ atomic_inc(&eb->blocking_writers);
+ write_unlock(&eb->lock);
}
}
-/*
- * if we currently have a blocking lock, take the spinlock
- * and drop our blocking count
- */
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb)
+{
+ /*
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+ read_lock(&eb->lock);
+ atomic_inc(&eb->spinning_readers);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ cond_wake_up_nomb(&eb->read_lock_wq);
+}
+
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
{
/*
* no lock is required. The lock owner may change if
@@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
-
- if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
- BUG_ON(atomic_read(&eb->blocking_writers) != 1);
- write_lock(&eb->lock);
- WARN_ON(atomic_read(&eb->spinning_writers));
- atomic_inc(&eb->spinning_writers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_writers))
- cond_wake_up_nomb(&eb->write_lock_wq);
- } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
- BUG_ON(atomic_read(&eb->blocking_readers) == 0);
- read_lock(&eb->lock);
- atomic_inc(&eb->spinning_readers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_readers))
- cond_wake_up_nomb(&eb->read_lock_wq);
- }
+ BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+ write_lock(&eb->lock);
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_inc(&eb->spinning_writers);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_writers))
+ cond_wake_up_nomb(&eb->write_lock_wq);
}
/*
@@ -232,16 +237,9 @@ again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
write_lock(&eb->lock);
- if (atomic_read(&eb->blocking_readers)) {
+ if (atomic_read(&eb->blocking_readers) ||
+ atomic_read(&eb->blocking_writers)) {
write_unlock(&eb->lock);
- wait_event(eb->read_lock_wq,
- atomic_read(&eb->blocking_readers) == 0);
- goto again;
- }
- if (atomic_read(&eb->blocking_writers)) {
- write_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- atomic_read(&eb->blocking_writers) == 0);
goto again;
}
WARN_ON(atomic_read(&eb->spinning_writers));
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 29135def468e..595014f64830 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb);
void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
@@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
BUG();
}
-static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
-{
- btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
-}
-
-static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
-{
- btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
-}
#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 90639140439f..579d53ae256f 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -61,6 +61,28 @@ struct workspace {
struct list_head list;
};
+static struct workspace_manager wsm;
+
+static void lzo_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
+}
+
+static void lzo_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *lzo_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&wsm, level);
+}
+
+static void lzo_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&wsm, ws);
+}
+
static void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *lzo_alloc_workspace(void)
+static struct list_head *lzo_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -485,11 +507,16 @@ out:
return ret;
}
-static void lzo_set_level(struct list_head *ws, unsigned int type)
+static unsigned int lzo_set_level(unsigned int level)
{
+ return 0;
}
const struct btrfs_compress_op btrfs_lzo_compress = {
+ .init_workspace_manager = lzo_init_workspace_manager,
+ .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
+ .get_workspace = lzo_get_workspace,
+ .put_workspace = lzo_put_workspace,
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e473a998219..c1cd5558a646 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
node);
- if (bytenr < entry->bytenr)
+ if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
+ } else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
- else
+ } else {
+ if (record->data_rsv && !entry->data_rsv) {
+ entry->data_rsv = record->data_rsv;
+ entry->data_rsv_refroot =
+ record->data_rsv_refroot;
+ }
return 1;
+ }
}
rb_link_node(&record->node, parent_node, p);
@@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
- record = kmalloc(sizeof(*record), gfp_flag);
+ record = kzalloc(sizeof(*record), gfp_flag);
if (!record)
return -ENOMEM;
@@ -1832,7 +1838,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
}
@@ -1973,7 +1979,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
need_cleanup = true;
}
@@ -2017,86 +2023,30 @@ out:
return ret;
}
-/*
- * Inform qgroup to trace subtree swap used in balance.
- *
- * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
- * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
- *
- * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
- * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
- * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
- * and skip all tree blocks whose generation is smaller than last_snapshot.
- *
- * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
- * which could be the cause of very slow balance if the file tree is large.
- *
- * @src_parent, @src_slot: pointer to src (file tree) eb.
- * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
- */
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *bg_cache,
- struct extent_buffer *src_parent, int src_slot,
- struct extent_buffer *dst_parent, int dst_slot,
- u64 last_snapshot)
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct extent_buffer *src_eb,
+ struct extent_buffer *dst_eb,
+ u64 last_snapshot, bool trace_leaf)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *dst_path = NULL;
- struct btrfs_key first_key;
- struct extent_buffer *src_eb = NULL;
- struct extent_buffer *dst_eb = NULL;
- bool trace_leaf = false;
- u64 child_gen;
- u64 child_bytenr;
int level;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
- /* Check parameter order */
- if (btrfs_node_ptr_generation(src_parent, src_slot) >
- btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+ /* Wrong parameter order */
+ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
- btrfs_node_ptr_generation(src_parent, src_slot),
- btrfs_node_ptr_generation(dst_parent, dst_slot));
+ btrfs_header_generation(src_eb),
+ btrfs_header_generation(dst_eb));
return -EUCLEAN;
}
- /*
- * Only trace leaf if we're relocating data block groups, this could
- * reduce tons of data extents tracing for meta/sys bg relocation.
- */
- if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
- trace_leaf = true;
- /* Read out real @src_eb, pointed by @src_parent and @src_slot */
- child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
- child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
- btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
-
- src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
- btrfs_header_level(src_parent) - 1, &first_key);
- if (IS_ERR(src_eb)) {
- ret = PTR_ERR(src_eb);
- goto out;
- }
-
- /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
- child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
- child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
- btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
-
- dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
- btrfs_header_level(dst_parent) - 1, &first_key);
- if (IS_ERR(dst_eb)) {
- ret = PTR_ERR(dst_eb);
- goto out;
- }
-
if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
- ret = -EINVAL;
+ ret = -EIO;
goto out;
}
@@ -2106,14 +2056,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = -ENOMEM;
goto out;
}
-
/* For dst_path */
extent_buffer_get(dst_eb);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
- /* Do the generation-aware breadth-first search */
+ /* Do the generation aware breadth-first search */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
level, last_snapshot, trace_leaf);
if (ret < 0)
@@ -2121,8 +2070,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = 0;
out:
- free_extent_buffer(src_eb);
- free_extent_buffer(dst_eb);
btrfs_free_path(dst_path);
if (ret < 0)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2207,7 +2154,7 @@ walk_down:
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -2576,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
goto cleanup;
}
+ /* Free the reserved data space */
+ btrfs_qgroup_free_refroot(fs_info,
+ record->data_rsv_refroot,
+ record->data_rsv,
+ BTRFS_QGROUP_RSV_DATA);
/*
* Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
@@ -2842,16 +2794,15 @@ out:
/*
* Two limits to commit transaction in advance.
*
- * For RATIO, it will be 1/RATIO of the remaining limit
- * (excluding data and prealloc meta) as threshold.
+ * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
* For SIZE, it will be in byte unit as threshold.
*/
-#define QGROUP_PERTRANS_RATIO 32
-#define QGROUP_PERTRANS_SIZE SZ_32M
+#define QGROUP_FREE_RATIO 32
+#define QGROUP_FREE_SIZE SZ_32M
static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 limit;
+ u64 free;
u64 threshold;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2870,20 +2821,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
*/
if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
- limit = qg->max_excl;
- else
- limit = qg->max_rfer;
- threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
- qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
- QGROUP_PERTRANS_RATIO;
- threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+ if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+ free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+ threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+ QGROUP_FREE_SIZE);
+ } else {
+ free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+ threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+ QGROUP_FREE_SIZE);
+ }
/*
* Use transaction_kthread to commit transaction, so we no
* longer need to bother nested transaction nor lock context.
*/
- if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+ if (free < threshold)
btrfs_commit_transaction_locksafe(fs_info);
}
@@ -2959,7 +2911,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
qgroup_rsv_add(fs_info, qg, num_bytes, type);
}
@@ -3026,7 +2977,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
qgroup_rsv_release(fs_info, qg, num_bytes, type);
list_for_each_entry(glist, &qg->groups, next_group) {
@@ -3783,3 +3733,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
extent_changeset_release(&changeset);
}
+
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+ int i;
+
+ spin_lock_init(&swapped_blocks->lock);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ swapped_blocks->blocks[i] = RB_ROOT;
+ swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+ int i;
+
+ swapped_blocks = &root->swapped_blocks;
+
+ spin_lock(&swapped_blocks->lock);
+ if (!swapped_blocks->swapped)
+ goto out;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
+ struct btrfs_qgroup_swapped_block *entry;
+ struct btrfs_qgroup_swapped_block *next;
+
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+ node)
+ kfree(entry);
+ swapped_blocks->blocks[i] = RB_ROOT;
+ }
+ swapped_blocks->swapped = false;
+out:
+ spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root: tree root of the subvolume tree get swapped
+ * @bg: block group under balance
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ * BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot: last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct rb_node **cur;
+ struct rb_node *parent = NULL;
+ int level = btrfs_header_level(subvol_parent) - 1;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+ __func__,
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+ return -EUCLEAN;
+ }
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * @reloc_parent/slot is still before swap, while @block is going to
+ * record the bytenr after swap, so we do the swap here.
+ */
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+ reloc_slot);
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+ subvol_slot);
+ block->last_snapshot = last_snapshot;
+ block->level = level;
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ block->trace_leaf = true;
+ else
+ block->trace_leaf = false;
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+ /* Insert @block into @blocks */
+ spin_lock(&blocks->lock);
+ cur = &blocks->blocks[level].rb_node;
+ while (*cur) {
+ struct btrfs_qgroup_swapped_block *entry;
+
+ parent = *cur;
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+ node);
+
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
+ cur = &(*cur)->rb_left;
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+ cur = &(*cur)->rb_right;
+ } else {
+ if (entry->subvol_generation !=
+ block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation !=
+ block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found.
+ * Shouldn't happen.
+ *
+ * Marking qgroup inconsistent should be enough
+ * for end users.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EEXIST;
+ }
+ kfree(block);
+ goto out_unlock;
+ }
+ }
+ rb_link_node(&block->node, parent, cur);
+ rb_insert_color(&block->node, &blocks->blocks[level]);
+ blocks->swapped = true;
+out_unlock:
+ spin_unlock(&blocks->lock);
+out:
+ if (ret < 0)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *subvol_eb)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct extent_buffer *reloc_eb = NULL;
+ struct rb_node *node;
+ bool found = false;
+ bool swapped = false;
+ int level = btrfs_header_level(subvol_eb);
+ int ret = 0;
+ int i;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+ if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ return 0;
+
+ spin_lock(&blocks->lock);
+ if (!blocks->swapped) {
+ spin_unlock(&blocks->lock);
+ return 0;
+ }
+ node = blocks->blocks[level].rb_node;
+
+ while (node) {
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+ if (block->subvol_bytenr < subvol_eb->start) {
+ node = node->rb_left;
+ } else if (block->subvol_bytenr > subvol_eb->start) {
+ node = node->rb_right;
+ } else {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ spin_unlock(&blocks->lock);
+ goto out;
+ }
+ /* Found one, remove it from @blocks first and update blocks->swapped */
+ rb_erase(&block->node, &blocks->blocks[level]);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+ swapped = true;
+ break;
+ }
+ }
+ blocks->swapped = swapped;
+ spin_unlock(&blocks->lock);
+
+ /* Read out reloc subtree root */
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+ block->reloc_generation, block->level,
+ &block->first_key);
+ if (IS_ERR(reloc_eb)) {
+ ret = PTR_ERR(reloc_eb);
+ reloc_eb = NULL;
+ goto free_out;
+ }
+ if (!extent_buffer_uptodate(reloc_eb)) {
+ ret = -EIO;
+ goto free_out;
+ }
+
+ ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+ block->last_snapshot, block->trace_leaf);
+free_out:
+ kfree(block);
+ free_extent_buffer(reloc_eb);
+out:
+ if (ret < 0) {
+ btrfs_err_rl(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 20c6bd5fa701..46ba7bd2961c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -38,6 +40,66 @@
*/
/*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ * During subtree swap:
+ * O = Old tree blocks
+ * N = New tree blocks
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * NA OB OA OB
+ * / | | \ / | | \
+ * NC ND OE OF OC OD OE OF
+ *
+ * In this case, NA and OA are going to be swapped, record (NA, OA) into
+ * subvolume tree X.
+ *
+ * 2) After subtree swap.
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * OA OB NA OB
+ * / | | \ / | | \
+ * OC OD OE OF NC ND OE OF
+ *
+ * 3a) COW happens for OB
+ * If we are going to COW tree block OB, we check OB's bytenr against
+ * tree X's swapped_blocks structure.
+ * If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ * Then we do subtree scan on both subtrees OA and NA.
+ * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ * Then no matter what we do to subvolume tree X, qgroup numbers will
+ * still be correct.
+ * Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4) Transaction commit
+ * Any record in X's swapped_blocks gets removed, since there is no
+ * modification to the swapped subtrees, no need to trigger heavy qgroup
+ * subtree rescan for them.
+ */
+
+/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
*/
@@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record {
struct rb_node node;
u64 bytenr;
u64 num_bytes;
+
+ /*
+ * For qgroup reserved data space freeing.
+ *
+ * @data_rsv_refroot and @data_rsv will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * transaction commit time.
+ */
+ u32 data_rsv; /* reserved data space needs to be freed */
+ u64 data_rsv_refroot; /* which root the reserved data belongs to */
struct ulist *old_roots;
};
+struct btrfs_qgroup_swapped_block {
+ struct rb_node node;
+
+ int level;
+ bool trace_leaf;
+
+ /* bytenr/generation of the tree block in subvolume tree after swap */
+ u64 subvol_bytenr;
+ u64 subvol_generation;
+
+ /* bytenr/generation of the tree block in reloc tree after swap */
+ u64 reloc_bytenr;
+ u64 reloc_generation;
+
+ u64 last_snapshot;
+ struct btrfs_key first_key;
+};
+
/*
* Qgroup reservation types:
*
@@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
-
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *bg_cache,
- struct extent_buffer *src_parent, int src_slot,
- struct extent_buffer *dst_parent, int dst_slot,
- u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
-static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
- u64 ref_root, u64 num_bytes)
-{
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
- return;
- trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
- BTRFS_QGROUP_RSV_DATA);
-}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *eb);
+
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index e74455eb42f9..1869ba8e5981 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
SetPageUptodate(bvec->bv_page);
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index c3557c12656b..d09b6cdb785a 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
return -EIO;
}
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->nodes[level-1] = eb;
path->slots[level-1] = 0;
path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
@@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
return -ENOMEM;
eb = btrfs_read_lock_root_node(fs_info->extent_root);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 272b287f8cf0..ddf028509931 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -162,6 +162,8 @@ struct reloc_control {
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
+ /* list of subvolume trees that get relocated */
+ struct list_head dirty_subvol_roots;
/* size of metadata reservation for merging reloc trees */
u64 merging_rsv_size;
/* size of relocated tree nodes */
@@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *root_item;
int ret;
- if (!root->reloc_root)
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+ !root->reloc_root)
goto out;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
+ /* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
- root->reloc_root = NULL;
+ set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
__del_reloc_root(reloc_root);
}
@@ -1773,7 +1777,7 @@ again:
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
level = btrfs_header_level(eb);
if (level < lowest_level) {
@@ -1786,7 +1790,7 @@ again:
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
BUG_ON(ret);
}
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
if (next_key) {
next_key->objectid = (u64)-1;
@@ -1802,6 +1806,8 @@ again:
BUG_ON(level < lowest_level);
ret = btrfs_bin_search(parent, &key, level, &slot);
+ if (ret < 0)
+ break;
if (ret && slot > 0)
slot--;
@@ -1852,7 +1858,7 @@ again:
slot, &eb);
BUG_ON(ret);
}
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1885,15 +1891,18 @@ again:
* If not traced, we will leak data numbers
* 2) Fs subtree
* If not traced, we will double count old data
- * and tree block numbers, if current trans doesn't free
- * data reloc tree inode.
+ *
+ * We don't scan the subtree right now, but only record
+ * the swapped tree blocks.
+ * The real subtree rescan is delayed until we have new
+ * CoW on the subtree root node before transaction commit.
*/
- ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
- parent, slot, path->nodes[level],
- path->slots[level], last_snapshot);
+ ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ rc->block_group, parent, slot,
+ path->nodes[level], path->slots[level],
+ last_snapshot);
if (ret < 0)
break;
-
/*
* swap blocks in fs tree and reloc tree.
*/
@@ -2121,6 +2130,58 @@ static int find_next_key(struct btrfs_path *path, int level,
}
/*
+ * Insert current subvolume into reloc_control::dirty_subvol_roots
+ */
+static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root = root->reloc_root;
+ struct btrfs_root_item *reloc_root_item;
+
+ /* @root must be a subvolume tree root with a valid reloc tree */
+ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(reloc_root);
+
+ reloc_root_item = &reloc_root->root_item;
+ memset(&reloc_root_item->drop_progress, 0,
+ sizeof(reloc_root_item->drop_progress));
+ reloc_root_item->drop_level = 0;
+ btrfs_set_root_refs(reloc_root_item, 0);
+ btrfs_update_reloc_root(trans, root);
+
+ if (list_empty(&root->reloc_dirty_list)) {
+ btrfs_grab_fs_root(root);
+ list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
+ }
+}
+
+static int clean_dirty_subvols(struct reloc_control *rc)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *next;
+ int ret = 0;
+
+ list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
+ reloc_dirty_list) {
+ struct btrfs_root *reloc_root = root->reloc_root;
+
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ list_del_init(&root->reloc_dirty_list);
+ root->reloc_root = NULL;
+ if (reloc_root) {
+ int ret2;
+
+ ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
+ }
+ btrfs_put_fs_root(root);
+ }
+ return ret;
+}
+
+/*
* merge the relocated tree blocks in reloc tree with corresponding
* fs tree.
*/
@@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- LIST_HEAD(inode_list);
struct btrfs_key key;
struct btrfs_key next_key;
struct btrfs_trans_handle *trans = NULL;
@@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
- if (err == 0) {
- memset(&root_item->drop_progress, 0,
- sizeof(root_item->drop_progress));
- root_item->drop_level = 0;
- btrfs_set_root_refs(root_item, 0);
- btrfs_update_reloc_root(trans, root);
- }
+ if (err == 0)
+ insert_dirty_subvol(trans, rc, root);
if (trans)
btrfs_end_transaction_throttle(trans);
@@ -2410,14 +2465,6 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
-
- ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
- if (ret < 0) {
- if (list_empty(&reloc_root->root_list))
- list_add_tail(&reloc_root->root_list,
- &reloc_roots);
- goto out;
- }
}
if (found) {
@@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!lowest) {
ret = btrfs_bin_search(upper->eb, key,
upper->level, &slot);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
@@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
&slot);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
BUG_ON(ret);
}
@@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
@@ -4079,6 +4134,9 @@ restart:
goto out_free;
}
btrfs_commit_transaction(trans);
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
@@ -4173,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void)
return NULL;
INIT_LIST_HEAD(&rc->reloc_roots);
+ INIT_LIST_HEAD(&rc->dirty_subvol_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(&rc->processed_blocks, NULL);
@@ -4468,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out_free;
}
err = btrfs_commit_transaction(trans);
+
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
kfree(rc);
out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 65bda0682928..0d2b957ca3a3 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
uuid_le uuid;
- int len;
+ u32 len;
int need_reset = 0;
len = btrfs_item_size_nr(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
- min_t(int, len, (int)sizeof(*item)));
+ min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
need_reset = 1;
if (!need_reset && btrfs_root_generation(item)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dcd36d7b849..a99588536c79 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -584,6 +584,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
+ INIT_LIST_HEAD(&sctx->csum_list);
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
@@ -608,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
- INIT_LIST_HEAD(&sctx->csum_list);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
@@ -3741,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
- if (fs_info->scrub_workers_refcnt == 0) {
+ lockdep_assert_held(&fs_info->scrub_lock);
+
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL);
fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
flags, is_dev_replace ? 1 : max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
+ ASSERT(fs_info->scrub_wr_completion_workers == NULL);
fs_info->scrub_wr_completion_workers =
btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
+ ASSERT(fs_info->scrub_parity_workers == NULL);
fs_info->scrub_parity_workers =
btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
+
+ refcount_set(&fs_info->scrub_workers_refcnt, 1);
+ } else {
+ refcount_inc(&fs_info->scrub_workers_refcnt);
}
- ++fs_info->scrub_workers_refcnt;
return 0;
fail_scrub_parity_workers:
@@ -3770,16 +3778,6 @@ fail_scrub_workers:
return -ENOMEM;
}
-static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
-{
- if (--fs_info->scrub_workers_refcnt == 0) {
- btrfs_destroy_workqueue(fs_info->scrub_workers);
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
- btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
- }
- WARN_ON(fs_info->scrub_workers_refcnt < 0);
-}
-
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
@@ -3788,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3835,7 +3836,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return PTR_ERR(sctx);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3903,6 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
* kick off writing super in log tree sync.
@@ -3925,11 +3927,26 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (progress)
memcpy(progress, &sctx->stat, sizeof(*progress));
+ if (!is_dev_replace)
+ btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
+ ret ? "not finished" : "finished", devid, ret);
+
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- scrub_workers_put(fs_info);
+ if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ }
mutex_unlock(&fs_info->scrub_lock);
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
scrub_put_ctx(sctx);
return ret;
@@ -4012,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c5586ffd1426..120e4340792a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
if (token != Opt_compress &&
token != Opt_compress_force)
info->compress_level =
- btrfs_compress_str2level(args[0].from);
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZLIB,
+ args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
no_compress = 0;
- } else if (strcmp(args[0].from, "zstd") == 0) {
+ } else if (strncmp(args[0].from, "zstd", 4) == 0) {
compress_type = "zstd";
info->compress_type = BTRFS_COMPRESS_ZSTD;
+ info->compress_level =
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZSTD,
+ args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -1621,6 +1627,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
flags | SB_RDONLY, device_name, data);
if (IS_ERR(mnt_root)) {
root = ERR_CAST(mnt_root);
+ kfree(subvol_name);
goto out;
}
@@ -1630,12 +1637,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (error < 0) {
root = ERR_PTR(error);
mntput(mnt_root);
+ kfree(subvol_name);
goto out;
}
}
}
if (IS_ERR(mnt_root)) {
root = ERR_CAST(mnt_root);
+ kfree(subvol_name);
goto out;
}
@@ -2187,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
+ case BTRFS_IOC_FORGET_DEV:
+ ret = btrfs_forget_devices(vol->name);
+ break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 127fa1535f58..acdad6d658f5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
+ btrfs_qgroup_clean_swapped_blocks(root);
}
/* We can free old roots now. */
@@ -845,19 +846,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
- if (lock && should_end_transaction(trans) &&
- READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
- spin_lock(&info->trans_lock);
- if (cur_trans->state == TRANS_STATE_RUNNING)
- cur_trans->state = TRANS_STATE_BLOCKED;
- spin_unlock(&info->trans_lock);
- }
-
if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
if (throttle)
return btrfs_commit_transaction(trans);
@@ -1540,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- btrfs_set_lock_blocking(old);
+ btrfs_set_lock_blocking_write(old);
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
@@ -1879,6 +1871,21 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
+/*
+ * Release reserved delayed ref space of all pending block groups of the
+ * transaction and remove them from the list
+ */
+static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group_cache *block_group, *tmp;
+
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ list_del_init(&block_group->bg_list);
+ }
+}
+
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
/*
@@ -1936,8 +1943,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->delayed_refs.flushing = 1;
smp_wmb();
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
@@ -2270,6 +2276,7 @@ scrub_continue:
btrfs_scrub_continue(fs_info);
cleanup_transaction:
btrfs_trans_release_metadata(trans);
+ btrfs_cleanup_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3c0987ab587d..5f9e2dd413af 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking(root_node);
+ btrfs_set_lock_blocking_write(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ac232b3d6d7e..f06454a55e00 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
#define LOG_INODE_ALL 0
#define LOG_INODE_EXISTS 1
#define LOG_OTHER_INODE 2
+#define LOG_OTHER_INODE_ALL 3
/*
* directory trouble cases
@@ -1330,6 +1331,67 @@ out:
return ret;
}
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct inode *dir, struct inode *inode, const char *name,
+ int namelen, u64 ref_index)
+{
+ struct btrfs_dir_item *dir_item;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct inode *other_inode = NULL;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_item = btrfs_lookup_dir_item(NULL, root, path,
+ btrfs_ino(BTRFS_I(dir)),
+ name, namelen, 0);
+ if (!dir_item) {
+ btrfs_release_path(path);
+ goto add_link;
+ } else if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ goto out;
+ }
+
+ /*
+ * Our inode's dentry collides with the dentry of another inode which is
+ * in the log but not yet processed since it has a higher inode number.
+ * So delete that other dentry.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
+ btrfs_release_path(path);
+ other_inode = read_one_inode(root, key.objectid);
+ if (!other_inode) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+ name, namelen);
+ if (ret)
+ goto out;
+ /*
+ * If we dropped the link count to 0, bump it so that later the iput()
+ * on the inode will not free it. We will fixup the link count later.
+ */
+ if (other_inode->i_nlink == 0)
+ inc_nlink(other_inode);
+
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ goto out;
+add_link:
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, namelen, 0, ref_index);
+out:
+ iput(other_inode);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen, 0, ref_index);
+ ret = add_link(trans, root, dir, inode, name, namelen,
+ ref_index);
if (ret)
goto out;
@@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -3706,6 +3767,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
found_key.type = 0;
ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
&start_slot);
+ if (ret < 0)
+ break;
ret = btrfs_del_items(trans, log, path, start_slot,
path->slots[0] - start_slot + 1);
@@ -4717,7 +4780,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
struct btrfs_inode *inode,
- u64 *other_ino)
+ u64 *other_ino, u64 *other_parent)
{
int ret;
struct btrfs_path *search_path;
@@ -4780,8 +4843,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
- ret = 1;
- *other_ino = di_key.objectid;
+ if (di_key.objectid != key->objectid) {
+ ret = 1;
+ *other_ino = di_key.objectid;
+ *other_parent = parent;
+ } else {
+ ret = 0;
+ }
} else {
ret = -EAGAIN;
}
@@ -4801,6 +4869,144 @@ out:
return ret;
}
+struct btrfs_ino_list {
+ u64 ino;
+ u64 parent;
+ struct list_head list;
+};
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ u64 ino, u64 parent)
+{
+ struct btrfs_ino_list *ino_elem;
+ LIST_HEAD(inode_list);
+ int ret = 0;
+
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+
+ while (!list_empty(&inode_list)) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
+ list);
+ ino = ino_elem->ino;
+ parent = ino_elem->parent;
+ list_del(&ino_elem->list);
+ kfree(ino_elem);
+ if (ret)
+ continue;
+
+ btrfs_release_path(path);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ /*
+ * If the other inode that had a conflicting dir entry was
+ * deleted in the current transaction, we need to log its parent
+ * directory.
+ */
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ if (ret == -ENOENT) {
+ key.objectid = parent;
+ inode = btrfs_iget(fs_info->sb, &key, root,
+ NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ } else {
+ ret = btrfs_log_inode(trans, root,
+ BTRFS_I(inode),
+ LOG_OTHER_INODE_ALL,
+ 0, LLONG_MAX, ctx);
+ iput(inode);
+ }
+ }
+ continue;
+ }
+ /*
+ * We are safe logging the other inode without acquiring its
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
+ * are safe against concurrent renames of the other inode as
+ * well because during a rename we pin the log and update the
+ * log with the new name before we unpin it.
+ */
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ if (ret) {
+ iput(inode);
+ continue;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ iput(inode);
+ continue;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_check_ref_name_override(leaf, slot, &key,
+ BTRFS_I(inode), &other_ino,
+ &other_parent);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem) {
+ ret = -ENOMEM;
+ break;
+ }
+ ino_elem->ino = other_ino;
+ ino_elem->parent = other_parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+ ret = 0;
+ }
+ path->slots[0]++;
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4840,6 +5046,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
+ bool recursive_logging = false;
path = btrfs_alloc_path();
if (!path)
@@ -4885,8 +5092,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return ret;
}
- if (inode_only == LOG_OTHER_INODE) {
- inode_only = LOG_INODE_EXISTS;
+ if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
+ recursive_logging = true;
+ if (inode_only == LOG_OTHER_INODE)
+ inode_only = LOG_INODE_EXISTS;
+ else
+ inode_only = LOG_INODE_ALL;
mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&inode->log_mutex);
@@ -4981,20 +5192,19 @@ again:
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid) {
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
+ u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], &min_key, inode,
- &other_ino);
+ &other_ino, &other_parent);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0 && ctx &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- struct btrfs_key inode_key;
- struct inode *other_inode;
-
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -5010,43 +5220,13 @@ again:
goto out_unlock;
}
ins_nr = 0;
- btrfs_release_path(path);
- inode_key.objectid = other_ino;
- inode_key.type = BTRFS_INODE_ITEM_KEY;
- inode_key.offset = 0;
- other_inode = btrfs_iget(fs_info->sb,
- &inode_key, root,
- NULL);
- /*
- * If the other inode that had a conflicting dir
- * entry was deleted in the current transaction,
- * we don't need to do more work nor fallback to
- * a transaction commit.
- */
- if (other_inode == ERR_PTR(-ENOENT)) {
- goto next_key;
- } else if (IS_ERR(other_inode)) {
- err = PTR_ERR(other_inode);
- goto out_unlock;
- }
- /*
- * We are safe logging the other inode without
- * acquiring its i_mutex as long as we log with
- * the LOG_INODE_EXISTS mode. We're safe against
- * concurrent renames of the other inode as well
- * because during a rename we pin the log and
- * update the log with the new name before we
- * unpin it.
- */
- err = btrfs_log_inode(trans, root,
- BTRFS_I(other_inode),
- LOG_OTHER_INODE, 0, LLONG_MAX,
- ctx);
- iput(other_inode);
+
+ err = log_conflicting_inodes(trans, root, path,
+ ctx, other_ino, other_parent);
if (err)
goto out_unlock;
- else
- goto next_key;
+ btrfs_release_path(path);
+ goto next_key;
}
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2576b1a379c9..9024eee889b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,27 +415,6 @@ static struct btrfs_device *__alloc_device(void)
return dev;
}
-/*
- * Find a device specified by @devid or @uuid in the list of @fs_devices, or
- * return NULL.
- *
- * If devid and uuid are both specified, the match must be exact, otherwise
- * only devid is used.
- */
-static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, const u8 *uuid)
-{
- struct btrfs_device *dev;
-
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
- if (dev->devid == devid &&
- (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
- return dev;
- }
- }
- return NULL;
-}
-
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@@ -734,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+static bool device_path_matched(const char *path, struct btrfs_device *device)
+{
+ int found;
+
+ rcu_read_lock();
+ found = strcmp(rcu_str_deref(device->name), path);
+ rcu_read_unlock();
+
+ return found == 0;
+}
+
/*
* Search and remove all stale (devices which are not mounted) devices.
* When both inputs are NULL, it will search and release all stale devices.
@@ -741,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work)
* matching this path only.
* skip_dev: Optional. Will skip this device when searching for the stale
* devices.
+ * Return: 0 for success or if @path is NULL.
+ * -EBUSY if @path is a mounted device.
+ * -ENOENT if @path does not match any device in the list.
*/
-static void btrfs_free_stale_devices(const char *path,
+static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
+ int ret = 0;
+
+ if (path)
+ ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
- mutex_lock(&fs_devices->device_list_mutex);
- if (fs_devices->opened) {
- mutex_unlock(&fs_devices->device_list_mutex);
- continue;
- }
+ mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp_device,
&fs_devices->devices, dev_list) {
- int not_found = 0;
-
if (skip_device && skip_device == device)
continue;
if (path && !device->name)
continue;
-
- rcu_read_lock();
- if (path)
- not_found = strcmp(rcu_str_deref(device->name),
- path);
- rcu_read_unlock();
- if (not_found)
+ if (path && !device_path_matched(path, device))
continue;
+ if (fs_devices->opened) {
+ /* for an already deleted device return 0 */
+ if (path && ret != 0)
+ ret = -EBUSY;
+ break;
+ }
/* delete the stale device */
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
+ ret = 0;
if (fs_devices->num_devices == 0)
break;
}
mutex_unlock(&fs_devices->device_list_mutex);
+
if (fs_devices->num_devices == 0) {
btrfs_sysfs_remove_fsid(fs_devices);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
+
+ return ret;
}
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
@@ -957,19 +952,19 @@ static noinline struct btrfs_device *device_list_add(const char *path,
else
fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
- fs_devices->fsid_change = fsid_change_in_progress;
-
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
+ fs_devices->fsid_change = fsid_change_in_progress;
+
mutex_lock(&fs_devices->device_list_mutex);
list_add(&fs_devices->fs_list, &fs_uuids);
device = NULL;
} else {
mutex_lock(&fs_devices->device_list_mutex);
- device = find_device(fs_devices, devid,
- disk_super->dev_item.uuid);
+ device = btrfs_find_device(fs_devices, devid,
+ disk_super->dev_item.uuid, NULL, false);
/*
* If this disk has been pulled into an fs devices created by
@@ -1134,7 +1129,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
- /* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
@@ -1451,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
return 0;
}
+int btrfs_forget_devices(const char *path)
+{
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ mutex_unlock(&uuid_mutex);
+
+ return ret;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -2385,11 +2390,11 @@ static struct btrfs_device *btrfs_find_device_by_path(
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- device = btrfs_find_device(fs_info, devid, dev_uuid,
- disk_super->metadata_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ disk_super->metadata_uuid, true);
else
- device = btrfs_find_device(fs_info, devid,
- dev_uuid, disk_super->fsid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ disk_super->fsid, true);
brelse(bh);
if (!device)
@@ -2398,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path(
return device;
}
-static struct btrfs_device *btrfs_find_device_missing_or_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
-{
- struct btrfs_device *device = NULL;
- if (strcmp(device_path, "missing") == 0) {
- struct list_head *devices;
- struct btrfs_device *tmp;
-
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(tmp, devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &tmp->dev_state) && !tmp->bdev) {
- device = tmp;
- break;
- }
- }
-
- if (!device)
- return ERR_PTR(-ENOENT);
- } else {
- device = btrfs_find_device_by_path(fs_info, device_path);
- }
-
- return device;
-}
-
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
struct btrfs_device *btrfs_find_device_by_devspec(
- struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
+ struct btrfs_fs_info *fs_info, u64 devid,
+ const char *device_path)
{
struct btrfs_device *device;
if (devid) {
- device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
+ NULL, true);
if (!device)
return ERR_PTR(-ENOENT);
- } else {
- if (!devpath || !devpath[0])
- return ERR_PTR(-EINVAL);
- device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
+ return device;
}
- return device;
+
+ if (!device_path || !device_path[0])
+ return ERR_PTR(-EINVAL);
+
+ if (strcmp(device_path, "missing") == 0) {
+ /* Find first missing device */
+ list_for_each_entry(device, &fs_info->fs_devices->devices,
+ dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) && !device->bdev)
+ return device;
+ }
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_find_device_by_path(fs_info, device_path);
}
/*
@@ -2563,7 +2556,8 @@ next_slot:
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ fs_uuid, true);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -6616,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
return BLK_STS_OK;
}
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid, u8 *fsid)
+/*
+ * Find a device specified by @devid or @uuid in the list of @fs_devices, or
+ * return NULL.
+ *
+ * If devid and uuid are both specified, the match must be exact, otherwise
+ * only devid is used.
+ *
+ * If @seed is true, traverse through the seed devices.
+ */
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *uuid, u8 *fsid,
+ bool seed)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *cur_devices;
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
+ while (fs_devices) {
if (!fsid ||
- !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- device = find_device(cur_devices, devid, uuid);
- if (device)
- return device;
+ !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices,
+ dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
}
- cur_devices = cur_devices->seed;
+ if (seed)
+ fs_devices = fs_devices->seed;
+ else
+ return NULL;
}
return NULL;
}
@@ -6782,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
}
if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
(type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
(type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
num_stripes != 1)) {
btrfs_err(fs_info,
@@ -6875,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info, devid,
- uuid, NULL);
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
+ devid, uuid, NULL, true);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -7015,7 +7024,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ fs_uuid, true);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -7605,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
+ true);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7819,12 +7830,25 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device bondary */
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
+
+ /* It's possible this device is a dummy for seed device */
+ if (dev->disk_total_bytes == 0) {
+ dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
+ NULL, false);
+ if (!dev) {
+ btrfs_err(fs_info, "failed to find seed devid %llu",
+ devid);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ed806649a473..3ad9d58d1b66 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
@@ -433,8 +434,8 @@ void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *uuid, u8 *fsid, bool seed);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 970ff3e35bb3..b86b7ad6b900 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -27,6 +27,33 @@ struct workspace {
int level;
};
+static struct workspace_manager wsm;
+
+static void zlib_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
+}
+
+static void zlib_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *zlib_get_workspace(unsigned int level)
+{
+ struct list_head *ws = btrfs_get_workspace(&wsm, level);
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ workspace->level = level;
+
+ return ws;
+}
+
+static void zlib_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&wsm, ws);
+}
+
static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zlib_alloc_workspace(void)
+static struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
@@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+ workspace->level = level;
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -390,18 +418,19 @@ next:
return ret;
}
-static void zlib_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zlib_set_level(unsigned int level)
{
- struct workspace *workspace = list_entry(ws, struct workspace, list);
- unsigned level = (type & 0xF0) >> 4;
-
- if (level > 9)
- level = 9;
+ if (!level)
+ return BTRFS_ZLIB_DEFAULT_LEVEL;
- workspace->level = level > 0 ? level : 3;
+ return min_t(unsigned int, level, 9);
}
const struct btrfs_compress_op btrfs_zlib_compress = {
+ .init_workspace_manager = zlib_init_workspace_manager,
+ .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
+ .get_workspace = zlib_get_workspace,
+ .put_workspace = zlib_put_workspace,
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index af6ec59972f5..3e418a3aeb11 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -6,25 +6,31 @@
*/
#include <linux/bio.h>
+#include <linux/bitmap.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/pagemap.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/zstd.h>
#include "compression.h"
+#include "ctree.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MAX_LEVEL 15
+/* 307s to avoid pathologically clashing with transaction commit */
+#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
+ size_t src_len)
{
- ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
- src_len, 0);
+ ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -36,11 +42,290 @@ struct workspace {
void *mem;
size_t size;
char *buf;
+ unsigned int level;
+ unsigned int req_level;
+ unsigned long last_used; /* jiffies */
struct list_head list;
+ struct list_head lru_list;
ZSTD_inBuffer in_buf;
ZSTD_outBuffer out_buf;
};
+/*
+ * Zstd Workspace Management
+ *
+ * Zstd workspaces have different memory requirements depending on the level.
+ * The zstd workspaces are managed by having individual lists for each level
+ * and a global lru. Forward progress is maintained by protecting a max level
+ * workspace.
+ *
+ * Getting a workspace is done by using the bitmap to identify the levels that
+ * have available workspaces and scans up. This lets us recycle higher level
+ * workspaces because of the monotonic memory guarantee. A workspace's
+ * last_used is only updated if it is being used by the corresponding memory
+ * level. Putting a workspace involves adding it back to the appropriate places
+ * and adding it back to the lru if necessary.
+ *
+ * A timer is used to reclaim workspaces if they have not been used for
+ * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
+ * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
+ */
+
+struct zstd_workspace_manager {
+ const struct btrfs_compress_op *ops;
+ spinlock_t lock;
+ struct list_head lru_list;
+ struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
+ unsigned long active_map;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+};
+
+static struct zstd_workspace_manager wsm;
+
+static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
+
+static inline struct workspace *list_to_workspace(struct list_head *list)
+{
+ return container_of(list, struct workspace, list);
+}
+
+/*
+ * zstd_reclaim_timer_fn - reclaim timer
+ * @t: timer
+ *
+ * This scans the lru_list and attempts to reclaim any workspace that hasn't
+ * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ */
+static void zstd_reclaim_timer_fn(struct timer_list *timer)
+{
+ unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
+ struct list_head *pos, *next;
+
+ spin_lock(&wsm.lock);
+
+ if (list_empty(&wsm.lru_list)) {
+ spin_unlock(&wsm.lock);
+ return;
+ }
+
+ list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+ struct workspace *victim = container_of(pos, struct workspace,
+ lru_list);
+ unsigned int level;
+
+ if (time_after(victim->last_used, reclaim_threshold))
+ break;
+
+ /* workspace is in use */
+ if (victim->req_level)
+ continue;
+
+ level = victim->level;
+ list_del(&victim->lru_list);
+ list_del(&victim->list);
+ wsm.ops->free_workspace(&victim->list);
+
+ if (list_empty(&wsm.idle_ws[level - 1]))
+ clear_bit(level - 1, &wsm.active_map);
+
+ }
+
+ if (!list_empty(&wsm.lru_list))
+ mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+
+ spin_unlock(&wsm.lock);
+}
+
+/*
+ * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ *
+ * It is possible based on the level configurations that a higher level
+ * workspace uses less memory than a lower level workspace. In order to reuse
+ * workspaces, this must be made a monotonic relationship. This precomputes
+ * the required memory for each level and enforces the monotonicity between
+ * level and memory required.
+ */
+static void zstd_calc_ws_mem_sizes(void)
+{
+ size_t max_size = 0;
+ unsigned int level;
+
+ for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ ZSTD_parameters params =
+ zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
+ size_t level_size =
+ max_t(size_t,
+ ZSTD_CStreamWorkspaceBound(params.cParams),
+ ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+
+ max_size = max_t(size_t, max_size, level_size);
+ zstd_ws_mem_sizes[level - 1] = max_size;
+ }
+}
+
+static void zstd_init_workspace_manager(void)
+{
+ struct list_head *ws;
+ int i;
+
+ zstd_calc_ws_mem_sizes();
+
+ wsm.ops = &btrfs_zstd_compress;
+ spin_lock_init(&wsm.lock);
+ init_waitqueue_head(&wsm.wait);
+ timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
+
+ INIT_LIST_HEAD(&wsm.lru_list);
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&wsm.idle_ws[i]);
+
+ ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ if (IS_ERR(ws)) {
+ pr_warn(
+ "BTRFS: cannot preallocate zstd compression workspace\n");
+ } else {
+ set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
+ list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+ }
+}
+
+static void zstd_cleanup_workspace_manager(void)
+{
+ struct workspace *workspace;
+ int i;
+
+ del_timer(&wsm.timer);
+
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+ while (!list_empty(&wsm.idle_ws[i])) {
+ workspace = container_of(wsm.idle_ws[i].next,
+ struct workspace, list);
+ list_del(&workspace->list);
+ list_del(&workspace->lru_list);
+ wsm.ops->free_workspace(&workspace->list);
+ }
+ }
+}
+
+/*
+ * zstd_find_workspace - find workspace
+ * @level: compression level
+ *
+ * This iterates over the set bits in the active_map beginning at the requested
+ * compression level. This lets us utilize already allocated workspaces before
+ * allocating a new one. If the workspace is of a larger size, it is used, but
+ * the place in the lru_list and last_used times are not updated. This is to
+ * offer the opportunity to reclaim the workspace in favor of allocating an
+ * appropriately sized one in the future.
+ */
+static struct list_head *zstd_find_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ struct workspace *workspace;
+ int i = level - 1;
+
+ spin_lock(&wsm.lock);
+ for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
+ if (!list_empty(&wsm.idle_ws[i])) {
+ ws = wsm.idle_ws[i].next;
+ workspace = list_to_workspace(ws);
+ list_del_init(ws);
+ /* keep its place if it's a lower level using this */
+ workspace->req_level = level;
+ if (level == workspace->level)
+ list_del(&workspace->lru_list);
+ if (list_empty(&wsm.idle_ws[i]))
+ clear_bit(i, &wsm.active_map);
+ spin_unlock(&wsm.lock);
+ return ws;
+ }
+ }
+ spin_unlock(&wsm.lock);
+
+ return NULL;
+}
+
+/*
+ * zstd_get_workspace - zstd's get_workspace
+ * @level: compression level
+ *
+ * If @level is 0, then any compression level can be used. Therefore, we begin
+ * scanning from 1. We first scan through possible workspaces and then after
+ * attempt to allocate a new workspace. If we fail to allocate one due to
+ * memory pressure, go to sleep waiting for the max level workspace to free up.
+ */
+static struct list_head *zstd_get_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ unsigned int nofs_flag;
+
+ /* level == 0 means we can use any workspace */
+ if (!level)
+ level = 1;
+
+again:
+ ws = zstd_find_workspace(level);
+ if (ws)
+ return ws;
+
+ nofs_flag = memalloc_nofs_save();
+ ws = wsm.ops->alloc_workspace(level);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (IS_ERR(ws)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&wsm.wait, &wait);
+
+ goto again;
+ }
+
+ return ws;
+}
+
+/*
+ * zstd_put_workspace - zstd put_workspace
+ * @ws: list_head for the workspace
+ *
+ * When putting back a workspace, we only need to update the LRU if we are of
+ * the requested compression level. Here is where we continue to protect the
+ * max level workspace or update last_used accordingly. If the reclaim timer
+ * isn't set, it is also set here. Only the max level workspace tries and wakes
+ * up waiting workspaces.
+ */
+static void zstd_put_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_to_workspace(ws);
+
+ spin_lock(&wsm.lock);
+
+ /* A node is only taken off the lru if we are the corresponding level */
+ if (workspace->req_level == workspace->level) {
+ /* Hide a max level workspace from reclaim */
+ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+ INIT_LIST_HEAD(&workspace->lru_list);
+ } else {
+ workspace->last_used = jiffies;
+ list_add(&workspace->lru_list, &wsm.lru_list);
+ if (!timer_pending(&wsm.timer))
+ mod_timer(&wsm.timer,
+ jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+ }
+ }
+
+ set_bit(workspace->level - 1, &wsm.active_map);
+ list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ workspace->req_level = 0;
+
+ spin_unlock(&wsm.lock);
+
+ if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+ cond_wake_up(&wsm.wait);
+}
+
static void zstd_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -50,25 +335,25 @@ static void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zstd_alloc_workspace(void)
+static struct list_head *zstd_alloc_workspace(unsigned int level)
{
- ZSTD_parameters params =
- zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->size = max_t(size_t,
- ZSTD_CStreamWorkspaceBound(params.cParams),
- ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+ workspace->size = zstd_ws_mem_sizes[level - 1];
+ workspace->level = level;
+ workspace->req_level = level;
+ workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
+ INIT_LIST_HEAD(&workspace->lru_list);
return &workspace->list;
fail:
@@ -95,7 +380,8 @@ static int zstd_compress_pages(struct list_head *ws,
unsigned long len = *total_out;
const unsigned long nr_dest_pages = *out_pages;
unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+ ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+ len);
*out_pages = 0;
*total_out = 0;
@@ -419,11 +705,19 @@ finish:
return ret;
}
-static void zstd_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zstd_set_level(unsigned int level)
{
+ if (!level)
+ return ZSTD_BTRFS_DEFAULT_LEVEL;
+
+ return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL);
}
const struct btrfs_compress_op btrfs_zstd_compress = {
+ .init_workspace_manager = zstd_init_workspace_manager,
+ .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
+ .get_workspace = zstd_get_workspace,
+ .put_workspace = zstd_put_workspace,
.alloc_workspace = zstd_alloc_workspace,
.free_workspace = zstd_free_workspace,
.compress_pages = zstd_compress_pages,
diff --git a/fs/buffer.c b/fs/buffer.c
index 52d024bfdbc1..ce357602f471 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -200,6 +200,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
struct buffer_head *head;
struct page *page;
int all_mapped = 1;
+ static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
@@ -227,15 +228,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
* file io on the block device and getblk. It gets dealt with
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
- if (all_mapped) {
- printk("__find_get_block_slow() failed. "
- "block=%llu, b_blocknr=%llu\n",
- (unsigned long long)block,
- (unsigned long long)bh->b_blocknr);
- printk("b_state=0x%08lx, b_size=%zu\n",
- bh->b_state, bh->b_size);
- printk("device %pg blocksize: %d\n", bdev,
- 1 << bd_inode->i_blkbits);
+ ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
+ if (all_mapped && __ratelimit(&last_warned)) {
+ printk("__find_get_block_slow() failed. block=%llu, "
+ "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
+ "device %pg blocksize: %d\n",
+ (unsigned long long)block,
+ (unsigned long long)bh->b_blocknr,
+ bh->b_state, bh->b_size, bdev,
+ 1 << bd_inode->i_blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->private_lock);
@@ -3026,13 +3027,23 @@ void guard_bio_eod(int op, struct bio *bio)
/* Uhhuh. We've got a bio that straddles the device size! */
truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
+ /*
+ * The bio contains more than one segment which spans EOD, just return
+ * and let IO layer turn it into an EIO
+ */
+ if (truncated_bytes > bvec->bv_len)
+ return;
+
/* Truncate the bio.. */
bio->bi_iter.bi_size -= truncated_bytes;
bvec->bv_len -= truncated_bytes;
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
- zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+ struct bio_vec bv;
+
+ mp_bvec_last_segment(bvec, &bv);
+ zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5d0c05e288cc..a47c541f8006 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1494,10 +1494,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
if (err < 0 || off >= i_size_read(inode)) {
unlock_page(page);
put_page(page);
- if (err == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
+ ret = vmf_error(err);
goto out_inline;
}
if (err < PAGE_SIZE)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 94c026bba2c2..bba28a5034ba 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1035,6 +1035,8 @@ static void drop_inode_snap_realm(struct ceph_inode_info *ci)
list_del_init(&ci->i_snap_realm_item);
ci->i_snap_realm_counter++;
ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
spin_unlock(&realm->inodes_with_caps_lock);
ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
realm);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 03f4d24db8fe..9455d3aef0c3 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -3,19 +3,6 @@
* quota.c - CephFS quota
*
* Copyright (C) 2017-2018 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/statfs.h>
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 041c27ea8de1..f74193da0e09 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -616,7 +616,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
- list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ if (list_empty(&ci->i_snap_flush_item))
+ list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4e9a7cc488da..da2cd8e89062 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -530,7 +530,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_putc(m, ',');
pos = m->count;
- ret = ceph_print_client_options(m, fsc->client);
+ ret = ceph_print_client_options(m, fsc->client, false);
if (ret)
return ret;
@@ -640,7 +640,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
opt = NULL; /* fsc->client now owns this */
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->osdc.abort_on_full = true;
+ ceph_set_opt(fsc->client, ABORT_ON_FULL);
if (!fsopt->mds_namespace) {
ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f1ddc9d03c10..76724efc831c 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -117,25 +117,25 @@ config CIFS_UPCALL
secure Kerberos authentication is required). If unsure, say Y.
config CIFS_XATTR
- bool "CIFS extended attributes"
- depends on CIFS
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page for details).
- CIFS maps the name of extended attributes beginning with the user
- namespace prefix to SMB/CIFS EAs. EAs are stored on Windows
- servers without the user namespace prefix, but their names are
- seen by Linux cifs clients prefaced by the user namespace prefix.
- The system namespace (used by some filesystems to store ACLs) is
- not supported at this time.
-
- If unsure, say Y.
+ bool "CIFS extended attributes"
+ depends on CIFS
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page for details).
+ CIFS maps the name of extended attributes beginning with the user
+ namespace prefix to SMB/CIFS EAs. EAs are stored on Windows
+ servers without the user namespace prefix, but their names are
+ seen by Linux cifs clients prefaced by the user namespace prefix.
+ The system namespace (used by some filesystems to store ACLs) is
+ not supported at this time.
+
+ If unsure, say Y.
config CIFS_POSIX
- bool "CIFS POSIX Extensions"
- depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
- help
- Enabling this option will cause the cifs client to attempt to
+ bool "CIFS POSIX Extensions"
+ depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
+ help
+ Enabling this option will cause the cifs client to attempt to
negotiate a newer dialect with servers, such as Samba 3.0.5
or later, that optionally can handle more POSIX like (rather
than Windows like) file behavior. It also enables
@@ -144,61 +144,62 @@ config CIFS_POSIX
CIFS POSIX ACL support. If unsure, say N.
config CIFS_ACL
- bool "Provide CIFS ACL support"
- depends on CIFS_XATTR && KEYS
- help
- Allows fetching CIFS/NTFS ACL from the server. The DACL blob
- is handed over to the application/caller. See the man
- page for getcifsacl for more information. If unsure, say Y.
+ bool "Provide CIFS ACL support"
+ depends on CIFS_XATTR && KEYS
+ help
+ Allows fetching CIFS/NTFS ACL from the server. The DACL blob
+ is handed over to the application/caller. See the man
+ page for getcifsacl for more information. If unsure, say Y.
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
default y
depends on CIFS
help
- Enabling this option adds helpful debugging messages to
- the cifs code which increases the size of the cifs module.
- If unsure, say Y.
+ Enabling this option adds helpful debugging messages to
+ the cifs code which increases the size of the cifs module.
+ If unsure, say Y.
+
config CIFS_DEBUG2
bool "Enable additional CIFS debugging routines"
depends on CIFS_DEBUG
help
- Enabling this option adds a few more debugging routines
- to the cifs code which slightly increases the size of
- the cifs module and can cause additional logging of debug
- messages in some error paths, slowing performance. This
- option can be turned off unless you are debugging
- cifs problems. If unsure, say N.
+ Enabling this option adds a few more debugging routines
+ to the cifs code which slightly increases the size of
+ the cifs module and can cause additional logging of debug
+ messages in some error paths, slowing performance. This
+ option can be turned off unless you are debugging
+ cifs problems. If unsure, say N.
config CIFS_DEBUG_DUMP_KEYS
bool "Dump encryption keys for offline decryption (Unsafe)"
depends on CIFS_DEBUG
help
- Enabling this will dump the encryption and decryption keys
- used to communicate on an encrypted share connection on the
- console. This allows Wireshark to decrypt and dissect
- encrypted network captures. Enable this carefully.
- If unsure, say N.
+ Enabling this will dump the encryption and decryption keys
+ used to communicate on an encrypted share connection on the
+ console. This allows Wireshark to decrypt and dissect
+ encrypted network captures. Enable this carefully.
+ If unsure, say N.
config CIFS_DFS_UPCALL
- bool "DFS feature support"
- depends on CIFS && KEYS
- select DNS_RESOLVER
- help
- Distributed File System (DFS) support is used to access shares
- transparently in an enterprise name space, even if the share
- moves to a different server. This feature also enables
- an upcall mechanism for CIFS which contacts userspace helper
- utilities to provide server name resolution (host names to
- IP addresses) which is needed in order to reconnect to
- servers if their addresses change or for implicit mounts of
- DFS junction points. If unsure, say Y.
+ bool "DFS feature support"
+ depends on CIFS && KEYS
+ select DNS_RESOLVER
+ help
+ Distributed File System (DFS) support is used to access shares
+ transparently in an enterprise name space, even if the share
+ moves to a different server. This feature also enables
+ an upcall mechanism for CIFS which contacts userspace helper
+ utilities to provide server name resolution (host names to
+ IP addresses) which is needed in order to reconnect to
+ servers if their addresses change or for implicit mounts of
+ DFS junction points. If unsure, say Y.
config CIFS_NFSD_EXPORT
- bool "Allow nfsd to export CIFS file system"
- depends on CIFS && BROKEN
- help
- Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+ bool "Allow nfsd to export CIFS file system"
+ depends on CIFS && BROKEN
+ help
+ Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB_DIRECT
bool "SMB Direct support (Experimental)"
@@ -209,10 +210,9 @@ config CIFS_SMB_DIRECT
say N.
config CIFS_FSCACHE
- bool "Provide CIFS client caching support"
- depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
- help
- Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
- to be cached locally on disk through the general filesystem cache
- manager. If unsure, say N.
-
+ bool "Provide CIFS client caching support"
+ depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+ help
+ Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+ to be cached locally on disk through the general filesystem cache
+ manager. If unsure, say N.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 593fb422d0f3..e92a2fee3c57 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -252,6 +252,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_printf(m, ",ACL");
#endif
seq_putc(m, '\n');
+ seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize);
seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
seq_printf(m, "Servers:");
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d9b99abe1243..5d83c924cc47 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -285,9 +285,9 @@ static void dump_referral(const struct dfs_info3_param *ref)
{
cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
- cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+ cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
ref->flags, ref->server_type);
- cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+ cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
ref->ref_flag, ref->path_consumed);
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 42f0d67f1054..ed49222abecb 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -58,6 +58,7 @@ struct cifs_sb_info {
spinlock_t tlink_tree_lock;
struct tcon_link *master_tlink;
struct nls_table *local_nls;
+ unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
unsigned long actimeo; /* attribute cache timeout (jiffies) */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 62d48d486d8f..217276b8b942 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -381,7 +381,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
seq_puts(s, "ntlm");
break;
case Kerberos:
- seq_puts(s, "krb5");
+ seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid));
break;
case RawNTLMSSP:
seq_puts(s, "ntlmssp");
@@ -554,6 +554,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rsize=%u", cifs_sb->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+ seq_printf(s, ",bsize=%u", cifs_sb->bsize);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
if (tcon->snapshot_time)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 26776eddd85d..142164ef1f05 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.15"
+#define CIFS_VERSION "2.18"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 01ded7038b19..f293e052e351 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -216,6 +216,7 @@ struct cifs_io_parms;
struct cifs_search_info;
struct cifsInodeInfo;
struct cifs_open_parms;
+struct cifs_credits;
struct smb_version_operations {
int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *,
@@ -230,12 +231,15 @@ struct smb_version_operations {
/* check response: verify signature, map error */
int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
bool);
- void (*add_credits)(struct TCP_Server_Info *, const unsigned int,
- const int);
+ void (*add_credits)(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits,
+ const int optype);
void (*set_credits)(struct TCP_Server_Info *, const int);
int * (*get_credits_field)(struct TCP_Server_Info *, const int);
unsigned int (*get_credits)(struct mid_q_entry *);
__u64 (*get_next_mid)(struct TCP_Server_Info *);
+ void (*revert_current_mid)(struct TCP_Server_Info *server,
+ const unsigned int val);
/* data offset from read response message */
unsigned int (*read_data_offset)(char *);
/*
@@ -383,8 +387,8 @@ struct smb_version_operations {
struct cifs_fid *);
/* calculate a size of SMB message */
unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
- /* check for STATUS_PENDING and process it in a positive case */
- bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
+ /* check for STATUS_PENDING and process the response if yes */
+ bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server);
/* check for STATUS_NETWORK_SESSION_EXPIRED */
bool (*is_session_expired)(char *);
/* send oplock break response */
@@ -452,7 +456,11 @@ struct smb_version_operations {
unsigned int (*wp_retry_size)(struct inode *);
/* get mtu credits */
int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
- unsigned int *, unsigned int *);
+ unsigned int *, struct cifs_credits *);
+ /* adjust previously taken mtu credits to request size */
+ int (*adjust_credits)(struct TCP_Server_Info *server,
+ struct cifs_credits *credits,
+ const unsigned int payload_size);
/* check if we need to issue closedir */
bool (*dir_needs_close)(struct cifsFileInfo *);
long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
@@ -557,6 +565,7 @@ struct smb_vol {
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
bool rdma:1;
+ unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -710,6 +719,11 @@ struct TCP_Server_Info {
int nr_targets;
};
+struct cifs_credits {
+ unsigned int value;
+ unsigned int instance;
+};
+
static inline unsigned int
in_flight(struct TCP_Server_Info *server)
{
@@ -731,18 +745,18 @@ has_credits(struct TCP_Server_Info *server, int *credits)
}
static inline void
-add_credits(struct TCP_Server_Info *server, const unsigned int add,
+add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits,
const int optype)
{
- server->ops->add_credits(server, add, optype);
+ server->ops->add_credits(server, credits, optype);
}
static inline void
-add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+add_credits_and_wake_if(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
- if (add) {
- server->ops->add_credits(server, add, optype);
+ if (credits->value) {
+ server->ops->add_credits(server, credits, optype);
wake_up(&server->request_q);
}
}
@@ -753,6 +767,14 @@ set_credits(struct TCP_Server_Info *server, const int val)
server->ops->set_credits(server, val);
}
+static inline int
+adjust_credits(struct TCP_Server_Info *server, struct cifs_credits *credits,
+ const unsigned int payload_size)
+{
+ return server->ops->adjust_credits ?
+ server->ops->adjust_credits(server, credits, payload_size) : 0;
+}
+
static inline __le64
get_next_mid64(struct TCP_Server_Info *server)
{
@@ -770,6 +792,22 @@ get_next_mid(struct TCP_Server_Info *server)
return cpu_to_le16(mid);
}
+static inline void
+revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+ if (server->ops->revert_current_mid)
+ server->ops->revert_current_mid(server, val);
+}
+
+static inline void
+revert_current_mid_from_hdr(struct TCP_Server_Info *server,
+ const struct smb2_sync_hdr *shdr)
+{
+ unsigned int num = le16_to_cpu(shdr->CreditCharge);
+
+ return revert_current_mid(server, num > 0 ? num : 1);
+}
+
static inline __u16
get_mid(const struct smb_hdr *smb)
{
@@ -1234,7 +1272,7 @@ struct cifs_readdata {
unsigned int pagesz;
unsigned int page_offset;
unsigned int tailsz;
- unsigned int credits;
+ struct cifs_credits credits;
unsigned int nr_pages;
struct page **pages;
};
@@ -1260,7 +1298,7 @@ struct cifs_writedata {
unsigned int pagesz;
unsigned int page_offset;
unsigned int tailsz;
- unsigned int credits;
+ struct cifs_credits credits;
unsigned int nr_pages;
struct page **pages;
};
@@ -1422,6 +1460,7 @@ struct mid_q_entry {
struct kref refcount;
struct TCP_Server_Info *server; /* server corresponding to this mid */
__u64 mid; /* multiplex id */
+ __u16 credits; /* number of credits consumed by this mid */
__u32 pid; /* process id */
__u32 sequence_number; /* for CIFS signing */
unsigned long when_alloc; /* when mid was created */
@@ -1438,6 +1477,7 @@ struct mid_q_entry {
int mid_state; /* wish this were enum but can not pass to wait_event */
unsigned int mid_flags;
__le16 command; /* smb command code */
+ unsigned int optype; /* operation type */
bool large_buf:1; /* if valid response, is pointer to large buf */
bool multiRsp:1; /* multiple trans2 responses for one request */
bool multiEnd:1; /* both received */
@@ -1574,6 +1614,25 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
kfree(param);
}
+static inline bool is_interrupt_error(int error)
+{
+ switch (error) {
+ case -EINTR:
+ case -ERESTARTSYS:
+ case -ERESTARTNOHAND:
+ case -ERESTARTNOINTR:
+ return true;
+ }
+ return false;
+}
+
+static inline bool is_retryable_error(int error)
+{
+ if (is_interrupt_error(error) || error == -EAGAIN)
+ return true;
+ return false;
+}
+
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
#define MID_REQUEST_SUBMITTED 2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 336c116995d7..4f96b3b00a7a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -93,7 +93,8 @@ extern int cifs_discard_remaining_data(struct TCP_Server_Info *server);
extern int cifs_call_async(struct TCP_Server_Info *server,
struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
- mid_handle_t *handle, void *cbdata, const int flags);
+ mid_handle_t *handle, void *cbdata, const int flags,
+ const struct cifs_credits *exist_credits);
extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
struct smb_rqst *rqst, int *resp_buf_type,
const int flags, struct kvec *resp_iov);
@@ -115,7 +116,7 @@ extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
unsigned int size, unsigned int *num,
- unsigned int *credits);
+ struct cifs_credits *credits);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
int * /* type of buf returned */, const int flags,
@@ -133,6 +134,9 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
+ bool fsuid_only,
+ struct cifsFileInfo **ret_file);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b1f49c1c543a..f43747c062a7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -128,24 +128,31 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
int rc;
struct dfs_cache_tgt_list tl;
struct dfs_cache_tgt_iterator *it = NULL;
- char tree[MAX_TREE_SIZE + 1];
+ char *tree;
const char *tcp_host;
size_t tcp_host_len;
const char *dfs_host;
size_t dfs_host_len;
+ tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+ if (!tree)
+ return -ENOMEM;
+
if (tcon->ipc) {
- snprintf(tree, sizeof(tree), "\\\\%s\\IPC$",
- tcon->ses->server->hostname);
- return CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
+ rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
+ goto out;
}
- if (!tcon->dfs_path)
- return CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+ if (!tcon->dfs_path) {
+ rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+ goto out;
+ }
rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl);
if (rc)
- return rc;
+ goto out;
extract_unc_hostname(tcon->ses->server->hostname, &tcp_host,
&tcp_host_len);
@@ -165,7 +172,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
continue;
}
- snprintf(tree, sizeof(tree), "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
if (!rc)
@@ -182,6 +189,8 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
rc = -ENOENT;
}
dfs_cache_free_tgts(&tl);
+out:
+ kfree(tree);
return rc;
}
#else
@@ -813,9 +822,10 @@ static void
cifs_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
DeleteMidQEntry(mid);
- add_credits(server, 1, CIFS_ECHO_OP);
+ add_credits(server, &credits, CIFS_ECHO_OP);
}
int
@@ -850,7 +860,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
iov[1].iov_base = (char *)smb + 4;
rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL,
- server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
+ server, CIFS_ASYNC_OP | CIFS_ECHO_OP, NULL);
if (rc)
cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -1540,18 +1550,26 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server)
}
static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
+ bool malformed)
{
int length;
- struct cifs_readdata *rdata = mid->callback_data;
length = cifs_discard_remaining_data(server);
- dequeue_mid(mid, rdata->result);
+ dequeue_mid(mid, malformed);
mid->resp_buf = server->smallbuf;
server->smallbuf = NULL;
return length;
}
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ return __cifs_readv_discard(server, mid, rdata->result);
+}
+
int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
@@ -1588,17 +1606,29 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, 0)) {
+ server->ops->is_status_pending(buf, server)) {
cifs_discard_remaining_data(server);
return -1;
}
+ /* set up first two iov for signature check and to get credits */
+ rdata->iov[0].iov_base = buf;
+ rdata->iov[0].iov_len = server->vals->header_preamble_size;
+ rdata->iov[1].iov_base = buf + server->vals->header_preamble_size;
+ rdata->iov[1].iov_len =
+ server->total_read - server->vals->header_preamble_size;
+ cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+ rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+ cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+ rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
/* Was the SMB read successful? */
rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
cifs_dbg(FYI, "%s: server returned error %d\n",
__func__, rdata->result);
- return cifs_readv_discard(server, mid);
+ /* normal error on read response */
+ return __cifs_readv_discard(server, mid, false);
}
/* Is there enough to get to the rest of the READ_RSP header? */
@@ -1642,14 +1672,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
server->total_read += length;
}
- /* set up first iov for signature check */
- rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
- rdata->iov[1].iov_len = server->total_read - 4;
- cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n",
- rdata->iov[0].iov_base, server->total_read);
-
/* how much data is in the response? */
#ifdef CONFIG_CIFS_SMB_DIRECT
use_rdma_mr = rdata->mr;
@@ -1693,6 +1715,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
.rq_npages = rdata->nr_pages,
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
__func__, mid->mid, mid->mid_state, rdata->result,
@@ -1730,7 +1753,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- add_credits(server, 1, 0);
+ add_credits(server, &credits, 0);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -1789,7 +1812,7 @@ cifs_async_readv(struct cifs_readdata *rdata)
kref_get(&rdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
- cifs_readv_callback, NULL, rdata, 0);
+ cifs_readv_callback, NULL, rdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
@@ -2103,18 +2126,22 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
- wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+ rc = cifs_get_writable_file(CIFS_I(inode), false,
+ &wdata2->cfile);
if (!wdata2->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- break;
+ cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
+ rc);
+ if (!is_retryable_error(rc))
+ rc = -EBADF;
+ } else {
+ wdata2->pid = wdata2->cfile->pid;
+ rc = server->ops->async_writev(wdata2,
+ cifs_writedata_release);
}
- wdata2->pid = wdata2->cfile->pid;
- rc = server->ops->async_writev(wdata2, cifs_writedata_release);
for (j = 0; j < nr_pages; j++) {
unlock_page(wdata2->pages[j]);
- if (rc != 0 && rc != -EAGAIN) {
+ if (rc != 0 && !is_retryable_error(rc)) {
SetPageError(wdata2->pages[j]);
end_page_writeback(wdata2->pages[j]);
put_page(wdata2->pages[j]);
@@ -2123,8 +2150,9 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
if (rc) {
kref_put(&wdata2->refcount, cifs_writedata_release);
- if (rc == -EAGAIN)
+ if (is_retryable_error(rc))
continue;
+ i += nr_pages;
break;
}
@@ -2132,7 +2160,15 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
i += nr_pages;
} while (i < wdata->nr_pages);
- mapping_set_error(inode->i_mapping, rc);
+ /* cleanup remaining pages from the original wdata */
+ for (; i < wdata->nr_pages; i++) {
+ SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ put_page(wdata->pages[i]);
+ }
+
+ if (rc != 0 && !is_retryable_error(rc))
+ mapping_set_error(inode->i_mapping, rc);
kref_put(&wdata->refcount, cifs_writedata_release);
}
@@ -2205,6 +2241,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -2240,7 +2277,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- add_credits(tcon->ses->server, 1, 0);
+ add_credits(tcon->ses->server, &credits, 0);
}
/* cifs_async_writev - send an async write, and set up mid to handle result */
@@ -2318,7 +2355,7 @@ cifs_async_writev(struct cifs_writedata *wdata,
kref_get(&wdata->refcount);
rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
- cifs_writev_callback, NULL, wdata, 0);
+ cifs_writev_callback, NULL, wdata, 0, NULL);
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f66529679ca2..b95db2b593cb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -102,7 +102,7 @@ enum {
Opt_backupuid, Opt_backupgid, Opt_uid,
Opt_cruid, Opt_gid, Opt_file_mode,
Opt_dirmode, Opt_port,
- Opt_rsize, Opt_wsize, Opt_actimeo,
+ Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo,
Opt_echo_interval, Opt_max_credits,
Opt_snapshot,
@@ -204,6 +204,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_dirmode, "dirmode=%s" },
{ Opt_dirmode, "dir_mode=%s" },
{ Opt_port, "port=%s" },
+ { Opt_blocksize, "bsize=%s" },
{ Opt_rsize, "rsize=%s" },
{ Opt_wsize, "wsize=%s" },
{ Opt_actimeo, "actimeo=%s" },
@@ -348,7 +349,7 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server)
cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
return -ENOMEM;
}
- snprintf(unc, len, "\\\\%s", server->hostname);
+ scnprintf(unc, len, "\\\\%s", server->hostname);
rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
kfree(unc);
@@ -433,9 +434,10 @@ static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
kfree(server->hostname);
server->hostname = extract_hostname(name);
- if (!server->hostname) {
- cifs_dbg(FYI, "%s: failed to extract hostname from target: %d\n",
- __func__, -ENOMEM);
+ if (IS_ERR(server->hostname)) {
+ cifs_dbg(FYI,
+ "%s: failed to extract hostname from target: %ld\n",
+ __func__, PTR_ERR(server->hostname));
}
}
@@ -591,6 +593,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
+ set_credits(server, 1);
spin_lock(&GlobalMid_Lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
@@ -719,6 +722,21 @@ server_unresponsive(struct TCP_Server_Info *server)
return false;
}
+static inline bool
+zero_credits(struct TCP_Server_Info *server)
+{
+ int val;
+
+ spin_lock(&server->req_lock);
+ val = server->credits + server->echo_credits + server->oplock_credits;
+ if (server->in_flight == 0 && val == 0) {
+ spin_unlock(&server->req_lock);
+ return true;
+ }
+ spin_unlock(&server->req_lock);
+ return false;
+}
+
static int
cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
{
@@ -731,6 +749,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
try_to_freeze();
+ /* reconnect if no credits and no requests in flight */
+ if (zero_credits(server)) {
+ cifs_reconnect(server);
+ return -ECONNABORTED;
+ }
+
if (server_unresponsive(server))
return -ECONNABORTED;
if (cifs_rdma_enabled(server) && server->smbd_conn)
@@ -1031,7 +1055,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, length))
+ server->ops->is_status_pending(buf, server))
return -1;
if (!mid)
@@ -1041,6 +1065,26 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return 0;
}
+static void
+smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+
+ /*
+ * SMB1 does not use credits.
+ */
+ if (server->vals->header_preamble_size)
+ return;
+
+ if (shdr->CreditRequest) {
+ spin_lock(&server->req_lock);
+ server->credits += le16_to_cpu(shdr->CreditRequest);
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ }
+}
+
+
static int
cifs_demultiplex_thread(void *p)
{
@@ -1170,6 +1214,7 @@ next_pdu:
} else if (server->ops->is_oplock_break &&
server->ops->is_oplock_break(bufs[i],
server)) {
+ smb2_add_credits_from_hdr(bufs[i], server);
cifs_dbg(FYI, "Received oplock break\n");
} else {
cifs_dbg(VFS, "No task to wake, unknown frame "
@@ -1181,6 +1226,7 @@ next_pdu:
if (server->ops->dump_detail)
server->ops->dump_detail(bufs[i],
server);
+ smb2_add_credits_from_hdr(bufs[i], server);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
}
@@ -1464,6 +1510,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol)
const char *delims = "/\\";
size_t len;
+ if (unlikely(!devname || !*devname)) {
+ cifs_dbg(VFS, "Device name not specified.\n");
+ return -EINVAL;
+ }
+
/* make sure we have a valid UNC double delimiter prefix */
len = strspn(devname, delims);
if (len != 2)
@@ -1549,7 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->cred_uid = current_uid();
vol->linux_uid = current_uid();
vol->linux_gid = current_gid();
-
+ vol->bsize = 1024 * 1024; /* can improve cp performance significantly */
/*
* default to SFM style remapping of seven reserved characters
* unless user overrides it or we negotiate CIFS POSIX where
@@ -1922,6 +1973,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
port = (unsigned short)option;
break;
+ case Opt_blocksize:
+ if (get_option_ul(args, &option)) {
+ cifs_dbg(VFS, "%s: Invalid blocksize value\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ /*
+ * inode blocksize realistically should never need to be
+ * less than 16K or greater than 16M and default is 1MB.
+ * Note that small inode block sizes (e.g. 64K) can lead
+ * to very poor performance of common tools like cp and scp
+ */
+ if ((option < CIFS_MAX_MSGSIZE) ||
+ (option > (4 * SMB3_DEFAULT_IOSIZE))) {
+ cifs_dbg(VFS, "%s: Invalid blocksize\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->bsize = option;
+ break;
case Opt_rsize:
if (get_option_ul(args, &option)) {
cifs_dbg(VFS, "%s: Invalid rsize value\n",
@@ -2587,7 +2658,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
- tcp_ses->reconnect_instance = 0;
+ tcp_ses->reconnect_instance = 1;
tcp_ses->lstrp = jiffies;
spin_lock_init(&tcp_ses->req_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
@@ -2748,7 +2819,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info)
if (tcon == NULL)
return -ENOMEM;
- snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
+ scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
/* cannot fail */
nls_codepage = load_nls_default();
@@ -3817,6 +3888,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
+ cifs_sb->bsize = pvolume_info->bsize;
/*
* Temporarily set r/wsize for matching superblock. If we end up using
* new sb then client will later negotiate it downward if needed.
@@ -4176,7 +4248,7 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
new_unc = kmalloc(len, GFP_KERNEL);
if (!new_unc)
return -ENOMEM;
- snprintf(new_unc, len, "\\%s", tgt);
+ scnprintf(new_unc, len, "\\%s", tgt);
kfree(vol->UNC);
vol->UNC = new_unc;
@@ -4880,8 +4952,6 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
if (!server->ops->need_neg(server))
return 0;
- set_credits(server, 1);
-
rc = server->ops->negotiate(xid, ses);
if (rc == 0) {
spin_lock(&GlobalMid_Lock);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index cd63c4a70875..09b7d0d4f6e4 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -776,6 +776,7 @@ static int get_tgt_list(const struct dfs_cache_entry *ce,
it->it_name = kstrndup(t->t_name, strlen(t->t_name),
GFP_KERNEL);
if (!it->it_name) {
+ kfree(it);
rc = -ENOMEM;
goto err_free_it;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e3e3a7550205..4c144c1f50eb 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -733,7 +733,8 @@ reopen_success:
if (can_flush) {
rc = filemap_write_and_wait(inode->i_mapping);
- mapping_set_error(inode->i_mapping, rc);
+ if (!is_interrupt_error(rc))
+ mapping_set_error(inode->i_mapping, rc);
if (tcon->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path,
@@ -1132,14 +1133,18 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf) {
+ if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) {
free_xid(xid);
return -EINVAL;
}
+ BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+ PAGE_SIZE);
+ max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+ PAGE_SIZE);
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
@@ -1472,12 +1477,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf)
+ if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE)))
return -EINVAL;
+ BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+ PAGE_SIZE);
+ max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+ PAGE_SIZE);
max_num = (max_buf - sizeof(struct smb_hdr)) /
sizeof(LOCKING_ANDX_RANGE);
buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
@@ -1833,24 +1842,30 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
return NULL;
}
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
- bool fsuid_only)
+/* Return -EBADF if no handle is found and general rc otherwise */
+int
+cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
+ struct cifsFileInfo **ret_file)
{
struct cifsFileInfo *open_file, *inv_file = NULL;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
bool any_available = false;
- int rc;
+ int rc = -EBADF;
unsigned int refind = 0;
- /* Having a null inode here (because mapping->host was set to zero by
- the VFS or MM) should not happen but we had reports of on oops (due to
- it being zero) during stress testcases so we need to check for it */
+ *ret_file = NULL;
+
+ /*
+ * Having a null inode here (because mapping->host was set to zero by
+ * the VFS or MM) should not happen but we had reports of on oops (due
+ * to it being zero) during stress testcases so we need to check for it
+ */
if (cifs_inode == NULL) {
cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
dump_stack();
- return NULL;
+ return rc;
}
cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
@@ -1864,7 +1879,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
refind_writable:
if (refind > MAX_REOPEN_ATT) {
spin_unlock(&tcon->open_file_lock);
- return NULL;
+ return rc;
}
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (!any_available && open_file->pid != current->tgid)
@@ -1876,7 +1891,8 @@ refind_writable:
/* found a good writable file */
cifsFileInfo_get(open_file);
spin_unlock(&tcon->open_file_lock);
- return open_file;
+ *ret_file = open_file;
+ return 0;
} else {
if (!inv_file)
inv_file = open_file;
@@ -1898,22 +1914,35 @@ refind_writable:
if (inv_file) {
rc = cifs_reopen_file(inv_file, false);
- if (!rc)
- return inv_file;
- else {
- spin_lock(&tcon->open_file_lock);
- list_move_tail(&inv_file->flist,
- &cifs_inode->openFileList);
- spin_unlock(&tcon->open_file_lock);
- cifsFileInfo_put(inv_file);
- ++refind;
- inv_file = NULL;
- spin_lock(&tcon->open_file_lock);
- goto refind_writable;
+ if (!rc) {
+ *ret_file = inv_file;
+ return 0;
}
+
+ spin_lock(&tcon->open_file_lock);
+ list_move_tail(&inv_file->flist, &cifs_inode->openFileList);
+ spin_unlock(&tcon->open_file_lock);
+ cifsFileInfo_put(inv_file);
+ ++refind;
+ inv_file = NULL;
+ spin_lock(&tcon->open_file_lock);
+ goto refind_writable;
}
- return NULL;
+ return rc;
+}
+
+struct cifsFileInfo *
+find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
+{
+ struct cifsFileInfo *cfile;
+ int rc;
+
+ rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile);
+ if (rc)
+ cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc);
+
+ return cfile;
}
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
@@ -1950,8 +1979,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (mapping->host->i_size - offset < (loff_t)to)
to = (unsigned)(mapping->host->i_size - offset);
- open_file = find_writable_file(CIFS_I(mapping->host), false);
- if (open_file) {
+ rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file);
+ if (!rc) {
bytes_written = cifs_write(open_file, open_file->pid,
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
@@ -1961,9 +1990,12 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
rc = 0;
else if (bytes_written < 0)
rc = bytes_written;
+ else
+ rc = -EFAULT;
} else {
- cifs_dbg(FYI, "No writeable filehandles for inode\n");
- rc = -EIO;
+ cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc);
+ if (!is_retryable_error(rc))
+ rc = -EIO;
}
kunmap(page);
@@ -2070,9 +2102,9 @@ static int
wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
struct address_space *mapping, struct writeback_control *wbc)
{
- int rc = 0;
- struct TCP_Server_Info *server;
- unsigned int i;
+ int rc;
+ struct TCP_Server_Info *server =
+ tlink_tcon(wdata->cfile->tlink)->ses->server;
wdata->sync_mode = wbc->sync_mode;
wdata->nr_pages = nr_pages;
@@ -2082,21 +2114,16 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
page_offset(wdata->pages[nr_pages - 1]),
(loff_t)PAGE_SIZE);
wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
+ wdata->pid = wdata->cfile->pid;
- if (wdata->cfile != NULL)
- cifsFileInfo_put(wdata->cfile);
- wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
- if (!wdata->cfile) {
- cifs_dbg(VFS, "No writable handles for inode\n");
- rc = -EBADF;
- } else {
- wdata->pid = wdata->cfile->pid;
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata, cifs_writedata_release);
- }
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ if (rc)
+ return rc;
- for (i = 0; i < nr_pages; ++i)
- unlock_page(wdata->pages[i]);
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata, cifs_writedata_release);
return rc;
}
@@ -2104,12 +2131,15 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+ struct inode *inode = mapping->host;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct TCP_Server_Info *server;
bool done = false, scanned = false, range_whole = false;
pgoff_t end, index;
struct cifs_writedata *wdata;
+ struct cifsFileInfo *cfile = NULL;
int rc = 0;
+ int saved_rc = 0;
unsigned int xid;
/*
@@ -2133,13 +2163,27 @@ static int cifs_writepages(struct address_space *mapping,
server = cifs_sb_master_tcon(cifs_sb)->ses->server;
retry:
while (!done && index <= end) {
- unsigned int i, nr_pages, found_pages, wsize, credits;
+ unsigned int i, nr_pages, found_pages, wsize;
pgoff_t next = 0, tofind, saved_index = index;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+ int get_file_rc = 0;
- rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
- &wsize, &credits);
+ if (cfile)
+ cifsFileInfo_put(cfile);
+
+ rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile);
+
+ /* in case of an error store it to return later */
if (rc)
+ get_file_rc = rc;
+
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+ &wsize, credits);
+ if (rc != 0) {
+ done = true;
break;
+ }
tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1;
@@ -2147,6 +2191,7 @@ retry:
&found_pages);
if (!wdata) {
rc = -ENOMEM;
+ done = true;
add_credits_and_wake_if(server, credits, 0);
break;
}
@@ -2167,15 +2212,28 @@ retry:
continue;
}
- wdata->credits = credits;
+ wdata->credits = credits_on_stack;
+ wdata->cfile = cfile;
+ cfile = NULL;
+
+ if (!wdata->cfile) {
+ cifs_dbg(VFS, "No writable handle in writepages rc=%d\n",
+ get_file_rc);
+ if (is_retryable_error(get_file_rc))
+ rc = get_file_rc;
+ else
+ rc = -EBADF;
+ } else
+ rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
- rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+ for (i = 0; i < nr_pages; ++i)
+ unlock_page(wdata->pages[i]);
/* send failure -- clean up the mess */
if (rc != 0) {
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
for (i = 0; i < nr_pages; ++i) {
- if (rc == -EAGAIN)
+ if (is_retryable_error(rc))
redirty_page_for_writepage(wbc,
wdata->pages[i]);
else
@@ -2183,7 +2241,7 @@ retry:
end_page_writeback(wdata->pages[i]);
put_page(wdata->pages[i]);
}
- if (rc != -EAGAIN)
+ if (!is_retryable_error(rc))
mapping_set_error(mapping, rc);
}
kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2193,6 +2251,15 @@ retry:
continue;
}
+ /* Return immediately if we received a signal during writing */
+ if (is_interrupt_error(rc)) {
+ done = true;
+ break;
+ }
+
+ if (rc != 0 && saved_rc == 0)
+ saved_rc = rc;
+
wbc->nr_to_write -= nr_pages;
if (wbc->nr_to_write <= 0)
done = true;
@@ -2210,9 +2277,14 @@ retry:
goto retry;
}
+ if (saved_rc != 0)
+ rc = saved_rc;
+
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+ if (cfile)
+ cifsFileInfo_put(cfile);
free_xid(xid);
return rc;
}
@@ -2242,8 +2314,8 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
set_page_writeback(page);
retry_write:
rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
- if (rc == -EAGAIN) {
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (is_retryable_error(rc)) {
+ if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
goto retry_write;
redirty_page_for_writepage(wbc, page);
} else if (rc != 0) {
@@ -2542,7 +2614,8 @@ static int
cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
struct cifs_aio_ctx *ctx)
{
- unsigned int wsize, credits;
+ unsigned int wsize;
+ struct cifs_credits credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
@@ -2552,18 +2625,19 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
* Note: we are attempting to resend the whole wdata not in segments
*/
do {
- rc = server->ops->wait_mtu_credits(
- server, wdata->bytes, &wsize, &credits);
+ rc = server->ops->wait_mtu_credits(server, wdata->bytes, &wsize,
+ &credits);
if (rc)
goto out;
if (wsize < wdata->bytes) {
- add_credits_and_wake_if(server, credits, 0);
+ add_credits_and_wake_if(server, &credits, 0);
msleep(1000);
}
} while (wsize < wdata->bytes);
+ wdata->credits = credits;
rc = -EAGAIN;
while (rc == -EAGAIN) {
rc = 0;
@@ -2579,7 +2653,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
return 0;
}
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
out:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
@@ -2602,6 +2676,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
struct TCP_Server_Info *server;
struct page **pagevec;
size_t start;
+ unsigned int xid;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2609,12 +2684,23 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
pid = current->tgid;
server = tlink_tcon(open_file->tlink)->ses->server;
+ xid = get_xid();
do {
- unsigned int wsize, credits;
+ unsigned int wsize;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, false);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
- &wsize, &credits);
+ &wsize, credits);
if (rc)
break;
@@ -2671,6 +2757,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
if (rc) {
+ kvfree(wdata->pages);
kfree(wdata);
add_credits_and_wake_if(server, credits, 0);
break;
@@ -2682,6 +2769,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
if (rc) {
for (i = 0; i < nr_pages; i++)
put_page(wdata->pages[i]);
+ kvfree(wdata->pages);
kfree(wdata);
add_credits_and_wake_if(server, credits, 0);
break;
@@ -2704,16 +2792,22 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
wdata->pid = pid;
wdata->bytes = cur_len;
wdata->pagesz = PAGE_SIZE;
- wdata->credits = credits;
+ wdata->credits = credits_on_stack;
wdata->ctx = ctx;
kref_get(&ctx->refcount);
- if (!wdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(wdata->cfile, false)))
- rc = server->ops->async_writev(wdata,
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+
+ if (!rc) {
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, wdata->credits, 0);
+ add_credits_and_wake_if(server, &wdata->credits, 0);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
if (rc == -EAGAIN) {
@@ -2729,6 +2823,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
len -= cur_len;
} while (len > 0);
+ free_xid(xid);
return rc;
}
@@ -3001,14 +3096,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
* these pages but not on the region from pos to ppos+len-1.
*/
written = cifs_user_writev(iocb, from);
- if (written > 0 && CIFS_CACHE_READ(cinode)) {
+ if (CIFS_CACHE_READ(cinode)) {
/*
- * Windows 7 server can delay breaking level2 oplock if a write
- * request comes - break it on the client to prevent reading
- * an old data.
+ * We have read level caching and we have just sent a write
+ * request to the server thus making data in the cache stale.
+ * Zap the cache and set oplock/lease level to NONE to avoid
+ * reading stale data from the cache. All subsequent read
+ * operations will read new data from the server.
*/
cifs_zap_mapping(inode);
- cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+ cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n",
inode);
cinode->oplock = 0;
}
@@ -3233,7 +3330,8 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
struct list_head *rdata_list,
struct cifs_aio_ctx *ctx)
{
- unsigned int rsize, credits;
+ unsigned int rsize;
+ struct cifs_credits credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
@@ -3250,11 +3348,12 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
goto out;
if (rsize < rdata->bytes) {
- add_credits_and_wake_if(server, credits, 0);
+ add_credits_and_wake_if(server, &credits, 0);
msleep(1000);
}
} while (rsize < rdata->bytes);
+ rdata->credits = credits;
rc = -EAGAIN;
while (rc == -EAGAIN) {
rc = 0;
@@ -3270,7 +3369,7 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
return 0;
}
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
out:
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
@@ -3284,7 +3383,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
struct cifs_aio_ctx *ctx)
{
struct cifs_readdata *rdata;
- unsigned int npages, rsize, credits;
+ unsigned int npages, rsize;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
size_t cur_len;
int rc;
pid_t pid;
@@ -3304,8 +3405,16 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
iov_iter_advance(&direct_iov, offset - ctx->pos);
do {
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
- &rsize, &credits);
+ &rsize, credits);
if (rc)
break;
@@ -3361,8 +3470,12 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
}
rc = cifs_read_allocate_pages(rdata, npages);
- if (rc)
- goto error;
+ if (rc) {
+ kvfree(rdata->pages);
+ kfree(rdata);
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
rdata->tailsz = PAGE_SIZE;
}
@@ -3375,16 +3488,21 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->pagesz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
- rdata->credits = credits;
+ rdata->credits = credits_on_stack;
rdata->ctx = ctx;
kref_get(&ctx->refcount);
- if (!rdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(rdata->cfile, true)))
- rc = server->ops->async_readv(rdata);
-error:
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
if (rc == -EAGAIN) {
@@ -3503,8 +3621,6 @@ again:
ctx->total_len = ctx->len - iov_iter_count(to);
}
- cifs_stats_bytes_read(tcon, ctx->total_len);
-
/* mask nodata case */
if (rc == -ENODATA)
rc = 0;
@@ -4065,10 +4181,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
loff_t offset;
struct page *page, *tpage;
struct cifs_readdata *rdata;
- unsigned credits;
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
+
+ if (open_file->invalidHandle) {
+ rc = cifs_reopen_file(open_file, true);
+ if (rc == -EAGAIN)
+ continue;
+ else if (rc)
+ break;
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
- &rsize, &credits);
+ &rsize, credits);
if (rc)
break;
@@ -4114,18 +4239,24 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->copy_into_pages = cifs_readpages_copy_into_pages;
- rdata->credits = credits;
+ rdata->credits = credits_on_stack;
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
list_del(&page->lru);
rdata->pages[rdata->nr_pages++] = page;
}
- if (!rdata->cfile->invalidHandle ||
- !(rc = cifs_reopen_file(rdata->cfile, true)))
- rc = server->ops->async_readv(rdata);
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+
+ if (!rc) {
+ if (rdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = server->ops->async_readv(rdata);
+ }
+
if (rc) {
- add_credits_and_wake_if(server, rdata->credits, 0);
+ add_credits_and_wake_if(server, &rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
lru_cache_add_file(page);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 13fb59aadebc..53fdb5df0d2e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2080,7 +2080,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
return rc;
generic_fillattr(inode, stat);
- stat->blksize = CIFS_MAX_MSGSIZE;
+ stat->blksize = cifs_sb->bsize;
stat->ino = CIFS_I(inode)->uniqueid;
/* old CIFS Unix Extensions doesn't return create time */
@@ -2257,6 +2257,11 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
* the flush returns error?
*/
rc = filemap_write_and_wait(inode->i_mapping);
+ if (is_interrupt_error(rc)) {
+ rc = -ERESTARTSYS;
+ goto out;
+ }
+
mapping_set_error(inode->i_mapping, rc);
rc = 0;
@@ -2400,6 +2405,11 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
* the flush returns error?
*/
rc = filemap_write_and_wait(inode->i_mapping);
+ if (is_interrupt_error(rc)) {
+ rc = -ERESTARTSYS;
+ goto cifs_setattr_exit;
+ }
+
mapping_set_error(inode->i_mapping, rc);
rc = 0;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 2148b0f60e5e..62216dc8f9f5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -103,9 +103,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
return rc;
}
- snprintf(md5_str2, sizeof(md5_str2),
- CIFS_MF_SYMLINK_MD5_FORMAT,
- CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+ scnprintf(md5_str2, sizeof(md5_str2),
+ CIFS_MF_SYMLINK_MD5_FORMAT,
+ CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
if (strncmp(md5_str1, md5_str2, 17) != 0)
return -EINVAL;
@@ -142,10 +142,10 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
return rc;
}
- snprintf(buf, buf_len,
- CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
- link_len,
- CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+ scnprintf(buf, buf_len,
+ CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+ link_len,
+ CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
memcpy(buf + ofs, link_str, link_len);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 32a6c020478f..f0ce27c3c6e4 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -117,11 +117,11 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
}
static void
-cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+cifs_add_credits(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
spin_lock(&server->req_lock);
- server->credits += add;
+ server->credits += credits->value;
server->in_flight--;
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
@@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
remaining = tgt_total_cnt - total_in_tgt;
if (remaining < 0) {
- cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+ cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n",
tgt_total_cnt, total_in_tgt);
return -EPROTO;
}
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 4ed10dd086e6..b204e84b87fb 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -122,12 +122,14 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
- * and check it for zero before using.
+ * and check it before using.
*/
max_buf = tcon->ses->server->maxBuf;
- if (!max_buf)
+ if (max_buf < sizeof(struct smb2_lock_element))
return -EINVAL;
+ BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+ max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
max_num = max_buf / sizeof(struct smb2_lock_element);
buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf)
@@ -264,6 +266,8 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
return -EINVAL;
}
+ BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+ max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
max_num = max_buf / sizeof(struct smb2_lock_element);
buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf) {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index f14533da3a93..01a76bccdb8d 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -293,6 +293,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_file_all_info *smb2_data;
__u32 create_options = 0;
+ struct cifs_fid fid;
+ bool no_cached_open = tcon->nohandlecache;
*adjust_tz = false;
*symlink = false;
@@ -301,6 +303,21 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
GFP_KERNEL);
if (smb2_data == NULL)
return -ENOMEM;
+
+ /* If it is a root and its handle is cached then use it */
+ if (!strlen(full_path) && !no_cached_open) {
+ rc = open_shroot(xid, tcon, &fid);
+ if (rc)
+ goto out;
+ rc = SMB2_query_info(xid, tcon, fid.persistent_fid,
+ fid.volatile_fid, smb2_data);
+ close_shroot(&tcon->crfid);
+ if (rc)
+ goto out;
+ move_smb2_info_to_cifs(data, smb2_data);
+ goto out;
+ }
+
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 6a9c47541c53..0e3570e40ff8 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
__u8 lease_state;
struct list_head *tmp;
struct cifsFileInfo *cfile;
- struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_pending_open *open;
struct cifsInodeInfo *cinode;
int ack_req = le32_to_cpu(rsp->Flags &
@@ -537,13 +536,25 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
le32_to_cpu(rsp->NewLeaseState));
- server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
-
if (ack_req)
cfile->oplock_break_cancelled = false;
else
cfile->oplock_break_cancelled = true;
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+
+ /*
+ * Set or clear flags depending on the lease state being READ.
+ * HANDLE caching flag should be added when the client starts
+ * to defer closing remote file handles with HANDLE leases.
+ */
+ if (lease_state & SMB2_LEASE_READ_CACHING_HE)
+ set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+ else
+ clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+
queue_work(cifsoplockd_wq, &cfile->oplock_break);
kfree(lw);
return true;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index cf7eb891804f..085e91436da7 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -34,6 +34,7 @@
#include "cifs_ioctl.h"
#include "smbdirect.h"
+/* Change credits for different ops and return the total number of credits */
static int
change_conf(struct TCP_Server_Info *server)
{
@@ -41,17 +42,15 @@ change_conf(struct TCP_Server_Info *server)
server->oplock_credits = server->echo_credits = 0;
switch (server->credits) {
case 0:
- return -1;
+ return 0;
case 1:
server->echoes = false;
server->oplocks = false;
- cifs_dbg(VFS, "disabling echoes and oplocks\n");
break;
case 2:
server->echoes = true;
server->oplocks = false;
server->echo_credits = 1;
- cifs_dbg(FYI, "disabling oplocks\n");
break;
default:
server->echoes = true;
@@ -64,14 +63,18 @@ change_conf(struct TCP_Server_Info *server)
server->echo_credits = 1;
}
server->credits -= server->echo_credits + server->oplock_credits;
- return 0;
+ return server->credits + server->echo_credits + server->oplock_credits;
}
static void
-smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
- const int optype)
+smb2_add_credits(struct TCP_Server_Info *server,
+ const struct cifs_credits *credits, const int optype)
{
- int *val, rc = 0;
+ int *val, rc = -1;
+ unsigned int add = credits->value;
+ unsigned int instance = credits->instance;
+ bool reconnect_detected = false;
+
spin_lock(&server->req_lock);
val = server->ops->get_credits_field(server, optype);
@@ -79,8 +82,11 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
server->hostname, *val);
+ if ((instance == 0) || (instance == server->reconnect_instance))
+ *val += add;
+ else
+ reconnect_detected = true;
- *val += add;
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
printk_once(KERN_WARNING "server overflowed SMB3 credits\n");
@@ -101,8 +107,31 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
}
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- if (rc)
- cifs_reconnect(server);
+
+ if (reconnect_detected)
+ cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n",
+ add, instance);
+
+ if (server->tcpStatus == CifsNeedReconnect
+ || server->tcpStatus == CifsExiting)
+ return;
+
+ switch (rc) {
+ case -1:
+ /* change_conf hasn't been executed */
+ break;
+ case 0:
+ cifs_dbg(VFS, "Possible client or server bug - zero credits\n");
+ break;
+ case 1:
+ cifs_dbg(VFS, "disabling echoes and oplocks\n");
+ break;
+ case 2:
+ cifs_dbg(FYI, "disabling oplocks\n");
+ break;
+ default:
+ cifs_dbg(FYI, "add %u credits total=%d\n", add, rc);
+ }
}
static void
@@ -136,12 +165,16 @@ smb2_get_credits(struct mid_q_entry *mid)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
- return le16_to_cpu(shdr->CreditRequest);
+ if (mid->mid_state == MID_RESPONSE_RECEIVED
+ || mid->mid_state == MID_RESPONSE_MALFORMED)
+ return le16_to_cpu(shdr->CreditRequest);
+
+ return 0;
}
static int
smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, unsigned int *credits)
+ unsigned int *num, struct cifs_credits *credits)
{
int rc = 0;
unsigned int scredits;
@@ -165,19 +198,22 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
scredits = server->credits;
/* can deadlock with reopen */
- if (scredits == 1) {
+ if (scredits <= 8) {
*num = SMB2_MAX_BUFFER_SIZE;
- *credits = 0;
+ credits->value = 0;
+ credits->instance = 0;
break;
}
- /* leave one credit for a possible reopen */
- scredits--;
+ /* leave some credits for reopen and other ops */
+ scredits -= 8;
*num = min_t(unsigned int, size,
scredits * SMB2_MAX_BUFFER_SIZE);
- *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
- server->credits -= *credits;
+ credits->value =
+ DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+ credits->instance = server->reconnect_instance;
+ server->credits -= credits->value;
server->in_flight++;
break;
}
@@ -186,6 +222,38 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
return rc;
}
+static int
+smb2_adjust_credits(struct TCP_Server_Info *server,
+ struct cifs_credits *credits,
+ const unsigned int payload_size)
+{
+ int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE);
+
+ if (!credits->value || credits->value == new_val)
+ return 0;
+
+ if (credits->value < new_val) {
+ WARN_ONCE(1, "request has less credits (%d) than required (%d)",
+ credits->value, new_val);
+ return -ENOTSUPP;
+ }
+
+ spin_lock(&server->req_lock);
+
+ if (server->reconnect_instance != credits->instance) {
+ spin_unlock(&server->req_lock);
+ cifs_dbg(VFS, "trying to return %d credits to old session\n",
+ credits->value - new_val);
+ return -EAGAIN;
+ }
+
+ server->credits += credits->value - new_val;
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+ credits->value = new_val;
+ return 0;
+}
+
static __u64
smb2_get_next_mid(struct TCP_Server_Info *server)
{
@@ -197,6 +265,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server)
return mid;
}
+static void
+smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+ spin_lock(&GlobalMid_Lock);
+ if (server->CurrentMid >= val)
+ server->CurrentMid -= val;
+ spin_unlock(&GlobalMid_Lock);
+}
+
static struct mid_q_entry *
smb2_find_mid(struct TCP_Server_Info *server, char *buf)
{
@@ -844,7 +921,9 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
FILE_READ_EA,
FILE_FULL_EA_INFORMATION,
SMB2_O_INFO_FILE,
- SMB2_MAX_EA_BUF,
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE,
&rsp_iov, &buftype, cifs_sb);
if (rc) {
/*
@@ -916,6 +995,16 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
+ if (ses->server->ops->query_all_EAs) {
+ if (!ea_value) {
+ rc = ses->server->ops->query_all_EAs(xid, tcon, path,
+ ea_name, NULL, 0,
+ cifs_sb);
+ if (rc == -ENODATA)
+ goto sea_exit;
+ }
+ }
+
/* Open */
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
@@ -1729,14 +1818,14 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
* the number of credits and return true. Otherwise - return false.
*/
static bool
-smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
+smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
if (shdr->Status != STATUS_PENDING)
return false;
- if (!length) {
+ if (shdr->CreditRequest) {
spin_lock(&server->req_lock);
server->credits += le16_to_cpu(shdr->CreditRequest);
spin_unlock(&server->req_lock);
@@ -2571,6 +2660,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server,
}
static void
+smb21_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ server->ops->set_oplock_level(cinode,
+ set_level2 ? SMB2_LEASE_READ_CACHING_HE :
+ 0, 0, NULL);
+}
+
+static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
@@ -3186,14 +3284,26 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
}
if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server, 0))
+ server->ops->is_status_pending(buf, server))
return -1;
- rdata->result = server->ops->map_error(buf, false);
+ /* set up first two iov to get credits */
+ rdata->iov[0].iov_base = buf;
+ rdata->iov[0].iov_len = 0;
+ rdata->iov[1].iov_base = buf;
+ rdata->iov[1].iov_len =
+ min_t(unsigned int, buf_len, server->vals->read_rsp_size);
+ cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+ rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+ cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+ rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
+ rdata->result = server->ops->map_error(buf, true);
if (rdata->result != 0) {
cifs_dbg(FYI, "%s: server returned error %d\n",
__func__, rdata->result);
- dequeue_mid(mid, rdata->result);
+ /* normal error on read response */
+ dequeue_mid(mid, false);
return 0;
}
@@ -3266,14 +3376,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- /* set up first iov for signature check */
- rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = 4;
- rdata->iov[1].iov_base = buf + 4;
- rdata->iov[1].iov_len = server->vals->read_rsp_size - 4;
- cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
- rdata->iov[0].iov_base, server->vals->read_rsp_size);
-
length = rdata->copy_into_pages(server, rdata, &iter);
kfree(bvec);
@@ -3513,6 +3615,7 @@ struct smb_version_operations smb20_operations = {
.get_credits = smb2_get_credits,
.wait_mtu_credits = cifs_wait_mtu_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3607,7 +3710,9 @@ struct smb_version_operations smb21_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3618,7 +3723,7 @@ struct smb_version_operations smb21_operations = {
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -3703,7 +3808,9 @@ struct smb_version_operations smb30_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3715,7 +3822,7 @@ struct smb_version_operations smb30_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
@@ -3808,7 +3915,9 @@ struct smb_version_operations smb311_operations = {
.get_credits_field = smb2_get_credits_field,
.get_credits = smb2_get_credits,
.wait_mtu_credits = smb2_wait_mtu_credits,
+ .adjust_credits = smb2_adjust_credits,
.get_next_mid = smb2_get_next_mid,
+ .revert_current_mid = smb2_revert_current_mid,
.read_data_offset = smb2_read_data_offset,
.read_data_length = smb2_read_data_length,
.map_error = map_smb2_to_linux_error,
@@ -3820,7 +3929,7 @@ struct smb_version_operations smb311_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb2_downgrade_oplock,
+ .downgrade_oplock = smb21_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index e57f6aa1d638..60fbe306f604 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -107,13 +107,13 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
struct TCP_Server_Info *server = tcon->ses->server;
spin_lock(&server->req_lock);
- /* Request up to 2 credits but don't go over the limit. */
+ /* Request up to 10 credits but don't go over the limit. */
if (server->credits >= server->max_credits)
shdr->CreditRequest = cpu_to_le16(0);
else
shdr->CreditRequest = cpu_to_le16(
min_t(int, server->max_credits -
- server->credits, 2));
+ server->credits, 10));
spin_unlock(&server->req_lock);
} else {
shdr->CreditRequest = cpu_to_le16(2);
@@ -162,24 +162,31 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
int rc;
struct dfs_cache_tgt_list tl;
struct dfs_cache_tgt_iterator *it = NULL;
- char tree[MAX_TREE_SIZE + 1];
+ char *tree;
const char *tcp_host;
size_t tcp_host_len;
const char *dfs_host;
size_t dfs_host_len;
+ tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+ if (!tree)
+ return -ENOMEM;
+
if (tcon->ipc) {
- snprintf(tree, sizeof(tree), "\\\\%s\\IPC$",
- tcon->ses->server->hostname);
- return SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$",
+ tcon->ses->server->hostname);
+ rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
+ goto out;
}
- if (!tcon->dfs_path)
- return SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+ if (!tcon->dfs_path) {
+ rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc);
+ goto out;
+ }
rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl);
if (rc)
- return rc;
+ goto out;
extract_unc_hostname(tcon->ses->server->hostname, &tcp_host,
&tcp_host_len);
@@ -199,7 +206,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
continue;
}
- snprintf(tree, sizeof(tree), "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
if (!rc)
@@ -216,6 +223,8 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
rc = -ENOENT;
}
dfs_cache_free_tgts(&tl);
+out:
+ kfree(tree);
return rc;
}
#else
@@ -481,6 +490,23 @@ build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
{
pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
+ /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+ pneg_ctxt->Name[0] = 0x93;
+ pneg_ctxt->Name[1] = 0xAD;
+ pneg_ctxt->Name[2] = 0x25;
+ pneg_ctxt->Name[3] = 0x50;
+ pneg_ctxt->Name[4] = 0x9C;
+ pneg_ctxt->Name[5] = 0xB4;
+ pneg_ctxt->Name[6] = 0x11;
+ pneg_ctxt->Name[7] = 0xE7;
+ pneg_ctxt->Name[8] = 0xB4;
+ pneg_ctxt->Name[9] = 0x23;
+ pneg_ctxt->Name[10] = 0x83;
+ pneg_ctxt->Name[11] = 0xDE;
+ pneg_ctxt->Name[12] = 0x96;
+ pneg_ctxt->Name[13] = 0x8B;
+ pneg_ctxt->Name[14] = 0xCD;
+ pneg_ctxt->Name[15] = 0x7C;
}
static void
@@ -977,8 +1003,14 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
(char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
-
- if (rc != 0) {
+ if (rc == -EOPNOTSUPP) {
+ /*
+ * Old Windows versions or Netapp SMB server can return
+ * not supported error. Client should accept it.
+ */
+ cifs_dbg(VFS, "Server does not support validate negotiate\n");
+ return 0;
+ } else if (rc != 0) {
cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
rc = -EIO;
goto out_free_inbuf;
@@ -1605,6 +1637,9 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ /* Need 64 for max size write so ask for more in case not there yet */
+ req->sync_hdr.CreditRequest = cpu_to_le16(64);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
@@ -2161,6 +2196,8 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
rqst.rq_iov = iov;
rqst.rq_nvec = n_iov;
+ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE,
+ FILE_WRITE_ATTRIBUTES);
/* resource #4: response buffer */
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
if (rc) {
@@ -2379,6 +2416,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (rc)
goto creat_exit;
+ trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid,
+ oparms->create_options, oparms->desired_access);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
@@ -2807,6 +2847,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buftype = CIFS_NO_BUFFER;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
+ bool allocated = false;
cifs_dbg(FYI, "Query Info\n");
@@ -2827,6 +2868,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto qinf_exit;
+ trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -2837,6 +2881,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
goto qinf_exit;
}
+ trace_smb3_query_info_done(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type);
+
if (dlen) {
*dlen = le32_to_cpu(rsp->OutputBufferLength);
if (!*data) {
@@ -2846,14 +2893,21 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
"Error %d allocating memory for acl\n",
rc);
*dlen = 0;
+ rc = -ENOMEM;
goto qinf_exit;
}
+ allocated = true;
}
}
rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength),
&rsp_iov, min_len, *data);
+ if (rc && allocated) {
+ kfree(*data);
+ *data = NULL;
+ *dlen = 0;
+ }
qinf_exit:
SMB2_query_info_free(&rqst);
@@ -2907,13 +2961,16 @@ smb2_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf;
- unsigned int credits_received = 1;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
- if (mid->mid_state == MID_RESPONSE_RECEIVED)
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ if (mid->mid_state == MID_RESPONSE_RECEIVED
+ || mid->mid_state == MID_RESPONSE_MALFORMED) {
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ }
DeleteMidQEntry(mid);
- add_credits(server, credits_received, CIFS_ECHO_OP);
+ add_credits(server, &credits, CIFS_ECHO_OP);
}
void smb2_reconnect_server(struct work_struct *work)
@@ -3005,7 +3062,7 @@ SMB2_echo(struct TCP_Server_Info *server)
iov[0].iov_base = (char *)req;
rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL,
- server, CIFS_ECHO_OP);
+ server, CIFS_ECHO_OP, NULL);
if (rc)
cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -3096,6 +3153,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->MinimumCount = 0;
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
+
+ trace_smb3_read_enter(0 /* xid */,
+ io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a RDMA write, fill in and append
@@ -3166,7 +3228,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = tcon->ses->server;
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
- unsigned int credits_received = 1;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
@@ -3181,7 +3243,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(shdr->CreditRequest);
+ credits.value = le16_to_cpu(shdr->CreditRequest);
+ credits.instance = server->reconnect_instance;
/* result already set, check signature */
if (server->sign && !mid->decrypted) {
int rc;
@@ -3205,9 +3268,12 @@ smb2_readv_callback(struct mid_q_entry *mid)
task_io_account_read(rdata->got_bytes);
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
+ case MID_RESPONSE_MALFORMED:
+ credits.value = le16_to_cpu(shdr->CreditRequest);
+ credits.instance = server->reconnect_instance;
+ /* fall through */
default:
- if (rdata->result != -ENODATA)
- rdata->result = -EIO;
+ rdata->result = -EIO;
}
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
@@ -3220,12 +3286,21 @@ smb2_readv_callback(struct mid_q_entry *mid)
rdata->mr = NULL;
}
#endif
- if (rdata->result)
+ if (rdata->result && rdata->result != -ENODATA) {
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
+ trace_smb3_read_err(0 /* xid */,
+ rdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, rdata->offset,
+ rdata->bytes, rdata->result);
+ } else
+ trace_smb3_read_done(0 /* xid */,
+ rdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid,
+ rdata->offset, rdata->got_bytes);
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- add_credits(server, credits_received, 0);
+ add_credits(server, &credits, 0);
}
/* smb2_async_readv - send an async read, and set up mid to handle result */
@@ -3255,17 +3330,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
rc = smb2_new_read_req(
(void **) &buf, &total_len, &io_parms, rdata, 0, 0);
- if (rc) {
- if (rc == -EAGAIN && rdata->credits) {
- /* credits was reset by reconnect */
- rdata->credits = 0;
- /* reduce in_flight value since we won't send the req */
- spin_lock(&server->req_lock);
- server->in_flight--;
- spin_unlock(&server->req_lock);
- }
+ if (rc)
return rc;
- }
if (smb3_encryption_required(io_parms.tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3275,33 +3341,34 @@ smb2_async_readv(struct cifs_readdata *rdata)
shdr = (struct smb2_sync_hdr *)buf;
- if (rdata->credits) {
+ if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest = shdr->CreditCharge;
- spin_lock(&server->req_lock);
- server->credits += rdata->credits -
- le16_to_cpu(shdr->CreditCharge);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
+ shdr->CreditRequest =
+ cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
+
+ rc = adjust_credits(server, &rdata->credits, rdata->bytes);
+ if (rc)
+ goto async_readv_out;
+
flags |= CIFS_HAS_CREDITS;
}
kref_get(&rdata->refcount);
rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
cifs_readv_receive, smb2_readv_callback,
- smb3_handle_read_data, rdata, flags);
+ smb3_handle_read_data, rdata, flags,
+ &rdata->credits);
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
- trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid,
- io_parms.tcon->tid, io_parms.tcon->ses->Suid,
- io_parms.offset, io_parms.length);
- } else
- trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid,
- io_parms.tcon->tid, io_parms.tcon->ses->Suid,
- io_parms.offset, io_parms.length);
+ trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid,
+ io_parms.tcon->tid,
+ io_parms.tcon->ses->Suid,
+ io_parms.offset, io_parms.length, rc);
+ }
+async_readv_out:
cifs_small_buf_release(buf);
return rc;
}
@@ -3344,10 +3411,14 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc != -ENODATA) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
- }
- trace_smb3_read_err(rc, xid, req->PersistentFileId,
+ trace_smb3_read_err(xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, io_parms->length,
+ rc);
+ } else
+ trace_smb3_read_done(xid, req->PersistentFileId,
io_parms->tcon->tid, ses->Suid,
- io_parms->offset, io_parms->length);
+ io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
return rc == -ENODATA ? 0 : rc;
} else
@@ -3386,14 +3457,16 @@ smb2_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
- unsigned int credits_received = 1;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
- wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ wdata->result = smb2_check_receive(mid, server, 0);
if (wdata->result != 0)
break;
@@ -3416,6 +3489,10 @@ smb2_writev_callback(struct mid_q_entry *mid)
case MID_RETRY_NEEDED:
wdata->result = -EAGAIN;
break;
+ case MID_RESPONSE_MALFORMED:
+ credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.instance = server->reconnect_instance;
+ /* fall through */
default:
wdata->result = -EIO;
break;
@@ -3433,12 +3510,21 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->mr = NULL;
}
#endif
- if (wdata->result)
+ if (wdata->result) {
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
+ trace_smb3_write_err(0 /* no xid */,
+ wdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, wdata->offset,
+ wdata->bytes, wdata->result);
+ } else
+ trace_smb3_write_done(0 /* no xid */,
+ wdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid,
+ wdata->offset, wdata->bytes);
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- add_credits(tcon->ses->server, credits_received, 0);
+ add_credits(server, &credits, 0);
}
/* smb2_async_writev - send an async write, and set up mid to handle result */
@@ -3456,17 +3542,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
unsigned int total_len;
rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len);
- if (rc) {
- if (rc == -EAGAIN && wdata->credits) {
- /* credits was reset by reconnect */
- wdata->credits = 0;
- /* reduce in_flight value since we won't send the req */
- spin_lock(&server->req_lock);
- server->in_flight--;
- spin_unlock(&server->req_lock);
- }
- goto async_writev_out;
- }
+ if (rc)
+ return rc;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3483,6 +3560,9 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->DataOffset = cpu_to_le16(
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
+
+ trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes);
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a server RDMA read, fill in and append
@@ -3552,21 +3632,22 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->Length = cpu_to_le32(wdata->bytes);
#endif
- if (wdata->credits) {
+ if (wdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
SMB2_MAX_BUFFER_SIZE));
- shdr->CreditRequest = shdr->CreditCharge;
- spin_lock(&server->req_lock);
- server->credits += wdata->credits -
- le16_to_cpu(shdr->CreditCharge);
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
+ shdr->CreditRequest =
+ cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
+
+ rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ if (rc)
+ goto async_writev_out;
+
flags |= CIFS_HAS_CREDITS;
}
kref_get(&wdata->refcount);
rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
- wdata, flags);
+ wdata, flags, &wdata->credits);
if (rc) {
trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
@@ -3574,10 +3655,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
wdata->bytes, rc);
kref_put(&wdata->refcount, release);
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
- } else
- trace_smb3_write_done(0 /* no xid */, req->PersistentFileId,
- tcon->tid, tcon->ses->Suid, wdata->offset,
- wdata->bytes);
+ }
async_writev_out:
cifs_small_buf_release(req);
@@ -3632,6 +3710,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
+ trace_smb3_write_enter(xid, io_parms->persistent_fid,
+ io_parms->tcon->tid, io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length);
+
iov[0].iov_base = (char *)req;
/* 1 for Buffer */
iov[0].iov_len = total_len - 1;
@@ -3794,6 +3876,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, output_size);
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -3801,18 +3886,26 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ trace_smb3_query_dir_done(xid, persistent_fid,
+ tcon->tid, tcon->ses->Suid, index, 0);
srch_inf->endOfSearch = true;
rc = 0;
+ } else {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
+ cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
}
- cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
goto qdir_exit;
}
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
info_buf_size);
- if (rc)
+ if (rc) {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
goto qdir_exit;
+ }
srch_inf->unicode = true;
@@ -3840,6 +3933,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
else
cifs_dbg(VFS, "illegal search buffer type\n");
+ trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, srch_inf->entries_in_buffer);
return rc;
qdir_exit:
@@ -4399,8 +4494,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
- please_key_low = (__u64 *)req->LeaseKey;
- please_key_high = (__u64 *)(req->LeaseKey+8);
+ please_key_low = (__u64 *)lease_key;
+ please_key_high = (__u64 *)(lease_key+8);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 7a2d0a2255e6..0bd4d4802701 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -84,8 +84,9 @@
#define NUMBER_OF_SMB2_COMMANDS 0x0013
-/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */
-#define MAX_SMB2_HDR_SIZE 0x00b0
+/* 52 transform hdr + 64 hdr + 88 create rsp */
+#define SMB2_TRANSFORM_HEADER_SIZE 52
+#define MAX_SMB2_HDR_SIZE 204
#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
@@ -287,12 +288,12 @@ struct smb2_encryption_neg_context {
__le16 Ciphers[1]; /* Ciphers[0] since only one used now */
} __packed;
-#define POSIX_CTXT_DATA_LEN 8
+#define POSIX_CTXT_DATA_LEN 16
struct smb2_posix_neg_context {
__le16 ContextType; /* 0x100 */
__le16 DataLength;
__le32 Reserved;
- __le64 Reserved1; /* In case needed for future (eg version or caps) */
+ __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
} __packed;
struct smb2_negotiate_rsp {
@@ -648,6 +649,13 @@ struct smb2_create_req {
__u8 Buffer[0];
} __packed;
+/*
+ * Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
+ * 88 (fixed part of create response) + 520 (path) + 150 (contexts) +
+ * 2 bytes of padding.
+ */
+#define MAX_SMB2_CREATE_RESPONSE_SIZE 824
+
struct smb2_create_rsp {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 89 */
@@ -996,6 +1004,11 @@ struct smb2_close_req {
__u64 VolatileFileId; /* opaque endianness */
} __packed;
+/*
+ * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
+ */
+#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
+
struct smb2_close_rsp {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* 60 */
@@ -1398,8 +1411,6 @@ struct smb2_file_link_info { /* encoding of request for level 11 */
char FileName[0]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
-#define SMB2_MAX_EA_BUF 65536
-
struct smb2_file_full_ea_info { /* encoding of response for level 15 */
__le32 next_entry_offset;
__u8 flags;
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 7b351c65ee46..d1181572758b 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
+ unsigned int credits = le16_to_cpu(shdr->CreditCharge);
if (server == NULL) {
cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
@@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
memset(temp, 0, sizeof(struct mid_q_entry));
kref_init(&temp->refcount);
temp->mid = le64_to_cpu(shdr->MessageId);
+ temp->credits = credits > 0 ? credits : 1;
temp->pid = current->pid;
temp->command = shdr->Command; /* Always LE */
temp->when_alloc = jiffies;
@@ -600,6 +602,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
+ trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
+ le16_to_cpu(shdr->Command), temp->mid);
return temp;
}
@@ -615,6 +619,10 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
return -EAGAIN;
}
+ if (ses->server->tcpStatus == CifsNeedNegotiate &&
+ shdr->Command != SMB2_NEGOTIATE)
+ return -EAGAIN;
+
if (ses->status == CifsNew) {
if ((shdr->Command != SMB2_SESSION_SETUP) &&
(shdr->Command != SMB2_NEGOTIATE))
@@ -634,6 +642,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
spin_lock(&GlobalMid_Lock);
list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
+
return 0;
}
@@ -674,13 +683,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
smb2_seq_num_into_buf(ses->server, shdr);
rc = smb2_get_mid_entry(ses, shdr, &mid);
- if (rc)
+ if (rc) {
+ revert_current_mid_from_hdr(ses->server, shdr);
return ERR_PTR(rc);
+ }
+
rc = smb2_sign_rqst(rqst, ses->server);
if (rc) {
+ revert_current_mid_from_hdr(ses->server, shdr);
cifs_delete_mid(mid);
return ERR_PTR(rc);
}
+
return mid;
}
@@ -692,14 +706,21 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
+ if (server->tcpStatus == CifsNeedNegotiate &&
+ shdr->Command != SMB2_NEGOTIATE)
+ return ERR_PTR(-EAGAIN);
+
smb2_seq_num_into_buf(server, shdr);
mid = smb2_mid_entry_alloc(shdr, server);
- if (mid == NULL)
+ if (mid == NULL) {
+ revert_current_mid_from_hdr(server, shdr);
return ERR_PTR(-ENOMEM);
+ }
rc = smb2_sign_rqst(rqst, server);
if (rc) {
+ revert_current_mid_from_hdr(server, shdr);
DeleteMidQEntry(mid);
return ERR_PTR(rc);
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index a568dac7b3a1..b943b74cd246 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1550,7 +1550,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
char name[MAX_NAME_LEN];
int rc;
- snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
info->request_cache =
kmem_cache_create(
name,
@@ -1566,7 +1566,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->request_mempool)
goto out1;
- snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
info->response_cache =
kmem_cache_create(
name,
@@ -1582,7 +1582,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->response_mempool)
goto out3;
- snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
info->workqueue = create_workqueue(name);
if (!info->workqueue)
goto out4;
diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c
index bd4a546feec1..465483787193 100644
--- a/fs/cifs/trace.c
+++ b/fs/cifs/trace.c
@@ -3,16 +3,6 @@
* Copyright (C) 2018, Microsoft Corporation.
*
* Author(s): Steve French <stfrench@microsoft.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index fb049809555f..d8b049afa606 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -3,16 +3,6 @@
* Copyright (C) 2018, Microsoft Corporation.
*
* Author(s): Steve French <stfrench@microsoft.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cifs
@@ -68,6 +58,7 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
DEFINE_SMB3_RW_ERR_EVENT(write_err);
DEFINE_SMB3_RW_ERR_EVENT(read_err);
+DEFINE_SMB3_RW_ERR_EVENT(query_dir_err);
/* For logging successful read or write */
@@ -110,8 +101,12 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
__u32 len), \
TP_ARGS(xid, fid, tid, sesid, offset, len))
+DEFINE_SMB3_RW_DONE_EVENT(write_enter);
+DEFINE_SMB3_RW_DONE_EVENT(read_enter);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter);
DEFINE_SMB3_RW_DONE_EVENT(write_done);
DEFINE_SMB3_RW_DONE_EVENT(read_done);
+DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
/*
* For handle based calls other than read and write, and get/set info
@@ -158,6 +153,48 @@ DEFINE_SMB3_FD_ERR_EVENT(close_err);
/*
* For handle based query/set info calls
*/
+DECLARE_EVENT_CLASS(smb3_inf_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u8 infclass,
+ __u32 type),
+ TP_ARGS(xid, fid, tid, sesid, infclass, type),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u8, infclass)
+ __field(__u32, type)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->infclass = infclass;
+ __entry->type = type;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->infclass, __entry->type)
+)
+
+#define DEFINE_SMB3_INF_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_inf_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u8 infclass, \
+ __u32 type), \
+ TP_ARGS(xid, fid, tid, sesid, infclass, type))
+
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_enter);
+DEFINE_SMB3_INF_ENTER_EVENT(query_info_done);
+
DECLARE_EVENT_CLASS(smb3_inf_err_class,
TP_PROTO(unsigned int xid,
__u64 fid,
@@ -280,6 +317,7 @@ DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \
__u64 mid), \
TP_ARGS(tid, sesid, cmd, mid))
+DEFINE_SMB3_CMD_DONE_EVENT(cmd_enter);
DEFINE_SMB3_CMD_DONE_EVENT(cmd_done);
DEFINE_SMB3_CMD_DONE_EVENT(ses_expired);
@@ -416,8 +454,47 @@ DEFINE_SMB3_TCON_EVENT(tcon);
/*
- * For smb2/smb3 open call
+ * For smb2/smb3 open (including create and mkdir) calls
*/
+
+DECLARE_EVENT_CLASS(smb3_open_enter_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int create_options,
+ int desired_access),
+ TP_ARGS(xid, tid, sesid, create_options, desired_access),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, create_options)
+ __field(int, desired_access)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->create_options = create_options;
+ __entry->desired_access = desired_access;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->create_options, __entry->desired_access)
+)
+
+#define DEFINE_SMB3_OPEN_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_open_enter_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int create_options, \
+ int desired_access), \
+ TP_ARGS(xid, tid, sesid, create_options, desired_access))
+
+DEFINE_SMB3_OPEN_ENTER_EVENT(open_enter);
+DEFINE_SMB3_OPEN_ENTER_EVENT(posix_mkdir_enter);
+
DECLARE_EVENT_CLASS(smb3_open_err_class,
TP_PROTO(unsigned int xid,
__u32 tid,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 5be7302853b6..7ce8a585abd6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -33,6 +33,7 @@
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/mempool.h>
+#include <linux/signal.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -291,6 +292,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
int n_vec;
unsigned int send_length = 0;
unsigned int i, j;
+ sigset_t mask, oldmask;
size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
struct msghdr smb_msg;
@@ -301,8 +303,14 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = smbd_send(server, rqst);
goto smbd_done;
}
+
if (ssocket == NULL)
- return -ENOTSOCK;
+ return -EAGAIN;
+
+ if (signal_pending(current)) {
+ cifs_dbg(FYI, "signal is pending before sending any data\n");
+ return -EINTR;
+ }
/* cork the socket */
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -312,6 +320,16 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
send_length += smb_rqst_len(server, &rqst[j]);
rfc1002_marker = cpu_to_be32(send_length);
+ /*
+ * We should not allow signals to interrupt the network send because
+ * any partial send will cause session reconnects thus increasing
+ * latency of system calls and overload a server with unnecessary
+ * requests.
+ */
+
+ sigfillset(&mask);
+ sigprocmask(SIG_BLOCK, &mask, &oldmask);
+
/* Generate a rfc1002 marker for SMB2+ */
if (server->vals->header_preamble_size == 0) {
struct kvec hiov = {
@@ -321,7 +339,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
- goto uncork;
+ goto unmask;
total_len += sent;
send_length += 4;
@@ -343,7 +361,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
- goto uncork;
+ goto unmask;
total_len += sent;
@@ -365,7 +383,25 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
}
}
-uncork:
+unmask:
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+
+ /*
+ * If signal is pending but we have already sent the whole packet to
+ * the server we need to return success status to allow a corresponding
+ * mid entry to be kept in the pending requests queue thus allowing
+ * to handle responses from the server by the client.
+ *
+ * If only part of the packet has been sent there is no need to hide
+ * interrupt because the session will be reconnected anyway, so there
+ * won't be any response from the server to handle.
+ */
+
+ if (signal_pending(current) && (total_len != send_length)) {
+ cifs_dbg(FYI, "signal is pending after attempt to send\n");
+ rc = -EINTR;
+ }
+
/* uncork it */
val = 0;
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -387,7 +423,7 @@ smbd_done:
if (rc < 0 && rc != -EINTR)
cifs_dbg(VFS, "Error %d sending data on socket to server\n",
rc);
- else
+ else if (rc > 0)
rc = 0;
return rc;
@@ -451,15 +487,18 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
static int
wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
- int *credits)
+ int *credits, unsigned int *instance)
{
int rc;
+ *instance = 0;
+
spin_lock(&server->req_lock);
if (timeout == CIFS_ASYNC_OP) {
/* oplock breaks must not be held up */
server->in_flight++;
*credits -= 1;
+ *instance = server->reconnect_instance;
spin_unlock(&server->req_lock);
return 0;
}
@@ -489,6 +528,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
if (timeout != CIFS_BLOCKING_OP) {
*credits -= 1;
server->in_flight++;
+ *instance = server->reconnect_instance;
}
spin_unlock(&server->req_lock);
break;
@@ -499,7 +539,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout,
static int
wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
- const int optype)
+ const int optype, unsigned int *instance)
{
int *val;
@@ -507,15 +547,16 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
/* Since an echo is already inflight, no need to wait to send another */
if (*val <= 0 && optype == CIFS_ECHO_OP)
return -EAGAIN;
- return wait_for_free_credits(server, timeout, val);
+ return wait_for_free_credits(server, timeout, val, instance);
}
int
cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
- unsigned int *num, unsigned int *credits)
+ unsigned int *num, struct cifs_credits *credits)
{
*num = size;
- *credits = 0;
+ credits->value = 0;
+ credits->instance = server->reconnect_instance;
return 0;
}
@@ -602,27 +643,43 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
int
cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
- mid_handle_t *handle, void *cbdata, const int flags)
+ mid_handle_t *handle, void *cbdata, const int flags,
+ const struct cifs_credits *exist_credits)
{
int rc, timeout, optype;
struct mid_q_entry *mid;
- unsigned int credits = 0;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
+ unsigned int instance;
timeout = flags & CIFS_TIMEOUT_MASK;
optype = flags & CIFS_OP_MASK;
if ((flags & CIFS_HAS_CREDITS) == 0) {
- rc = wait_for_free_request(server, timeout, optype);
+ rc = wait_for_free_request(server, timeout, optype, &instance);
if (rc)
return rc;
- credits = 1;
- }
+ credits.value = 1;
+ credits.instance = instance;
+ } else
+ instance = exist_credits->instance;
mutex_lock(&server->srv_mutex);
+
+ /*
+ * We can't use credits obtained from the previous session to send this
+ * request. Check if there were reconnects after we obtained credits and
+ * return -EAGAIN in such cases to let callers handle it.
+ */
+ if (instance != server->reconnect_instance) {
+ mutex_unlock(&server->srv_mutex);
+ add_credits_and_wake_if(server, &credits, optype);
+ return -EAGAIN;
+ }
+
mid = server->ops->setup_async_request(server, rqst);
if (IS_ERR(mid)) {
mutex_unlock(&server->srv_mutex);
- add_credits_and_wake_if(server, credits, optype);
+ add_credits_and_wake_if(server, &credits, optype);
return PTR_ERR(mid);
}
@@ -647,6 +704,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
cifs_in_send_dec(server);
if (rc < 0) {
+ revert_current_mid(server, mid->credits);
server->sequence_number -= 2;
cifs_delete_mid(mid);
}
@@ -656,7 +714,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
if (rc == 0)
return 0;
- add_credits_and_wake_if(server, credits, optype);
+ add_credits_and_wake_if(server, &credits, optype);
return rc;
}
@@ -783,8 +841,29 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
}
static void
-cifs_noop_callback(struct mid_q_entry *mid)
+cifs_compound_callback(struct mid_q_entry *mid)
+{
+ struct TCP_Server_Info *server = mid->server;
+ struct cifs_credits credits;
+
+ credits.value = server->ops->get_credits(mid);
+ credits.instance = server->reconnect_instance;
+
+ add_credits(server, &credits, mid->optype);
+}
+
+static void
+cifs_compound_last_callback(struct mid_q_entry *mid)
{
+ cifs_compound_callback(mid);
+ cifs_wake_up_task(mid);
+}
+
+static void
+cifs_cancelled_callback(struct mid_q_entry *mid)
+{
+ cifs_compound_callback(mid);
+ DeleteMidQEntry(mid);
}
int
@@ -795,7 +874,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
int i, j, rc = 0;
int timeout, optype;
struct mid_q_entry *midQ[MAX_COMPOUND];
- unsigned int credits = 0;
+ bool cancelled_mid[MAX_COMPOUND] = {false};
+ struct cifs_credits credits[MAX_COMPOUND] = {
+ { .value = 0, .instance = 0 }
+ };
+ unsigned int instance;
+ unsigned int first_instance = 0;
char *buf;
timeout = flags & CIFS_TIMEOUT_MASK;
@@ -812,15 +896,81 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
if (ses->server->tcpStatus == CifsExiting)
return -ENOENT;
+ spin_lock(&ses->server->req_lock);
+ if (ses->server->credits < num_rqst) {
+ /*
+ * Return immediately if not too many requests in flight since
+ * we will likely be stuck on waiting for credits.
+ */
+ if (ses->server->in_flight < num_rqst - ses->server->credits) {
+ spin_unlock(&ses->server->req_lock);
+ return -ENOTSUPP;
+ }
+ } else {
+ /* enough credits to send the whole compounded request */
+ ses->server->credits -= num_rqst;
+ ses->server->in_flight += num_rqst;
+ first_instance = ses->server->reconnect_instance;
+ }
+ spin_unlock(&ses->server->req_lock);
+
+ if (first_instance) {
+ cifs_dbg(FYI, "Acquired %d credits at once\n", num_rqst);
+ for (i = 0; i < num_rqst; i++) {
+ credits[i].value = 1;
+ credits[i].instance = first_instance;
+ }
+ goto setup_rqsts;
+ }
+
/*
- * Ensure that we do not send more than 50 overlapping requests
- * to the same server. We may make this configurable later or
- * use ses->maxReq.
+ * There are not enough credits to send the whole compound request but
+ * there are requests in flight that may bring credits from the server.
+ * This approach still leaves the possibility to be stuck waiting for
+ * credits if the server doesn't grant credits to the outstanding
+ * requests. This should be fixed by returning immediately and letting
+ * a caller fallback to sequential commands instead of compounding.
+ * Ensure we obtain 1 credit per request in the compound chain.
*/
- rc = wait_for_free_request(ses->server, timeout, optype);
- if (rc)
- return rc;
+ for (i = 0; i < num_rqst; i++) {
+ rc = wait_for_free_request(ses->server, timeout, optype,
+ &instance);
+
+ if (rc == 0) {
+ credits[i].value = 1;
+ credits[i].instance = instance;
+ /*
+ * All parts of the compound chain must get credits from
+ * the same session, otherwise we may end up using more
+ * credits than the server granted. If there were
+ * reconnects in between, return -EAGAIN and let callers
+ * handle it.
+ */
+ if (i == 0)
+ first_instance = instance;
+ else if (first_instance != instance) {
+ i++;
+ rc = -EAGAIN;
+ }
+ }
+ if (rc) {
+ /*
+ * We haven't sent an SMB packet to the server yet but
+ * we already obtained credits for i requests in the
+ * compound chain - need to return those credits back
+ * for future use. Note that we need to call add_credits
+ * multiple times to match the way we obtained credits
+ * in the first place and to account for in flight
+ * requests correctly.
+ */
+ for (j = 0; j < i; j++)
+ add_credits(ses->server, &credits[j], optype);
+ return rc;
+ }
+ }
+
+setup_rqsts:
/*
* Make sure that we sign in the same order that we send on this socket
* and avoid races inside tcp sendmsg code that could cause corruption
@@ -829,24 +979,47 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
mutex_lock(&ses->server->srv_mutex);
+ /*
+ * All the parts of the compound chain belong obtained credits from the
+ * same session (see the appropriate checks above). In the same time
+ * there might be reconnects after those checks but before we acquired
+ * the srv_mutex. We can not use credits obtained from the previous
+ * session to send this request. Check if there were reconnects after
+ * we obtained credits and return -EAGAIN in such cases to let callers
+ * handle it.
+ */
+ if (first_instance != ses->server->reconnect_instance) {
+ mutex_unlock(&ses->server->srv_mutex);
+ for (j = 0; j < num_rqst; j++)
+ add_credits(ses->server, &credits[j], optype);
+ return -EAGAIN;
+ }
+
for (i = 0; i < num_rqst; i++) {
midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]);
if (IS_ERR(midQ[i])) {
+ revert_current_mid(ses->server, i);
for (j = 0; j < i; j++)
cifs_delete_mid(midQ[j]);
mutex_unlock(&ses->server->srv_mutex);
+
/* Update # of requests on wire to server */
- add_credits(ses->server, 1, optype);
+ for (j = 0; j < num_rqst; j++)
+ add_credits(ses->server, &credits[j], optype);
return PTR_ERR(midQ[i]);
}
midQ[i]->mid_state = MID_REQUEST_SUBMITTED;
+ midQ[i]->optype = optype;
/*
- * We don't invoke the callback compounds unless it is the last
- * request.
+ * Invoke callback for every part of the compound chain
+ * to calculate credits properly. Wake up this thread only when
+ * the last element is received.
*/
if (i < num_rqst - 1)
- midQ[i]->callback = cifs_noop_callback;
+ midQ[i]->callback = cifs_compound_callback;
+ else
+ midQ[i]->callback = cifs_compound_last_callback;
}
cifs_in_send_inc(ses->server);
rc = smb_send_rqst(ses->server, num_rqst, rqst, flags);
@@ -855,13 +1028,27 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
for (i = 0; i < num_rqst; i++)
cifs_save_when_sent(midQ[i]);
- if (rc < 0)
+ if (rc < 0) {
+ revert_current_mid(ses->server, num_rqst);
ses->server->sequence_number -= 2;
+ }
mutex_unlock(&ses->server->srv_mutex);
- if (rc < 0)
+ if (rc < 0) {
+ /* Sending failed for some reason - return credits back */
+ for (i = 0; i < num_rqst; i++)
+ add_credits(ses->server, &credits[i], optype);
goto out;
+ }
+
+ /*
+ * At this point the request is passed to the network stack - we assume
+ * that any credits taken from the server structure on the client have
+ * been spent and we can't return them back. Once we receive responses
+ * we will collect credits granted by the server in the mid callbacks
+ * and add those credits to the server structure.
+ */
/*
* Compounding is never used during session establish.
@@ -875,36 +1062,34 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
for (i = 0; i < num_rqst; i++) {
rc = wait_for_response(ses->server, midQ[i]);
- if (rc != 0) {
+ if (rc != 0)
+ break;
+ }
+ if (rc != 0) {
+ for (; i < num_rqst; i++) {
cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n",
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(ses->server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
- midQ[i]->callback = DeleteMidQEntry;
- spin_unlock(&GlobalMid_Lock);
- add_credits(ses->server, 1, optype);
- return rc;
+ midQ[i]->callback = cifs_cancelled_callback;
+ cancelled_mid[i] = true;
+ credits[i].value = 0;
}
spin_unlock(&GlobalMid_Lock);
}
}
- for (i = 0; i < num_rqst; i++)
- if (midQ[i]->resp_buf)
- credits += ses->server->ops->get_credits(midQ[i]);
- if (!credits)
- credits = 1;
-
for (i = 0; i < num_rqst; i++) {
if (rc < 0)
goto out;
rc = cifs_sync_mid_result(midQ[i], ses->server);
if (rc != 0) {
- add_credits(ses->server, credits, optype);
- return rc;
+ /* mark this mid as cancelled to not free it below */
+ cancelled_mid[i] = true;
+ goto out;
}
if (!midQ[i]->resp_buf ||
@@ -951,9 +1136,10 @@ out:
* This is prevented above by using a noop callback that will not
* wake this thread except for the very last PDU.
*/
- for (i = 0; i < num_rqst; i++)
- cifs_delete_mid(midQ[i]);
- add_credits(ses->server, credits, optype);
+ for (i = 0; i < num_rqst; i++) {
+ if (!cancelled_mid[i])
+ cifs_delete_mid(midQ[i]);
+ }
return rc;
}
@@ -1015,6 +1201,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
struct kvec iov = { .iov_base = in_buf, .iov_len = len };
struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
if (ses == NULL) {
cifs_dbg(VFS, "Null smb session\n");
@@ -1038,7 +1225,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- rc = wait_for_free_request(ses->server, timeout, 0);
+ rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance);
if (rc)
return rc;
@@ -1052,7 +1239,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1088,7 +1275,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
/* no longer considered to be "in-flight" */
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
spin_unlock(&GlobalMid_Lock);
@@ -1096,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sync_mid_result(midQ, ses->server);
if (rc != 0) {
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1112,7 +1299,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_check_receive(midQ, ses->server, 0);
out:
cifs_delete_mid(midQ);
- add_credits(ses->server, 1, 0);
+ add_credits(ses->server, &credits, 0);
return rc;
}
@@ -1154,6 +1341,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
struct kvec iov = { .iov_base = in_buf, .iov_len = len };
struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ unsigned int instance;
if (tcon == NULL || tcon->ses == NULL) {
cifs_dbg(VFS, "Null smb session\n");
@@ -1179,7 +1367,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0);
+ rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0,
+ &instance);
if (rc)
return rc;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 02b7d91c9231..f0de238000c0 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -1,16 +1,16 @@
config FS_ENCRYPTION
- tristate "FS Encryption (Per-file encryption)"
+ bool "FS Encryption (Per-file encryption)"
select CRYPTO
select CRYPTO_AES
select CRYPTO_CBC
select CRYPTO_ECB
select CRYPTO_XTS
select CRYPTO_CTS
- select CRYPTO_CTR
select CRYPTO_SHA256
select KEYS
help
Enable encryption of files and directories. This
feature is similar to ecryptfs, but it is more memory
efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
+ decrypted pages in the page cache. Currently Ext4,
+ F2FS and UBIFS make use of this feature.
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0959044c5cee..5759bcd018cd 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
int ret = fscrypt_decrypt_page(page->mapping->host, page,
PAGE_SIZE, 0, page->index);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7424f851eb5c..7da276159593 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,7 +12,6 @@
#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H
-#define __FS_HAS_ENCRYPTION 1
#include <linux/fscrypt.h>
#include <crypto/hash.h>
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 926e5df20ec3..56debb1fcf5e 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -58,7 +58,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
return err;
if (!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
return 0;
}
@@ -82,13 +82,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_ENCRYPTED(new_dir) &&
!fscrypt_has_permitted_context(new_dir,
d_inode(old_dentry)))
- return -EPERM;
+ return -EXDEV;
if ((flags & RENAME_EXCHANGE) &&
IS_ENCRYPTED(old_dir) &&
!fscrypt_has_permitted_context(old_dir,
d_inode(new_dentry)))
- return -EPERM;
+ return -EXDEV;
}
return 0;
}
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 1e11a683f63d..322ce9686bdb 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -47,7 +47,7 @@ static int derive_key_aes(const u8 *master_key,
tfm = NULL;
goto out;
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
res = -ENOMEM;
@@ -257,7 +257,7 @@ allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
mode->friendly_name,
crypto_skcipher_alg(tfm)->base.cra_driver_name);
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
if (err)
goto err_free_tfm;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f490de921ce8..bd7eaf9b3f00 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -151,8 +151,7 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
* malicious offline violations of this constraint, while the link and rename
* checks are needed to prevent online violations of this constraint.
*
- * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
- * the filesystem operation with EPERM.
+ * Return: 1 if permitted, 0 if forbidden.
*/
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
diff --git a/fs/dcache.c b/fs/dcache.c
index 2593153471cf..aac41adf4743 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -119,6 +119,7 @@ struct dentry_stat_t dentry_stat = {
static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
+static DEFINE_PER_CPU(long, nr_dentry_negative);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
@@ -152,11 +153,22 @@ static long get_nr_dentry_unused(void)
return sum < 0 ? 0 : sum;
}
+static long get_nr_dentry_negative(void)
+{
+ int i;
+ long sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += per_cpu(nr_dentry_negative, i);
+ return sum < 0 ? 0 : sum;
+}
+
int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
dentry_stat.nr_unused = get_nr_dentry_unused();
+ dentry_stat.nr_negative = get_nr_dentry_negative();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
@@ -317,6 +329,8 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
WRITE_ONCE(dentry->d_flags, flags);
dentry->d_inode = NULL;
+ if (dentry->d_flags & DCACHE_LRU_LIST)
+ this_cpu_inc(nr_dentry_negative);
}
static void dentry_free(struct dentry *dentry)
@@ -371,6 +385,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
* The per-cpu "nr_dentry_unused" counters are updated with
* the DCACHE_LRU_LIST bit.
*
+ * The per-cpu "nr_dentry_negative" counters are only updated
+ * when deleted from or added to the per-superblock LRU list, not
+ * from/to the shrink list. That is to avoid an unneeded dec/inc
+ * pair when moving from LRU to shrink list in select_collect().
+ *
* These helper functions make sure we always follow the
* rules. d_lock must be held by the caller.
*/
@@ -380,6 +399,8 @@ static void d_lru_add(struct dentry *dentry)
D_FLAG_VERIFY(dentry, 0);
dentry->d_flags |= DCACHE_LRU_LIST;
this_cpu_inc(nr_dentry_unused);
+ if (d_is_negative(dentry))
+ this_cpu_inc(nr_dentry_negative);
WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
@@ -388,6 +409,8 @@ static void d_lru_del(struct dentry *dentry)
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags &= ~DCACHE_LRU_LIST;
this_cpu_dec(nr_dentry_unused);
+ if (d_is_negative(dentry))
+ this_cpu_dec(nr_dentry_negative);
WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
@@ -418,6 +441,8 @@ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags &= ~DCACHE_LRU_LIST;
this_cpu_dec(nr_dentry_unused);
+ if (d_is_negative(dentry))
+ this_cpu_dec(nr_dentry_negative);
list_lru_isolate(lru, &dentry->d_lru);
}
@@ -426,6 +451,8 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags |= DCACHE_SHRINK_LIST;
+ if (d_is_negative(dentry))
+ this_cpu_dec(nr_dentry_negative);
list_lru_isolate_move(lru, &dentry->d_lru, list);
}
@@ -1188,15 +1215,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
*/
void shrink_dcache_sb(struct super_block *sb)
{
- long freed;
-
do {
LIST_HEAD(dispose);
- freed = list_lru_walk(&sb->s_dentry_lru,
+ list_lru_walk(&sb->s_dentry_lru,
dentry_lru_isolate_shrink, &dispose, 1024);
-
- this_cpu_sub(nr_dentry_unused, freed);
shrink_dentry_list(&dispose);
} while (list_lru_count(&sb->s_dentry_lru) > 0);
}
@@ -1820,6 +1843,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
WARN_ON(d_in_lookup(dentry));
spin_lock(&dentry->d_lock);
+ /*
+ * Decrement negative dentry count if it was in the LRU list.
+ */
+ if (dentry->d_flags & DCACHE_LRU_LIST)
+ this_cpu_dec(nr_dentry_negative);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 13b01351dd1c..95b5e78c22b1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -324,7 +324,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
inode_unlock(d_inode(dentry->d_parent));
dput(dentry);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
static struct dentry *end_creating(struct dentry *dentry)
@@ -347,7 +347,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
dentry = start_creating(name, parent);
if (IS_ERR(dentry))
- return NULL;
+ return dentry;
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode))
@@ -386,7 +386,8 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
+ * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
@@ -422,8 +423,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
* debugfs core.
*
* It is your responsibility to protect your struct file_operation
- * methods against file removals by means of debugfs_use_file_start()
- * and debugfs_use_file_finish(). ->open() is still protected by
+ * methods against file removals by means of debugfs_file_get()
+ * and debugfs_file_put(). ->open() is still protected by
* debugfs though.
*
* Any struct file_operations defined by means of
@@ -464,7 +465,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
+ * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
@@ -495,7 +497,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %NULL will be returned.
+ * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
@@ -506,7 +509,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
struct inode *inode;
if (IS_ERR(dentry))
- return NULL;
+ return dentry;
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode))
@@ -545,7 +548,7 @@ struct dentry *debugfs_create_automount(const char *name,
struct inode *inode;
if (IS_ERR(dentry))
- return NULL;
+ return dentry;
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode))
@@ -581,8 +584,8 @@ EXPORT_SYMBOL(debugfs_create_automount);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the symbolic
* link is to be removed (no automatic cleanup happens if your module is
- * unloaded, you are responsible here.) If an error occurs, %NULL will be
- * returned.
+ * unloaded, you are responsible here.) If an error occurs, %ERR_PTR(-ERROR)
+ * will be returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
@@ -594,12 +597,12 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
struct inode *inode;
char *link = kstrdup(target, GFP_KERNEL);
if (!link)
- return NULL;
+ return ERR_PTR(-ENOMEM);
dentry = start_creating(name, parent);
if (IS_ERR(dentry)) {
kfree(link);
- return NULL;
+ return dentry;
}
inode = debugfs_get_inode(dentry->d_sb);
@@ -787,6 +790,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
struct dentry *dentry = NULL, *trap;
struct name_snapshot old_name;
+ if (IS_ERR(old_dir))
+ return old_dir;
+ if (IS_ERR(new_dir))
+ return new_dir;
+ if (IS_ERR_OR_NULL(old_dentry))
+ return old_dentry;
+
trap = lock_rename(new_dir, old_dir);
/* Source or destination directories don't exist? */
if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
@@ -820,7 +830,9 @@ exit:
if (dentry && !IS_ERR(dentry))
dput(dentry);
unlock_rename(new_dir, old_dir);
- return NULL;
+ if (IS_ERR(dentry))
+ return dentry;
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(debugfs_rename);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index dbc1a1f080ce..9bb015bc4a83 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
- bio_for_each_segment_all(bvec, bio, i) {
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
if (dio->op == REQ_OP_READ && !PageCompound(page) &&
@@ -679,6 +681,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
unsigned long fs_count; /* Number of filesystem-sized blocks */
int create;
unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
+ loff_t i_size;
/*
* If there was a memory error and we've overwritten all the
@@ -708,8 +711,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
*/
create = dio->op == REQ_OP_WRITE;
if (dio->flags & DIO_SKIP_HOLES) {
- if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
- i_blkbits))
+ i_size = i_size_read(dio->inode);
+ if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits)
create = 0;
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 76976d6e50f9..c98ad9777ad9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1089,12 +1089,12 @@ static void sctp_connect_to_sock(struct connection *con)
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
0);
memset(&tv, 0, sizeof(tv));
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
if (result == -EINPROGRESS)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 82377017130f..d31b6c72b476 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
+ /*
+ * We must skip inodes in unusual state. We may also skip
+ * inodes without pages but we deliberately won't in case
+ * we need to reschedule to avoid softlockups.
+ */
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- (inode->i_mapping->nrpages == 0)) {
+ (inode->i_mapping->nrpages == 0 && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
}
@@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
+ cond_resched();
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
toput_inode = inode;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 4dd842f72846..f664da55234e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -610,7 +610,8 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
full_alg_name);
goto out_free;
}
- crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(crypt_stat->tfm,
+ CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
rc = 0;
out_free:
kfree(full_alg_name);
@@ -1590,7 +1591,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
"[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
- crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
if (*key_size == 0)
*key_size = crypto_skcipher_default_keysize(*key_tfm);
get_random_bytes(dummy_key, *key_size);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a5d219d920e7..4a0e98d87fcc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +182,6 @@ struct epitem {
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
- *
- * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
@@ -203,13 +201,16 @@ struct eventpoll {
/* List of ready file descriptors */
struct list_head rdllist;
+ /* Lock which protects rdllist and ovflist */
+ rwlock_t lock;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->wq.lock.
+ * holding ->lock.
*/
struct epitem *ovflist;
@@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
WRITE_ONCE(ep->ovflist, NULL);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -722,7 +723,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ /*
+ * ->ovflist is LIFO, so we have to reverse it in order
+ * to keep in FIFO.
+ */
+ list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
@@ -745,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -789,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -842,7 +847,7 @@ static void ep_free(struct eventpoll *ep)
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->wq.lock".
+ * us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1023,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep)
goto free_uid;
mutex_init(&ep->mtx);
+ rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ * existing list until all changes are completed, in other words
+ * concurrent list_add_tail_lockless() calls should be protected
+ * with a read lock, where write lock acts as a barrier which
+ * makes sure all list_add_tail_lockless() calls are fully
+ * completed.
+ *
+ * Also an element can be locklessly added to the list only in one
+ * direction i.e. either to the tail either to the head, otherwise
+ * concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+ struct list_head *head)
+{
+ struct list_head *prev;
+
+ /*
+ * This is simple 'new->next = head' operation, but cmpxchg()
+ * is used in order to detect that same element has been just
+ * added to the list from another CPU: the winner observes
+ * new->next == new.
+ */
+ if (cmpxchg(&new->next, new, head) != new)
+ return false;
+
+ /*
+ * Initially ->next of a new element must be updated with the head
+ * (we are inserting to the tail) and only then pointers are atomically
+ * exchanged. XCHG guarantees memory ordering, thus ->next should be
+ * updated before pointers are actually swapped and pointers are
+ * swapped before prev->next is updated.
+ */
+
+ prev = xchg(&head->prev, new);
+
+ /*
+ * It is safe to modify prev->next and new->prev, because a new element
+ * is added only to the tail and new->next is updated before XCHG.
+ */
+
+ prev->next = new;
+ new->prev = prev;
+
+ return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+ struct eventpoll *ep = epi->ep;
+
+ /* Check that the same epi has not been just chained from another CPU */
+ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ return false;
+
+ /* Atomically exchange tail */
+ epi->next = xchg(&ep->ovflist, epi);
+
+ return true;
+}
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless. Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries. Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly. This is detected using
+ * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
- unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
+ unsigned long flags;
int ewake = 0;
- spin_lock_irqsave(&ep->wq.lock, flags);
+ read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1155,24 +1247,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = READ_ONCE(ep->ovflist);
- WRITE_ONCE(ep->ovflist, epi);
- if (epi->ws) {
- /*
- * Activate ep->ws since epi->ws may get
- * deactivated at any time.
- */
- __pm_stay_awake(ep->ws);
- }
-
- }
+ if (epi->next == EP_UNACTIVE_PTR &&
+ chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(epi) &&
+ list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
@@ -1197,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->wq.lock, flags);
+ read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1488,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1500,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1531,10 +1614,10 @@ error_unregister:
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1578,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->wq.lock while
+ * We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
- * ep->wq.lock).
+ * ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1599,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1771,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
timed_out = 1;
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
goto send_events;
}
diff --git a/fs/exec.c b/fs/exec.c
index fb72d36f7823..2e0033348d8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,7 +932,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
if (bytes < 0) {
ret = bytes;
- goto out;
+ goto out_free;
}
if (bytes == 0)
@@ -1189,7 +1189,7 @@ no_thread_group:
flush_itimer_signals();
#endif
- if (atomic_read(&oldsighand->count) != 1) {
+ if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
/*
* This ->sighand is shared with the CLONE_SIGHAND
@@ -1199,7 +1199,7 @@ no_thread_group:
if (!newsighand)
return -ENOMEM;
- atomic_set(&newsighand->count, 1);
+ refcount_set(&newsighand->count, 1);
memcpy(newsighand->action, oldsighand->action,
sizeof(newsighand->action));
@@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
/*
* Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ * Check permissions, then read the first BINPRM_BUF_SIZE bytes
*
* This may be called multiple times for binary chains (scripts for example).
*/
@@ -1944,15 +1944,10 @@ EXPORT_SYMBOL(set_binfmt);
*/
void set_dumpable(struct mm_struct *mm, int value)
{
- unsigned long old, new;
-
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
- do {
- old = READ_ONCE(mm->flags);
- new = (old & ~MMF_DUMPABLE_MASK) | value;
- } while (cmpxchg(&mm->flags, old, new) != old);
+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}
SYSCALL_DEFINE3(execve,
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3b8114def693..13318e255ebf 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -252,33 +252,10 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask)
return (char *)p - base;
}
-static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
- [EXT2_FT_UNKNOWN] = DT_UNKNOWN,
- [EXT2_FT_REG_FILE] = DT_REG,
- [EXT2_FT_DIR] = DT_DIR,
- [EXT2_FT_CHRDEV] = DT_CHR,
- [EXT2_FT_BLKDEV] = DT_BLK,
- [EXT2_FT_FIFO] = DT_FIFO,
- [EXT2_FT_SOCK] = DT_SOCK,
- [EXT2_FT_SYMLINK] = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK,
-};
-
static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
{
- umode_t mode = inode->i_mode;
if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ de->file_type = fs_umode_to_ftype(inode->i_mode);
else
de->file_type = 0;
}
@@ -293,14 +270,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
- unsigned char *types = NULL;
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+ bool has_filetype;
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
return 0;
- if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
- types = ext2_filetype_table;
+ has_filetype =
+ EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE);
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -335,8 +312,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
if (de->inode) {
unsigned char d_type = DT_UNKNOWN;
- if (types && de->file_type < EXT2_FT_MAX)
- d_type = types[de->file_type];
+ if (has_filetype)
+ d_type = fs_ftype_to_dtype(de->file_type);
if (!dir_emit(ctx, de->name, de->name_len,
le32_to_cpu(de->inode),
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e770cd100a6a..10ab238de9a6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -604,22 +604,6 @@ struct ext2_dir_entry_2 {
};
/*
- * Ext2 directory file types. Only the low 3 bits are used. The
- * other bits are reserved for now.
- */
-enum {
- EXT2_FT_UNKNOWN = 0,
- EXT2_FT_REG_FILE = 1,
- EXT2_FT_DIR = 2,
- EXT2_FT_CHRDEV = 3,
- EXT2_FT_BLKDEV = 4,
- EXT2_FT_FIFO = 5,
- EXT2_FT_SOCK = 6,
- EXT2_FT_SYMLINK = 7,
- EXT2_FT_MAX
-};
-
-/*
* EXT2_DIR_PAD defines the directory entries boundaries
*
* NOTE: It must be a multiple of 4
@@ -774,6 +758,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
extern void ext2_evict_inode(struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern int ext2_setattr (struct dentry *, struct iattr *);
+extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
extern void ext2_set_inode_flags(struct inode *inode);
extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 28b2609f25c1..39c4772e96c9 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -199,6 +199,7 @@ const struct inode_operations ext2_file_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5c3d7b7e4975..a0c5ea91fcd4 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -222,8 +222,6 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
best_desc = desc;
}
}
- if (!best_desc)
- return -1;
return best_group;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e4bb9386c045..c27c27300d95 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -717,7 +717,7 @@ static int ext2_get_blocks(struct inode *inode,
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = (chain + depth) - partial - 1;
/*
- * Next look up the indirect map to count the totoal number of
+ * Next look up the indirect map to count the total number of
* direct blocks to allocate for this branch.
*/
count = ext2_blks_to_allocate(partial, indirect_blks,
@@ -1239,6 +1239,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 1);
}
+ /* fall through */
case EXT2_IND_BLOCK:
nr = i_data[EXT2_DIND_BLOCK];
if (nr) {
@@ -1246,6 +1247,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 2);
}
+ /* fall through */
case EXT2_DIND_BLOCK:
nr = i_data[EXT2_TIND_BLOCK];
if (nr) {
@@ -1635,6 +1637,32 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}
+int ext2_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_falgs)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ unsigned int flags;
+
+ flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
+ if (flags & EXT2_APPEND_FL)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (flags & EXT2_COMPR_FL)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+ if (flags & EXT2_IMMUTABLE_FL)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (flags & EXT2_NODUMP_FL)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_ENCRYPTED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 0c26dcc5d850..ccfbbf59e2fc 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -416,6 +416,7 @@ const struct inode_operations ext2_dir_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
@@ -426,6 +427,7 @@ const struct inode_operations ext2_special_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 73b2d528237f..0128010a0874 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -757,7 +757,8 @@ static loff_t ext2_max_size(int bits)
{
loff_t res = EXT2_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
+ unsigned int upper_limit;
+ unsigned int ppb = 1 << (bits-2);
/* This is calculated to be the largest file size for a
* dense, file such that the total number of
@@ -771,24 +772,34 @@ static loff_t ext2_max_size(int bits)
/* total blocks in file system block size */
upper_limit >>= (bits - 9);
+ /* Compute how many blocks we can address by block tree */
+ res += 1LL << (bits-2);
+ res += 1LL << (2*(bits-2));
+ res += 1LL << (3*(bits-2));
+ /* Does block tree limit file size? */
+ if (res < upper_limit)
+ goto check_lfs;
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT2_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
+ DIV_ROUND_UP(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
@@ -1024,8 +1035,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- if (EXT2_INODE_SIZE(sb) == 0)
- goto cantfind_ext2;
sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
goto cantfind_ext2;
@@ -1087,12 +1096,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount;
}
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
if (!sbi->s_debts) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount_group_desc;
}
@@ -1148,6 +1159,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT2_FS_XATTR
sbi->s_ea_block_cache = ext2_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
goto failed_mount3;
}
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index d5589ddcc281..00cdb8679486 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -23,6 +23,7 @@
const struct inode_operations ext2_symlink_inode_operations = {
.get_link = page_get_link,
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
@@ -31,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
const struct inode_operations ext2_fast_symlink_inode_operations = {
.get_link = simple_get_link,
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 4f30876ee325..1e33e0ac8cf1 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -342,6 +342,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
return;
spin_lock(&EXT2_SB(sb)->s_lock);
+ ext2_update_dynamic_rev(sb);
EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
spin_unlock(&EXT2_SB(sb)->s_lock);
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index a453cc87082b..031e5a82d556 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -96,21 +96,6 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
-config EXT4_ENCRYPTION
- bool "Ext4 Encryption"
- depends on EXT4_FS
- select FS_ENCRYPTION
- help
- Enable encryption of ext4 files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
-config EXT4_FS_ENCRYPTION
- bool
- default y
- depends on EXT4_ENCRYPTION
-
config EXT4_DEBUG
bool "EXT4 debugging support"
depends on EXT4_FS
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f93f9881ec18..0ccd51f72048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -111,7 +111,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
int dir_has_error = 0;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
return err;
@@ -138,7 +138,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return err;
}
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
@@ -245,7 +245,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- if (!ext4_encrypted_inode(inode)) {
+ if (!IS_ENCRYPTED(inode)) {
if (!dir_emit(ctx, de->name,
de->name_len,
le32_to_cpu(de->inode),
@@ -283,9 +283,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
done:
err = 0;
errout:
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fstr);
-#endif
brelse(bh);
return err;
}
@@ -613,7 +611,7 @@ finished:
static int ext4_dir_open(struct inode * inode, struct file * filp)
{
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 185a05d3257e..5012ddb6daf9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -40,7 +40,6 @@
#include <linux/compat.h>
#endif
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#include <linux/compiler.h>
@@ -1326,7 +1325,7 @@ struct ext4_super_block {
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
EXT4_MF_TEST_DUMMY_ENCRYPTION))
#else
@@ -2051,7 +2050,7 @@ struct ext4_filename {
const struct qstr *usr_fname;
struct fscrypt_str disk_name;
struct dx_hash_info hinfo;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
};
@@ -2279,12 +2278,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-static inline bool ext4_encrypted_inode(struct inode *inode)
-{
- return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
-}
-
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static inline int ext4_fname_setup_filename(struct inode *dir,
const struct qstr *iname,
int lookup, struct ext4_filename *fname)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 15b6dd733780..a1ac7e9245ec 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -411,7 +411,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
!test_opt(inode->i_sb, DELALLOC))) {
/* We do not support data journalling for encrypted data */
- if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 240b6dea5441..79d986dbf5af 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3631,7 +3631,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
max_zeroout = 0;
/*
@@ -4818,7 +4818,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
* leave it disabled for encrypted inodes for now. This is a
* bug we should fix....
*/
- if (ext4_encrypted_inode(inode) &&
+ if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
FALLOC_FL_ZERO_RANGE)))
return -EOPNOTSUPP;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 712f00995390..5508baa11bb6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -116,16 +116,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
- ret = file_write_and_wait_range(file, start, end);
- if (ret)
- return ret;
-
if (!journal) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL
- };
-
- ret = ext4_write_inode(inode, &wbc);
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret)
ret = ext4_sync_parent(inode);
if (test_opt(inode->i_sb, BARRIER))
@@ -133,6 +125,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
+ ret = file_write_and_wait_range(file, start, end);
+ if (ret)
+ return ret;
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7ff14a1adba3..f3e17a8c84b4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -771,7 +771,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
!(i_flags & EXT4_EA_INODE_FL)) {
err = fscrypt_get_encryption_info(dir);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 34d7e0703cc6..4356ef6d728e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -415,7 +415,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
{
int ret;
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
@@ -1150,7 +1150,7 @@ int do_journal_get_write_access(handle_t *handle,
return ret;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
@@ -1217,8 +1217,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
(block_start < from || block_end > to)) {
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++ = bh;
- decrypt = ext4_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode);
+ decrypt = IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}
}
/*
@@ -1303,7 +1302,7 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block_unwritten);
@@ -3105,7 +3104,7 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ret = ext4_block_write_begin(page, pos, len,
ext4_da_get_block_prep);
#else
@@ -3880,8 +3879,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t offset = iocb->ki_pos;
ssize_t ret;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+#ifdef CONFIG_FS_ENCRYPTION
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return 0;
#endif
@@ -4065,8 +4064,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
- if (S_ISREG(inode->i_mode) &&
- ext4_encrypted_inode(inode)) {
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_SIZE);
@@ -4142,7 +4140,7 @@ static int ext4_block_truncate_page(handle_t *handle,
struct inode *inode = mapping->host;
/* If we are processing an encrypted inode during orphan list handling */
- if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode))
+ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
return 0;
blocksize = inode->i_sb->s_blocksize;
@@ -4722,7 +4720,7 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_has_inline_data(inode))
return false;
- if (ext4_encrypted_inode(inode))
+ if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
return false;
return true;
}
@@ -5072,7 +5070,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
ext4_set_aops(inode);
} else if (ext4_inode_is_fast_symlink(inode)) {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d37dafa1d133..d26bcac291bb 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -210,7 +210,7 @@ journal_err_out:
return err;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int uuid_is_zero(__u8 u[16])
{
int i;
@@ -978,7 +978,7 @@ resizefs_out:
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
handle_t *handle;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2f5be02fc6f6..1083a9f3f16a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -592,8 +592,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
return -EOPNOTSUPP;
}
- if (ext4_encrypted_inode(orig_inode) ||
- ext4_encrypted_inode(donor_inode)) {
+ if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
ext4_msg(orig_inode->i_sb, KERN_ERR,
"Online defrag not supported for encrypted files");
return -EOPNOTSUPP;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2b928eb07fa2..980166a8122a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -612,7 +612,7 @@ static struct stats dx_show_leaf(struct inode *dir,
{
if (show_names)
{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
int len;
char *name;
struct fscrypt_str fname_crypto_str =
@@ -621,7 +621,7 @@ static struct stats dx_show_leaf(struct inode *dir,
name = de->name;
len = de->name_len;
- if (ext4_encrypted_inode(dir))
+ if (IS_ENCRYPTED(dir))
res = fscrypt_get_encryption_info(dir);
if (res) {
printk(KERN_WARNING "Error setting up"
@@ -984,9 +984,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
/* Check if the directory is encrypted */
- if (ext4_encrypted_inode(dir)) {
+ if (IS_ENCRYPTED(dir)) {
err = fscrypt_get_encryption_info(dir);
if (err < 0) {
brelse(bh);
@@ -1015,7 +1015,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
continue;
if (de->inode == 0)
continue;
- if (!ext4_encrypted_inode(dir)) {
+ if (!IS_ENCRYPTED(dir)) {
tmp_str.name = de->name;
tmp_str.len = de->name_len;
err = ext4_htree_store_dirent(dir_file,
@@ -1047,7 +1047,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
}
errout:
brelse(bh);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fname_crypto_str);
#endif
return count;
@@ -1267,7 +1267,7 @@ static inline bool ext4_match(const struct ext4_filename *fname,
f.usr_fname = fname->usr_fname;
f.disk_name = fname->disk_name;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
f.crypto_buf = fname->crypto_buf;
#endif
return fscrypt_match_name(&f, de->name, de->name_len);
@@ -1498,7 +1498,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
ext4_lblk_t block;
int retval;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
*res_dir = NULL;
#endif
frame = dx_probe(fname, dir, NULL, frames);
@@ -1578,7 +1578,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
ino);
return ERR_PTR(-EFSCORRUPTED);
}
- if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+ if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
ext4_warning(inode->i_sb,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 2aa62d58d8dd..6f5305e9a6ac 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -63,10 +63,11 @@ static void ext4_finish_bio(struct bio *bio)
{
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct page *data_page = NULL;
#endif
struct buffer_head *bh, *head;
@@ -78,7 +79,7 @@ static void ext4_finish_bio(struct bio *bio)
if (!page)
continue;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (!page->mapping) {
/* The bounce data pages are unmapped. */
data_page = page;
@@ -111,7 +112,7 @@ static void ext4_finish_bio(struct bio *bio)
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
if (!under_io) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (data_page)
fscrypt_restore_control_page(data_page);
#endif
@@ -477,8 +478,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
bh = head = page_buffers(page);
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
- nr_to_submit) {
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
retry_encrypt:
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 6aa282ee455a..3adadf461825 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -49,7 +49,7 @@
static inline bool ext4_bio_encrypted(struct bio *bio)
{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
return unlikely(bio->bi_private != NULL);
#else
return false;
@@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
if (ext4_bio_encrypted(bio)) {
if (bio->bi_status) {
@@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio)
return;
}
}
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -242,8 +243,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (bio == NULL) {
struct fscrypt_ctx *ctx = NULL;
- if (ext4_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode)) {
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
ctx = fscrypt_get_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
goto set_error_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb12d3c17c1b..60da0a6e4d86 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1232,7 +1232,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
return try_to_free_buffers(page);
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
{
return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
@@ -1922,7 +1922,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
ext4_msg(sb, KERN_WARNING,
"Test dummy encryption mode enabled");
@@ -4167,7 +4167,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ext4_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
#endif
#ifdef CONFIG_QUOTA
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9212a026a1f1..5e4e78fc0b3a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -224,7 +224,7 @@ static struct attribute *ext4_attrs[] = {
EXT4_ATTR_FEATURE(lazy_itable_init);
EXT4_ATTR_FEATURE(batched_discard);
EXT4_ATTR_FEATURE(meta_bg_resize);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
@@ -233,7 +233,7 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
ATTR_LIST(meta_bg_resize),
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
#endif
ATTR_LIST(metadata_csum_seed),
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 9a20ef42fadd..e57cc754d543 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -3,6 +3,7 @@ config F2FS_FS
depends on BLOCK
select CRYPTO
select CRYPTO_CRC32
+ select F2FS_FS_XATTR if FS_ENCRYPTION
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@@ -70,17 +71,6 @@ config F2FS_CHECK_FS
If you want to improve the performance, say N.
-config F2FS_FS_ENCRYPTION
- bool "F2FS Encryption"
- depends on F2FS_FS
- depends on F2FS_FS_XATTR
- select FS_ENCRYPTION
- help
- Enable encryption of f2fs files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
config F2FS_IO_TRACE
bool "F2FS IO tracer"
depends on F2FS_FS
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f91d8630c9a2..568e1d09eb48 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio)
struct page *page;
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
page = bv->bv_page;
/* PG_error was set if any post_read step failed */
@@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio)
struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
f2fs_show_injection_info(FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
}
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page);
@@ -347,6 +349,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
struct bio_vec *bvec;
struct page *target;
int i;
+ struct bvec_iter_all iter_all;
if (!io->bio)
return false;
@@ -354,7 +357,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
if (!inode && !page && !ino)
return true;
- bio_for_each_segment_all(bvec, io->bio, i) {
+ bio_for_each_segment_all(bvec, io->bio, i, iter_all) {
if (bvec->bv_page->mapping)
target = bvec->bv_page;
@@ -1465,7 +1468,7 @@ next:
}
if (size) {
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
ret = fiemap_fill_next_extent(fieinfo, logical,
@@ -1736,7 +1739,7 @@ static inline bool check_inplace_update_policy(struct inode *inode,
if (policy & (0x1 << F2FS_IPU_ASYNC) &&
fio && fio->op == REQ_OP_WRITE &&
!(fio->op_flags & REQ_SYNC) &&
- !f2fs_encrypted_inode(inode))
+ !IS_ENCRYPTED(inode))
return true;
/* this is only set during fdatasync */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index ebcc121920ba..fd7f170e2f2d 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -506,30 +506,16 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
kvfree(si);
}
-int __init f2fs_create_root_stats(void)
+void __init f2fs_create_root_stats(void)
{
- struct dentry *file;
-
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
- if (!f2fs_debugfs_root)
- return -ENOMEM;
- file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
- NULL, &stat_fops);
- if (!file) {
- debugfs_remove(f2fs_debugfs_root);
- f2fs_debugfs_root = NULL;
- return -ENOMEM;
- }
-
- return 0;
+ debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
+ &stat_fops);
}
void f2fs_destroy_root_stats(void)
{
- if (!f2fs_debugfs_root)
- return;
-
debugfs_remove_recursive(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 50d0d36280fa..713b36a10a79 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -385,7 +385,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- if ((f2fs_encrypted_inode(dir) || dummy_encrypt) &&
+ if ((IS_ENCRYPTED(dir) || dummy_encrypt) &&
f2fs_may_encrypt(inode)) {
err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
@@ -399,7 +399,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (new_name) {
init_dent_inode(new_name, page);
- if (f2fs_encrypted_inode(dir))
+ if (IS_ENCRYPTED(dir))
file_set_enc_name(inode);
}
@@ -819,7 +819,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
goto out;
}
- if (f2fs_encrypted_inode(d->inode)) {
+ if (IS_ENCRYPTED(d->inode)) {
int save_len = fstr->len;
err = fscrypt_fname_disk_to_usr(d->inode,
@@ -862,7 +862,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
int err = 0;
- if (f2fs_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
goto out;
@@ -924,7 +924,7 @@ out:
static int f2fs_dir_open(struct inode *inode, struct file *filp)
{
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 12fabd6735dd..7ea5c9cede37 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,7 +24,6 @@
#include <linux/quotaops.h>
#include <crypto/hash.h>
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#ifdef CONFIG_F2FS_CHECK_FS
@@ -1137,7 +1136,7 @@ enum fsync_mode {
FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */
};
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) \
(unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
#else
@@ -3328,7 +3327,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
int f2fs_build_stats(struct f2fs_sb_info *sbi);
void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
-int __init f2fs_create_root_stats(void);
+void __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
#define stat_inc_cp_count(si) do { } while (0)
@@ -3366,7 +3365,7 @@ void f2fs_destroy_root_stats(void);
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline int __init f2fs_create_root_stats(void) { return 0; }
+static inline void __init f2fs_create_root_stats(void) { }
static inline void f2fs_destroy_root_stats(void) { }
#endif
@@ -3463,19 +3462,14 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
/*
* crypto support
*/
-static inline bool f2fs_encrypted_inode(struct inode *inode)
-{
- return file_is_encrypt(inode);
-}
-
static inline bool f2fs_encrypted_file(struct inode *inode)
{
- return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode);
+ return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}
static inline void f2fs_set_encrypted_inode(struct inode *inode)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
file_set_encrypt(inode);
f2fs_set_inode_flags(inode);
#endif
@@ -3554,7 +3548,7 @@ static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
static inline bool f2fs_may_encrypt(struct inode *inode)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
umode_t mode = inode->i_mode;
return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bba56b39dcc5..ba5954f41e14 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -582,7 +582,7 @@ truncate_out:
zero_user(page, offset, PAGE_SIZE - offset);
/* An encrypted inode should have a key and truncate the last page. */
- f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode));
+ f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode));
if (!cache_only)
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -711,7 +711,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
stat->attributes |= STATX_ATTR_APPEND;
if (flags & F2FS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
stat->attributes |= STATX_ATTR_ENCRYPTED;
if (flags & F2FS_IMMUTABLE_FL)
stat->attributes |= STATX_ATTR_IMMUTABLE;
@@ -1563,7 +1563,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- if (f2fs_encrypted_inode(inode) &&
+ if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
@@ -1647,7 +1647,7 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int flags = fi->i_flags;
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
flags |= F2FS_ENCRYPT_FL;
if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
flags |= F2FS_INLINE_DATA_FL;
@@ -2414,7 +2414,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
return -EINVAL;
- if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
+ if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst))
return -EOPNOTSUPP;
if (src == dst) {
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index bec52961630b..d910a820ae67 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -43,7 +43,7 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & F2FS_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (f2fs_encrypted_inode(inode))
+ if (file_is_encrypt(inode))
new_fl |= S_ENCRYPTED;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
@@ -453,7 +453,7 @@ make_now:
inode->i_mapping->a_ops = &f2fs_dblock_aops;
inode_nohighmem(inode);
} else if (S_ISLNK(inode->i_mode)) {
- if (f2fs_encrypted_inode(inode))
+ if (file_is_encrypt(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 62d9829f3a6a..e967d27c1a89 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -75,7 +75,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
set_inode_flag(inode, FI_NEW_INODE);
/* If the directory encrypted, then we should encrypt the inode. */
- if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
@@ -476,7 +476,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
if (err)
goto out_iput;
}
- if (f2fs_encrypted_inode(dir) &&
+ if (IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
f2fs_msg(inode->i_sb, KERN_WARNING,
@@ -803,7 +803,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
+ if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
int err = fscrypt_get_encryption_info(dir);
if (err)
return err;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c46a1d4318d4..d1ccc52afc93 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -757,7 +757,7 @@ static int parse_options(struct super_block *sb, char *options)
kvfree(name);
break;
case Opt_test_dummy_encryption:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (!f2fs_sb_has_encrypt(sbi)) {
f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
return -EINVAL;
@@ -1390,7 +1390,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",whint_mode=%s", "user-based");
else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
seq_printf(seq, ",whint_mode=%s", "fs-based");
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (F2FS_OPTION(sbi).test_dummy_encryption)
seq_puts(seq, ",test_dummy_encryption");
#endif
@@ -2154,7 +2154,7 @@ static const struct super_operations f2fs_sops = {
.remount_fs = f2fs_remount,
};
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
{
return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
@@ -3116,7 +3116,7 @@ try_onemore:
#endif
sb->s_op = &f2fs_sops;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &f2fs_cryptops;
#endif
sb->s_xattr = f2fs_xattr_handlers;
@@ -3545,9 +3545,7 @@ static int __init init_f2fs_fs(void)
err = register_filesystem(&f2fs_fs_type);
if (err)
goto free_shrinker;
- err = f2fs_create_root_stats();
- if (err)
- goto free_filesystem;
+ f2fs_create_root_stats();
err = f2fs_init_post_read_processing();
if (err)
goto free_root_stats;
@@ -3555,7 +3553,6 @@ static int __init init_f2fs_fs(void)
free_root_stats:
f2fs_destroy_root_stats();
-free_filesystem:
unregister_filesystem(&f2fs_fs_type);
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 0575edbe3ed6..70da6801c86f 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -431,7 +431,7 @@ F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
#endif
#ifdef CONFIG_BLK_DEV_ZONED
@@ -492,7 +492,7 @@ static struct attribute *f2fs_attrs[] = {
};
static struct attribute *f2fs_feat_attrs[] = {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
#endif
#ifdef CONFIG_BLK_DEV_ZONED
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 13935ee99e1e..b3bed32946b1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -214,6 +214,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.fallocate = fat_fallocate,
};
diff --git a/fs/file.c b/fs/file.c
index 3209ee271c41..3da91a112bab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -457,6 +457,7 @@ struct files_struct init_files = {
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+ .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
@@ -705,7 +706,7 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
-static struct file *__fget(unsigned int fd, fmode_t mask)
+static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
{
struct files_struct *files = current->files;
struct file *file;
@@ -720,7 +721,7 @@ loop:
*/
if (file->f_mode & mask)
file = NULL;
- else if (!get_file_rcu(file))
+ else if (!get_file_rcu_many(file, refs))
goto loop;
}
rcu_read_unlock();
@@ -728,15 +729,20 @@ loop:
return file;
}
+struct file *fget_many(unsigned int fd, unsigned int refs)
+{
+ return __fget(fd, FMODE_PATH, refs);
+}
+
struct file *fget(unsigned int fd)
{
- return __fget(fd, FMODE_PATH);
+ return __fget(fd, FMODE_PATH, 1);
}
EXPORT_SYMBOL(fget);
struct file *fget_raw(unsigned int fd)
{
- return __fget(fd, 0);
+ return __fget(fd, 0, 1);
}
EXPORT_SYMBOL(fget_raw);
@@ -767,7 +773,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
return 0;
return (unsigned long)file;
} else {
- file = __fget(fd, mask);
+ file = __fget(fd, mask, 1);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
diff --git a/fs/file_table.c b/fs/file_table.c
index 5679e7fcb6b0..155d7514a094 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -326,9 +326,9 @@ void flush_delayed_fput(void)
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
-void fput(struct file *file)
+void fput_many(struct file *file, unsigned int refs)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
+ if (atomic_long_sub_and_test(refs, &file->f_count)) {
struct task_struct *task = current;
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
@@ -347,6 +347,11 @@ void fput(struct file *file)
}
}
+void fput(struct file *file)
+{
+ fput_many(file, 1);
+}
+
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b40168fcc94a..36855c1f8daf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
struct work_struct work;
};
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+ down_write(&bdi->wb_switch_rwsem);
+}
+
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+ up_write(&bdi->wb_switch_rwsem);
+}
+
static void inode_switch_wbs_work_fn(struct work_struct *work)
{
struct inode_switch_wbs_context *isw =
container_of(work, struct inode_switch_wbs_context, work);
struct inode *inode = isw->inode;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
@@ -344,6 +355,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
bool switched = false;
/*
+ * If @inode switches cgwb membership while sync_inodes_sb() is
+ * being issued, sync_inodes_sb() might miss it. Synchronize.
+ */
+ down_read(&bdi->wb_switch_rwsem);
+
+ /*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
@@ -428,6 +445,8 @@ skip_switch:
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
+ up_read(&bdi->wb_switch_rwsem);
+
if (switched) {
wb_wakeup(new_wb);
wb_put(old_wb);
@@ -468,9 +487,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (inode->i_state & I_WB_SWITCH)
return;
+ /*
+ * Avoid starting new switches while sync_inodes_sb() is in
+ * progress. Otherwise, if the down_write protected issue path
+ * blocks heavily, we might end up starting a large number of
+ * switches which will block on the rwsem.
+ */
+ if (!down_read_trylock(&bdi->wb_switch_rwsem))
+ return;
+
isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
if (!isw)
- return;
+ goto out_unlock;
/* find and pin the new wb */
rcu_read_lock();
@@ -504,12 +532,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
- return;
+ goto out_unlock;
out_free:
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
+out_unlock:
+ up_read(&bdi->wb_switch_rwsem);
}
/**
@@ -887,6 +917,9 @@ fs_initcall(cgroup_writeback_init);
#else /* CONFIG_CGROUP_WRITEBACK */
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
__releases(&inode->i_lock)
@@ -2413,8 +2446,11 @@ void sync_inodes_sb(struct super_block *sb)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
+ bdi_down_write_wb_switch_rwsem(bdi);
bdi_split_work_to_wbs(bdi, &work, false);
wb_wait_for_completion(bdi, &done);
+ bdi_up_write_wb_switch_rwsem(bdi);
wait_sb_inodes(sb);
}
diff --git a/fs/fs_types.c b/fs/fs_types.c
new file mode 100644
index 000000000000..78365e5dc08c
--- /dev/null
+++ b/fs/fs_types.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/export.h>
+
+/*
+ * fs on-disk file type to dirent file type conversion
+ */
+static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
+ [FT_UNKNOWN] = DT_UNKNOWN,
+ [FT_REG_FILE] = DT_REG,
+ [FT_DIR] = DT_DIR,
+ [FT_CHRDEV] = DT_CHR,
+ [FT_BLKDEV] = DT_BLK,
+ [FT_FIFO] = DT_FIFO,
+ [FT_SOCK] = DT_SOCK,
+ [FT_SYMLINK] = DT_LNK
+};
+
+/**
+ * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
+ * @filetype: The on-disk file type to convert.
+ *
+ * This function converts the on-disk file type value (FT_*) to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN - Unknown type
+ * * DT_FIFO - FIFO
+ * * DT_CHR - Character device
+ * * DT_DIR - Directory
+ * * DT_BLK - Block device
+ * * DT_REG - Regular file
+ * * DT_LNK - Symbolic link
+ * * DT_SOCK - Local-domain socket
+ */
+unsigned char fs_ftype_to_dtype(unsigned int filetype)
+{
+ if (filetype >= FT_MAX)
+ return DT_UNKNOWN;
+
+ return fs_dtype_by_ftype[filetype];
+}
+EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
+
+/*
+ * dirent file type to fs on-disk file type conversion
+ * Values not initialized explicitly are FT_UNKNOWN (0).
+ */
+static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
+ [DT_REG] = FT_REG_FILE,
+ [DT_DIR] = FT_DIR,
+ [DT_LNK] = FT_SYMLINK,
+ [DT_CHR] = FT_CHRDEV,
+ [DT_BLK] = FT_BLKDEV,
+ [DT_FIFO] = FT_FIFO,
+ [DT_SOCK] = FT_SOCK,
+};
+
+/**
+ * fs_umode_to_ftype() - file mode to on-disk file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the on-disk file type (FT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * FT_UNKNOWN - Unknown type
+ * * FT_REG_FILE - Regular file
+ * * FT_DIR - Directory
+ * * FT_CHRDEV - Character device
+ * * FT_BLKDEV - Block device
+ * * FT_FIFO - FIFO
+ * * FT_SOCK - Local-domain socket
+ * * FT_SYMLINK - Symbolic link
+ */
+unsigned char fs_umode_to_ftype(umode_t mode)
+{
+ return fs_ftype_by_dtype[S_DT(mode)];
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
+
+/**
+ * fs_umode_to_dtype() - file mode to dirent file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN - Unknown type
+ * * DT_FIFO - FIFO
+ * * DT_CHR - Character device
+ * * DT_DIR - Directory
+ * * DT_BLK - Block device
+ * * DT_REG - Regular file
+ * * DT_LNK - Symbolic link
+ * * DT_SOCK - Local-domain socket
+ */
+unsigned char fs_umode_to_dtype(umode_t mode)
+{
+ return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_dtype);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a5e516a40e7a..809c0f2f9942 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1742,7 +1742,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
req->in.h.nodeid = outarg->nodeid;
req->in.numargs = 2;
req->in.argpages = 1;
- req->page_descs[0].offset = offset;
req->end = fuse_retrieve_end;
index = outarg->offset >> PAGE_SHIFT;
@@ -1757,6 +1756,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
this_num = min_t(unsigned, num, PAGE_SIZE - offset);
req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].offset = offset;
req->page_descs[req->num_pages].length = this_num;
req->num_pages++;
@@ -2077,8 +2077,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
ret = fuse_dev_do_write(fud, &cs, len);
+ pipe_lock(pipe);
for (idx = 0; idx < nbuf; idx++)
pipe_buf_release(pipe, &bufs[idx]);
+ pipe_unlock(pipe);
out:
kvfree(bufs);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ffaffe18352a..a59c16bd90ac 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1782,7 +1782,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
spin_unlock(&fc->lock);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_node_page_state(page, NR_WRITEBACK_TEMP);
+ dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 76baaa6be393..c2d4099429be 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -628,6 +628,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
fc->user_ns = get_user_ns(user_ns);
+ fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -1162,7 +1163,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fc->user_id = d.user_id;
fc->group_id = d.group_id;
fc->max_read = max_t(unsigned, 4096, d.max_read);
- fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
/* Used by get_root_inode() */
sb->s_fs_info = fc;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a2dea5bc0427..58a768e59712 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
@@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b92740edc416..d32964cd1117 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -107,7 +107,7 @@ static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode,
static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
{
- u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0);
+ u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0);
return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
}
@@ -2131,71 +2131,29 @@ static const struct file_operations gfs2_sbstats_fops = {
.release = seq_release,
};
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
-{
- struct dentry *dent;
-
- dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dir = dent;
-
- dent = debugfs_create_file("glocks",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_glocks_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_glocks = dent;
-
- dent = debugfs_create_file("glstats",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_glstats_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_glstats = dent;
-
- dent = debugfs_create_file("sbstats",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_sbstats_fops);
- if (IS_ERR_OR_NULL(dent))
- goto fail;
- sdp->debugfs_dentry_sbstats = dent;
+void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+ sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
- return 0;
-fail:
- gfs2_delete_debugfs_file(sdp);
- return dent ? PTR_ERR(dent) : -ENOMEM;
+ debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_glocks_fops);
+
+ debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_glstats_fops);
+
+ debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
+ &gfs2_sbstats_fops);
}
void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
{
- if (sdp->debugfs_dir) {
- if (sdp->debugfs_dentry_glocks) {
- debugfs_remove(sdp->debugfs_dentry_glocks);
- sdp->debugfs_dentry_glocks = NULL;
- }
- if (sdp->debugfs_dentry_glstats) {
- debugfs_remove(sdp->debugfs_dentry_glstats);
- sdp->debugfs_dentry_glstats = NULL;
- }
- if (sdp->debugfs_dentry_sbstats) {
- debugfs_remove(sdp->debugfs_dentry_sbstats);
- sdp->debugfs_dentry_sbstats = NULL;
- }
- debugfs_remove(sdp->debugfs_dir);
- sdp->debugfs_dir = NULL;
- }
+ debugfs_remove_recursive(sdp->debugfs_dir);
+ sdp->debugfs_dir = NULL;
}
-int gfs2_register_debugfs(void)
+void gfs2_register_debugfs(void)
{
gfs2_root = debugfs_create_dir("gfs2", NULL);
- if (IS_ERR(gfs2_root))
- return PTR_ERR(gfs2_root);
- return gfs2_root ? 0 : -ENOMEM;
}
void gfs2_unregister_debugfs(void)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 8949bf28b249..936b3295839c 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -243,9 +243,9 @@ extern void gfs2_glock_free(struct gfs2_glock *gl);
extern int __init gfs2_glock_init(void);
extern void gfs2_glock_exit(void);
-extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-extern int gfs2_register_debugfs(void);
+extern void gfs2_register_debugfs(void);
extern void gfs2_unregister_debugfs(void);
extern const struct lm_lockops gfs2_dlm_ops;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f15b4c57c4bd..78510ab91835 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,7 +28,6 @@
#include "util.h"
#include "trans.h"
#include "dir.h"
-#include "lops.h"
struct workqueue_struct *gfs2_freeze_wq;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e10e0b0a7cd5..cdf07b408f54 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -853,9 +853,6 @@ struct gfs2_sbd {
unsigned long sd_last_warning;
struct dentry *debugfs_dir; /* debugfs directory */
- struct dentry *debugfs_dentry_glocks;
- struct dentry *debugfs_dentry_glstats;
- struct dentry *debugfs_dentry_sbstats;
};
static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 793808263c6d..18d4af7417fa 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -59,8 +59,8 @@ static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
{
- gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
- change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+ change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT;
+ gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
inode->i_blocks += change;
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 5bfaf381921a..b8830fda51e8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -733,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
lh->lh_crc = cpu_to_be32(crc);
gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr);
- gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags);
+ gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags);
log_flush_wait(sdp);
}
@@ -810,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
gfs2_ordered_write(sdp);
lops_before_commit(sdp, tr);
- gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE);
+ gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0);
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
log_flush_wait(sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 94dcab655bc0..8722c60b11fe 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -17,9 +17,7 @@
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/list_sort.h>
-#include <linux/blkdev.h>
-#include "bmap.h"
#include "dir.h"
#include "gfs2.h"
#include "incore.h"
@@ -170,7 +168,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
* that is pinned in the pagecache.
*/
-static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
+ struct bio_vec *bvec,
blk_status_t error)
{
struct buffer_head *bh, *next;
@@ -195,6 +194,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
/**
* gfs2_end_log_write - end of i/o to the log
* @bio: The bio
+ * @error: Status of i/o request
*
* Each bio_vec contains either data from the pagecache or data
* relating to the log itself. Here we iterate over the bio_vec
@@ -208,6 +208,7 @@ static void gfs2_end_log_write(struct bio *bio)
struct bio_vec *bvec;
struct page *page;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status) {
fs_err(sdp, "Error %d writing to journal, jid=%u\n",
@@ -215,7 +216,7 @@ static void gfs2_end_log_write(struct bio *bio)
wake_up(&sdp->sd_logd_waitq);
}
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
@@ -231,19 +232,20 @@ static void gfs2_end_log_write(struct bio *bio)
/**
* gfs2_log_submit_bio - Submit any pending log bio
* @biop: Address of the bio pointer
- * @opf: REQ_OP | op_flags
+ * @op: REQ_OP
+ * @op_flags: req_flag_bits
*
* Submit any pending part-built or full bio to the block device. If
* there is no pending bio, then this is a no-op.
*/
-void gfs2_log_submit_bio(struct bio **biop, int opf)
+void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags)
{
struct bio *bio = *biop;
if (bio) {
struct gfs2_sbd *sdp = bio->bi_private;
atomic_inc(&sdp->sd_log_in_flight);
- bio->bi_opf = opf;
+ bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
*biop = NULL;
}
@@ -304,7 +306,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno,
nblk >>= sdp->sd_fsb2bb_shift;
if (blkno == nblk && !flush)
return bio;
- gfs2_log_submit_bio(biop, op);
+ gfs2_log_submit_bio(biop, op, 0);
}
*biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
@@ -375,184 +377,6 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
gfs2_log_bmap(sdp));
}
-/**
- * gfs2_end_log_read - end I/O callback for reads from the log
- * @bio: The bio
- *
- * Simply unlock the pages in the bio. The main thread will wait on them and
- * process them in order as necessary.
- */
-
-static void gfs2_end_log_read(struct bio *bio)
-{
- struct page *page;
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- page = bvec->bv_page;
- if (bio->bi_status) {
- int err = blk_status_to_errno(bio->bi_status);
-
- SetPageError(page);
- mapping_set_error(page->mapping, err);
- }
- unlock_page(page);
- }
-
- bio_put(bio);
-}
-
-/**
- * gfs2_jhead_pg_srch - Look for the journal head in a given page.
- * @jd: The journal descriptor
- * @page: The page to look in
- *
- * Returns: 1 if found, 0 otherwise.
- */
-
-static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head,
- struct page *page)
-{
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- struct gfs2_log_header_host uninitialized_var(lh);
- void *kaddr = kmap_atomic(page);
- unsigned int offset;
- bool ret = false;
-
- for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
- if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
- if (lh.lh_sequence > head->lh_sequence)
- *head = lh;
- else {
- ret = true;
- break;
- }
- }
- }
- kunmap_atomic(kaddr);
- return ret;
-}
-
-/**
- * gfs2_jhead_process_page - Search/cleanup a page
- * @jd: The journal descriptor
- * @index: Index of the page to look into
- * @done: If set, perform only cleanup, else search and set if found.
- *
- * Find the page with 'index' in the journal's mapping. Search the page for
- * the journal head if requested (cleanup == false). Release refs on the
- * page so the page cache can reclaim it (put_page() twice). We grabbed a
- * reference on this page two times, first when we did a find_or_create_page()
- * to obtain the page to add it to the bio and second when we do a
- * find_get_page() here to get the page to wait on while I/O on it is being
- * completed.
- * This function is also used to free up a page we might've grabbed but not
- * used. Maybe we added it to a bio, but not submitted it for I/O. Or we
- * submitted the I/O, but we already found the jhead so we only need to drop
- * our references to the page.
- */
-
-static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
- struct gfs2_log_header_host *head,
- bool *done)
-{
- struct page *page;
-
- page = find_get_page(jd->jd_inode->i_mapping, index);
- wait_on_page_locked(page);
-
- if (PageError(page))
- *done = true;
-
- if (!*done)
- *done = gfs2_jhead_pg_srch(jd, head, page);
-
- put_page(page); /* Once for find_get_page */
- put_page(page); /* Once more for find_or_create_page */
-}
-
-/**
- * gfs2_find_jhead - find the head of a log
- * @jd: The journal descriptor
- * @head: The log descriptor for the head of the log is returned here
- *
- * Do a search of a journal by reading it in large chunks using bios and find
- * the valid log entry with the highest sequence number. (i.e. the log head)
- *
- * Returns: 0 on success, errno otherwise
- */
-
-int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
-{
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- struct address_space *mapping = jd->jd_inode->i_mapping;
- struct gfs2_journal_extent *je;
- u32 block, read_idx = 0, submit_idx = 0, index = 0;
- int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
- int blocks_per_page = 1 << shift, sz, ret = 0;
- struct bio *bio = NULL;
- struct page *page;
- bool done = false;
- errseq_t since;
-
- memset(head, 0, sizeof(*head));
- if (list_empty(&jd->extent_list))
- gfs2_map_journal_extents(sdp, jd);
-
- since = filemap_sample_wb_err(mapping);
- list_for_each_entry(je, &jd->extent_list, list) {
- for (block = 0; block < je->blocks; block += blocks_per_page) {
- index = (je->lblock + block) >> shift;
-
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (!page) {
- ret = -ENOMEM;
- done = true;
- goto out;
- }
-
- if (bio) {
- sz = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (sz == PAGE_SIZE)
- goto page_added;
- submit_idx = index;
- submit_bio(bio);
- bio = NULL;
- }
-
- bio = gfs2_log_alloc_bio(sdp,
- je->dblock + (index << shift),
- gfs2_end_log_read);
- bio->bi_opf = REQ_OP_READ;
- sz = bio_add_page(bio, page, PAGE_SIZE, 0);
- gfs2_assert_warn(sdp, sz == PAGE_SIZE);
-
-page_added:
- if (submit_idx <= read_idx + BIO_MAX_PAGES) {
- /* Keep at least one bio in flight */
- continue;
- }
-
- gfs2_jhead_process_page(jd, read_idx++, head, &done);
- if (done)
- goto out; /* found */
- }
- }
-
-out:
- if (bio)
- submit_bio(bio);
- while (read_idx <= index)
- gfs2_jhead_process_page(jd, read_idx++, head, &done);
-
- if (!ret)
- ret = filemap_check_wb_err(mapping, since);
-
- return ret;
-}
-
static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
u32 ld_length, u32 ld_data1)
{
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 331160fc568b..711c4d89c063 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -30,10 +30,8 @@ extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp);
extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
unsigned size, unsigned offset, u64 blkno);
extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
-extern void gfs2_log_submit_bio(struct bio **biop, int opf);
+extern void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
-extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c7603063f861..136484ef35d3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -178,16 +178,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_page_pool)
goto fail_mempool;
- error = gfs2_register_debugfs();
- if (error)
- goto fail_debugfs;
+ gfs2_register_debugfs();
pr_info("GFS2 installed\n");
return 0;
-fail_debugfs:
- mempool_destroy(gfs2_page_pool);
fail_mempool:
destroy_workqueue(gfs2_freeze_wq);
fail_wq3:
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index be9c0bf697fe..3201342404a7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct buffer_head *bh = page_buffers(page);
unsigned int len = bvec->bv_len;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1179763f6370..b041cb8ae383 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -41,7 +41,6 @@
#include "dir.h"
#include "meta_io.h"
#include "trace_gfs2.h"
-#include "lops.h"
#define DO 0
#define UNDO 1
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 7389e445a7a7..2dac43065382 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -182,6 +182,129 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
}
/**
+ * find_good_lh - find a good log header
+ * @jd: the journal
+ * @blk: the segment to start searching from
+ * @lh: the log header to fill in
+ * @forward: if true search forward in the log, else search backward
+ *
+ * Call get_log_header() to get a log header for a segment, but if the
+ * segment is bad, either scan forward or backward until we find a good one.
+ *
+ * Returns: errno
+ */
+
+static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
+ struct gfs2_log_header_host *head)
+{
+ unsigned int orig_blk = *blk;
+ int error;
+
+ for (;;) {
+ error = get_log_header(jd, *blk, head);
+ if (error <= 0)
+ return error;
+
+ if (++*blk == jd->jd_blocks)
+ *blk = 0;
+
+ if (*blk == orig_blk) {
+ gfs2_consist_inode(GFS2_I(jd->jd_inode));
+ return -EIO;
+ }
+ }
+}
+
+/**
+ * jhead_scan - make sure we've found the head of the log
+ * @jd: the journal
+ * @head: this is filled in with the log descriptor of the head
+ *
+ * At this point, seg and lh should be either the head of the log or just
+ * before. Scan forward until we find the head.
+ *
+ * Returns: errno
+ */
+
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+ unsigned int blk = head->lh_blkno;
+ struct gfs2_log_header_host lh;
+ int error;
+
+ for (;;) {
+ if (++blk == jd->jd_blocks)
+ blk = 0;
+
+ error = get_log_header(jd, blk, &lh);
+ if (error < 0)
+ return error;
+ if (error == 1)
+ continue;
+
+ if (lh.lh_sequence == head->lh_sequence) {
+ gfs2_consist_inode(GFS2_I(jd->jd_inode));
+ return -EIO;
+ }
+ if (lh.lh_sequence < head->lh_sequence)
+ break;
+
+ *head = lh;
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: the journal
+ * @head: the log descriptor for the head of the log is returned here
+ *
+ * Do a binary search of a journal and find the valid log entry with the
+ * highest sequence number. (i.e. the log head)
+ *
+ * Returns: errno
+ */
+
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+ struct gfs2_log_header_host lh_1, lh_m;
+ u32 blk_1, blk_2, blk_m;
+ int error;
+
+ blk_1 = 0;
+ blk_2 = jd->jd_blocks - 1;
+
+ for (;;) {
+ blk_m = (blk_1 + blk_2) / 2;
+
+ error = find_good_lh(jd, &blk_1, &lh_1);
+ if (error)
+ return error;
+
+ error = find_good_lh(jd, &blk_m, &lh_m);
+ if (error)
+ return error;
+
+ if (blk_1 == blk_m || blk_m == blk_2)
+ break;
+
+ if (lh_1.lh_sequence <= lh_m.lh_sequence)
+ blk_1 = blk_m;
+ else
+ blk_2 = blk_m;
+ }
+
+ error = jhead_scan(jd, &lh_1);
+ if (error)
+ return error;
+
+ *head = lh_1;
+
+ return error;
+}
+
+/**
* foreach_descriptor - go through the active part of the log
* @jd: the journal
* @start: the first log header in the active region
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 99575ab81202..11d81248be85 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -27,6 +27,8 @@ extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
+ struct gfs2_log_header_host *head);
extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
extern void gfs2_recover_func(struct work_struct *work);
extern int __get_log_header(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 831d7cb5a49c..17a8d3b43990 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1780,9 +1780,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
goto next_iter;
}
if (ret == -E2BIG) {
- n += rbm->bii - initial_bii;
rbm->bii = 0;
rbm->offset = 0;
+ n += (rbm->bii - initial_bii);
goto res_covered_end_of_rgrp;
}
return ret;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d4b11c903971..ca71163ff7cf 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -45,7 +45,6 @@
#include "util.h"
#include "sys.h"
#include "xattr.h"
-#include "lops.h"
#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a2fcea5f8225..b0eef008de67 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- * maps and global counts.
+ * maps and global counts. Page faults can not race with truncation
+ * in this routine. hugetlb_no_page() prevents page faults in the
+ * truncated range. It checks i_size before allocation, and again after
+ * with the page table lock for the page held. The same lock must be
+ * acquired to unmap a page.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserv map
* deleted. The region/reserv map for ranges without associated
- * pages are not modified.
- *
- * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
- * races with page faults.
- *
+ * pages are not modified. Page faults can race with hole punch.
+ * This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
* not LLONG_MAX this routine still performs a hole punch operation.
*/
@@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ u32 hash;
index = page->index;
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, index, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
/*
- * A mapped page is impossible as callers should unmap
- * all references before calling. And, i_mmap_rwsem
- * prevents the creation of additional mappings.
+ * If page is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) now after taking
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the page.
+ *
+ * This race can only happen in the hole punch case.
+ * Getting here in a truncate operation is a bug.
*/
- VM_BUG_ON(page_mapped(page));
+ if (unlikely(page_mapped(page))) {
+ BUG_ON(truncate_op);
+
+ i_mmap_lock_write(mapping);
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ index * pages_per_huge_page(h),
+ (index + 1) * pages_per_huge_page(h));
+ i_mmap_unlock_write(mapping);
+ }
lock_page(page);
/*
@@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}
unlock_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
cond_resched();
@@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
static void hugetlbfs_evict_inode(struct inode *inode)
{
- struct address_space *mapping = inode->i_mapping;
struct resv_map *resv_map;
- /*
- * The vfs layer guarantees that there are no other users of this
- * inode. Therefore, it would be safe to call remove_inode_hugepages
- * without holding i_mmap_rwsem. We acquire and hold here to be
- * consistent with other callers. Since there will be no contention
- * on the semaphore, overhead is negligible.
- */
- i_mmap_lock_write(mapping);
remove_inode_hugepages(inode, 0, LLONG_MAX);
- i_mmap_unlock_write(mapping);
-
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
- remove_inode_hugepages(inode, offset, LLONG_MAX);
i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
return 0;
}
@@ -521,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode);
return -EPERM;
}
@@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT);
- remove_inode_hugepages(inode, hole_start, hole_end);
i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
inode_unlock(inode);
}
@@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
- /*
- * fault mutex taken here, protects against fault path
- * and hole punch. inode_lock previously taken protects
- * against truncation.
- */
+ /* mutex taken here, fault path and hole punch */
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
index, addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -854,6 +859,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
+
+ /*
+ * page_private is subpool pointer in hugetlb pages. Transfer to
+ * new page. PagePrivate is not associated with page_private for
+ * hugetlb pages and can not be set here as only page_huge_active
+ * pages can be migrated.
+ */
+ if (page_private(page)) {
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ }
+
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
diff --git a/fs/inode.c b/fs/inode.c
index 0cd47fe0dbe5..e9d97add2b36 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
return LRU_REMOVED;
}
- /*
- * Recently referenced inodes and inodes with many attached pages
- * get one more pass.
- */
- if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) {
+ /* recently referenced inodes get one more pass */
+ if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
@@ -2096,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait);
void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask)
{
- unsigned int old_flags, new_flags;
-
WARN_ON_ONCE(flags & ~mask);
- do {
- old_flags = READ_ONCE(inode->i_flags);
- new_flags = (old_flags & ~mask) | flags;
- } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
- new_flags) != old_flags));
+ set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/io_uring.c b/fs/io_uring.c
new file mode 100644
index 000000000000..5d99376d2369
--- /dev/null
+++ b/fs/io_uring.c
@@ -0,0 +1,2971 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared application/kernel submission and completion ring pairs, for
+ * supporting fast/efficient IO.
+ *
+ * A note on the read/write ordering memory barriers that are matched between
+ * the application and kernel side. When the application reads the CQ ring
+ * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
+ * the kernel uses after writing the tail. Failure to do so could cause a
+ * delay in when the application notices that completion events available.
+ * This isn't a fatal condition. Likewise, the application must use an
+ * appropriate smp_wmb() both before writing the SQ tail, and after writing
+ * the SQ tail. The first one orders the sqe writes with the tail write, and
+ * the latter is paired with the smp_rmb() the kernel will issue before
+ * reading the SQ tail on submission.
+ *
+ * Also see the examples in the liburing library:
+ *
+ * git://git.kernel.dk/liburing
+ *
+ * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
+ * from data shared between the kernel and application. This is done both
+ * for ordering purposes, but also to ensure that once a value is loaded from
+ * data that the application could potentially modify, it remains stable.
+ *
+ * Copyright (C) 2018-2019 Jens Axboe
+ * Copyright (c) 2018-2019 Christoph Hellwig
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/refcount.h>
+#include <linux/uio.h>
+
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_context.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/bvec.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <net/af_unix.h>
+#include <net/scm.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/nospec.h>
+#include <linux/sizes.h>
+#include <linux/hugetlb.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "internal.h"
+
+#define IORING_MAX_ENTRIES 4096
+#define IORING_MAX_FIXED_FILES 1024
+
+struct io_uring {
+ u32 head ____cacheline_aligned_in_smp;
+ u32 tail ____cacheline_aligned_in_smp;
+};
+
+struct io_sq_ring {
+ struct io_uring r;
+ u32 ring_mask;
+ u32 ring_entries;
+ u32 dropped;
+ u32 flags;
+ u32 array[];
+};
+
+struct io_cq_ring {
+ struct io_uring r;
+ u32 ring_mask;
+ u32 ring_entries;
+ u32 overflow;
+ struct io_uring_cqe cqes[];
+};
+
+struct io_mapped_ubuf {
+ u64 ubuf;
+ size_t len;
+ struct bio_vec *bvec;
+ unsigned int nr_bvecs;
+};
+
+struct async_list {
+ spinlock_t lock;
+ atomic_t cnt;
+ struct list_head list;
+
+ struct file *file;
+ off_t io_end;
+ size_t io_pages;
+};
+
+struct io_ring_ctx {
+ struct {
+ struct percpu_ref refs;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ unsigned int flags;
+ bool compat;
+ bool account_mem;
+
+ /* SQ ring */
+ struct io_sq_ring *sq_ring;
+ unsigned cached_sq_head;
+ unsigned sq_entries;
+ unsigned sq_mask;
+ unsigned sq_thread_idle;
+ struct io_uring_sqe *sq_sqes;
+ } ____cacheline_aligned_in_smp;
+
+ /* IO offload */
+ struct workqueue_struct *sqo_wq;
+ struct task_struct *sqo_thread; /* if using sq thread polling */
+ struct mm_struct *sqo_mm;
+ wait_queue_head_t sqo_wait;
+ unsigned sqo_stop;
+
+ struct {
+ /* CQ ring */
+ struct io_cq_ring *cq_ring;
+ unsigned cached_cq_tail;
+ unsigned cq_entries;
+ unsigned cq_mask;
+ struct wait_queue_head cq_wait;
+ struct fasync_struct *cq_fasync;
+ } ____cacheline_aligned_in_smp;
+
+ /*
+ * If used, fixed file set. Writers must ensure that ->refs is dead,
+ * readers must ensure that ->refs is alive as long as the file* is
+ * used. Only updated through io_uring_register(2).
+ */
+ struct file **user_files;
+ unsigned nr_user_files;
+
+ /* if used, fixed mapped user buffers */
+ unsigned nr_user_bufs;
+ struct io_mapped_ubuf *user_bufs;
+
+ struct user_struct *user;
+
+ struct completion ctx_done;
+
+ struct {
+ struct mutex uring_lock;
+ wait_queue_head_t wait;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ spinlock_t completion_lock;
+ bool poll_multi_file;
+ /*
+ * ->poll_list is protected by the ctx->uring_lock for
+ * io_uring instances that don't use IORING_SETUP_SQPOLL.
+ * For SQPOLL, only the single threaded io_sq_thread() will
+ * manipulate the list, hence no extra locking is needed there.
+ */
+ struct list_head poll_list;
+ struct list_head cancel_list;
+ } ____cacheline_aligned_in_smp;
+
+ struct async_list pending_async[2];
+
+#if defined(CONFIG_UNIX)
+ struct socket *ring_sock;
+#endif
+};
+
+struct sqe_submit {
+ const struct io_uring_sqe *sqe;
+ unsigned short index;
+ bool has_user;
+ bool needs_lock;
+ bool needs_fixed_file;
+};
+
+struct io_poll_iocb {
+ struct file *file;
+ struct wait_queue_head *head;
+ __poll_t events;
+ bool woken;
+ bool canceled;
+ struct wait_queue_entry wait;
+};
+
+struct io_kiocb {
+ union {
+ struct kiocb rw;
+ struct io_poll_iocb poll;
+ };
+
+ struct sqe_submit submit;
+
+ struct io_ring_ctx *ctx;
+ struct list_head list;
+ unsigned int flags;
+ refcount_t refs;
+#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
+#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
+#define REQ_F_FIXED_FILE 4 /* ctx owns file */
+#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+ u64 user_data;
+ u64 error;
+
+ struct work_struct work;
+};
+
+#define IO_PLUG_THRESHOLD 2
+#define IO_IOPOLL_BATCH 8
+
+struct io_submit_state {
+ struct blk_plug plug;
+
+ /*
+ * io_kiocb alloc cache
+ */
+ void *reqs[IO_IOPOLL_BATCH];
+ unsigned int free_reqs;
+ unsigned int cur_req;
+
+ /*
+ * File reference cache
+ */
+ struct file *file;
+ unsigned int fd;
+ unsigned int has_refs;
+ unsigned int used_refs;
+ unsigned int ios_left;
+};
+
+static struct kmem_cache *req_cachep;
+
+static const struct file_operations io_uring_fops;
+
+struct sock *io_uring_get_socket(struct file *file)
+{
+#if defined(CONFIG_UNIX)
+ if (file->f_op == &io_uring_fops) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ return ctx->ring_sock->sk;
+ }
+#endif
+ return NULL;
+}
+EXPORT_SYMBOL(io_uring_get_socket);
+
+static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+{
+ struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
+
+ complete(&ctx->ctx_done);
+}
+
+static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+{
+ struct io_ring_ctx *ctx;
+ int i;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
+ kfree(ctx);
+ return NULL;
+ }
+
+ ctx->flags = p->flags;
+ init_waitqueue_head(&ctx->cq_wait);
+ init_completion(&ctx->ctx_done);
+ mutex_init(&ctx->uring_lock);
+ init_waitqueue_head(&ctx->wait);
+ for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
+ spin_lock_init(&ctx->pending_async[i].lock);
+ INIT_LIST_HEAD(&ctx->pending_async[i].list);
+ atomic_set(&ctx->pending_async[i].cnt, 0);
+ }
+ spin_lock_init(&ctx->completion_lock);
+ INIT_LIST_HEAD(&ctx->poll_list);
+ INIT_LIST_HEAD(&ctx->cancel_list);
+ return ctx;
+}
+
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+
+ if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
+ /* order cqe stores with ring update */
+ smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
+
+ /*
+ * Write sider barrier of tail update, app has read side. See
+ * comment at the top of this file.
+ */
+ smp_wmb();
+
+ if (wq_has_sleeper(&ctx->cq_wait)) {
+ wake_up_interruptible(&ctx->cq_wait);
+ kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
+ }
+ }
+}
+
+static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+ unsigned tail;
+
+ tail = ctx->cached_cq_tail;
+ /* See comment at the top of the file */
+ smp_rmb();
+ if (tail + 1 == READ_ONCE(ring->r.head))
+ return NULL;
+
+ ctx->cached_cq_tail++;
+ return &ring->cqes[tail & ctx->cq_mask];
+}
+
+static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+ long res, unsigned ev_flags)
+{
+ struct io_uring_cqe *cqe;
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqring(ctx);
+ if (cqe) {
+ WRITE_ONCE(cqe->user_data, ki_user_data);
+ WRITE_ONCE(cqe->res, res);
+ WRITE_ONCE(cqe->flags, ev_flags);
+ } else {
+ unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
+
+ WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
+ }
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+ long res, unsigned ev_flags)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
+}
+
+static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
+{
+ percpu_ref_put_many(&ctx->refs, refs);
+
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+}
+
+static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
+ struct io_submit_state *state)
+{
+ struct io_kiocb *req;
+
+ if (!percpu_ref_tryget(&ctx->refs))
+ return NULL;
+
+ if (!state) {
+ req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
+ if (unlikely(!req))
+ goto out;
+ } else if (!state->free_reqs) {
+ size_t sz;
+ int ret;
+
+ sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
+ ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
+ state->reqs);
+ if (unlikely(ret <= 0))
+ goto out;
+ state->free_reqs = ret - 1;
+ state->cur_req = 1;
+ req = state->reqs[0];
+ } else {
+ req = state->reqs[state->cur_req];
+ state->free_reqs--;
+ state->cur_req++;
+ }
+
+ req->ctx = ctx;
+ req->flags = 0;
+ refcount_set(&req->refs, 0);
+ return req;
+out:
+ io_ring_drop_ctx_refs(ctx, 1);
+ return NULL;
+}
+
+static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
+{
+ if (*nr) {
+ kmem_cache_free_bulk(req_cachep, *nr, reqs);
+ io_ring_drop_ctx_refs(ctx, *nr);
+ *nr = 0;
+ }
+}
+
+static void io_free_req(struct io_kiocb *req)
+{
+ if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) {
+ io_ring_drop_ctx_refs(req->ctx, 1);
+ kmem_cache_free(req_cachep, req);
+ }
+}
+
+/*
+ * Find and free completed poll iocbs
+ */
+static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ struct list_head *done)
+{
+ void *reqs[IO_IOPOLL_BATCH];
+ int file_count, to_free;
+ struct file *file = NULL;
+ struct io_kiocb *req;
+
+ file_count = to_free = 0;
+ while (!list_empty(done)) {
+ req = list_first_entry(done, struct io_kiocb, list);
+ list_del(&req->list);
+
+ io_cqring_fill_event(ctx, req->user_data, req->error, 0);
+
+ reqs[to_free++] = req;
+ (*nr_events)++;
+
+ /*
+ * Batched puts of the same file, to avoid dirtying the
+ * file usage count multiple times, if avoidable.
+ */
+ if (!(req->flags & REQ_F_FIXED_FILE)) {
+ if (!file) {
+ file = req->rw.ki_filp;
+ file_count = 1;
+ } else if (file == req->rw.ki_filp) {
+ file_count++;
+ } else {
+ fput_many(file, file_count);
+ file = req->rw.ki_filp;
+ file_count = 1;
+ }
+ }
+
+ if (to_free == ARRAY_SIZE(reqs))
+ io_free_req_many(ctx, reqs, &to_free);
+ }
+ io_commit_cqring(ctx);
+
+ if (file)
+ fput_many(file, file_count);
+ io_free_req_many(ctx, reqs, &to_free);
+}
+
+static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ long min)
+{
+ struct io_kiocb *req, *tmp;
+ LIST_HEAD(done);
+ bool spin;
+ int ret;
+
+ /*
+ * Only spin for completions if we don't have multiple devices hanging
+ * off our complete list, and we're under the requested amount.
+ */
+ spin = !ctx->poll_multi_file && *nr_events < min;
+
+ ret = 0;
+ list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+ struct kiocb *kiocb = &req->rw;
+
+ /*
+ * Move completed entries to our local list. If we find a
+ * request that requires polling, break out and complete
+ * the done list first, if we have entries there.
+ */
+ if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+ list_move_tail(&req->list, &done);
+ continue;
+ }
+ if (!list_empty(&done))
+ break;
+
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ if (ret < 0)
+ break;
+
+ if (ret && spin)
+ spin = false;
+ ret = 0;
+ }
+
+ if (!list_empty(&done))
+ io_iopoll_complete(ctx, nr_events, &done);
+
+ return ret;
+}
+
+/*
+ * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
+ * non-spinning poll check - we'll still enter the driver poll loop, but only
+ * as a non-spinning completion check.
+ */
+static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
+ long min)
+{
+ while (!list_empty(&ctx->poll_list)) {
+ int ret;
+
+ ret = io_do_iopoll(ctx, nr_events, min);
+ if (ret < 0)
+ return ret;
+ if (!min || *nr_events >= min)
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * We can't just wait for polled events to come to us, we have to actively
+ * find and complete them.
+ */
+static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+{
+ if (!(ctx->flags & IORING_SETUP_IOPOLL))
+ return;
+
+ mutex_lock(&ctx->uring_lock);
+ while (!list_empty(&ctx->poll_list)) {
+ unsigned int nr_events = 0;
+
+ io_iopoll_getevents(ctx, &nr_events, 1);
+ }
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
+{
+ int ret = 0;
+
+ do {
+ int tmin = 0;
+
+ if (*nr_events < min)
+ tmin = min - *nr_events;
+
+ ret = io_iopoll_getevents(ctx, nr_events, tmin);
+ if (ret <= 0)
+ break;
+ ret = 0;
+ } while (min && !*nr_events && !need_resched());
+
+ return ret;
+}
+
+static void kiocb_end_write(struct kiocb *kiocb)
+{
+ if (kiocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(kiocb->ki_filp);
+
+ /*
+ * Tell lockdep we inherited freeze protection from submission
+ * thread.
+ */
+ if (S_ISREG(inode->i_mode))
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ file_end_write(kiocb->ki_filp);
+ }
+}
+
+static void io_fput(struct io_kiocb *req)
+{
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ fput(req->rw.ki_filp);
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+ kiocb_end_write(kiocb);
+
+ io_fput(req);
+ io_cqring_add_event(req->ctx, req->user_data, res, 0);
+ io_free_req(req);
+}
+
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+ kiocb_end_write(kiocb);
+
+ req->error = res;
+ if (res != -EAGAIN)
+ req->flags |= REQ_F_IOPOLL_COMPLETED;
+}
+
+/*
+ * After the iocb has been issued, it's safe to be found on the poll list.
+ * Adding the kiocb to the list AFTER submission ensures that we don't
+ * find it from a io_iopoll_getevents() thread before the issuer is done
+ * accessing the kiocb cookie.
+ */
+static void io_iopoll_req_issued(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ /*
+ * Track whether we have multiple files in our lists. This will impact
+ * how we do polling eventually, not spinning if we're on potentially
+ * different devices.
+ */
+ if (list_empty(&ctx->poll_list)) {
+ ctx->poll_multi_file = false;
+ } else if (!ctx->poll_multi_file) {
+ struct io_kiocb *list_req;
+
+ list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
+ list);
+ if (list_req->rw.ki_filp != req->rw.ki_filp)
+ ctx->poll_multi_file = true;
+ }
+
+ /*
+ * For fast devices, IO may have already completed. If it has, add
+ * it to the front so we find it first.
+ */
+ if (req->flags & REQ_F_IOPOLL_COMPLETED)
+ list_add(&req->list, &ctx->poll_list);
+ else
+ list_add_tail(&req->list, &ctx->poll_list);
+}
+
+static void io_file_put(struct io_submit_state *state, struct file *file)
+{
+ if (!state) {
+ fput(file);
+ } else if (state->file) {
+ int diff = state->has_refs - state->used_refs;
+
+ if (diff)
+ fput_many(state->file, diff);
+ state->file = NULL;
+ }
+}
+
+/*
+ * Get as many references to a file as we have IOs left in this submission,
+ * assuming most submissions are for one file, or at least that each file
+ * has more than one submission.
+ */
+static struct file *io_file_get(struct io_submit_state *state, int fd)
+{
+ if (!state)
+ return fget(fd);
+
+ if (state->file) {
+ if (state->fd == fd) {
+ state->used_refs++;
+ state->ios_left--;
+ return state->file;
+ }
+ io_file_put(state, NULL);
+ }
+ state->file = fget_many(fd, state->ios_left);
+ if (!state->file)
+ return NULL;
+
+ state->fd = fd;
+ state->has_refs = state->ios_left;
+ state->used_refs = 1;
+ state->ios_left--;
+ return state->file;
+}
+
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static bool io_file_supports_async(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+
+ if (S_ISBLK(mode) || S_ISCHR(mode))
+ return true;
+ if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+ return true;
+
+ return false;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ const struct io_uring_sqe *sqe = s->sqe;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct kiocb *kiocb = &req->rw;
+ unsigned ioprio, flags;
+ int fd, ret;
+
+ /* For -EAGAIN retry, everything is already prepped */
+ if (kiocb->ki_filp)
+ return 0;
+
+ flags = READ_ONCE(sqe->flags);
+ fd = READ_ONCE(sqe->fd);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files ||
+ (unsigned) fd >= ctx->nr_user_files))
+ return -EBADF;
+ kiocb->ki_filp = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ if (s->needs_fixed_file)
+ return -EBADF;
+ kiocb->ki_filp = io_file_get(state, fd);
+ if (unlikely(!kiocb->ki_filp))
+ return -EBADF;
+ if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
+ force_nonblock = false;
+ }
+ kiocb->ki_pos = READ_ONCE(sqe->off);
+ kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+ kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+
+ ioprio = READ_ONCE(sqe->ioprio);
+ if (ioprio) {
+ ret = ioprio_check_cap(ioprio);
+ if (ret)
+ goto out_fput;
+
+ kiocb->ki_ioprio = ioprio;
+ } else
+ kiocb->ki_ioprio = get_current_ioprio();
+
+ ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+ if (unlikely(ret))
+ goto out_fput;
+ if (force_nonblock) {
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ req->flags |= REQ_F_FORCE_NONBLOCK;
+ }
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ ret = -EOPNOTSUPP;
+ if (!(kiocb->ki_flags & IOCB_DIRECT) ||
+ !kiocb->ki_filp->f_op->iopoll)
+ goto out_fput;
+
+ req->error = 0;
+ kiocb->ki_flags |= IOCB_HIPRI;
+ kiocb->ki_complete = io_complete_rw_iopoll;
+ } else {
+ if (kiocb->ki_flags & IOCB_HIPRI) {
+ ret = -EINVAL;
+ goto out_fput;
+ }
+ kiocb->ki_complete = io_complete_rw;
+ }
+ return 0;
+out_fput:
+ if (!(flags & IOSQE_FIXED_FILE)) {
+ /*
+ * in case of error, we didn't use this file reference. drop it.
+ */
+ if (state)
+ state->used_refs--;
+ io_file_put(state, kiocb->ki_filp);
+ }
+ return ret;
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+ switch (ret) {
+ case -EIOCBQUEUED:
+ break;
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /*
+ * We can't just restart the syscall, since previously
+ * submitted sqes may already be in progress. Just fail this
+ * IO with EINTR.
+ */
+ ret = -EINTR;
+ /* fall through */
+ default:
+ kiocb->ki_complete(kiocb, ret, 0);
+ }
+}
+
+static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
+ const struct io_uring_sqe *sqe,
+ struct iov_iter *iter)
+{
+ size_t len = READ_ONCE(sqe->len);
+ struct io_mapped_ubuf *imu;
+ unsigned index, buf_index;
+ size_t offset;
+ u64 buf_addr;
+
+ /* attempt to use fixed buffers without having provided iovecs */
+ if (unlikely(!ctx->user_bufs))
+ return -EFAULT;
+
+ buf_index = READ_ONCE(sqe->buf_index);
+ if (unlikely(buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+
+ index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+ imu = &ctx->user_bufs[index];
+ buf_addr = READ_ONCE(sqe->addr);
+
+ /* overflow */
+ if (buf_addr + len < buf_addr)
+ return -EFAULT;
+ /* not inside the mapped region */
+ if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+ return -EFAULT;
+
+ /*
+ * May not be a start of buffer, set size appropriately
+ * and advance us to the beginning.
+ */
+ offset = buf_addr - imu->ubuf;
+ iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
+ if (offset)
+ iov_iter_advance(iter, offset);
+ return 0;
+}
+
+static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
+ const struct sqe_submit *s, struct iovec **iovec,
+ struct iov_iter *iter)
+{
+ const struct io_uring_sqe *sqe = s->sqe;
+ void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ size_t sqe_len = READ_ONCE(sqe->len);
+ u8 opcode;
+
+ /*
+ * We're reading ->opcode for the second time, but the first read
+ * doesn't care whether it's _FIXED or not, so it doesn't matter
+ * whether ->opcode changes concurrently. The first read does care
+ * about whether it is a READ or a WRITE, so we don't trust this read
+ * for that purpose and instead let the caller pass in the read/write
+ * flag.
+ */
+ opcode = READ_ONCE(sqe->opcode);
+ if (opcode == IORING_OP_READ_FIXED ||
+ opcode == IORING_OP_WRITE_FIXED) {
+ ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ *iovec = NULL;
+ return ret;
+ }
+
+ if (!s->has_user)
+ return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (ctx->compat)
+ return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
+ iovec, iter);
+#endif
+
+ return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+}
+
+/*
+ * Make a note of the last file/offset/direction we punted to async
+ * context. We'll use this information to see if we can piggy back a
+ * sequential request onto the previous one, if it's still hasn't been
+ * completed by the async worker.
+ */
+static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
+{
+ struct async_list *async_list = &req->ctx->pending_async[rw];
+ struct kiocb *kiocb = &req->rw;
+ struct file *filp = kiocb->ki_filp;
+ off_t io_end = kiocb->ki_pos + len;
+
+ if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
+ unsigned long max_pages;
+
+ /* Use 8x RA size as a decent limiter for both reads/writes */
+ max_pages = filp->f_ra.ra_pages;
+ if (!max_pages)
+ max_pages = VM_MAX_READAHEAD >> (PAGE_SHIFT - 10);
+ max_pages *= 8;
+
+ /* If max pages are exceeded, reset the state */
+ len >>= PAGE_SHIFT;
+ if (async_list->io_pages + len <= max_pages) {
+ req->flags |= REQ_F_SEQ_PREV;
+ async_list->io_pages += len;
+ } else {
+ io_end = 0;
+ async_list->io_pages = 0;
+ }
+ }
+
+ /* New file? Reset state. */
+ if (async_list->file != filp) {
+ async_list->io_pages = 0;
+ async_list->file = filp;
+ }
+ async_list->io_end = io_end;
+}
+
+static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct kiocb *kiocb = &req->rw;
+ struct iov_iter iter;
+ struct file *file;
+ size_t iov_count;
+ ssize_t ret;
+
+ ret = io_prep_rw(req, s, force_nonblock, state);
+ if (ret)
+ return ret;
+ file = kiocb->ki_filp;
+
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_fput;
+ ret = -EINVAL;
+ if (unlikely(!file->f_op->read_iter))
+ goto out_fput;
+
+ ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+ if (ret)
+ goto out_fput;
+
+ iov_count = iov_iter_count(&iter);
+ ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
+ if (!ret) {
+ ssize_t ret2;
+
+ /* Catch -EAGAIN return for forced non-blocking submission */
+ ret2 = call_read_iter(file, kiocb, &iter);
+ if (!force_nonblock || ret2 != -EAGAIN) {
+ io_rw_done(kiocb, ret2);
+ } else {
+ /*
+ * If ->needs_lock is true, we're already in async
+ * context.
+ */
+ if (!s->needs_lock)
+ io_async_list_note(READ, req, iov_count);
+ ret = -EAGAIN;
+ }
+ }
+ kfree(iovec);
+out_fput:
+ /* Hold on to the file for -EAGAIN */
+ if (unlikely(ret && ret != -EAGAIN))
+ io_fput(req);
+ return ret;
+}
+
+static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
+ bool force_nonblock, struct io_submit_state *state)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct kiocb *kiocb = &req->rw;
+ struct iov_iter iter;
+ struct file *file;
+ size_t iov_count;
+ ssize_t ret;
+
+ ret = io_prep_rw(req, s, force_nonblock, state);
+ if (ret)
+ return ret;
+
+ ret = -EBADF;
+ file = kiocb->ki_filp;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_fput;
+ ret = -EINVAL;
+ if (unlikely(!file->f_op->write_iter))
+ goto out_fput;
+
+ ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+ if (ret)
+ goto out_fput;
+
+ iov_count = iov_iter_count(&iter);
+
+ ret = -EAGAIN;
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
+ /* If ->needs_lock is true, we're already in async context. */
+ if (!s->needs_lock)
+ io_async_list_note(WRITE, req, iov_count);
+ goto out_free;
+ }
+
+ ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
+ if (!ret) {
+ /*
+ * Open-code file_start_write here to grab freeze protection,
+ * which will be released by another thread in
+ * io_complete_rw(). Fool lockdep by telling it the lock got
+ * released so that it doesn't complain about the held lock when
+ * we return to userspace.
+ */
+ if (S_ISREG(file_inode(file)->i_mode)) {
+ __sb_start_write(file_inode(file)->i_sb,
+ SB_FREEZE_WRITE, true);
+ __sb_writers_release(file_inode(file)->i_sb,
+ SB_FREEZE_WRITE);
+ }
+ kiocb->ki_flags |= IOCB_WRITE;
+ io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
+ }
+out_free:
+ kfree(iovec);
+out_fput:
+ /* Hold on to the file for -EAGAIN */
+ if (unlikely(ret && ret != -EAGAIN))
+ io_fput(req);
+ return ret;
+}
+
+/*
+ * IORING_OP_NOP just posts a completion event, nothing else.
+ */
+static int io_nop(struct io_kiocb *req, u64 user_data)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ long err = 0;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
+ /*
+ * Twilight zone - it's possible that someone issued an opcode that
+ * has a file attached, then got -EAGAIN on submission, and changed
+ * the sqe before we retried it from async context. Avoid dropping
+ * a file reference for this malicious case, and flag the error.
+ */
+ if (req->rw.ki_filp) {
+ err = -EBADF;
+ io_fput(req);
+ }
+ io_cqring_add_event(ctx, user_data, err, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned flags;
+ int fd;
+
+ /* Prep already done */
+ if (req->rw.ki_filp)
+ return 0;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ return -EINVAL;
+
+ fd = READ_ONCE(sqe->fd);
+ flags = READ_ONCE(sqe->flags);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+ return -EBADF;
+ req->rw.ki_filp = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ req->rw.ki_filp = fget(fd);
+ if (unlikely(!req->rw.ki_filp))
+ return -EBADF;
+ }
+
+ return 0;
+}
+
+static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock)
+{
+ loff_t sqe_off = READ_ONCE(sqe->off);
+ loff_t sqe_len = READ_ONCE(sqe->len);
+ loff_t end = sqe_off + sqe_len;
+ unsigned fsync_flags;
+ int ret;
+
+ fsync_flags = READ_ONCE(sqe->fsync_flags);
+ if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
+ return -EINVAL;
+
+ ret = io_prep_fsync(req, sqe);
+ if (ret)
+ return ret;
+
+ /* fsync always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
+ end > 0 ? end : LLONG_MAX,
+ fsync_flags & IORING_FSYNC_DATASYNC);
+
+ io_fput(req);
+ io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static void io_poll_remove_one(struct io_kiocb *req)
+{
+ struct io_poll_iocb *poll = &req->poll;
+
+ spin_lock(&poll->head->lock);
+ WRITE_ONCE(poll->canceled, true);
+ if (!list_empty(&poll->wait.entry)) {
+ list_del_init(&poll->wait.entry);
+ queue_work(req->ctx->sqo_wq, &req->work);
+ }
+ spin_unlock(&poll->head->lock);
+
+ list_del_init(&req->list);
+}
+
+static void io_poll_remove_all(struct io_ring_ctx *ctx)
+{
+ struct io_kiocb *req;
+
+ spin_lock_irq(&ctx->completion_lock);
+ while (!list_empty(&ctx->cancel_list)) {
+ req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+ io_poll_remove_one(req);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
+/*
+ * Find a running poll command that matches one specified in sqe->addr,
+ * and remove it if found.
+ */
+static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *poll_req, *next;
+ int ret = -ENOENT;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+ sqe->poll_events)
+ return -EINVAL;
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
+ if (READ_ONCE(sqe->addr) == poll_req->user_data) {
+ io_poll_remove_one(poll_req);
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ return 0;
+}
+
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+{
+ io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
+ io_fput(req);
+ io_free_req(req);
+}
+
+static void io_poll_complete_work(struct work_struct *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_poll_iocb *poll = &req->poll;
+ struct poll_table_struct pt = { ._key = poll->events };
+ struct io_ring_ctx *ctx = req->ctx;
+ __poll_t mask = 0;
+
+ if (!READ_ONCE(poll->canceled))
+ mask = vfs_poll(poll->file, &pt) & poll->events;
+
+ /*
+ * Note that ->ki_cancel callers also delete iocb from active_reqs after
+ * calling ->ki_cancel. We need the ctx_lock roundtrip here to
+ * synchronize with them. In the cancellation case the list_del_init
+ * itself is not actually needed, but harmless so we keep it in to
+ * avoid further branches in the fast path.
+ */
+ spin_lock_irq(&ctx->completion_lock);
+ if (!mask && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
+ list_del_init(&req->list);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_poll_complete(req, mask);
+}
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
+ wait);
+ struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
+ struct io_ring_ctx *ctx = req->ctx;
+ __poll_t mask = key_to_poll(key);
+
+ poll->woken = true;
+
+ /* for instances that support it check for an event match first: */
+ if (mask) {
+ unsigned long flags;
+
+ if (!(mask & poll->events))
+ return 0;
+
+ /* try to complete the iocb inline if we can: */
+ if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+ list_del(&req->list);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ list_del_init(&poll->wait.entry);
+ io_poll_complete(req, mask);
+ return 1;
+ }
+ }
+
+ list_del_init(&poll->wait.entry);
+ queue_work(ctx->sqo_wq, &req->work);
+ return 1;
+}
+
+struct io_poll_table {
+ struct poll_table_struct pt;
+ struct io_kiocb *req;
+ int error;
+};
+
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+ if (unlikely(pt->req->poll.head)) {
+ pt->error = -EINVAL;
+ return;
+ }
+
+ pt->error = 0;
+ pt->req->poll.head = head;
+ add_wait_queue(head, &pt->req->poll.wait);
+}
+
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_poll_iocb *poll = &req->poll;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_poll_table ipt;
+ unsigned flags;
+ __poll_t mask;
+ u16 events;
+ int fd;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ return -EINVAL;
+
+ INIT_WORK(&req->work, io_poll_complete_work);
+ events = READ_ONCE(sqe->poll_events);
+ poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+ flags = READ_ONCE(sqe->flags);
+ fd = READ_ONCE(sqe->fd);
+
+ if (flags & IOSQE_FIXED_FILE) {
+ if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
+ return -EBADF;
+ poll->file = ctx->user_files[fd];
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ poll->file = fget(fd);
+ }
+ if (unlikely(!poll->file))
+ return -EBADF;
+
+ poll->head = NULL;
+ poll->woken = false;
+ poll->canceled = false;
+
+ ipt.pt._qproc = io_poll_queue_proc;
+ ipt.pt._key = poll->events;
+ ipt.req = req;
+ ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+ /* initialized the list so that we can do list_empty checks */
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+
+ /* one for removal from waitqueue, one for this function */
+ refcount_set(&req->refs, 2);
+
+ mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+ if (unlikely(!poll->head)) {
+ /* we did not manage to set up a waitqueue, done */
+ goto out;
+ }
+
+ spin_lock_irq(&ctx->completion_lock);
+ spin_lock(&poll->head->lock);
+ if (poll->woken) {
+ /* wake_up context handles the rest */
+ mask = 0;
+ ipt.error = 0;
+ } else if (mask || ipt.error) {
+ /* if we get an error or a mask we are done */
+ WARN_ON_ONCE(list_empty(&poll->wait.entry));
+ list_del_init(&poll->wait.entry);
+ } else {
+ /* actually waiting for an event */
+ list_add_tail(&req->list, &ctx->cancel_list);
+ }
+ spin_unlock(&poll->head->lock);
+ spin_unlock_irq(&ctx->completion_lock);
+
+out:
+ if (unlikely(ipt.error)) {
+ if (!(flags & IOSQE_FIXED_FILE))
+ fput(poll->file);
+ /*
+ * Drop one of our refs to this req, __io_submit_sqe() will
+ * drop the other one since we're returning an error.
+ */
+ io_free_req(req);
+ return ipt.error;
+ }
+
+ if (mask)
+ io_poll_complete(req, mask);
+ io_free_req(req);
+ return 0;
+}
+
+static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct sqe_submit *s, bool force_nonblock,
+ struct io_submit_state *state)
+{
+ ssize_t ret;
+ int opcode;
+
+ if (unlikely(s->index >= ctx->sq_entries))
+ return -EINVAL;
+ req->user_data = READ_ONCE(s->sqe->user_data);
+
+ opcode = READ_ONCE(s->sqe->opcode);
+ switch (opcode) {
+ case IORING_OP_NOP:
+ ret = io_nop(req, req->user_data);
+ break;
+ case IORING_OP_READV:
+ if (unlikely(s->sqe->buf_index))
+ return -EINVAL;
+ ret = io_read(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_WRITEV:
+ if (unlikely(s->sqe->buf_index))
+ return -EINVAL;
+ ret = io_write(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_READ_FIXED:
+ ret = io_read(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_WRITE_FIXED:
+ ret = io_write(req, s, force_nonblock, state);
+ break;
+ case IORING_OP_FSYNC:
+ ret = io_fsync(req, s->sqe, force_nonblock);
+ break;
+ case IORING_OP_POLL_ADD:
+ ret = io_poll_add(req, s->sqe);
+ break;
+ case IORING_OP_POLL_REMOVE:
+ ret = io_poll_remove(req, s->sqe);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ if (req->error == -EAGAIN)
+ return -EAGAIN;
+
+ /* workqueue context doesn't hold uring_lock, grab it now */
+ if (s->needs_lock)
+ mutex_lock(&ctx->uring_lock);
+ io_iopoll_req_issued(req);
+ if (s->needs_lock)
+ mutex_unlock(&ctx->uring_lock);
+ }
+
+ return 0;
+}
+
+static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
+ const struct io_uring_sqe *sqe)
+{
+ switch (sqe->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ return &ctx->pending_async[READ];
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ return &ctx->pending_async[WRITE];
+ default:
+ return NULL;
+ }
+}
+
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+{
+ u8 opcode = READ_ONCE(sqe->opcode);
+
+ return !(opcode == IORING_OP_READ_FIXED ||
+ opcode == IORING_OP_WRITE_FIXED);
+}
+
+static void io_sq_wq_submit_work(struct work_struct *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct mm_struct *cur_mm = NULL;
+ struct async_list *async_list;
+ LIST_HEAD(req_list);
+ mm_segment_t old_fs;
+ int ret;
+
+ async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
+restart:
+ do {
+ struct sqe_submit *s = &req->submit;
+ const struct io_uring_sqe *sqe = s->sqe;
+
+ /* Ensure we clear previously set forced non-block flag */
+ req->flags &= ~REQ_F_FORCE_NONBLOCK;
+ req->rw.ki_flags &= ~IOCB_NOWAIT;
+
+ ret = 0;
+ if (io_sqe_needs_user(sqe) && !cur_mm) {
+ if (!mmget_not_zero(ctx->sqo_mm)) {
+ ret = -EFAULT;
+ } else {
+ cur_mm = ctx->sqo_mm;
+ use_mm(cur_mm);
+ old_fs = get_fs();
+ set_fs(USER_DS);
+ }
+ }
+
+ if (!ret) {
+ s->has_user = cur_mm != NULL;
+ s->needs_lock = true;
+ do {
+ ret = __io_submit_sqe(ctx, req, s, false, NULL);
+ /*
+ * We can get EAGAIN for polled IO even though
+ * we're forcing a sync submission from here,
+ * since we can't wait for request slots on the
+ * block side.
+ */
+ if (ret != -EAGAIN)
+ break;
+ cond_resched();
+ } while (1);
+ }
+ if (ret) {
+ io_cqring_add_event(ctx, sqe->user_data, ret, 0);
+ io_free_req(req);
+ }
+
+ /* async context always use a copy of the sqe */
+ kfree(sqe);
+
+ if (!async_list)
+ break;
+ if (!list_empty(&req_list)) {
+ req = list_first_entry(&req_list, struct io_kiocb,
+ list);
+ list_del(&req->list);
+ continue;
+ }
+ if (list_empty(&async_list->list))
+ break;
+
+ req = NULL;
+ spin_lock(&async_list->lock);
+ if (list_empty(&async_list->list)) {
+ spin_unlock(&async_list->lock);
+ break;
+ }
+ list_splice_init(&async_list->list, &req_list);
+ spin_unlock(&async_list->lock);
+
+ req = list_first_entry(&req_list, struct io_kiocb, list);
+ list_del(&req->list);
+ } while (req);
+
+ /*
+ * Rare case of racing with a submitter. If we find the count has
+ * dropped to zero AND we have pending work items, then restart
+ * the processing. This is a tiny race window.
+ */
+ if (async_list) {
+ ret = atomic_dec_return(&async_list->cnt);
+ while (!ret && !list_empty(&async_list->list)) {
+ spin_lock(&async_list->lock);
+ atomic_inc(&async_list->cnt);
+ list_splice_init(&async_list->list, &req_list);
+ spin_unlock(&async_list->lock);
+
+ if (!list_empty(&req_list)) {
+ req = list_first_entry(&req_list,
+ struct io_kiocb, list);
+ list_del(&req->list);
+ goto restart;
+ }
+ ret = atomic_dec_return(&async_list->cnt);
+ }
+ }
+
+ if (cur_mm) {
+ set_fs(old_fs);
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ }
+}
+
+/*
+ * See if we can piggy back onto previously submitted work, that is still
+ * running. We currently only allow this if the new request is sequential
+ * to the previous one we punted.
+ */
+static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
+{
+ bool ret = false;
+
+ if (!list)
+ return false;
+ if (!(req->flags & REQ_F_SEQ_PREV))
+ return false;
+ if (!atomic_read(&list->cnt))
+ return false;
+
+ ret = true;
+ spin_lock(&list->lock);
+ list_add_tail(&req->list, &list->list);
+ if (!atomic_read(&list->cnt)) {
+ list_del_init(&req->list);
+ ret = false;
+ }
+ spin_unlock(&list->lock);
+ return ret;
+}
+
+static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
+ struct io_submit_state *state)
+{
+ struct io_kiocb *req;
+ ssize_t ret;
+
+ /* enforce forwards compatibility on users */
+ if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
+ return -EINVAL;
+
+ req = io_get_req(ctx, state);
+ if (unlikely(!req))
+ return -EAGAIN;
+
+ req->rw.ki_filp = NULL;
+
+ ret = __io_submit_sqe(ctx, req, s, true, state);
+ if (ret == -EAGAIN) {
+ struct io_uring_sqe *sqe_copy;
+
+ sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+ if (sqe_copy) {
+ struct async_list *list;
+
+ memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
+ s->sqe = sqe_copy;
+
+ memcpy(&req->submit, s, sizeof(*s));
+ list = io_async_list_from_sqe(ctx, s->sqe);
+ if (!io_add_to_prev_work(list, req)) {
+ if (list)
+ atomic_inc(&list->cnt);
+ INIT_WORK(&req->work, io_sq_wq_submit_work);
+ queue_work(ctx->sqo_wq, &req->work);
+ }
+ ret = 0;
+ }
+ }
+ if (ret)
+ io_free_req(req);
+
+ return ret;
+}
+
+/*
+ * Batched submission is done, ensure local IO is flushed out.
+ */
+static void io_submit_state_end(struct io_submit_state *state)
+{
+ blk_finish_plug(&state->plug);
+ io_file_put(state, NULL);
+ if (state->free_reqs)
+ kmem_cache_free_bulk(req_cachep, state->free_reqs,
+ &state->reqs[state->cur_req]);
+}
+
+/*
+ * Start submission side cache.
+ */
+static void io_submit_state_start(struct io_submit_state *state,
+ struct io_ring_ctx *ctx, unsigned max_ios)
+{
+ blk_start_plug(&state->plug);
+ state->free_reqs = 0;
+ state->file = NULL;
+ state->ios_left = max_ios;
+}
+
+static void io_commit_sqring(struct io_ring_ctx *ctx)
+{
+ struct io_sq_ring *ring = ctx->sq_ring;
+
+ if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
+ /*
+ * Ensure any loads from the SQEs are done at this point,
+ * since once we write the new head, the application could
+ * write new data to them.
+ */
+ smp_store_release(&ring->r.head, ctx->cached_sq_head);
+
+ /*
+ * write side barrier of head update, app has read side. See
+ * comment at the top of this file
+ */
+ smp_wmb();
+ }
+}
+
+/*
+ * Undo last io_get_sqring()
+ */
+static void io_drop_sqring(struct io_ring_ctx *ctx)
+{
+ ctx->cached_sq_head--;
+}
+
+/*
+ * Fetch an sqe, if one is available. Note that s->sqe will point to memory
+ * that is mapped by userspace. This means that care needs to be taken to
+ * ensure that reads are stable, as we cannot rely on userspace always
+ * being a good citizen. If members of the sqe are validated and then later
+ * used, it's important that those reads are done through READ_ONCE() to
+ * prevent a re-load down the line.
+ */
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+{
+ struct io_sq_ring *ring = ctx->sq_ring;
+ unsigned head;
+
+ /*
+ * The cached sq head (or cq tail) serves two purposes:
+ *
+ * 1) allows us to batch the cost of updating the user visible
+ * head updates.
+ * 2) allows the kernel side to track the head on its own, even
+ * though the application is the one updating it.
+ */
+ head = ctx->cached_sq_head;
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (head == READ_ONCE(ring->r.tail))
+ return false;
+
+ head = READ_ONCE(ring->array[head & ctx->sq_mask]);
+ if (head < ctx->sq_entries) {
+ s->index = head;
+ s->sqe = &ctx->sq_sqes[head];
+ ctx->cached_sq_head++;
+ return true;
+ }
+
+ /* drop invalid entries */
+ ctx->cached_sq_head++;
+ ring->dropped++;
+ /* See comment at the top of this file */
+ smp_wmb();
+ return false;
+}
+
+static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
+ unsigned int nr, bool has_user, bool mm_fault)
+{
+ struct io_submit_state state, *statep = NULL;
+ int ret, i, submitted = 0;
+
+ if (nr > IO_PLUG_THRESHOLD) {
+ io_submit_state_start(&state, ctx, nr);
+ statep = &state;
+ }
+
+ for (i = 0; i < nr; i++) {
+ if (unlikely(mm_fault)) {
+ ret = -EFAULT;
+ } else {
+ sqes[i].has_user = has_user;
+ sqes[i].needs_lock = true;
+ sqes[i].needs_fixed_file = true;
+ ret = io_submit_sqe(ctx, &sqes[i], statep);
+ }
+ if (!ret) {
+ submitted++;
+ continue;
+ }
+
+ io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
+ }
+
+ if (statep)
+ io_submit_state_end(&state);
+
+ return submitted;
+}
+
+static int io_sq_thread(void *data)
+{
+ struct sqe_submit sqes[IO_IOPOLL_BATCH];
+ struct io_ring_ctx *ctx = data;
+ struct mm_struct *cur_mm = NULL;
+ mm_segment_t old_fs;
+ DEFINE_WAIT(wait);
+ unsigned inflight;
+ unsigned long timeout;
+
+ old_fs = get_fs();
+ set_fs(USER_DS);
+
+ timeout = inflight = 0;
+ while (!kthread_should_stop() && !ctx->sqo_stop) {
+ bool all_fixed, mm_fault = false;
+ int i;
+
+ if (inflight) {
+ unsigned nr_events = 0;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ /*
+ * We disallow the app entering submit/complete
+ * with polling, but we still need to lock the
+ * ring to prevent racing with polled issue
+ * that got punted to a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
+ io_iopoll_check(ctx, &nr_events, 0);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ /*
+ * Normal IO, just pretend everything completed.
+ * We don't have to poll completions for that.
+ */
+ nr_events = inflight;
+ }
+
+ inflight -= nr_events;
+ if (!inflight)
+ timeout = jiffies + ctx->sq_thread_idle;
+ }
+
+ if (!io_get_sqring(ctx, &sqes[0])) {
+ /*
+ * We're polling. If we're within the defined idle
+ * period, then let us spin without work before going
+ * to sleep.
+ */
+ if (inflight || !time_after(jiffies, timeout)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * Drop cur_mm before scheduling, we can't hold it for
+ * long periods (or over schedule()). Do this before
+ * adding ourselves to the waitqueue, as the unuse/drop
+ * may sleep.
+ */
+ if (cur_mm) {
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ cur_mm = NULL;
+ }
+
+ prepare_to_wait(&ctx->sqo_wait, &wait,
+ TASK_INTERRUPTIBLE);
+
+ /* Tell userspace we may need a wakeup call */
+ ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+
+ if (!io_get_sqring(ctx, &sqes[0])) {
+ if (kthread_should_stop()) {
+ finish_wait(&ctx->sqo_wait, &wait);
+ break;
+ }
+ if (signal_pending(current))
+ flush_signals(current);
+ schedule();
+ finish_wait(&ctx->sqo_wait, &wait);
+
+ ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+ continue;
+ }
+ finish_wait(&ctx->sqo_wait, &wait);
+
+ ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+ smp_wmb();
+ }
+
+ i = 0;
+ all_fixed = true;
+ do {
+ if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
+ all_fixed = false;
+
+ i++;
+ if (i == ARRAY_SIZE(sqes))
+ break;
+ } while (io_get_sqring(ctx, &sqes[i]));
+
+ /* Unless all new commands are FIXED regions, grab mm */
+ if (!all_fixed && !cur_mm) {
+ mm_fault = !mmget_not_zero(ctx->sqo_mm);
+ if (!mm_fault) {
+ use_mm(ctx->sqo_mm);
+ cur_mm = ctx->sqo_mm;
+ }
+ }
+
+ inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
+ mm_fault);
+
+ /* Commit SQ ring head once we've consumed all SQEs */
+ io_commit_sqring(ctx);
+ }
+
+ set_fs(old_fs);
+ if (cur_mm) {
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ }
+ return 0;
+}
+
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
+{
+ struct io_submit_state state, *statep = NULL;
+ int i, ret = 0, submit = 0;
+
+ if (to_submit > IO_PLUG_THRESHOLD) {
+ io_submit_state_start(&state, ctx, to_submit);
+ statep = &state;
+ }
+
+ for (i = 0; i < to_submit; i++) {
+ struct sqe_submit s;
+
+ if (!io_get_sqring(ctx, &s))
+ break;
+
+ s.has_user = true;
+ s.needs_lock = false;
+ s.needs_fixed_file = false;
+
+ ret = io_submit_sqe(ctx, &s, statep);
+ if (ret) {
+ io_drop_sqring(ctx);
+ break;
+ }
+
+ submit++;
+ }
+ io_commit_sqring(ctx);
+
+ if (statep)
+ io_submit_state_end(statep);
+
+ return submit ? submit : ret;
+}
+
+static unsigned io_cqring_events(struct io_cq_ring *ring)
+{
+ return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
+}
+
+/*
+ * Wait until events become available, if we don't already have some. The
+ * application must reap them itself, as they reside on the shared cq ring.
+ */
+static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
+ const sigset_t __user *sig, size_t sigsz)
+{
+ struct io_cq_ring *ring = ctx->cq_ring;
+ sigset_t ksigmask, sigsaved;
+ DEFINE_WAIT(wait);
+ int ret;
+
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (io_cqring_events(ring) >= min_events)
+ return 0;
+
+ if (sig) {
+ ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
+
+ ret = 0;
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (io_cqring_events(ring) >= min_events)
+ break;
+
+ schedule();
+
+ ret = -EINTR;
+ if (signal_pending(current))
+ break;
+ } while (1);
+
+ finish_wait(&ctx->wait, &wait);
+
+ if (sig)
+ restore_user_sigmask(sig, &sigsaved);
+
+ return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
+}
+
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_UNIX)
+ if (ctx->ring_sock) {
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+ kfree_skb(skb);
+ }
+#else
+ int i;
+
+ for (i = 0; i < ctx->nr_user_files; i++)
+ fput(ctx->user_files[i]);
+#endif
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+ if (!ctx->user_files)
+ return -ENXIO;
+
+ __io_sqe_files_unregister(ctx);
+ kfree(ctx->user_files);
+ ctx->user_files = NULL;
+ ctx->nr_user_files = 0;
+ return 0;
+}
+
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+ if (ctx->sqo_thread) {
+ ctx->sqo_stop = 1;
+ mb();
+ kthread_stop(ctx->sqo_thread);
+ ctx->sqo_thread = NULL;
+ }
+}
+
+static void io_finish_async(struct io_ring_ctx *ctx)
+{
+ io_sq_thread_stop(ctx);
+
+ if (ctx->sqo_wq) {
+ destroy_workqueue(ctx->sqo_wq);
+ ctx->sqo_wq = NULL;
+ }
+}
+
+#if defined(CONFIG_UNIX)
+static void io_destruct_skb(struct sk_buff *skb)
+{
+ struct io_ring_ctx *ctx = skb->sk->sk_user_data;
+
+ io_finish_async(ctx);
+ unix_destruct_scm(skb);
+}
+
+/*
+ * Ensure the UNIX gc is aware of our file set, so we are certain that
+ * the io_uring can be safely unregistered on process exit, even if we have
+ * loops in the file referencing.
+ */
+static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+{
+ struct sock *sk = ctx->ring_sock->sk;
+ struct scm_fp_list *fpl;
+ struct sk_buff *skb;
+ int i;
+
+ if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ unsigned long inflight = ctx->user->unix_inflight + nr;
+
+ if (inflight > task_rlimit(current, RLIMIT_NOFILE))
+ return -EMFILE;
+ }
+
+ fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
+
+ skb = alloc_skb(0, GFP_KERNEL);
+ if (!skb) {
+ kfree(fpl);
+ return -ENOMEM;
+ }
+
+ skb->sk = sk;
+ skb->destructor = io_destruct_skb;
+
+ fpl->user = get_uid(ctx->user);
+ for (i = 0; i < nr; i++) {
+ fpl->fp[i] = get_file(ctx->user_files[i + offset]);
+ unix_inflight(fpl->user, fpl->fp[i]);
+ }
+
+ fpl->max = fpl->count = nr;
+ UNIXCB(skb).fp = fpl;
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_queue_head(&sk->sk_receive_queue, skb);
+
+ for (i = 0; i < nr; i++)
+ fput(fpl->fp[i]);
+
+ return 0;
+}
+
+/*
+ * If UNIX sockets are enabled, fd passing can cause a reference cycle which
+ * causes regular reference counting to break down. We rely on the UNIX
+ * garbage collection to take care of this problem for us.
+ */
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+ unsigned left, total;
+ int ret = 0;
+
+ total = 0;
+ left = ctx->nr_user_files;
+ while (left) {
+ unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
+ int ret;
+
+ ret = __io_sqe_files_scm(ctx, this_files, total);
+ if (ret)
+ break;
+ left -= this_files;
+ total += this_files;
+ }
+
+ if (!ret)
+ return 0;
+
+ while (total < ctx->nr_user_files) {
+ fput(ctx->user_files[total]);
+ total++;
+ }
+
+ return ret;
+}
+#else
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+ return 0;
+}
+#endif
+
+static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ __s32 __user *fds = (__s32 __user *) arg;
+ int fd, ret = 0;
+ unsigned i;
+
+ if (ctx->user_files)
+ return -EBUSY;
+ if (!nr_args)
+ return -EINVAL;
+ if (nr_args > IORING_MAX_FIXED_FILES)
+ return -EMFILE;
+
+ ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
+ if (!ctx->user_files)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_args; i++) {
+ ret = -EFAULT;
+ if (copy_from_user(&fd, &fds[i], sizeof(fd)))
+ break;
+
+ ctx->user_files[i] = fget(fd);
+
+ ret = -EBADF;
+ if (!ctx->user_files[i])
+ break;
+ /*
+ * Don't allow io_uring instances to be registered. If UNIX
+ * isn't enabled, then this causes a reference cycle and this
+ * instance can never get freed. If UNIX is enabled we'll
+ * handle it just fine, but there's still no point in allowing
+ * a ring fd as it doesn't support regular read/write anyway.
+ */
+ if (ctx->user_files[i]->f_op == &io_uring_fops) {
+ fput(ctx->user_files[i]);
+ break;
+ }
+ ctx->nr_user_files++;
+ ret = 0;
+ }
+
+ if (ret) {
+ for (i = 0; i < ctx->nr_user_files; i++)
+ fput(ctx->user_files[i]);
+
+ kfree(ctx->user_files);
+ ctx->nr_user_files = 0;
+ return ret;
+ }
+
+ ret = io_sqe_files_scm(ctx);
+ if (ret)
+ io_sqe_files_unregister(ctx);
+
+ return ret;
+}
+
+static int io_sq_offload_start(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ int ret;
+
+ init_waitqueue_head(&ctx->sqo_wait);
+ mmgrab(current->mm);
+ ctx->sqo_mm = current->mm;
+
+ ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+ if (!ctx->sq_thread_idle)
+ ctx->sq_thread_idle = HZ;
+
+ ret = -EINVAL;
+ if (!cpu_possible(p->sq_thread_cpu))
+ goto err;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (p->flags & IORING_SETUP_SQ_AFF) {
+ int cpu;
+
+ cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
+ ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
+ ctx, cpu,
+ "io_uring-sq");
+ } else {
+ ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+ "io_uring-sq");
+ }
+ if (IS_ERR(ctx->sqo_thread)) {
+ ret = PTR_ERR(ctx->sqo_thread);
+ ctx->sqo_thread = NULL;
+ goto err;
+ }
+ wake_up_process(ctx->sqo_thread);
+ } else if (p->flags & IORING_SETUP_SQ_AFF) {
+ /* Can't have SQ_AFF without SQPOLL */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* Do QD, or 2 * CPUS, whatever is smallest */
+ ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
+ min(ctx->sq_entries - 1, 2 * num_online_cpus()));
+ if (!ctx->sqo_wq) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ return 0;
+err:
+ io_sq_thread_stop(ctx);
+ mmdrop(ctx->sqo_mm);
+ ctx->sqo_mm = NULL;
+ return ret;
+}
+
+static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+{
+ atomic_long_sub(nr_pages, &user->locked_vm);
+}
+
+static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+{
+ unsigned long page_limit, cur_pages, new_pages;
+
+ /* Don't allow more pages than we can safely lock */
+ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ do {
+ cur_pages = atomic_long_read(&user->locked_vm);
+ new_pages = cur_pages + nr_pages;
+ if (new_pages > page_limit)
+ return -ENOMEM;
+ } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+ new_pages) != cur_pages);
+
+ return 0;
+}
+
+static void io_mem_free(void *ptr)
+{
+ struct page *page = virt_to_head_page(ptr);
+
+ if (put_page_testzero(page))
+ free_compound_page(page);
+}
+
+static void *io_mem_alloc(size_t size)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
+ __GFP_NORETRY;
+
+ return (void *) __get_free_pages(gfp_flags, get_order(size));
+}
+
+static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
+{
+ struct io_sq_ring *sq_ring;
+ struct io_cq_ring *cq_ring;
+ size_t bytes;
+
+ bytes = struct_size(sq_ring, array, sq_entries);
+ bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
+ bytes += struct_size(cq_ring, cqes, cq_entries);
+
+ return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+}
+
+static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+{
+ int i, j;
+
+ if (!ctx->user_bufs)
+ return -ENXIO;
+
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+
+ for (j = 0; j < imu->nr_bvecs; j++)
+ put_page(imu->bvec[j].bv_page);
+
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, imu->nr_bvecs);
+ kfree(imu->bvec);
+ imu->nr_bvecs = 0;
+ }
+
+ kfree(ctx->user_bufs);
+ ctx->user_bufs = NULL;
+ ctx->nr_user_bufs = 0;
+ return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+ void __user *arg, unsigned index)
+{
+ struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+ if (ctx->compat) {
+ struct compat_iovec __user *ciovs;
+ struct compat_iovec ciov;
+
+ ciovs = (struct compat_iovec __user *) arg;
+ if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+ return -EFAULT;
+
+ dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+ dst->iov_len = ciov.iov_len;
+ return 0;
+ }
+#endif
+ src = (struct iovec __user *) arg;
+ if (copy_from_user(dst, &src[index], sizeof(*dst)))
+ return -EFAULT;
+ return 0;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct vm_area_struct **vmas = NULL;
+ struct page **pages = NULL;
+ int i, j, got_pages = 0;
+ int ret = -EINVAL;
+
+ if (ctx->user_bufs)
+ return -EBUSY;
+ if (!nr_args || nr_args > UIO_MAXIOV)
+ return -EINVAL;
+
+ ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
+ GFP_KERNEL);
+ if (!ctx->user_bufs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_args; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+ unsigned long off, start, end, ubuf;
+ int pret, nr_pages;
+ struct iovec iov;
+ size_t size;
+
+ ret = io_copy_iov(ctx, &iov, arg, i);
+ if (ret)
+ break;
+
+ /*
+ * Don't impose further limits on the size and buffer
+ * constraints here, we'll -EINVAL later when IO is
+ * submitted if they are wrong.
+ */
+ ret = -EFAULT;
+ if (!iov.iov_base || !iov.iov_len)
+ goto err;
+
+ /* arbitrary limit, but we need something */
+ if (iov.iov_len > SZ_1G)
+ goto err;
+
+ ubuf = (unsigned long) iov.iov_base;
+ end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ubuf >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ if (ctx->account_mem) {
+ ret = io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ goto err;
+ }
+
+ ret = 0;
+ if (!pages || nr_pages > got_pages) {
+ kfree(vmas);
+ kfree(pages);
+ pages = kmalloc_array(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ vmas = kmalloc_array(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!pages || !vmas) {
+ ret = -ENOMEM;
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+ got_pages = nr_pages;
+ }
+
+ imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!imu->bvec) {
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+
+ ret = 0;
+ down_read(&current->mm->mmap_sem);
+ pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
+ pages, vmas);
+ if (pret == nr_pages) {
+ /* don't support file backed memory */
+ for (j = 0; j < nr_pages; j++) {
+ struct vm_area_struct *vma = vmas[j];
+
+ if (vma->vm_file &&
+ !is_file_hugepages(vma->vm_file)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ }
+ } else {
+ ret = pret < 0 ? pret : -EFAULT;
+ }
+ up_read(&current->mm->mmap_sem);
+ if (ret) {
+ /*
+ * if we did partial map, or found file backed vmas,
+ * release any pages we did get
+ */
+ if (pret > 0) {
+ for (j = 0; j < pret; j++)
+ put_page(pages[j]);
+ }
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user, nr_pages);
+ goto err;
+ }
+
+ off = ubuf & ~PAGE_MASK;
+ size = iov.iov_len;
+ for (j = 0; j < nr_pages; j++) {
+ size_t vec_len;
+
+ vec_len = min_t(size_t, size, PAGE_SIZE - off);
+ imu->bvec[j].bv_page = pages[j];
+ imu->bvec[j].bv_len = vec_len;
+ imu->bvec[j].bv_offset = off;
+ off = 0;
+ size -= vec_len;
+ }
+ /* store original address for later verification */
+ imu->ubuf = ubuf;
+ imu->len = iov.iov_len;
+ imu->nr_bvecs = nr_pages;
+
+ ctx->nr_user_bufs++;
+ }
+ kfree(pages);
+ kfree(vmas);
+ return 0;
+err:
+ kfree(pages);
+ kfree(vmas);
+ io_sqe_buffer_unregister(ctx);
+ return ret;
+}
+
+static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+{
+ io_finish_async(ctx);
+ if (ctx->sqo_mm)
+ mmdrop(ctx->sqo_mm);
+
+ io_iopoll_reap_events(ctx);
+ io_sqe_buffer_unregister(ctx);
+ io_sqe_files_unregister(ctx);
+
+#if defined(CONFIG_UNIX)
+ if (ctx->ring_sock)
+ sock_release(ctx->ring_sock);
+#endif
+
+ io_mem_free(ctx->sq_ring);
+ io_mem_free(ctx->sq_sqes);
+ io_mem_free(ctx->cq_ring);
+
+ percpu_ref_exit(&ctx->refs);
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user,
+ ring_pages(ctx->sq_entries, ctx->cq_entries));
+ free_uid(ctx->user);
+ kfree(ctx);
+}
+
+static __poll_t io_uring_poll(struct file *file, poll_table *wait)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(file, &ctx->cq_wait, wait);
+ /* See comment at the top of this file */
+ smp_rmb();
+ if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static int io_uring_fasync(int fd, struct file *file, int on)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ return fasync_helper(fd, file, on, &ctx->cq_fasync);
+}
+
+static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+{
+ mutex_lock(&ctx->uring_lock);
+ percpu_ref_kill(&ctx->refs);
+ mutex_unlock(&ctx->uring_lock);
+
+ io_poll_remove_all(ctx);
+ io_iopoll_reap_events(ctx);
+ wait_for_completion(&ctx->ctx_done);
+ io_ring_ctx_free(ctx);
+}
+
+static int io_uring_release(struct inode *inode, struct file *file)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ file->private_data = NULL;
+ io_ring_ctx_wait_and_kill(ctx);
+ return 0;
+}
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long sz = vma->vm_end - vma->vm_start;
+ struct io_ring_ctx *ctx = file->private_data;
+ unsigned long pfn;
+ struct page *page;
+ void *ptr;
+
+ switch (offset) {
+ case IORING_OFF_SQ_RING:
+ ptr = ctx->sq_ring;
+ break;
+ case IORING_OFF_SQES:
+ ptr = ctx->sq_sqes;
+ break;
+ case IORING_OFF_CQ_RING:
+ ptr = ctx->cq_ring;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ page = virt_to_head_page(ptr);
+ if (sz > (PAGE_SIZE << compound_order(page)))
+ return -EINVAL;
+
+ pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
+ return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
+ u32, min_complete, u32, flags, const sigset_t __user *, sig,
+ size_t, sigsz)
+{
+ struct io_ring_ctx *ctx;
+ long ret = -EBADF;
+ int submitted = 0;
+ struct fd f;
+
+ if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
+ return -EINVAL;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EOPNOTSUPP;
+ if (f.file->f_op != &io_uring_fops)
+ goto out_fput;
+
+ ret = -ENXIO;
+ ctx = f.file->private_data;
+ if (!percpu_ref_tryget(&ctx->refs))
+ goto out_fput;
+
+ /*
+ * For SQ polling, the thread will do all submissions and completions.
+ * Just return the requested submit count, and wake the thread if
+ * we were asked to.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (flags & IORING_ENTER_SQ_WAKEUP)
+ wake_up(&ctx->sqo_wait);
+ submitted = to_submit;
+ goto out_ctx;
+ }
+
+ ret = 0;
+ if (to_submit) {
+ to_submit = min(to_submit, ctx->sq_entries);
+
+ mutex_lock(&ctx->uring_lock);
+ submitted = io_ring_submit(ctx, to_submit);
+ mutex_unlock(&ctx->uring_lock);
+
+ if (submitted < 0)
+ goto out_ctx;
+ }
+ if (flags & IORING_ENTER_GETEVENTS) {
+ unsigned nr_events = 0;
+
+ min_complete = min(min_complete, ctx->cq_entries);
+
+ /*
+ * The application could have included the 'to_submit' count
+ * in how many events it wanted to wait for. If we failed to
+ * submit the desired count, we may need to adjust the number
+ * of events to poll/wait for.
+ */
+ if (submitted < to_submit)
+ min_complete = min_t(unsigned, submitted, min_complete);
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ mutex_lock(&ctx->uring_lock);
+ ret = io_iopoll_check(ctx, &nr_events, min_complete);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+ }
+ }
+
+out_ctx:
+ io_ring_drop_ctx_refs(ctx, 1);
+out_fput:
+ fdput(f);
+ return submitted ? submitted : ret;
+}
+
+static const struct file_operations io_uring_fops = {
+ .release = io_uring_release,
+ .mmap = io_uring_mmap,
+ .poll = io_uring_poll,
+ .fasync = io_uring_fasync,
+};
+
+static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ struct io_sq_ring *sq_ring;
+ struct io_cq_ring *cq_ring;
+ size_t size;
+
+ sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
+ if (!sq_ring)
+ return -ENOMEM;
+
+ ctx->sq_ring = sq_ring;
+ sq_ring->ring_mask = p->sq_entries - 1;
+ sq_ring->ring_entries = p->sq_entries;
+ ctx->sq_mask = sq_ring->ring_mask;
+ ctx->sq_entries = sq_ring->ring_entries;
+
+ size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ ctx->sq_sqes = io_mem_alloc(size);
+ if (!ctx->sq_sqes) {
+ io_mem_free(ctx->sq_ring);
+ return -ENOMEM;
+ }
+
+ cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
+ if (!cq_ring) {
+ io_mem_free(ctx->sq_ring);
+ io_mem_free(ctx->sq_sqes);
+ return -ENOMEM;
+ }
+
+ ctx->cq_ring = cq_ring;
+ cq_ring->ring_mask = p->cq_entries - 1;
+ cq_ring->ring_entries = p->cq_entries;
+ ctx->cq_mask = cq_ring->ring_mask;
+ ctx->cq_entries = cq_ring->ring_entries;
+ return 0;
+}
+
+/*
+ * Allocate an anonymous fd, this is what constitutes the application
+ * visible backing of an io_uring instance. The application mmaps this
+ * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
+ * we have to tie this fd to a socket for file garbage collection purposes.
+ */
+static int io_uring_get_fd(struct io_ring_ctx *ctx)
+{
+ struct file *file;
+ int ret;
+
+#if defined(CONFIG_UNIX)
+ ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
+ &ctx->ring_sock);
+ if (ret)
+ return ret;
+#endif
+
+ ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (ret < 0)
+ goto err;
+
+ file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
+ O_RDWR | O_CLOEXEC);
+ if (IS_ERR(file)) {
+ put_unused_fd(ret);
+ ret = PTR_ERR(file);
+ goto err;
+ }
+
+#if defined(CONFIG_UNIX)
+ ctx->ring_sock->file = file;
+ ctx->ring_sock->sk->sk_user_data = ctx;
+#endif
+ fd_install(ret, file);
+ return ret;
+err:
+#if defined(CONFIG_UNIX)
+ sock_release(ctx->ring_sock);
+ ctx->ring_sock = NULL;
+#endif
+ return ret;
+}
+
+static int io_uring_create(unsigned entries, struct io_uring_params *p)
+{
+ struct user_struct *user = NULL;
+ struct io_ring_ctx *ctx;
+ bool account_mem;
+ int ret;
+
+ if (!entries || entries > IORING_MAX_ENTRIES)
+ return -EINVAL;
+
+ /*
+ * Use twice as many entries for the CQ ring. It's possible for the
+ * application to drive a higher depth than the size of the SQ ring,
+ * since the sqes are only used at submission time. This allows for
+ * some flexibility in overcommitting a bit.
+ */
+ p->sq_entries = roundup_pow_of_two(entries);
+ p->cq_entries = 2 * p->sq_entries;
+
+ user = get_uid(current_user());
+ account_mem = !capable(CAP_IPC_LOCK);
+
+ if (account_mem) {
+ ret = io_account_mem(user,
+ ring_pages(p->sq_entries, p->cq_entries));
+ if (ret) {
+ free_uid(user);
+ return ret;
+ }
+ }
+
+ ctx = io_ring_ctx_alloc(p);
+ if (!ctx) {
+ if (account_mem)
+ io_unaccount_mem(user, ring_pages(p->sq_entries,
+ p->cq_entries));
+ free_uid(user);
+ return -ENOMEM;
+ }
+ ctx->compat = in_compat_syscall();
+ ctx->account_mem = account_mem;
+ ctx->user = user;
+
+ ret = io_allocate_scq_urings(ctx, p);
+ if (ret)
+ goto err;
+
+ ret = io_sq_offload_start(ctx, p);
+ if (ret)
+ goto err;
+
+ ret = io_uring_get_fd(ctx);
+ if (ret < 0)
+ goto err;
+
+ memset(&p->sq_off, 0, sizeof(p->sq_off));
+ p->sq_off.head = offsetof(struct io_sq_ring, r.head);
+ p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
+ p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
+ p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
+ p->sq_off.flags = offsetof(struct io_sq_ring, flags);
+ p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
+ p->sq_off.array = offsetof(struct io_sq_ring, array);
+
+ memset(&p->cq_off, 0, sizeof(p->cq_off));
+ p->cq_off.head = offsetof(struct io_cq_ring, r.head);
+ p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
+ p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
+ p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
+ p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
+ p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
+ return ret;
+err:
+ io_ring_ctx_wait_and_kill(ctx);
+ return ret;
+}
+
+/*
+ * Sets up an aio uring context, and returns the fd. Applications asks for a
+ * ring size, we return the actual sq/cq ring sizes (among other things) in the
+ * params structure passed in.
+ */
+static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
+{
+ struct io_uring_params p;
+ long ret;
+ int i;
+
+ if (copy_from_user(&p, params, sizeof(p)))
+ return -EFAULT;
+ for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
+ if (p.resv[i])
+ return -EINVAL;
+ }
+
+ if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
+ IORING_SETUP_SQ_AFF))
+ return -EINVAL;
+
+ ret = io_uring_create(entries, &p);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(params, &p, sizeof(p)))
+ return -EFAULT;
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(io_uring_setup, u32, entries,
+ struct io_uring_params __user *, params)
+{
+ return io_uring_setup(entries, params);
+}
+
+static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
+ void __user *arg, unsigned nr_args)
+{
+ int ret;
+
+ percpu_ref_kill(&ctx->refs);
+ wait_for_completion(&ctx->ctx_done);
+
+ switch (opcode) {
+ case IORING_REGISTER_BUFFERS:
+ ret = io_sqe_buffer_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_BUFFERS:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_buffer_unregister(ctx);
+ break;
+ case IORING_REGISTER_FILES:
+ ret = io_sqe_files_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_FILES:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_files_unregister(ctx);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ /* bring the ctx back to life */
+ reinit_completion(&ctx->ctx_done);
+ percpu_ref_reinit(&ctx->refs);
+ return ret;
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+ void __user *, arg, unsigned int, nr_args)
+{
+ struct io_ring_ctx *ctx;
+ long ret = -EBADF;
+ struct fd f;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EOPNOTSUPP;
+ if (f.file->f_op != &io_uring_fops)
+ goto out_fput;
+
+ ctx = f.file->private_data;
+
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_uring_register(ctx, opcode, arg, nr_args);
+ mutex_unlock(&ctx->uring_lock);
+out_fput:
+ fdput(f);
+ return ret;
+}
+
+static int __init io_uring_init(void)
+{
+ req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ return 0;
+};
+__initcall(io_uring_init);
diff --git a/fs/iomap.c b/fs/iomap.c
index a3088fae567b..97cb9d486a7d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -116,6 +116,12 @@ iomap_page_create(struct inode *inode, struct page *page)
atomic_set(&iop->read_count, 0);
atomic_set(&iop->write_count, 0);
bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+
+ /*
+ * migrate_page_move_mapping() assumes that pages with private data have
+ * their count elevated by 1.
+ */
+ get_page(page);
set_page_private(page, (unsigned long)iop);
SetPagePrivate(page);
return iop;
@@ -132,6 +138,7 @@ iomap_page_release(struct page *page)
WARN_ON_ONCE(atomic_read(&iop->write_count));
ClearPagePrivate(page);
set_page_private(page, 0);
+ put_page(page);
kfree(iop);
}
@@ -267,8 +274,9 @@ iomap_read_end_io(struct bio *bio)
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
iomap_read_page_end_io(bvec, error);
bio_put(bio);
}
@@ -317,7 +325,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
*/
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
- if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+ if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@@ -348,7 +356,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
- __bio_add_page(ctx->bio, page, plen, poff);
+ bio_add_page(ctx->bio, page, plen, poff);
done:
/*
* Move the caller beyond our range so that it keeps making progress.
@@ -569,8 +577,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
if (page_has_private(page)) {
ClearPagePrivate(page);
+ get_page(newpage);
set_page_private(newpage, page_private(page));
set_page_private(page, 0);
+ put_page(page);
SetPagePrivate(newpage);
}
@@ -1454,6 +1464,28 @@ struct iomap_dio {
};
};
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+ struct request_queue *q = READ_ONCE(kiocb->private);
+
+ if (!q)
+ return 0;
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+ struct bio *bio)
+{
+ atomic_inc(&dio->ref);
+
+ if (dio->iocb->ki_flags & IOCB_HIPRI)
+ bio_set_polled(bio, dio->iocb);
+
+ dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+ dio->submit.cookie = submit_bio(bio);
+}
+
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
struct kiocb *iocb = dio->iocb;
@@ -1559,14 +1591,15 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
}
-static blk_qc_t
+static void
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
unsigned len)
{
@@ -1580,15 +1613,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- if (dio->iocb->ki_flags & IOCB_HIPRI)
- flags |= REQ_HIPRI;
-
get_page(page);
__bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
-
- atomic_inc(&dio->ref);
- return submit_bio(bio);
+ iomap_dio_submit_bio(dio, iomap, bio);
}
static loff_t
@@ -1691,9 +1719,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
bio_set_pages_dirty(bio);
}
- if (dio->iocb->ki_flags & IOCB_HIPRI)
- bio->bi_opf |= REQ_HIPRI;
-
iov_iter_advance(dio->submit.iter, n);
dio->size += n;
@@ -1701,11 +1726,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
copied += n;
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-
- atomic_inc(&dio->ref);
-
- dio->submit.last_queue = bdev_get_queue(iomap->bdev);
- dio->submit.cookie = submit_bio(bio);
+ iomap_dio_submit_bio(dio, iomap, bio);
} while (nr_pages);
/*
@@ -1804,6 +1825,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos, start = pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0;
unsigned int flags = IOMAP_DIRECT;
+ bool wait_for_completion = is_sync_kiocb(iocb);
struct blk_plug plug;
struct iomap_dio *dio;
@@ -1823,7 +1845,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->end_io = end_io;
dio->error = 0;
dio->flags = 0;
- dio->wait_for_completion = is_sync_kiocb(iocb);
dio->submit.iter = iter;
dio->submit.waiter = current;
@@ -1878,7 +1899,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio_warn_stale_pagecache(iocb->ki_filp);
ret = 0;
- if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
+ if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
!inode->i_sb->s_dio_done_wq) {
ret = sb_init_dio_done_wq(inode->i_sb);
if (ret < 0)
@@ -1894,7 +1915,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret <= 0) {
/* magic error code to fall back to buffered I/O */
if (ret == -ENOTBLK) {
- dio->wait_for_completion = true;
+ wait_for_completion = true;
ret = 0;
}
break;
@@ -1916,8 +1937,27 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+ WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+ WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
+ /*
+ * We are about to drop our additional submission reference, which
+ * might be the last reference to the dio. There are three three
+ * different ways we can progress here:
+ *
+ * (a) If this is the last reference we will always complete and free
+ * the dio ourselves.
+ * (b) If this is not the last reference, and we serve an asynchronous
+ * iocb, we must never touch the dio after the decrement, the
+ * I/O completion handler will complete and free it.
+ * (c) If this is not the last reference, but we serve a synchronous
+ * iocb, the I/O completion handler will wake us up on the drop
+ * of the final reference, and we will complete and free it here
+ * after we got woken by the I/O completion handler.
+ */
+ dio->wait_for_completion = wait_for_completion;
if (!atomic_dec_and_test(&dio->ref)) {
- if (!dio->wait_for_completion)
+ if (!wait_for_completion)
return -EIOCBQUEUED;
for (;;) {
@@ -1934,9 +1974,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
__set_current_state(TASK_RUNNING);
}
- ret = iomap_dio_complete(dio);
-
- return ret;
+ return iomap_dio_complete(dio);
out_free_dio:
kfree(dio);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 4ca0b5c18192..b84d635567d3 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -536,8 +536,8 @@ void kernfs_put(struct kernfs_node *kn)
security_release_secctx(kn->iattr->ia_secdata,
kn->iattr->ia_secdata_len);
simple_xattrs_free(&kn->iattr->xattrs);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
- kfree(kn->iattr);
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, kn->id.ino);
spin_unlock(&kernfs_idr_lock);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f8d5021a652e..ae948aaa4c53 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
* to see if it supports poll (Neither 'poll' nor 'select' return
* an appropriate error code). When in doubt, set a suitable timeout value.
*/
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
+{
+ struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
+ struct kernfs_open_node *on = kn->attr.open;
+
+ poll_wait(of->file, &on->poll, wait);
+
+ if (of->event != atomic_read(&on->event))
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
struct kernfs_open_file *of = kernfs_of(filp);
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ __poll_t ret;
if (!kernfs_get_active(kn))
- goto trigger;
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
- poll_wait(filp, &on->poll, wait);
+ if (kn->attr.ops->poll)
+ ret = kn->attr.ops->poll(of, wait);
+ else
+ ret = kernfs_generic_poll(of, wait);
kernfs_put_active(kn);
-
- if (of->event != atomic_read(&on->event))
- goto trigger;
-
- return DEFAULT_POLLMASK;
-
- trigger:
- return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+ return ret;
}
static void kernfs_notify_workfn(struct work_struct *work)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 80cebcd94c90..0c1fd945ce42 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -42,7 +42,7 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
if (kn->iattr)
goto out_unlock;
- kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+ kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
if (!kn->iattr)
goto out_unlock;
iattrs = &kn->iattr->ia_iattr;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3d83b114bb08..dba810cd83b1 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -78,7 +78,7 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
}
extern const struct super_operations kernfs_sops;
-extern struct kmem_cache *kernfs_node_cache;
+extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
/*
* inode.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index fdf527b6d79c..f3ac352699cf 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -20,7 +20,7 @@
#include "kernfs-internal.h"
-struct kmem_cache *kernfs_node_cache;
+struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
{
@@ -196,8 +196,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
return dentry;
knparent = find_next_ancestor(kn, NULL);
- if (WARN_ON(!knparent))
+ if (WARN_ON(!knparent)) {
+ dput(dentry);
return ERR_PTR(-EINVAL);
+ }
do {
struct dentry *dtmp;
@@ -206,8 +208,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
if (kn == knparent)
return dentry;
kntmp = find_next_ancestor(kn, knparent);
- if (WARN_ON(!kntmp))
+ if (WARN_ON(!kntmp)) {
+ dput(dentry);
return ERR_PTR(-EINVAL);
+ }
dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
strlen(kntmp->name));
dput(dentry);
@@ -417,4 +421,9 @@ void __init kernfs_init(void)
0,
SLAB_PANIC | SLAB_TYPESAFE_BY_RCU,
NULL);
+
+ /* Creates slab cache for kernfs inode attributes */
+ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
+ sizeof(struct kernfs_iattrs),
+ 0, SLAB_PANIC, NULL);
}
diff --git a/fs/locks.c b/fs/locks.c
index ff6af2c32601..eaa1cfaf73b0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1058,7 +1058,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
return -ENOMEM;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
@@ -1100,7 +1100,7 @@ find_conflict:
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
@@ -1138,7 +1138,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
new_fl2 = locks_alloc_lock();
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1312,7 +1312,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
}
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
/*
* Free any unused locks.
*/
@@ -1584,7 +1584,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
return error;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
@@ -1636,13 +1636,13 @@ restart:
locks_insert_block(fl, new_fl, leases_conflict);
trace_break_lease_block(inode, new_fl);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_blocker, break_time);
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
@@ -1659,7 +1659,7 @@ restart:
}
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
@@ -1729,7 +1729,7 @@ int fcntl_getlease(struct file *filp)
ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
@@ -1739,7 +1739,7 @@ int fcntl_getlease(struct file *filp)
break;
}
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
@@ -1813,7 +1813,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
return -EINVAL;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1884,7 +1884,7 @@ out_setup:
lease->fl_lmops->lm_setup(lease, priv);
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
if (is_deleg)
inode_unlock(inode);
@@ -1907,7 +1907,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
return error;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
@@ -1920,7 +1920,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
return error;
}
@@ -2643,13 +2643,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
if (list_empty(&ctx->flc_lease))
return;
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
if (filp == fl->fl_file)
lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
diff --git a/fs/mpage.c b/fs/mpage.c
index c820dc9bebab..3f19da75178b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, i, iter_all) {
struct page *page = bv->bv_page;
page_endio(page, bio_op(bio),
blk_status_to_errno(bio->bi_status));
diff --git a/fs/namei.c b/fs/namei.c
index 914178cdbe94..0a8c5c27f90e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -39,7 +39,6 @@
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>
-#include <linux/build_bug.h>
#include "internal.h"
#include "mount.h"
@@ -131,7 +130,6 @@ getname_flags(const char __user *filename, int flags, int *empty)
struct filename *result;
char *kname;
int len;
- BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
result = audit_reusename(filename);
if (result)
@@ -2720,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
if (unlikely(error == -ESTALE))
error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
if (likely(!error))
- audit_inode(name, path->dentry, 0);
+ audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL);
restore_nameidata();
putname(name);
return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index a677b59efd74..98a8c182af4f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,6 +1640,8 @@ int ksys_umount(char __user *name, int flags)
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
+ lookup_flags |= LOOKUP_NO_EVAL;
+
retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
@@ -2698,7 +2700,6 @@ static long exact_copy_from_user(void *to, const void __user * from,
if (!access_ok(from, n))
return n;
- current->kernel_uaccess_faults_ok++;
while (n) {
if (__get_user(c, f)) {
memset(t, 0, n);
@@ -2708,7 +2709,6 @@ static long exact_copy_from_user(void *to, const void __user * from,
f++;
n--;
}
- current->kernel_uaccess_faults_ok--;
return n;
}
@@ -2746,7 +2746,7 @@ void *copy_mount_options(const void __user * data)
char *copy_mount_string(const void __user *data)
{
- return data ? strndup_user(data, PAGE_SIZE) : NULL;
+ return data ? strndup_user(data, PATH_MAX) : NULL;
}
/*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 46d691ba04bc..45b2322e092d 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -133,15 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
{
- ssize_t ret;
-
if (file_inode(file_in) == file_inode(file_out))
return -EINVAL;
-retry:
- ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
- if (ret == -EAGAIN)
- goto retry;
- return ret;
+ return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
}
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 3f23b6840547..bf34ddaa2ad7 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -44,6 +44,7 @@
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
+#include <keys/request_key_auth-type.h>
#include <linux/module.h>
#include "internal.h"
@@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy;
struct idmap_legacy_upcalldata {
struct rpc_pipe_msg pipe_msg;
struct idmap_msg idmap_msg;
- struct key_construction *key_cons;
+ struct key *authkey;
struct idmap *idmap;
};
@@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = {
{ Opt_find_err, NULL }
};
-static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
+static int nfs_idmap_legacy_upcall(struct key *, void *);
static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
size_t);
static void idmap_release_pipe(struct inode *);
@@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
static void
nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
{
- struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
+ struct key *authkey = idmap->idmap_upcall_data->authkey;
kfree(idmap->idmap_upcall_data);
idmap->idmap_upcall_data = NULL;
- complete_request_key(cons, ret);
+ complete_request_key(authkey, ret);
+ key_put(authkey);
}
static void
@@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
}
-static int nfs_idmap_legacy_upcall(struct key_construction *cons,
- const char *op,
- void *aux)
+static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
{
struct idmap_legacy_upcalldata *data;
+ struct request_key_auth *rka = get_request_key_auth(authkey);
struct rpc_pipe_msg *msg;
struct idmap_msg *im;
struct idmap *idmap = (struct idmap *)aux;
- struct key *key = cons->key;
+ struct key *key = rka->target_key;
int ret = -ENOKEY;
if (!aux)
@@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
msg = &data->pipe_msg;
im = &data->idmap_msg;
data->idmap = idmap;
- data->key_cons = cons;
+ data->authkey = key_get(authkey);
ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
if (ret < 0)
@@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
out2:
kfree(data);
out1:
- complete_request_key(cons, ret);
+ complete_request_key(authkey, ret);
return ret;
}
@@ -651,9 +652,10 @@ out:
static ssize_t
idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
+ struct request_key_auth *rka;
struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
- struct key_construction *cons;
+ struct key *authkey;
struct idmap_msg im;
size_t namelen_in;
int ret = -ENOKEY;
@@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (idmap->idmap_upcall_data == NULL)
goto out_noupcall;
- cons = idmap->idmap_upcall_data->key_cons;
+ authkey = idmap->idmap_upcall_data->authkey;
+ rka = get_request_key_auth(authkey);
if (mlen != sizeof(im)) {
ret = -ENOSPC;
@@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
ret = nfs_idmap_read_and_verify_message(&im,
&idmap->idmap_upcall_data->idmap_msg,
- cons->key, cons->authkey);
+ rka->target_key, authkey);
if (ret >= 0) {
- key_set_timeout(cons->key, nfs_idmap_cache_timeout);
+ key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
ret = mlen;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 22ce3c8a2f46..0570391eaa16 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1895,6 +1895,11 @@ static int nfs_parse_devname(const char *dev_name,
size_t len;
char *end;
+ if (unlikely(!dev_name || !*dev_name)) {
+ dfprintk(MOUNT, "NFS: device name not specified\n");
+ return -EINVAL;
+ }
+
/* Is the host name protected with square brakcets? */
if (*dev_name == '[') {
end = strchr(++dev_name, ']');
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5a0bbf917a32..d09c9f878141 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -238,9 +238,9 @@ out:
}
/* A writeback failed: mark the page as bad, and invalidate the page cache */
-static void nfs_set_pageerror(struct page *page)
+static void nfs_set_pageerror(struct address_space *mapping)
{
- nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
+ nfs_zap_mapping(mapping->host, mapping);
}
/*
@@ -621,11 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
nfs_set_page_writeback(page);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
- ret = 0;
+ ret = req->wb_context->error;
/* If there is a fatal error that covers this write, just exit */
- if (nfs_error_is_fatal_on_server(req->wb_context->error))
+ if (nfs_error_is_fatal_on_server(ret))
goto out_launder;
+ ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
ret = pgio->pg_error;
/*
@@ -635,9 +636,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
nfs_context_set_write_error(req->wb_context, ret);
if (nfs_error_is_fatal_on_server(ret))
goto out_launder;
- }
+ } else
+ ret = -EAGAIN;
nfs_redirty_request(req);
- ret = -EAGAIN;
} else
nfs_add_stats(page_file_mapping(page)->host,
NFSIOS_WRITEPAGES, 1);
@@ -993,7 +994,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
(hdr->good_bytes < bytes)) {
- nfs_set_pageerror(req->wb_page);
+ nfs_set_pageerror(page_file_mapping(req->wb_page));
nfs_context_set_write_error(req->wb_context, hdr->error);
goto remove_req;
}
@@ -1347,7 +1348,8 @@ int nfs_updatepage(struct file *file, struct page *page,
unsigned int offset, unsigned int count)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct inode *inode = page_file_mapping(page)->host;
+ struct address_space *mapping = page_file_mapping(page);
+ struct inode *inode = mapping->host;
int status = 0;
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -1365,7 +1367,7 @@ int nfs_updatepage(struct file *file, struct page *page,
status = nfs_writepage_setup(ctx, page, offset, count);
if (status < 0)
- nfs_set_pageerror(page);
+ nfs_set_pageerror(mapping);
else
__set_page_dirty_nobuffers(page);
out:
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b33f9785b756..72a7681f4046 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1239,8 +1239,8 @@ static __net_init int nfsd_init_net(struct net *net)
retval = nfsd_idmap_init(net);
if (retval)
goto out_idmap_error;
- nn->nfsd4_lease = 45; /* default lease time */
- nn->nfsd4_grace = 45;
+ nn->nfsd4_lease = 90; /* default lease time */
+ nn->nfsd4_grace = 90;
nn->somebody_reclaimed = false;
nn->clverifier_counter = prandom_u32();
nn->clientid_counter = prandom_u32();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9824e32b2f23..7dc98e14655d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -557,9 +557,11 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
loff_t cloned;
cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
+ if (cloned < 0)
+ return nfserrno(cloned);
if (count && cloned != count)
- cloned = -EINVAL;
- return nfserrno(cloned < 0 ? cloned : 0);
+ return nfserrno(-EINVAL);
+ return 0;
}
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 41355ce74ac0..735bfb2e9190 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -2,6 +2,7 @@ config FANOTIFY
bool "Filesystem wide access notification"
select FSNOTIFY
select ANON_INODES
+ select EXPORTFS
default n
---help---
Say Y here to enable fanotify support. fanotify is a file access
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 3723f3d18d20..6b9c27548997 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -13,22 +13,40 @@
#include <linux/wait.h>
#include <linux/audit.h>
#include <linux/sched/mm.h>
+#include <linux/statfs.h>
#include "fanotify.h"
static bool should_merge(struct fsnotify_event *old_fsn,
struct fsnotify_event *new_fsn)
{
- struct fanotify_event_info *old, *new;
+ struct fanotify_event *old, *new;
pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
old = FANOTIFY_E(old_fsn);
new = FANOTIFY_E(new_fsn);
- if (old_fsn->inode == new_fsn->inode && old->pid == new->pid &&
- old->path.mnt == new->path.mnt &&
- old->path.dentry == new->path.dentry)
- return true;
+ if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
+ old->fh_type != new->fh_type || old->fh_len != new->fh_len)
+ return false;
+
+ if (fanotify_event_has_path(old)) {
+ return old->path.mnt == new->path.mnt &&
+ old->path.dentry == new->path.dentry;
+ } else if (fanotify_event_has_fid(old)) {
+ /*
+ * We want to merge many dirent events in the same dir (i.e.
+ * creates/unlinks/renames), but we do not want to merge dirent
+ * events referring to subdirs with dirent events referring to
+ * non subdirs, otherwise, user won't be able to tell from a
+ * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
+ * unlink pair or rmdir+create pair of events.
+ */
+ return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) &&
+ fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
+ }
+
+ /* Do not merge events if we failed to encode fid */
return false;
}
@@ -36,20 +54,22 @@ static bool should_merge(struct fsnotify_event *old_fsn,
static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
{
struct fsnotify_event *test_event;
+ struct fanotify_event *new;
pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+ new = FANOTIFY_E(event);
/*
* Don't merge a permission event with any other event so that we know
* the event structure we have created in fanotify_handle_event() is the
* one we should check for permission response.
*/
- if (fanotify_is_perm_event(event->mask))
+ if (fanotify_is_perm_event(new->mask))
return 0;
list_for_each_entry_reverse(test_event, list, list) {
if (should_merge(test_event, event)) {
- test_event->mask |= event->mask;
+ FANOTIFY_E(test_event)->mask |= new->mask;
return 1;
}
}
@@ -57,15 +77,44 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
return 0;
}
+/*
+ * Wait for response to permission event. The function also takes care of
+ * freeing the permission event (or offloads that in case the wait is canceled
+ * by a signal). The function returns 0 in case access got allowed by userspace,
+ * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case
+ * the wait got interrupted by a signal.
+ */
static int fanotify_get_response(struct fsnotify_group *group,
- struct fanotify_perm_event_info *event,
+ struct fanotify_perm_event *event,
struct fsnotify_iter_info *iter_info)
{
int ret;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- wait_event(group->fanotify_data.access_waitq, event->response);
+ ret = wait_event_killable(group->fanotify_data.access_waitq,
+ event->state == FAN_EVENT_ANSWERED);
+ /* Signal pending? */
+ if (ret < 0) {
+ spin_lock(&group->notification_lock);
+ /* Event reported to userspace and no answer yet? */
+ if (event->state == FAN_EVENT_REPORTED) {
+ /* Event will get freed once userspace answers to it */
+ event->state = FAN_EVENT_CANCELED;
+ spin_unlock(&group->notification_lock);
+ return ret;
+ }
+ /* Event not yet reported? Just remove it. */
+ if (event->state == FAN_EVENT_INIT)
+ fsnotify_remove_queued_event(group, &event->fae.fse);
+ /*
+ * Event may be also answered in case signal delivery raced
+ * with wakeup. In that case we have nothing to do besides
+ * freeing the event and reporting error.
+ */
+ spin_unlock(&group->notification_lock);
+ goto out;
+ }
/* userspace responded, convert to something usable */
switch (event->response & ~FAN_AUDIT) {
@@ -81,11 +130,11 @@ static int fanotify_get_response(struct fsnotify_group *group,
if (event->response & FAN_AUDIT)
audit_fanotify(event->response & ~FAN_AUDIT);
- event->response = 0;
-
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
group, event, ret);
-
+out:
+ fsnotify_destroy_event(group, &event->fae.fse);
+
return ret;
}
@@ -95,11 +144,13 @@ static int fanotify_get_response(struct fsnotify_group *group,
* been included within the event mask, but have not been explicitly
* requested by the user, will not be present in the returned mask.
*/
-static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
- u32 event_mask, const void *data,
- int data_type)
+static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+ struct fsnotify_iter_info *iter_info,
+ u32 event_mask, const void *data,
+ int data_type)
{
__u32 marks_mask = 0, marks_ignored_mask = 0;
+ __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS;
const struct path *path = data;
struct fsnotify_mark *mark;
int type;
@@ -107,14 +158,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
__func__, iter_info->report_mask, event_mask, data, data_type);
- /* If we don't have enough info to send an event to userspace say no */
- if (data_type != FSNOTIFY_EVENT_PATH)
- return 0;
-
- /* Sorry, fanotify only gives a damn about files and dirs */
- if (!d_is_reg(path->dentry) &&
- !d_can_lookup(path->dentry))
- return 0;
+ if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Do we have path to open a file descriptor? */
+ if (data_type != FSNOTIFY_EVENT_PATH)
+ return 0;
+ /* Path type events are only relevant for files and dirs */
+ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry))
+ return 0;
+ }
fsnotify_foreach_obj_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
@@ -133,20 +184,106 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
marks_ignored_mask |= mark->ignored_mask;
}
- if (d_is_dir(path->dentry) &&
+ test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+
+ /*
+ * dirent modification events (create/delete/move) do not carry the
+ * child entry name/inode information. Instead, we report FAN_ONDIR
+ * for mkdir/rmdir so user can differentiate them from creat/unlink.
+ *
+ * For backward compatibility and consistency, do not report FAN_ONDIR
+ * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
+ * to user in FAN_REPORT_FID mode for all event types.
+ */
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Do not report FAN_ONDIR without any event */
+ if (!(test_mask & ~FAN_ONDIR))
+ return 0;
+ } else {
+ user_mask &= ~FAN_ONDIR;
+ }
+
+ if (event_mask & FS_ISDIR &&
!(marks_mask & FS_ISDIR & ~marks_ignored_mask))
return 0;
- return event_mask & FANOTIFY_OUTGOING_EVENTS & marks_mask &
- ~marks_ignored_mask;
+ return test_mask & user_mask;
+}
+
+static int fanotify_encode_fid(struct fanotify_event *event,
+ struct inode *inode, gfp_t gfp,
+ __kernel_fsid_t *fsid)
+{
+ struct fanotify_fid *fid = &event->fid;
+ int dwords, bytes = 0;
+ int err, type;
+
+ fid->ext_fh = NULL;
+ dwords = 0;
+ err = -ENOENT;
+ type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+ if (!dwords)
+ goto out_err;
+
+ bytes = dwords << 2;
+ if (bytes > FANOTIFY_INLINE_FH_LEN) {
+ /* Treat failure to allocate fh as failure to allocate event */
+ err = -ENOMEM;
+ fid->ext_fh = kmalloc(bytes, gfp);
+ if (!fid->ext_fh)
+ goto out_err;
+ }
+
+ type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes),
+ &dwords, NULL);
+ err = -EINVAL;
+ if (!type || type == FILEID_INVALID || bytes != dwords << 2)
+ goto out_err;
+
+ fid->fsid = *fsid;
+ event->fh_len = bytes;
+
+ return type;
+
+out_err:
+ pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
+ "type=%d, bytes=%d, err=%i)\n",
+ fsid->val[0], fsid->val[1], type, bytes, err);
+ kfree(fid->ext_fh);
+ fid->ext_fh = NULL;
+ event->fh_len = 0;
+
+ return FILEID_INVALID;
}
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
- struct inode *inode, u32 mask,
- const struct path *path)
+/*
+ * The inode to use as identifier when reporting fid depends on the event.
+ * Report the modified directory inode on dirent modification events.
+ * Report the "victim" inode otherwise.
+ * For example:
+ * FS_ATTRIB reports the child inode even if reported on a watched parent.
+ * FS_CREATE reports the modified dir inode and not the created inode.
+ */
+static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
+ const void *data, int data_type)
{
- struct fanotify_event_info *event = NULL;
+ if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+ return to_tell;
+ else if (data_type == FSNOTIFY_EVENT_INODE)
+ return (struct inode *)data;
+ else if (data_type == FSNOTIFY_EVENT_PATH)
+ return d_inode(((struct path *)data)->dentry);
+ return NULL;
+}
+
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
+ const void *data, int data_type,
+ __kernel_fsid_t *fsid)
+{
+ struct fanotify_event *event = NULL;
gfp_t gfp = GFP_KERNEL_ACCOUNT;
+ struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
/*
* For queues with unlimited length lost events are not expected and
@@ -160,28 +297,36 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
memalloc_use_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
- struct fanotify_perm_event_info *pevent;
+ struct fanotify_perm_event *pevent;
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
if (!pevent)
goto out;
event = &pevent->fae;
pevent->response = 0;
+ pevent->state = FAN_EVENT_INIT;
goto init;
}
event = kmem_cache_alloc(fanotify_event_cachep, gfp);
if (!event)
goto out;
init: __maybe_unused
- fsnotify_init_event(&event->fse, inode, mask);
+ fsnotify_init_event(&event->fse, inode);
+ event->mask = mask;
if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
event->pid = get_pid(task_pid(current));
else
event->pid = get_pid(task_tgid(current));
- if (path) {
- event->path = *path;
+ event->fh_len = 0;
+ if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Report the event without a file identifier on encode error */
+ event->fh_type = fanotify_encode_fid(event, id, gfp, fsid);
+ } else if (data_type == FSNOTIFY_EVENT_PATH) {
+ event->fh_type = FILEID_ROOT;
+ event->path = *((struct path *)data);
path_get(&event->path);
} else {
+ event->fh_type = FILEID_INVALID;
event->path.mnt = NULL;
event->path.dentry = NULL;
}
@@ -190,6 +335,29 @@ out:
return event;
}
+/*
+ * Get cached fsid of the filesystem containing the object from any connector.
+ * All connectors are supposed to have the same fsid, but we do not verify that
+ * here.
+ */
+static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
+{
+ int type;
+ __kernel_fsid_t fsid = {};
+
+ fsnotify_foreach_obj_type(type) {
+ if (!fsnotify_iter_should_report_type(iter_info, type))
+ continue;
+
+ fsid = iter_info->marks[type]->connector->fsid;
+ if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+ continue;
+ return fsid;
+ }
+
+ return fsid;
+}
+
static int fanotify_handle_event(struct fsnotify_group *group,
struct inode *inode,
u32 mask, const void *data, int data_type,
@@ -197,14 +365,22 @@ static int fanotify_handle_event(struct fsnotify_group *group,
struct fsnotify_iter_info *iter_info)
{
int ret = 0;
- struct fanotify_event_info *event;
+ struct fanotify_event *event;
struct fsnotify_event *fsn_event;
+ __kernel_fsid_t fsid = {};
BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+ BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB);
BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+ BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO);
+ BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
+ BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
+ BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
+ BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
+ BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
@@ -213,9 +389,10 @@ static int fanotify_handle_event(struct fsnotify_group *group,
BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
- mask = fanotify_group_event_mask(iter_info, mask, data, data_type);
+ mask = fanotify_group_event_mask(group, iter_info, mask, data,
+ data_type);
if (!mask)
return 0;
@@ -231,7 +408,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
return 0;
}
- event = fanotify_alloc_event(group, inode, mask, data);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID))
+ fsid = fanotify_get_fsid(iter_info);
+
+ event = fanotify_alloc_event(group, inode, mask, data, data_type,
+ &fsid);
ret = -ENOMEM;
if (unlikely(!event)) {
/*
@@ -255,7 +436,6 @@ static int fanotify_handle_event(struct fsnotify_group *group,
} else if (fanotify_is_perm_event(mask)) {
ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
iter_info);
- fsnotify_destroy_event(group, fsn_event);
}
finish:
if (fanotify_is_perm_event(mask))
@@ -275,12 +455,15 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
static void fanotify_free_event(struct fsnotify_event *fsn_event)
{
- struct fanotify_event_info *event;
+ struct fanotify_event *event;
event = FANOTIFY_E(fsn_event);
- path_put(&event->path);
+ if (fanotify_event_has_path(event))
+ path_put(&event->path);
+ else if (fanotify_event_has_ext_fh(event))
+ kfree(event->fid.ext_fh);
put_pid(event->pid);
- if (fanotify_is_perm_event(fsn_event->mask)) {
+ if (fanotify_is_perm_event(event->mask)) {
kmem_cache_free(fanotify_perm_event_cachep,
FANOTIFY_PE(fsn_event));
return;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index ea05b8a401e7..68b30504284c 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,26 +2,112 @@
#include <linux/fsnotify_backend.h>
#include <linux/path.h>
#include <linux/slab.h>
+#include <linux/exportfs.h>
extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
+/* Possible states of the permission event */
+enum {
+ FAN_EVENT_INIT,
+ FAN_EVENT_REPORTED,
+ FAN_EVENT_ANSWERED,
+ FAN_EVENT_CANCELED,
+};
+
+/*
+ * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
+ * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
+ * fh_* fields increase the size of fanotify_event by another 4 bytes.
+ * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
+ * fh_* fields are packed in a hole after mask.
+ */
+#if BITS_PER_LONG == 32
+#define FANOTIFY_INLINE_FH_LEN (3 << 2)
+#else
+#define FANOTIFY_INLINE_FH_LEN (4 << 2)
+#endif
+
+struct fanotify_fid {
+ __kernel_fsid_t fsid;
+ union {
+ unsigned char fh[FANOTIFY_INLINE_FH_LEN];
+ unsigned char *ext_fh;
+ };
+};
+
+static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
+ unsigned int fh_len)
+{
+ return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
+}
+
+static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
+ struct fanotify_fid *fid2,
+ unsigned int fh_len)
+{
+ return fid1->fsid.val[0] == fid2->fsid.val[0] &&
+ fid1->fsid.val[1] == fid2->fsid.val[1] &&
+ !memcmp(fanotify_fid_fh(fid1, fh_len),
+ fanotify_fid_fh(fid2, fh_len), fh_len);
+}
+
/*
* Structure for normal fanotify events. It gets allocated in
* fanotify_handle_event() and freed when the information is retrieved by
* userspace
*/
-struct fanotify_event_info {
+struct fanotify_event {
struct fsnotify_event fse;
+ u32 mask;
/*
- * We hold ref to this path so it may be dereferenced at any point
- * during this object's lifetime
+ * Those fields are outside fanotify_fid to pack fanotify_event nicely
+ * on 64bit arch and to use fh_type as an indication of whether path
+ * or fid are used in the union:
+ * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
*/
- struct path path;
+ u8 fh_type;
+ u8 fh_len;
+ u16 pad;
+ union {
+ /*
+ * We hold ref to this path so it may be dereferenced at any
+ * point during this object's lifetime
+ */
+ struct path path;
+ /*
+ * With FAN_REPORT_FID, we do not hold any reference on the
+ * victim object. Instead we store its NFS file handle and its
+ * filesystem's fsid as a unique identifier.
+ */
+ struct fanotify_fid fid;
+ };
struct pid *pid;
};
+static inline bool fanotify_event_has_path(struct fanotify_event *event)
+{
+ return event->fh_type == FILEID_ROOT;
+}
+
+static inline bool fanotify_event_has_fid(struct fanotify_event *event)
+{
+ return event->fh_type != FILEID_ROOT &&
+ event->fh_type != FILEID_INVALID;
+}
+
+static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
+{
+ return fanotify_event_has_fid(event) &&
+ event->fh_len > FANOTIFY_INLINE_FH_LEN;
+}
+
+static inline void *fanotify_event_fh(struct fanotify_event *event)
+{
+ return fanotify_fid_fh(&event->fid, event->fh_len);
+}
+
/*
* Structure for permission fanotify events. It gets allocated and freed in
* fanotify_handle_event() since we wait there for user response. When the
@@ -29,16 +115,17 @@ struct fanotify_event_info {
* group->notification_list to group->fanotify_data.access_list to wait for
* user response.
*/
-struct fanotify_perm_event_info {
- struct fanotify_event_info fae;
- int response; /* userspace answer to question */
+struct fanotify_perm_event {
+ struct fanotify_event fae;
+ unsigned short response; /* userspace answer to the event */
+ unsigned short state; /* state of the event */
int fd; /* fd we passed to userspace for this event */
};
-static inline struct fanotify_perm_event_info *
+static inline struct fanotify_perm_event *
FANOTIFY_PE(struct fsnotify_event *fse)
{
- return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+ return container_of(fse, struct fanotify_perm_event, fae.fse);
}
static inline bool fanotify_is_perm_event(u32 mask)
@@ -47,11 +134,12 @@ static inline bool fanotify_is_perm_event(u32 mask)
mask & FANOTIFY_PERM_EVENTS;
}
-static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
{
- return container_of(fse, struct fanotify_event_info, fse);
+ return container_of(fse, struct fanotify_event, fse);
}
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
- struct inode *inode, u32 mask,
- const struct path *path);
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
+ const void *data, int data_type,
+ __kernel_fsid_t *fsid);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9c870b0d2b56..56992b32c6bb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,8 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/memcontrol.h>
+#include <linux/statfs.h>
+#include <linux/exportfs.h>
#include <asm/ioctls.h>
@@ -47,33 +49,55 @@ struct kmem_cache *fanotify_mark_cache __read_mostly;
struct kmem_cache *fanotify_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+#define FANOTIFY_EVENT_ALIGN 4
+
+static int fanotify_event_info_len(struct fanotify_event *event)
+{
+ if (!fanotify_event_has_fid(event))
+ return 0;
+
+ return roundup(sizeof(struct fanotify_event_info_fid) +
+ sizeof(struct file_handle) + event->fh_len,
+ FANOTIFY_EVENT_ALIGN);
+}
+
/*
* Get an fsnotify notification event if one exists and is small
* enough to fit in "count". Return an error pointer if the count
- * is not large enough.
- *
- * Called with the group->notification_lock held.
+ * is not large enough. When permission event is dequeued, its state is
+ * updated accordingly.
*/
static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
- assert_spin_locked(&group->notification_lock);
+ size_t event_size = FAN_EVENT_METADATA_LEN;
+ struct fsnotify_event *fsn_event = NULL;
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+ spin_lock(&group->notification_lock);
if (fsnotify_notify_queue_is_empty(group))
- return NULL;
+ goto out;
- if (FAN_EVENT_METADATA_LEN > count)
- return ERR_PTR(-EINVAL);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ event_size += fanotify_event_info_len(
+ FANOTIFY_E(fsnotify_peek_first_event(group)));
+ }
- /* held the notification_lock the whole time, so this is the
- * same event we peeked above */
- return fsnotify_remove_first_event(group);
+ if (event_size > count) {
+ fsn_event = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ fsn_event = fsnotify_remove_first_event(group);
+ if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask))
+ FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED;
+out:
+ spin_unlock(&group->notification_lock);
+ return fsn_event;
}
static int create_fd(struct fsnotify_group *group,
- struct fanotify_event_info *event,
+ struct fanotify_event *event,
struct file **file)
{
int client_fd;
@@ -114,62 +138,32 @@ static int create_fd(struct fsnotify_group *group,
return client_fd;
}
-static int fill_event_metadata(struct fsnotify_group *group,
- struct fanotify_event_metadata *metadata,
- struct fsnotify_event *fsn_event,
- struct file **file)
-{
- int ret = 0;
- struct fanotify_event_info *event;
-
- pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
- group, metadata, fsn_event);
-
- *file = NULL;
- event = container_of(fsn_event, struct fanotify_event_info, fse);
- metadata->event_len = FAN_EVENT_METADATA_LEN;
- metadata->metadata_len = FAN_EVENT_METADATA_LEN;
- metadata->vers = FANOTIFY_METADATA_VERSION;
- metadata->reserved = 0;
- metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS;
- metadata->pid = pid_vnr(event->pid);
- if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
- metadata->fd = FAN_NOFD;
- else {
- metadata->fd = create_fd(group, event, file);
- if (metadata->fd < 0)
- ret = metadata->fd;
- }
-
- return ret;
-}
-
-static struct fanotify_perm_event_info *dequeue_event(
- struct fsnotify_group *group, int fd)
+/*
+ * Finish processing of permission event by setting it to ANSWERED state and
+ * drop group->notification_lock.
+ */
+static void finish_permission_event(struct fsnotify_group *group,
+ struct fanotify_perm_event *event,
+ unsigned int response)
+ __releases(&group->notification_lock)
{
- struct fanotify_perm_event_info *event, *return_e = NULL;
-
- spin_lock(&group->notification_lock);
- list_for_each_entry(event, &group->fanotify_data.access_list,
- fae.fse.list) {
- if (event->fd != fd)
- continue;
+ bool destroy = false;
- list_del_init(&event->fae.fse.list);
- return_e = event;
- break;
- }
+ assert_spin_locked(&group->notification_lock);
+ event->response = response;
+ if (event->state == FAN_EVENT_CANCELED)
+ destroy = true;
+ else
+ event->state = FAN_EVENT_ANSWERED;
spin_unlock(&group->notification_lock);
-
- pr_debug("%s: found return_re=%p\n", __func__, return_e);
-
- return return_e;
+ if (destroy)
+ fsnotify_destroy_event(group, &event->fae.fse);
}
static int process_access_response(struct fsnotify_group *group,
struct fanotify_response *response_struct)
{
- struct fanotify_perm_event_info *event;
+ struct fanotify_perm_event *event;
int fd = response_struct->fd;
int response = response_struct->response;
@@ -194,48 +188,115 @@ static int process_access_response(struct fsnotify_group *group,
if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
return -EINVAL;
- event = dequeue_event(group, fd);
- if (!event)
- return -ENOENT;
+ spin_lock(&group->notification_lock);
+ list_for_each_entry(event, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ if (event->fd != fd)
+ continue;
- event->response = response;
- wake_up(&group->fanotify_data.access_waitq);
+ list_del_init(&event->fae.fse.list);
+ finish_permission_event(group, event, response);
+ wake_up(&group->fanotify_data.access_waitq);
+ return 0;
+ }
+ spin_unlock(&group->notification_lock);
+
+ return -ENOENT;
+}
+
+static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
+{
+ struct fanotify_event_info_fid info = { };
+ struct file_handle handle = { };
+ size_t fh_len = event->fh_len;
+ size_t len = fanotify_event_info_len(event);
+
+ if (!len)
+ return 0;
+
+ if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
+ return -EFAULT;
+
+ /* Copy event info fid header followed by vaiable sized file handle */
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
+ info.hdr.len = len;
+ info.fsid = event->fid.fsid;
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ buf += sizeof(info);
+ len -= sizeof(info);
+ handle.handle_type = event->fh_type;
+ handle.handle_bytes = fh_len;
+ if (copy_to_user(buf, &handle, sizeof(handle)))
+ return -EFAULT;
+
+ buf += sizeof(handle);
+ len -= sizeof(handle);
+ if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+ return -EFAULT;
+
+ /* Pad with 0's */
+ buf += fh_len;
+ len -= fh_len;
+ WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
+ if (len > 0 && clear_user(buf, len))
+ return -EFAULT;
return 0;
}
static ssize_t copy_event_to_user(struct fsnotify_group *group,
- struct fsnotify_event *event,
+ struct fsnotify_event *fsn_event,
char __user *buf, size_t count)
{
- struct fanotify_event_metadata fanotify_event_metadata;
- struct file *f;
- int fd, ret;
-
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
- ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
- if (ret < 0)
- return ret;
+ struct fanotify_event_metadata metadata;
+ struct fanotify_event *event;
+ struct file *f = NULL;
+ int ret, fd = FAN_NOFD;
+
+ pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+
+ event = container_of(fsn_event, struct fanotify_event, fse);
+ metadata.event_len = FAN_EVENT_METADATA_LEN;
+ metadata.metadata_len = FAN_EVENT_METADATA_LEN;
+ metadata.vers = FANOTIFY_METADATA_VERSION;
+ metadata.reserved = 0;
+ metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
+ metadata.pid = pid_vnr(event->pid);
+
+ if (fanotify_event_has_path(event)) {
+ fd = create_fd(group, event, &f);
+ if (fd < 0)
+ return fd;
+ } else if (fanotify_event_has_fid(event)) {
+ metadata.event_len += fanotify_event_info_len(event);
+ }
+ metadata.fd = fd;
- fd = fanotify_event_metadata.fd;
ret = -EFAULT;
/*
* Sanity check copy size in case get_one_event() and
* fill_event_metadata() event_len sizes ever get out of sync.
*/
- if (WARN_ON_ONCE(fanotify_event_metadata.event_len > count))
+ if (WARN_ON_ONCE(metadata.event_len > count))
goto out_close_fd;
- if (copy_to_user(buf, &fanotify_event_metadata,
- fanotify_event_metadata.event_len))
+
+ if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
goto out_close_fd;
if (fanotify_is_perm_event(event->mask))
- FANOTIFY_PE(event)->fd = fd;
+ FANOTIFY_PE(fsn_event)->fd = fd;
- if (fd != FAN_NOFD)
+ if (fanotify_event_has_path(event)) {
fd_install(fd, f);
- return fanotify_event_metadata.event_len;
+ } else if (fanotify_event_has_fid(event)) {
+ ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
+ if (ret < 0)
+ return ret;
+ }
+
+ return metadata.event_len;
out_close_fd:
if (fd != FAN_NOFD) {
@@ -276,10 +337,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- spin_lock(&group->notification_lock);
kevent = get_one_event(group, count);
- spin_unlock(&group->notification_lock);
-
if (IS_ERR(kevent)) {
ret = PTR_ERR(kevent);
break;
@@ -316,11 +374,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
* Permission events get queued to wait for response. Other
* events can be destroyed now.
*/
- if (!fanotify_is_perm_event(kevent->mask)) {
+ if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
fsnotify_destroy_event(group, kevent);
} else {
if (ret <= 0) {
- FANOTIFY_PE(kevent)->response = FAN_DENY;
+ spin_lock(&group->notification_lock);
+ finish_permission_event(group,
+ FANOTIFY_PE(kevent), FAN_DENY);
wake_up(&group->fanotify_data.access_waitq);
} else {
spin_lock(&group->notification_lock);
@@ -370,7 +430,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
static int fanotify_release(struct inode *ignored, struct file *file)
{
struct fsnotify_group *group = file->private_data;
- struct fanotify_perm_event_info *event, *next;
+ struct fanotify_perm_event *event;
struct fsnotify_event *fsn_event;
/*
@@ -385,13 +445,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
* and simulate reply from userspace.
*/
spin_lock(&group->notification_lock);
- list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
- fae.fse.list) {
- pr_debug("%s: found group=%p event=%p\n", __func__, group,
- event);
-
+ while (!list_empty(&group->fanotify_data.access_list)) {
+ event = list_first_entry(&group->fanotify_data.access_list,
+ struct fanotify_perm_event, fae.fse.list);
list_del_init(&event->fae.fse.list);
- event->response = FAN_ALLOW;
+ finish_permission_event(group, event, FAN_ALLOW);
+ spin_lock(&group->notification_lock);
}
/*
@@ -401,13 +460,14 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
while (!fsnotify_notify_queue_is_empty(group)) {
fsn_event = fsnotify_remove_first_event(group);
- if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) {
+ if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
spin_unlock(&group->notification_lock);
fsnotify_destroy_event(group, fsn_event);
- spin_lock(&group->notification_lock);
} else {
- FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+ finish_permission_event(group, FANOTIFY_PE(fsn_event),
+ FAN_ALLOW);
}
+ spin_lock(&group->notification_lock);
}
spin_unlock(&group->notification_lock);
@@ -598,7 +658,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
- unsigned int type)
+ unsigned int type,
+ __kernel_fsid_t *fsid)
{
struct fsnotify_mark *mark;
int ret;
@@ -611,7 +672,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
return ERR_PTR(-ENOMEM);
fsnotify_init_mark(mark, group);
- ret = fsnotify_add_mark_locked(mark, connp, type, 0);
+ ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
return ERR_PTR(ret);
@@ -623,7 +684,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int type,
- __u32 mask, unsigned int flags)
+ __u32 mask, unsigned int flags,
+ __kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
__u32 added;
@@ -631,7 +693,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, type);
+ fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
if (IS_ERR(fsn_mark)) {
mutex_unlock(&group->mark_mutex);
return PTR_ERR(fsn_mark);
@@ -648,23 +710,23 @@ static int fanotify_add_mark(struct fsnotify_group *group,
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
- unsigned int flags)
+ unsigned int flags, __kernel_fsid_t *fsid)
{
return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags);
+ FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
}
static int fanotify_add_sb_mark(struct fsnotify_group *group,
- struct super_block *sb, __u32 mask,
- unsigned int flags)
+ struct super_block *sb, __u32 mask,
+ unsigned int flags, __kernel_fsid_t *fsid)
{
return fanotify_add_mark(group, &sb->s_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_SB, mask, flags);
+ FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
}
static int fanotify_add_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
- unsigned int flags)
+ unsigned int flags, __kernel_fsid_t *fsid)
{
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -679,7 +741,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
return 0;
return fanotify_add_mark(group, &inode->i_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_INODE, mask, flags);
+ FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
}
/* fanotify syscalls */
@@ -688,7 +750,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
struct fsnotify_group *group;
int f_flags, fd;
struct user_struct *user;
- struct fanotify_event_info *oevent;
+ struct fanotify_event *oevent;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
@@ -715,6 +777,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
return -EINVAL;
}
+ if ((flags & FAN_REPORT_FID) &&
+ (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
+ return -EINVAL;
+
user = get_current_user();
if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
free_uid(user);
@@ -739,7 +805,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
atomic_inc(&user->fanotify_listeners);
group->memcg = get_mem_cgroup_from_mm(current->mm);
- oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
+ oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL,
+ FSNOTIFY_EVENT_NONE, NULL);
if (unlikely(!oevent)) {
fd = -ENOMEM;
goto out_destroy_group;
@@ -801,6 +868,48 @@ out_destroy_group:
return fd;
}
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+{
+ __kernel_fsid_t root_fsid;
+ int err;
+
+ /*
+ * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+ */
+ err = vfs_get_fsid(path->dentry, fsid);
+ if (err)
+ return err;
+
+ if (!fsid->val[0] && !fsid->val[1])
+ return -ENODEV;
+
+ /*
+ * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+ * which uses a different fsid than sb root.
+ */
+ err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+ if (err)
+ return err;
+
+ if (root_fsid.val[0] != fsid->val[0] ||
+ root_fsid.val[1] != fsid->val[1])
+ return -EXDEV;
+
+ /*
+ * We need to make sure that the file system supports at least
+ * encoding a file handle so user can use name_to_handle_at() to
+ * compare fid returned with event to the file handle of watched
+ * objects. However, name_to_handle_at() requires that the
+ * filesystem also supports decoding file handles.
+ */
+ if (!path->dentry->d_sb->s_export_op ||
+ !path->dentry->d_sb->s_export_op->fh_to_dentry)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
int dfd, const char __user *pathname)
{
@@ -809,6 +918,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct fd f;
struct path path;
+ __kernel_fsid_t __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
int ret;
@@ -871,6 +981,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group->priority == FS_PRIO_0)
goto fput_and_out;
+ /*
+ * Events with data type inode do not carry enough information to report
+ * event->fd, so we do not allow setting a mask for inode events unless
+ * group supports reporting fid.
+ * inode events are not supported on a mount mark, because they do not
+ * carry enough information (i.e. path) to be filtered by mount point.
+ */
+ if (mask & FANOTIFY_INODE_EVENTS &&
+ (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) ||
+ mark_type == FAN_MARK_MOUNT))
+ goto fput_and_out;
+
if (flags & FAN_MARK_FLUSH) {
ret = 0;
if (mark_type == FAN_MARK_MOUNT)
@@ -886,6 +1008,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (ret)
goto fput_and_out;
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ ret = fanotify_test_fid(&path, &__fsid);
+ if (ret)
+ goto path_put_and_out;
+
+ fsid = &__fsid;
+ }
+
/* inode held in place by reference to path; group by fget on fd */
if (mark_type == FAN_MARK_INODE)
inode = path.dentry->d_inode;
@@ -896,24 +1026,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
case FAN_MARK_ADD:
if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+ ret = fanotify_add_vfsmount_mark(group, mnt, mask,
+ flags, fsid);
else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags);
+ ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
+ flags, fsid);
else
- ret = fanotify_add_inode_mark(group, inode, mask, flags);
+ ret = fanotify_add_inode_mark(group, inode, mask,
+ flags, fsid);
break;
case FAN_MARK_REMOVE:
if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+ ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
+ flags);
else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags);
+ ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
+ flags);
else
- ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+ ret = fanotify_remove_inode_mark(group, inode, mask,
+ flags);
break;
default:
ret = -EINVAL;
}
+path_put_and_out:
path_put(&path);
fput_and_out:
fdput(f);
@@ -950,15 +1087,15 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
- fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+ fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
- KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC);
+ KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
}
return 0;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ecf09b6243d9..df06f3da166c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -328,16 +328,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
const unsigned char *file_name, u32 cookie)
{
struct fsnotify_iter_info iter_info = {};
- struct super_block *sb = NULL;
+ struct super_block *sb = to_tell->i_sb;
struct mount *mnt = NULL;
- __u32 mnt_or_sb_mask = 0;
+ __u32 mnt_or_sb_mask = sb->s_fsnotify_mask;
int ret = 0;
__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
if (data_is == FSNOTIFY_EVENT_PATH) {
mnt = real_mount(((const struct path *)data)->mnt);
- sb = mnt->mnt.mnt_sb;
- mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask;
+ mnt_or_sb_mask |= mnt->mnt_fsnotify_mask;
}
/* An event "on child" is not intended for a mount/sb mark */
if (mask & FS_EVENT_ON_CHILD)
@@ -350,8 +349,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (!to_tell->i_fsnotify_marks &&
- (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks)))
+ if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
+ (!mnt || !mnt->mnt_fsnotify_marks))
return 0;
/*
* if this is a modify event we may need to clear the ignored masks
@@ -366,11 +365,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
fsnotify_first_mark(&to_tell->i_fsnotify_marks);
+ iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
+ fsnotify_first_mark(&sb->s_fsnotify_marks);
if (mnt) {
iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
- iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
- fsnotify_first_mark(&sb->s_fsnotify_marks);
}
/*
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7e4578d35b61..74ae60305189 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -5,6 +5,7 @@
struct inotify_event_info {
struct fsnotify_event fse;
+ u32 mask;
int wd;
u32 sync_cookie;
int name_len;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f4184b4f3815..ff30abd6a49b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -43,11 +43,11 @@ static bool event_compare(struct fsnotify_event *old_fsn,
{
struct inotify_event_info *old, *new;
- if (old_fsn->mask & FS_IN_IGNORED)
- return false;
old = INOTIFY_E(old_fsn);
new = INOTIFY_E(new_fsn);
- if ((old_fsn->mask == new_fsn->mask) &&
+ if (old->mask & FS_IN_IGNORED)
+ return false;
+ if ((old->mask == new->mask) &&
(old_fsn->inode == new_fsn->inode) &&
(old->name_len == new->name_len) &&
(!old->name_len || !strcmp(old->name, new->name)))
@@ -113,8 +113,18 @@ int inotify_handle_event(struct fsnotify_group *group,
return -ENOMEM;
}
+ /*
+ * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
+ * for fanotify. inotify never reported IN_ISDIR with those events.
+ * It looks like an oversight, but to avoid the risk of breaking
+ * existing inotify programs, mask the flag out from those events.
+ */
+ if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
+ mask &= ~IN_ISDIR;
+
fsn_event = &event->fse;
- fsnotify_init_event(fsn_event, inode, mask);
+ fsnotify_init_event(fsn_event, inode);
+ event->mask = mask;
event->wd = i_mark->wd;
event->sync_cookie = cookie;
event->name_len = len;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 105576daca4a..e2901fbb9f76 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -189,7 +189,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
*/
pad_name_len = round_event_name_len(fsn_event);
inotify_event.len = pad_name_len;
- inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+ inotify_event.mask = inotify_mask_to_arg(event->mask);
inotify_event.wd = event->wd;
inotify_event.cookie = event->sync_cookie;
@@ -634,7 +634,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
return ERR_PTR(-ENOMEM);
}
group->overflow_event = &oevent->fse;
- fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+ fsnotify_init_event(group->overflow_event, NULL);
+ oevent->mask = FS_Q_OVERFLOW;
oevent->wd = -1;
oevent->sync_cookie = 0;
oevent->name_len = 0;
@@ -724,8 +725,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
return -EBADF;
/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
- if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE)))
- return -EINVAL;
+ if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) {
+ ret = -EINVAL;
+ goto fput_and_out;
+ }
/* verify that this is indeed an inotify instance */
if (unlikely(f.file->f_op != &inotify_fops)) {
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d2dd16cb5989..d593d4269561 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -82,6 +82,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
+#include <linux/ratelimit.h>
#include <linux/atomic.h>
@@ -481,7 +482,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
}
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int type)
+ unsigned int type,
+ __kernel_fsid_t *fsid)
{
struct inode *inode = NULL;
struct fsnotify_mark_connector *conn;
@@ -493,6 +495,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
INIT_HLIST_HEAD(&conn->list);
conn->type = type;
conn->obj = connp;
+ /* Cache fsid of filesystem containing the object */
+ if (fsid)
+ conn->fsid = *fsid;
+ else
+ conn->fsid.val[0] = conn->fsid.val[1] = 0;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
inode = igrab(fsnotify_conn_inode(conn));
/*
@@ -544,7 +551,7 @@ out:
*/
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
- int allow_dups)
+ int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -553,15 +560,36 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
if (WARN_ON(!fsnotify_valid_obj_type(type)))
return -EINVAL;
+
+ /* Backend is expected to check for zero fsid (e.g. tmpfs) */
+ if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
+ return -ENODEV;
+
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, type);
+ err = fsnotify_attach_connector_to_object(connp, type, fsid);
if (err)
return err;
goto restart;
+ } else if (fsid && (conn->fsid.val[0] || conn->fsid.val[1]) &&
+ (fsid->val[0] != conn->fsid.val[0] ||
+ fsid->val[1] != conn->fsid.val[1])) {
+ /*
+ * Backend is expected to check for non uniform fsid
+ * (e.g. btrfs), but maybe we missed something?
+ * Only allow setting conn->fsid once to non zero fsid.
+ * inotify and non-fid fanotify groups do not set nor test
+ * conn->fsid.
+ */
+ pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
+ "%x.%x != %x.%x\n", __func__, conn->type,
+ fsid->val[0], fsid->val[1],
+ conn->fsid.val[0], conn->fsid.val[1]);
+ err = -EXDEV;
+ goto out_err;
}
/* is mark the first mark? */
@@ -606,7 +634,7 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
- int allow_dups)
+ int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
@@ -627,7 +655,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, type, allow_dups);
+ ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
if (ret)
goto err;
@@ -648,13 +676,13 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int type, int allow_dups)
+ unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups);
+ ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
mutex_unlock(&group->mark_mutex);
return ret;
}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3c3e36745f59..5f3a54d444b5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -71,7 +71,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
struct fsnotify_event *event)
{
/* Overflow events are per-group and we don't want to free them */
- if (!event || event->mask == FS_Q_OVERFLOW)
+ if (!event || event == group->overflow_event)
return;
/*
* If the event is still queued, we have a problem... Do an unreliable
@@ -141,6 +141,18 @@ queue:
return ret;
}
+void fsnotify_remove_queued_event(struct fsnotify_group *group,
+ struct fsnotify_event *event)
+{
+ assert_spin_locked(&group->notification_lock);
+ /*
+ * We need to init list head for the case of overflow event so that
+ * check in fsnotify_add_event() works
+ */
+ list_del_init(&event->list);
+ group->q_len--;
+}
+
/*
* Remove and return the first event from the notification list. It is the
* responsibility of the caller to destroy the obtained event
@@ -155,13 +167,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
event = list_first_entry(&group->notification_list,
struct fsnotify_event, list);
- /*
- * We need to init list head for the case of overflow event so that
- * check in fsnotify_add_event() works
- */
- list_del_init(&event->list);
- group->q_len--;
-
+ fsnotify_remove_queued_event(group, event);
return event;
}
@@ -194,23 +200,3 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
}
spin_unlock(&group->notification_lock);
}
-
-/*
- * fsnotify_create_event - Allocate a new event which will be sent to each
- * group's handle_event function if the group was interested in this
- * particular event.
- *
- * @inode the inode which is supposed to receive the event (sometimes a
- * parent of the inode to which the event happened.
- * @mask what actually happened.
- * @data pointer to the object which was actually affected
- * @data_type flag indication if the data is a file, path, inode, nothing...
- * @name the filename, if available
- */
-void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
- u32 mask)
-{
- INIT_LIST_HEAD(&event->list);
- event->inode = inode;
- event->mask = mask;
-}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1cbb27808e2..6f0999015a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
return count;
}
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
- u64 start, len, trimmed, first_group, last_group, group;
+ u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
@@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
- struct ocfs2_trim_fs_info info, *pinfo = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
@@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
+ trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
- if (start >= le32_to_cpu(main_bm->i_clusters)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- len = range->len >> osb->s_clustersize_bits;
- if (start + len > le32_to_cpu(main_bm->i_clusters))
- len = le32_to_cpu(main_bm->i_clusters) - start;
-
- trace_ocfs2_trim_fs(start, len, minlen);
-
- ocfs2_trim_fs_lock_res_init(osb);
- ret = ocfs2_trim_fs_lock(osb, NULL, 1);
- if (ret < 0) {
- if (ret != -EAGAIN) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
+ /*
+ * Do some check before trim the first group.
+ */
+ if (!group) {
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
goto out_unlock;
}
- mlog(ML_NOTICE, "Wait for trim on device (%s) to "
- "finish, which is running from another node.\n",
- osb->dev_str);
- ret = ocfs2_trim_fs_lock(osb, &info, 0);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
- goto out_unlock;
- }
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
- if (info.tf_valid && info.tf_success &&
- info.tf_start == start && info.tf_len == len &&
- info.tf_minlen == minlen) {
- /* Avoid sending duplicated trim to a shared device */
- mlog(ML_NOTICE, "The same trim on device (%s) was "
- "just done from node (%u), return.\n",
- osb->dev_str, info.tf_nodenum);
- range->len = info.tf_trimlen;
- goto out_trimunlock;
- }
+ /*
+ * Determine first and last group to examine based on
+ * start and len
+ */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb,
+ first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode,
+ start + len - 1);
+ group = first_group;
}
- info.tf_nodenum = osb->node_num;
- info.tf_start = start;
- info.tf_len = len;
- info.tf_minlen = minlen;
-
- /* Determine first and last group to examine based on start and len */
- first_group = ocfs2_which_cluster_group(main_bm_inode, start);
- if (first_group == osb->first_cluster_group_blkno)
- first_bit = start;
- else
- first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
- last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
- last_bit = osb->bitmap_cpg;
-
- trimmed = 0;
- for (group = first_group; group <= last_group;) {
+ do {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
@@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
- }
- range->len = trimmed * sb->s_blocksize;
+ } while (0);
- info.tf_trimlen = range->len;
- info.tf_success = (ret ? 0 : 1);
- pinfo = &info;
-out_trimunlock:
- ocfs2_trim_fs_unlock(osb, pinfo);
- ocfs2_trim_fs_lock_res_uninit(osb);
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
+ main_bm_bh = NULL;
out_mutex:
inode_unlock(main_bm_inode);
iput(main_bm_inode);
+
+ /*
+ * If all the groups trim are not done or failed, but we should release
+ * main_bm related locks for avoiding the current IO starve, then go to
+ * trim the next group
+ */
+ if (ret >= 0 && group <= last_group)
+ goto next_group;
out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+ ocfs2_trim_fs_lock_res_init(osb);
+
+ trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+ ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+ "finish, which is running from another node.\n",
+ osb->dev_str);
+ ret = ocfs2_trim_fs_lock(osb, &info, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ if (info.tf_valid && info.tf_success &&
+ info.tf_start == range->start &&
+ info.tf_len == range->len &&
+ info.tf_minlen == range->minlen) {
+ /* Avoid sending duplicated trim to a shared device */
+ mlog(ML_NOTICE, "The same trim on device (%s) was "
+ "just done from node (%u), return.\n",
+ osb->dev_str, info.tf_nodenum);
+ range->len = info.tf_trimlen;
+ goto out;
+ }
+ }
+
+ info.tf_nodenum = osb->node_num;
+ info.tf_start = range->start;
+ info.tf_len = range->len;
+ info.tf_minlen = range->minlen;
+
+ ret = ocfs2_trim_mainbm(sb, range);
+
+ info.tf_trimlen = range->len;
+ info.tf_success = (ret < 0 ? 0 : 1);
+ pinfo = &info;
+out:
+ ocfs2_trim_fs_unlock(osb, pinfo);
+ ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 0e4166cc23a0..4ac775e32240 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
- o2net_disconnect_node(node);
+ if (cluster->cl_nodes[node->nd_num] == node) {
+ o2net_disconnect_node(node);
- if (cluster->cl_has_local &&
- (cluster->cl_local_node == node->nd_num)) {
- cluster->cl_has_local = 0;
- cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
- o2net_stop_listening(node);
+ if (cluster->cl_has_local &&
+ (cluster->cl_local_node == node->nd_num)) {
+ cluster->cl_has_local = 0;
+ cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+ o2net_stop_listening(node);
+ }
}
/* XXX call into net to stop this node from trading messages */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7c835824247e..af405586c5b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+ /* Only one trimfs thread are allowed to work at the same time. */
+ mutex_lock(&osb->obs_trim_fs_mutex);
+
ocfs2_lock_res_init_once(lockres);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, lockres);
ocfs2_lock_res_free(lockres);
+
+ mutex_unlock(&osb->obs_trim_fs_mutex);
}
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0027b5..1f029fbe8b8d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -407,6 +407,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_lock_res osb_trim_fs_lockres;
+ struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a90ba8f..dc4bce1649c1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d7407994f308..ea0756d83250 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -55,7 +55,7 @@ struct ocfs2_slot_info {
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- struct ocfs2_slot *si_slots;
+ struct ocfs2_slot si_slots[];
};
@@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
- si = kzalloc(sizeof(struct ocfs2_slot_info) +
- (sizeof(struct ocfs2_slot) * osb->max_slots),
- GFP_KERNEL);
+ si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
@@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_slots = (struct ocfs2_slot *)((char *)si +
- sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b09398..96ae7cedd487 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_is_hard_readonly(osb))
goto leave;
+ mutex_init(&osb->obs_trim_fs_mutex);
+
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index a5a2fe76568f..b094d3d79354 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -398,8 +398,6 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter
loff_t pos = iocb->ki_pos;
ssize_t rc = 0;
- BUG_ON(iocb->private);
-
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
orangefs_stats.reads++;
@@ -416,8 +414,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
loff_t pos;
ssize_t rc;
- BUG_ON(iocb->private);
-
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
inode_lock(file->f_mapping->host);
diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..51d5fd8840ab 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct page *page = buf->page;
if (page_count(page) == 1) {
- if (memcg_kmem_enabled())
- memcg_kmem_uncharge(page, 0);
+ memcg_kmem_uncharge(page, 0);
__SetPageLocked(page);
return 0;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9d428d5a0ac8..2edbb657f859 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
- seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+ seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
case -EINVAL:
- seq_printf(m, "unknown");
+ seq_puts(m, "unknown");
break;
case PR_SPEC_NOT_AFFECTED:
- seq_printf(m, "not vulnerable");
+ seq_puts(m, "not vulnerable");
break;
case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
- seq_printf(m, "thread force mitigated");
+ seq_puts(m, "thread force mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
- seq_printf(m, "thread mitigated");
+ seq_puts(m, "thread mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
- seq_printf(m, "thread vulnerable");
+ seq_puts(m, "thread vulnerable");
break;
case PR_SPEC_DISABLE:
- seq_printf(m, "globally mitigated");
+ seq_puts(m, "globally mitigated");
break;
default:
- seq_printf(m, "vulnerable");
+ seq_puts(m, "vulnerable");
break;
}
seq_putc(m, '\n');
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 633a63462573..5ab1849971b4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -140,9 +140,13 @@ struct pid_entry {
#define REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show) \
- NOD(NAME, (S_IFREG|(MODE)), \
+ NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+#define ATTR(LSM, NAME, MODE) \
+ NOD(NAME, (S_IFREG|(MODE)), \
+ NULL, &proc_pid_attr_operations, \
+ { .lsm = LSM })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -456,7 +460,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
if (unlikely(!sched_info_on()))
- seq_printf(m, "0 0 0\n");
+ seq_puts(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
@@ -1086,10 +1090,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
task_lock(p);
if (!p->vfork_done && process_shares_mm(p, mm)) {
- pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
- task_pid_nr(p), p->comm,
- p->signal->oom_score_adj, oom_adj,
- task_pid_nr(task), task->comm);
p->signal->oom_score_adj = oom_adj;
if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
p->signal->oom_score_adj_min = (short)oom_adj;
@@ -1210,7 +1210,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
@@ -2525,7 +2525,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsm,
(char*)file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
@@ -2574,7 +2574,9 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
- rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count);
+ rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ file->f_path.dentry->d_name.name, page,
+ count);
mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
kfree(page);
@@ -2588,13 +2590,53 @@ static const struct file_operations proc_pid_attr_operations = {
.llseek = generic_file_llseek,
};
+#define LSM_DIR_OPS(LSM) \
+static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+ struct dir_context *ctx) \
+{ \
+ return proc_pident_readdir(filp, ctx, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+ .read = generic_read_dir, \
+ .iterate = proc_##LSM##_attr_dir_iterate, \
+ .llseek = default_llseek, \
+}; \
+\
+static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+ struct dentry *dentry, unsigned int flags) \
+{ \
+ return proc_pident_lookup(dir, dentry, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+ .lookup = proc_##LSM##_attr_dir_lookup, \
+ .getattr = pid_getattr, \
+ .setattr = proc_setattr, \
+}
+
+#ifdef CONFIG_SECURITY_SMACK
+static const struct pid_entry smack_attr_dir_stuff[] = {
+ ATTR("smack", "current", 0666),
+};
+LSM_DIR_OPS(smack);
+#endif
+
static const struct pid_entry attr_dir_stuff[] = {
- REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("prev", S_IRUGO, proc_pid_attr_operations),
- REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ ATTR(NULL, "current", 0666),
+ ATTR(NULL, "prev", 0444),
+ ATTR(NULL, "exec", 0666),
+ ATTR(NULL, "fscreate", 0666),
+ ATTR(NULL, "keycreate", 0666),
+ ATTR(NULL, "sockcreate", 0666),
+#ifdef CONFIG_SECURITY_SMACK
+ DIR("smack", 0555,
+ proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+#endif
};
static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -3002,7 +3044,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
@@ -3165,7 +3207,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
return d_splice_alias(inode, dentry);
}
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
unsigned tgid;
@@ -3390,7 +3432,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8ae109429a88..e39bac94dead 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, &proc_misc_dentry_ops);
+ d_set_d_op(dentry, de->proc_dops);
return d_splice_alias(inode, dentry);
}
read_unlock(&proc_subdir_lock);
@@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
+ ent->proc_dops = &proc_misc_dentry_ops;
+
out:
return ent;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f6a51e..ea575375f210 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,6 +44,7 @@ struct proc_dir_entry {
struct completion *pde_unload_completion;
const struct inode_operations *proc_iops;
const struct file_operations *proc_fops;
+ const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
int (*single_show)(struct seq_file *, void *);
@@ -81,6 +82,7 @@ union proc_op {
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
+ const char *lsm;
};
struct proc_inode {
@@ -161,7 +163,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
-extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 40b05e0d4274..544d1ee15aee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page)
else if (page_count(page) == 0 && is_free_buddy_page(page))
u |= 1 << KPF_BUDDY;
- if (PageBalloon(page))
- u |= 1 << KPF_BALLOON;
+ if (PageOffline(page))
+ u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index d5e0fcb3439e..a7b12435519e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode)
return maybe_get_net(PDE_NET(PDE(inode)));
}
+static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return 0;
+}
+
+static const struct dentry_operations proc_net_dentry_ops = {
+ .d_revalidate = proc_net_d_revalidate,
+ .d_delete = always_delete_dentry,
+};
+
+static void pde_force_lookup(struct proc_dir_entry *pde)
+{
+ /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+ pde->proc_dops = &proc_net_dentry_ops;
+}
+
static int seq_open_net(struct inode *inode, struct file *file)
{
unsigned int state_size = PDE(inode)->state_size;
@@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_seq_fops;
p->seq_ops = ops;
p->state_size = state_size;
@@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_seq_fops;
p->seq_ops = ops;
p->state_size = state_size;
@@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_single_fops;
p->single_show = show;
return proc_register(parent, p);
@@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
+ pde_force_lookup(p);
p->proc_fops = &proc_net_single_fops;
p->single_show = show;
p->write = write;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f4b1a9d2eca6..621e6ec322ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -154,7 +154,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat,
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_pid_lookup(dir, dentry, flags))
+ if (!proc_pid_lookup(dentry, flags))
return NULL;
return proc_lookup(dir, dentry, flags);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 127265e5c55f..57c0a1047250 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
@@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
d_add(self, inode);
+ ret = 0;
} else {
dput(self);
- self = ERR_PTR(-ENOMEM);
}
- } else {
- self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- return PTR_ERR(self);
- }
- ns->proc_self = self;
- return 0;
+ else
+ ns->proc_self = self;
+
+ return ret;
}
void __init proc_self_init(void)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 535eda7857cf..80c305f206bb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -23,21 +23,21 @@
#ifdef arch_idle_time
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
idle += arch_idle_time(cpu);
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait;
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
if (cpu_online(cpu) && nr_iowait_cpu(cpu))
iowait += arch_idle_time(cpu);
return iowait;
@@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu)
#else
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu)
if (idle_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
else
idle = idle_usecs * NSEC_PER_USEC;
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;
@@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu)
if (iowait_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
else
iowait = iowait_usecs * NSEC_PER_USEC;
@@ -79,6 +79,31 @@ static u64 get_iowait_time(int cpu)
#endif
+static void show_irq_gap(struct seq_file *p, unsigned int gap)
+{
+ static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
+
+ while (gap > 0) {
+ unsigned int inc;
+
+ inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2);
+ seq_write(p, zeros, 2 * inc);
+ gap -= inc;
+ }
+}
+
+static void show_all_irqs(struct seq_file *p)
+{
+ unsigned int i, next = 0;
+
+ for_each_active_irq(i) {
+ show_irq_gap(p, i - next);
+ seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
+ next = i + 1;
+ }
+ show_irq_gap(p, nr_irqs - next);
+}
+
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
@@ -95,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
- user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(i);
- iowait += get_iowait_time(i);
- irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
+ user += kcs->cpustat[CPUTIME_USER];
+ nice += kcs->cpustat[CPUTIME_NICE];
+ system += kcs->cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(kcs, i);
+ iowait += get_iowait_time(kcs, i);
+ irq += kcs->cpustat[CPUTIME_IRQ];
+ softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal += kcs->cpustat[CPUTIME_STEAL];
+ guest += kcs->cpustat[CPUTIME_GUEST];
+ guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
@@ -130,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(i);
- iowait = get_iowait_time(i);
- irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ user = kcs->cpustat[CPUTIME_USER];
+ nice = kcs->cpustat[CPUTIME_NICE];
+ system = kcs->cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(kcs, i);
+ iowait = get_iowait_time(kcs, i);
+ irq = kcs->cpustat[CPUTIME_IRQ];
+ softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal = kcs->cpustat[CPUTIME_STEAL];
+ guest = kcs->cpustat[CPUTIME_GUEST];
+ guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
@@ -156,9 +185,7 @@ static int show_stat(struct seq_file *p, void *v)
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
- /* sum again ? it could be updated? */
- for_each_irq_nr(j)
- seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
+ show_all_irqs(p);
seq_printf(p,
"\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0ec9edab2f3..92a91e7816d8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
- SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
+ SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
@@ -423,7 +423,7 @@ struct mem_size_stats {
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty)
+ bool compound, bool young, bool dirty, bool locked)
{
int i, nr = compound ? 1 << compound_order(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -450,24 +450,31 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ if (locked)
+ mss->pss_locked += (u64)size << PSS_SHIFT;
return;
}
for (i = 0; i < nr; i++, page++) {
int mapcount = page_mapcount(page);
+ unsigned long pss = (PAGE_SIZE << PSS_SHIFT);
if (mapcount >= 2) {
if (dirty || PageDirty(page))
mss->shared_dirty += PAGE_SIZE;
else
mss->shared_clean += PAGE_SIZE;
- mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ mss->pss += pss / mapcount;
+ if (locked)
+ mss->pss_locked += pss / mapcount;
} else {
if (dirty || PageDirty(page))
mss->private_dirty += PAGE_SIZE;
else
mss->private_clean += PAGE_SIZE;
- mss->pss += PAGE_SIZE << PSS_SHIFT;
+ mss->pss += pss;
+ if (locked)
+ mss->pss_locked += pss;
}
}
}
@@ -490,6 +497,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
if (pte_present(*pte)) {
@@ -532,7 +540,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -541,6 +549,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page;
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -555,7 +564,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
/* pass */;
else
VM_BUG_ON_PAGE(1, page);
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -737,11 +746,8 @@ static void smap_gather_stats(struct vm_area_struct *vma,
}
}
#endif
-
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
- if (vma->vm_flags & VM_LOCKED)
- mss->pss_locked += mss->pss;
}
#define SEQ_PUT_DEC(str, val) \
@@ -942,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
pte_t ptent = *pte;
if (pte_present(ptent)) {
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
- ptent = pte_wrprotect(ptent);
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 0b63d68dedb2..36bf0f2e102e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
else
bytes += kobjsize(current->files);
- if (current->sighand && atomic_read(&current->sighand->count) > 1)
+ if (current->sighand && refcount_read(&current->sighand->count) > 1)
sbytes += kobjsize(current->sighand);
else
bytes += kobjsize(current->sighand);
@@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
seq_file_path(m, file, "");
} else if (mm && is_stack(vma)) {
seq_pad(m, ' ');
- seq_printf(m, "[stack]");
+ seq_puts(m, "[stack]");
}
seq_putc(m, '\n');
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index b905010ca9eb..f61ae53533f5 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *thread_self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
@@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
d_add(thread_self, inode);
+ ret = 0;
} else {
dput(thread_self);
- thread_self = ERR_PTR(-ENOMEM);
}
- } else {
- thread_self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(thread_self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
- return PTR_ERR(thread_self);
- }
- ns->proc_thread_self = thread_self;
- return 0;
+ else
+ ns->proc_thread_self = thread_self;
+
+ return ret;
}
void __init proc_thread_self_init(void)
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2d1066ed3c28..75887a269b64 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -501,6 +501,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
{
struct pstore_record record;
+ if (!c)
+ return;
+
pstore_record_init(&record, psinfo);
record.type = PSTORE_TYPE_CONSOLE;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 96f7d32cd184..c5c685589e36 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -110,7 +110,6 @@ struct ramoops_context {
};
static struct platform_device *dummy;
-static struct ramoops_platform_data *dummy_data;
static int ramoops_pstore_open(struct pstore_info *psi)
{
@@ -128,7 +127,6 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id,
struct pstore_record *record)
{
struct persistent_ram_zone *prz;
- bool update = (record->type == PSTORE_TYPE_DMESG);
/* Give up if we never existed or have hit the end. */
if (!przs)
@@ -139,7 +137,7 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id,
return NULL;
/* Update old/shadowed buffer. */
- if (update)
+ if (prz->type == PSTORE_TYPE_DMESG)
persistent_ram_save_old(prz);
if (!persistent_ram_old_size(prz))
@@ -347,17 +345,15 @@ out:
static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
struct pstore_record *record)
{
- char *hdr;
+ char hdr[36]; /* "===="(4), %lld(20), "."(1), %06lu(6), "-%c\n"(3) */
size_t len;
- hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n",
+ len = scnprintf(hdr, sizeof(hdr),
+ RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n",
(time64_t)record->time.tv_sec,
record->time.tv_nsec / 1000,
record->compressed ? 'C' : 'D');
- WARN_ON_ONCE(!hdr);
- len = hdr ? strlen(hdr) : 0;
persistent_ram_write(prz, hdr, len);
- kfree(hdr);
return len;
}
@@ -425,6 +421,9 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
/* Build header and append record contents. */
hlen = ramoops_write_kmsg_hdr(prz, record);
+ if (!hlen)
+ return -ENOMEM;
+
size = record->size;
if (size + hlen > prz->buffer_size)
size = prz->buffer_size - hlen;
@@ -711,24 +710,12 @@ static int ramoops_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct ramoops_platform_data *pdata = dev->platform_data;
+ struct ramoops_platform_data pdata_local;
struct ramoops_context *cxt = &oops_cxt;
size_t dump_mem_sz;
phys_addr_t paddr;
int err = -EINVAL;
- if (dev_of_node(dev) && !pdata) {
- pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
- if (!pdata) {
- pr_err("cannot allocate platform data buffer\n");
- err = -ENOMEM;
- goto fail_out;
- }
-
- err = ramoops_parse_dt(pdev, pdata);
- if (err < 0)
- goto fail_out;
- }
-
/*
* Only a single ramoops area allowed at a time, so fail extra
* probes.
@@ -738,6 +725,15 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
}
+ if (dev_of_node(dev) && !pdata) {
+ pdata = &pdata_local;
+ memset(pdata, 0, sizeof(*pdata));
+
+ err = ramoops_parse_dt(pdev, pdata);
+ if (err < 0)
+ goto fail_out;
+ }
+
/* Make sure we didn't get bogus platform data pointer. */
if (!pdata) {
pr_err("NULL platform data\n");
@@ -896,13 +892,12 @@ static inline void ramoops_unregister_dummy(void)
{
platform_device_unregister(dummy);
dummy = NULL;
-
- kfree(dummy_data);
- dummy_data = NULL;
}
static void __init ramoops_register_dummy(void)
{
+ struct ramoops_platform_data pdata;
+
/*
* Prepare a dummy platform data structure to carry the module
* parameters. If mem_size isn't set, then there are no module
@@ -913,30 +908,25 @@ static void __init ramoops_register_dummy(void)
pr_info("using module parameters\n");
- dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL);
- if (!dummy_data) {
- pr_info("could not allocate pdata\n");
- return;
- }
-
- dummy_data->mem_size = mem_size;
- dummy_data->mem_address = mem_address;
- dummy_data->mem_type = mem_type;
- dummy_data->record_size = record_size;
- dummy_data->console_size = ramoops_console_size;
- dummy_data->ftrace_size = ramoops_ftrace_size;
- dummy_data->pmsg_size = ramoops_pmsg_size;
- dummy_data->dump_oops = dump_oops;
- dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
+ memset(&pdata, 0, sizeof(pdata));
+ pdata.mem_size = mem_size;
+ pdata.mem_address = mem_address;
+ pdata.mem_type = mem_type;
+ pdata.record_size = record_size;
+ pdata.console_size = ramoops_console_size;
+ pdata.ftrace_size = ramoops_ftrace_size;
+ pdata.pmsg_size = ramoops_pmsg_size;
+ pdata.dump_oops = dump_oops;
+ pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
/*
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
* (using 1 byte for ECC isn't much of use anyway).
*/
- dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
+ pdata.ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
dummy = platform_device_register_data(NULL, "ramoops", -1,
- dummy_data, sizeof(struct ramoops_platform_data));
+ &pdata, sizeof(pdata));
if (IS_ERR(dummy)) {
pr_info("could not create platform device: %ld\n",
PTR_ERR(dummy));
diff --git a/fs/read_write.c b/fs/read_write.c
index ff3c5e6f87cf..30df848b7451 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -426,7 +426,7 @@ ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
ssize_t result;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
result = vfs_read(file, (void __user *)buf, count, pos);
set_fs(old_fs);
@@ -499,7 +499,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
return -EINVAL;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
p = (__force const char __user *)buf;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
@@ -521,7 +521,7 @@ ssize_t kernel_write(struct file *file, const void *buf, size_t count,
ssize_t res;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
res = vfs_write(file, (__force const char __user *)buf, count, pos);
set_fs(old_fs);
diff --git a/fs/select.c b/fs/select.c
index d0f35dbc0e8f..6cbc9ff56ba0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1379,7 +1379,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
struct old_timespec32 __user *, tsp, void __user *, sig)
{
@@ -1402,7 +1402,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
#endif
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
unsigned int, nfds, struct old_timespec32 __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
diff --git a/fs/splice.c b/fs/splice.c
index de2ede048473..6489fb9436e4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -357,7 +357,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
ssize_t res;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
set_fs(old_fs);
@@ -1123,6 +1123,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (ipipe == opipe)
return -EINVAL;
+ if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
return splice_pipe_to_pipe(ipipe, opipe, len, flags);
}
@@ -1148,6 +1151,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (unlikely(ret < 0))
return ret;
+ if (in->f_flags & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
file_start_write(out);
ret = do_splice_from(ipipe, out, &offset, len, flags);
file_end_write(out);
@@ -1172,6 +1178,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
offset = in->f_pos;
}
+ if (out->f_flags & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
if (!ret)
@@ -1717,6 +1726,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
* copying the data.
*/
if (ipipe && opipe && ipipe != opipe) {
+ if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
/*
* Keep going, unless we encounter an error. The ipipe/opipe
* ordering doesn't really matter.
diff --git a/fs/statfs.c b/fs/statfs.c
index f0216629621d..eea7af6f2f22 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -67,6 +67,20 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
return retval;
}
+int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+{
+ struct kstatfs st;
+ int error;
+
+ error = statfs_by_dentry(dentry, &st);
+ if (error)
+ return error;
+
+ *fsid = st.f_fsid;
+ return 0;
+}
+EXPORT_SYMBOL(vfs_get_fsid);
+
int vfs_statfs(const struct path *path, struct kstatfs *buf)
{
int error;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index feeae8081c22..aa85f2874a9f 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,7 +43,8 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
kuid_t uid;
kgid_t gid;
- BUG_ON(!kobj);
+ if (WARN_ON(!kobj))
+ return -EINVAL;
if (kobj->parent)
parent = kobj->parent->sd;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index bb71db63c99c..130fc6fbcc03 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -17,7 +17,6 @@
#include <linux/seq_file.h>
#include "sysfs.h"
-#include "../kernfs/kernfs-internal.h"
/*
* Determine ktype->sysfs_ops for the given kernfs_node. This function
@@ -325,7 +324,8 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
kuid_t uid;
kgid_t gid;
- BUG_ON(!kobj || !kobj->sd || !attr);
+ if (WARN_ON(!kobj || !kobj->sd || !attr))
+ return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode,
@@ -496,6 +496,7 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr)
{
int i;
+
for (i = 0; ptr[i]; i++)
sysfs_remove_file(kobj, ptr[i]);
}
@@ -537,7 +538,8 @@ int sysfs_create_bin_file(struct kobject *kobj,
kuid_t uid;
kgid_t gid;
- BUG_ON(!kobj || !kobj->sd || !attr);
+ if (WARN_ON(!kobj || !kobj->sd || !attr))
+ return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true,
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1eb2d6307663..57038604d4a8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -112,7 +112,8 @@ static int internal_create_group(struct kobject *kobj, int update,
kgid_t gid;
int error;
- BUG_ON(!kobj || (!update && !kobj->sd));
+ if (WARN_ON(!kobj || (!update && !kobj->sd)))
+ return -EINVAL;
/* Updates may happen before the object has been instantiated */
if (unlikely(update && !kobj->sd))
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 215c225b2ca1..c4deecc80f67 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -23,7 +23,8 @@ static int sysfs_do_create_link_sd(struct kernfs_node *parent,
{
struct kernfs_node *kn, *target = NULL;
- BUG_ON(!name || !parent);
+ if (WARN_ON(!name || !parent))
+ return -EINVAL;
/*
* We don't own @target_kobj and it may be removed at any time.
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 803ca070d42e..6a6fc8aa1de7 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -560,7 +560,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags,
const struct old_itimerspec32 __user *, utmr,
struct old_itimerspec32 __user *, otmr)
{
@@ -577,7 +577,7 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
return ret;
}
-COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
+SYSCALL_DEFINE2(timerfd_gettime32, int, ufd,
struct old_itimerspec32 __user *, otmr)
{
struct itimerspec64 kotmr;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index bc1e082d921d..9da2f135121b 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -8,6 +8,7 @@ config UBIFS_FS
select CRYPTO_LZO if UBIFS_FS_LZO
select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
select CRYPTO_HASH_INFO
+ select UBIFS_FS_XATTR if FS_ENCRYPTION
depends on MTD_UBI
help
UBIFS is a file system for flash devices which works on top of UBI.
@@ -60,17 +61,6 @@ config UBIFS_FS_XATTR
If unsure, say Y.
-config UBIFS_FS_ENCRYPTION
- bool "UBIFS Encryption"
- depends on UBIFS_FS_XATTR && BLOCK
- select FS_ENCRYPTION
- default n
- help
- Enable encryption of UBIFS files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
config UBIFS_FS_SECURITY
bool "UBIFS Security Labels"
depends on UBIFS_FS_XATTR
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 5f838319c8d5..5c4b845754a7 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -6,6 +6,6 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o
ubifs-y += misc.o
-ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
+ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o
ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 0164bcc827f8..0f9c362a3402 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -185,7 +185,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return err;
}
case FS_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct ubifs_info *c = inode->i_sb->s_fs_info;
err = ubifs_enable_encryption(c);
@@ -198,7 +198,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
#endif
}
case FS_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
return fscrypt_ioctl_get_policy(file, (void __user *)arg);
#else
return -EOPNOTSUPP;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 3da90c951c23..67fac1e8adfb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -748,7 +748,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
goto out;
}
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
if (c->encrypted) {
ubifs_err(c, "file system contains encrypted files but UBIFS"
" was built without crypto support.");
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fac1133dadd..8dc2818fdd84 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2146,7 +2146,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_UBIFS_FS_XATTR
sb->s_xattr = ubifs_xattr_handlers;
#endif
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ubifs_crypt_operations;
#endif
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 38401adaa00d..1ae12900e01d 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -43,7 +43,6 @@
#include <crypto/hash.h>
#include <crypto/algapi.h>
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
#include <linux/fscrypt.h>
#include "ubifs-media.h"
@@ -142,7 +141,7 @@
*/
#define WORST_COMPR_FACTOR 2
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
#else
#define UBIFS_CIPHER_BLOCK_SIZE 0
@@ -2072,7 +2071,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
#include "misc.h"
#include "key.h"
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#ifndef CONFIG_FS_ENCRYPTION
static inline int ubifs_encrypt(const struct inode *inode,
struct ubifs_data_node *dn,
unsigned int in_len, unsigned int *out_len,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e3d684ea3203..ffd8038ff728 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1474,6 +1474,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
if (lvd->integritySeqExt.extLength)
udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
ret = 0;
+
+ if (!sbi->s_lvid_bh) {
+ /* We can't generate unique IDs without a valid LVID */
+ if (sb_rdonly(sb)) {
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+ } else {
+ udf_warn(sb, "Damaged or missing LVID, forcing "
+ "readonly mount\n");
+ ret = -EACCES;
+ }
+ }
out_bh:
brelse(bh);
return ret;
@@ -1943,13 +1954,24 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
return 0;
}
+static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid)
+{
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+ udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
+ lvid->descTag.descCRC = cpu_to_le16(
+ crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+ le16_to_cpu(lvid->descTag.descCRCLength)));
+ lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+}
+
static void udf_open_lvid(struct super_block *sb)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
- struct timespec64 ts;
if (!bh)
return;
@@ -1961,18 +1983,12 @@ static void udf_open_lvid(struct super_block *sb)
mutex_lock(&sbi->s_alloc_mutex);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- ktime_get_real_ts64(&ts);
- udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
else
UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);
- lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(struct tag),
- le16_to_cpu(lvid->descTag.descCRCLength)));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+ udf_finalize_lvid(lvid);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
@@ -1986,7 +2002,6 @@ static void udf_close_lvid(struct super_block *sb)
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
- struct timespec64 ts;
if (!bh)
return;
@@ -1998,8 +2013,6 @@ static void udf_close_lvid(struct super_block *sb)
mutex_lock(&sbi->s_alloc_mutex);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- ktime_get_real_ts64(&ts);
- udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
@@ -2009,17 +2022,13 @@ static void udf_close_lvid(struct super_block *sb)
if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
- lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(struct tag),
- le16_to_cpu(lvid->descTag.descCRCLength)));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
/*
* We set buffer uptodate unconditionally here to avoid spurious
* warnings from mark_buffer_dirty() when previous EIO has marked
* the buffer as !uptodate
*/
set_buffer_uptodate(bh);
+ udf_finalize_lvid(lvid);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
@@ -2048,8 +2057,8 @@ u64 lvid_get_unique_id(struct super_block *sb)
if (!(++uniqueID & 0xFFFFFFFF))
uniqueID += 16;
lvhd->uniqueID = cpu_to_le64(uniqueID);
+ udf_updated_lvid(sb);
mutex_unlock(&sbi->s_alloc_mutex);
- mark_buffer_dirty(bh);
return ret;
}
@@ -2320,11 +2329,17 @@ static int udf_sync_fs(struct super_block *sb, int wait)
mutex_lock(&sbi->s_alloc_mutex);
if (sbi->s_lvid_dirty) {
+ struct buffer_head *bh = sbi->s_lvid_bh;
+ struct logicalVolIntegrityDesc *lvid;
+
+ lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+ udf_finalize_lvid(lvid);
+
/*
* Blockdevice will be synced later so we don't have to submit
* the buffer for IO
*/
- mark_buffer_dirty(sbi->s_lvid_bh);
+ mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
}
mutex_unlock(&sbi->s_alloc_mutex);
diff --git a/fs/utimes.c b/fs/utimes.c
index bdcf2daf39c1..350c9c16ace1 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -224,8 +224,8 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
* of sys_utimes.
*/
#ifdef __ARCH_WANT_SYS_UTIME32
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
- struct old_utimbuf32 __user *, t)
+SYSCALL_DEFINE2(utime32, const char __user *, filename,
+ struct old_utimbuf32 __user *, t)
{
struct timespec64 tv[2];
@@ -240,7 +240,7 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
}
#endif
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
+SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
{
struct timespec64 tv[2];
@@ -276,14 +276,14 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename,
return do_utimes(dfd, filename, t ? tv : NULL, 0);
}
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd,
+SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd,
const char __user *, filename,
struct old_timeval32 __user *, t)
{
return do_compat_futimesat(dfd, filename, t);
}
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t)
+SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t)
{
return do_compat_futimesat(AT_FDCWD, filename, t);
}
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 999ad8d00d43..1ef8acf35e7d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -339,14 +339,14 @@ xfs_ag_init_headers(
{ /* BNO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_cntbt_buf_ops,
.work = &xfs_cntroot_init,
.need_init = true
},
@@ -361,7 +361,7 @@ xfs_ag_init_headers(
{ /* FINO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_inobt_buf_ops,
+ .ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_FINO,
.need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e701ebc36c06..e2ba2a3b63b2 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -281,7 +281,7 @@ xfs_ag_resv_init(
*/
ask = used = 0;
- mp->m_inotbt_nores = true;
+ mp->m_finobt_nores = true;
error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
&used);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b715668886a4..bc3367b8b7bb 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -568,9 +568,9 @@ xfs_agfl_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return NULL;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+ if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
return __this_address;
- if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
/*
* during growfs operations, the perag is not fully initialised,
@@ -643,6 +643,7 @@ xfs_agfl_write_verify(
const struct xfs_buf_ops xfs_agfl_buf_ops = {
.name = "xfs_agfl",
+ .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
.verify_struct = xfs_agfl_verify,
@@ -2587,8 +2588,10 @@ xfs_agf_verify(
return __this_address;
}
- if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ if (!xfs_verify_magic(bp, agf->agf_magicnum))
+ return __this_address;
+
+ if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2670,6 +2673,7 @@ xfs_agf_write_verify(
const struct xfs_buf_ops xfs_agf_buf_ops = {
.name = "xfs_agf",
+ .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
.verify_struct = xfs_agf_verify,
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 4e59cc8a2802..9fe949f6055e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -297,48 +297,34 @@ xfs_allocbt_verify(
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
+ xfs_btnum_t btnum = XFS_BTNUM_BNOi;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+ }
/*
- * magic number and level verification
- *
- * During growfs operations, we can't verify the exact level or owner as
- * the perag is not fully initialised and hence not attached to the
- * buffer. In this case, check against the maximum tree depth.
+ * The perag may not be attached during grow operations or fully
+ * initialized from the AGF during log recovery. Therefore we can only
+ * check against maximum tree depth from those contexts.
*
- * Similarly, during log recovery we will have a perag structure
- * attached, but the agf information will not yet have been initialised
- * from the on disk AGF. Again, we can only check against maximum limits
- * in this case.
+ * Otherwise check against the per-tree limit. Peek at one of the
+ * verifier magic values to determine the type of tree we're verifying
+ * against.
*/
level = be16_to_cpu(block->bb_level);
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTB_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+ btnum = XFS_BTNUM_CNTi;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[btnum])
return __this_address;
- break;
- case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTC_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
- return __this_address;
- break;
- default:
+ } else if (level >= mp->m_ag_maxlevels)
return __this_address;
- }
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
@@ -377,13 +363,23 @@ xfs_allocbt_write_verify(
}
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
- .name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+ .name = "xfs_bnobt",
+ .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+ cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
.verify_struct = xfs_allocbt_verify,
};
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+ .name = "xfs_cntbt",
+ .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+ cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+ .verify_struct = xfs_allocbt_verify,
+};
STATIC int
xfs_bnobt_keys_inorder(
@@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_bnobt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_bnobt_buf_ops,
.diff_two_keys = xfs_bnobt_diff_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
@@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_cntbt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_cntbt_buf_ops,
.diff_two_keys = xfs_cntbt_diff_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 844ed87b1900..2dd9ee2a2e08 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args)
xfs_da_state_free(state);
return retval;
}
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any nulls here */
+ return !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index bdf52a333f3f..2297d8467666 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2652d00842d6..1f6e3965ff74 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -245,25 +245,14 @@ xfs_attr3_leaf_verify(
struct xfs_attr_leaf_entry *entries;
uint32_t end; /* must be 32bit - see below */
int i;
+ xfs_failaddr_t fa;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
- return __this_address;
- }
/*
* In recovery there is a transient state where count == 0 is valid
* because we may have transitioned an empty shortform attr to a leaf
@@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify(
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
.name = "xfs_attr3_leaf",
+ .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
.verify_struct = xfs_attr3_leaf_verify,
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d89363c6b523..65ff600a8067 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok(
static xfs_failaddr_t
xfs_attr3_rmt_verify(
struct xfs_mount *mp,
+ struct xfs_buf *bp,
void *ptr,
int fsbsize,
xfs_daddr_t bno)
@@ -87,7 +88,7 @@ xfs_attr3_rmt_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify(
*failaddr = __this_address;
return -EFSBADCRC;
}
- *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (*failaddr)
return -EFSCORRUPTED;
len -= blksize;
@@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify(
while (len > 0) {
struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify(
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
.name = "xfs_attr3_rmt",
+ .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
.verify_struct = xfs_attr3_rmt_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 332eefa2700b..48502cb9990f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -577,42 +577,44 @@ __xfs_bmap_add_free(
*/
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
*/
STATIC int /* error */
xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- /* REFERENCED */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
xfs_fsblock_t cbno; /* child block number */
xfs_buf_t *cbp; /* child block's buffer */
int error; /* error return value */
- struct xfs_ifork *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
- struct xfs_btree_block *rblock;/* root btree block */
struct xfs_owner_info oinfo;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* check if we actually need the extent format first: */
+ if (!xfs_bmap_wants_extents(ip, whichfork))
+ return 0;
+
+ ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
- *logflagsp = 0;
#ifdef DEBUG
XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
xfs_btree_check_lptr(cur, cbno, 1));
@@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents(
ASSERT(ifp->if_broot == NULL);
ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
return 0;
}
@@ -2029,7 +2031,7 @@ done:
/*
* Convert an unwritten allocation to a real allocation or vice versa.
*/
-STATIC int /* error */
+int /* error */
xfs_bmap_add_extent_unwritten_real(
struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
@@ -3685,17 +3687,6 @@ xfs_trim_extent(
}
}
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
- struct xfs_bmbt_irec *irec,
- struct xfs_inode *ip)
-
-{
- xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
- i_size_read(VFS_I(ip))));
-}
-
/*
* Trim the returned map to the required bounds
*/
@@ -4203,6 +4194,44 @@ xfs_bmapi_convert_unwritten(
return 0;
}
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int fork)
+{
+ if (tp && tp->t_firstblock != NULLFSBLOCK)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+ return 1;
+ return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error. Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should. Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+ struct xfs_bmalloca *bma,
+ int whichfork,
+ int error)
+{
+ if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ bma->logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+ if (bma->logflags)
+ xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+ if (bma->cur)
+ xfs_btree_del_cursor(bma->cur, error);
+}
+
/*
* Map file blocks to filesystem blocks, and allocate blocks or convert the
* extent state if necessary. Details behaviour is controlled by the flags
@@ -4247,9 +4276,7 @@ xfs_bmapi_write(
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(tp != NULL ||
- (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
- (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+ ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4282,25 +4309,12 @@ xfs_bmapi_write(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!tp || tp->t_firstblock == NULLFSBLOCK) {
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
- bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- else
- bma.minleft = 1;
- } else {
- bma.minleft = 0;
- }
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
goto error0;
}
- n = 0;
- end = bno + len;
- obno = bno;
-
if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
@@ -4309,7 +4323,11 @@ xfs_bmapi_write(
bma.ip = ip;
bma.total = total;
bma.datatype = 0;
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ n = 0;
+ end = bno + len;
+ obno = bno;
while (bno < end && n < *nmap) {
bool need_alloc = false, wasdelay = false;
@@ -4323,26 +4341,7 @@ xfs_bmapi_write(
ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
(flags & XFS_BMAPI_COWFORK)));
- if (flags & XFS_BMAPI_DELALLOC) {
- /*
- * For the COW fork we can reasonably get a
- * request for converting an extent that races
- * with other threads already having converted
- * part of it, as there converting COW to
- * regular blocks is not protected using the
- * IOLOCK.
- */
- ASSERT(flags & XFS_BMAPI_COWFORK);
- if (!(flags & XFS_BMAPI_COWFORK)) {
- error = -EIO;
- goto error0;
- }
-
- if (eof || bno >= end)
- break;
- } else {
- need_alloc = true;
- }
+ need_alloc = true;
} else if (isnullstartblock(bma.got.br_startblock)) {
wasdelay = true;
}
@@ -4351,8 +4350,7 @@ xfs_bmapi_write(
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if ((need_alloc || wasdelay) &&
- !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+ if (need_alloc || wasdelay) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4420,49 +4418,130 @@ xfs_bmapi_write(
}
*nmap = n;
- /*
- * Transform from btree to extents, give it cur.
- */
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- ASSERT(bma.cur);
- error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
- &tmp_logflags, whichfork);
- bma.logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto error0;
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
XFS_IFORK_NEXTENTS(ip, whichfork) >
XFS_IFORK_MAXEXT(ip, whichfork));
- error = 0;
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return 0;
error0:
+ xfs_bmapi_finish(&bma, whichfork, error);
+ return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t offset_fsb,
+ struct xfs_bmbt_irec *imap,
+ unsigned int *seq)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmalloca bma = { NULL };
+ struct xfs_trans *tp;
+ int error;
+
/*
- * Log everything. Do this after conversion, there's no point in
- * logging the extent records if we've converted to btree format.
+ * Space for the extent and indirect blocks was reserved when the
+ * delalloc extent was created so there's no need to do so here.
*/
- if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- bma.logflags &= ~xfs_ilog_fext(whichfork);
- else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+ bma.got.br_startoff > offset_fsb) {
+ /*
+ * No extent found in the range we are trying to convert. This
+ * should only happen for the COW fork, where another thread
+ * might have moved the extent to the data fork in the meantime.
+ */
+ WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+ error = -EAGAIN;
+ goto out_trans_cancel;
+ }
+
/*
- * Log whatever the flags say, even if error. Otherwise we might miss
- * detecting a case where the data is changed, there's an error,
- * and it's not logged so we don't shutdown when we should.
+ * If we find a real extent here we raced with another thread converting
+ * the extent. Just return the real extent at this offset.
*/
- if (bma.logflags)
- xfs_trans_log_inode(tp, ip, bma.logflags);
+ if (!isnullstartblock(bma.got.br_startblock)) {
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+ goto out_trans_cancel;
+ }
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.wasdel = true;
+ bma.offset = bma.got.br_startoff;
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ if (whichfork == XFS_COW_FORK)
+ bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
- if (bma.cur) {
- xfs_btree_del_cursor(bma.cur, error);
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto out_finish;
+
+ error = -ENOSPC;
+ if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+ goto out_finish;
+ error = -EFSCORRUPTED;
+ if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ goto out_finish;
+
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+ XFS_STATS_INC(mp, xs_xstrat_quick);
+
+ ASSERT(!isnullstartblock(bma.got.br_startblock));
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+
+ if (whichfork == XFS_COW_FORK) {
+ error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+ bma.length);
+ if (error)
+ goto out_finish;
}
- if (!error)
- xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto out_finish;
+
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_finish:
+ xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -4536,13 +4615,7 @@ xfs_bmapi_remap(
if (error)
goto error0;
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
error0:
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -5406,24 +5479,11 @@ nodelete:
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
- /*
- * transform from btree to extents, give it cur
- */
- else if (xfs_bmap_wants_extents(ip, whichfork)) {
- ASSERT(cur != NULL);
- error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+ } else {
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
}
- /*
- * transform from extents to local?
- */
- error = 0;
+
error0:
/*
* Log everything. Do this after conversion, there's no point in
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 09d3ea97cc15..8f597f9abdbe 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -95,12 +95,6 @@ struct xfs_extent_free_item
/* Map something in the CoW fork. */
#define XFS_BMAPI_COWFORK 0x200
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC 0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY 0x800
-
/* Skip online discard of freed extents */
#define XFS_BMAPI_NODISCARD 0x1000
@@ -117,8 +111,6 @@ struct xfs_extent_free_item
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
- { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
- { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
{ XFS_BMAPI_NORMAP, "NORMAP" }
@@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
-void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
@@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
+int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+ unsigned int *seq);
+int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new, int *logflagsp);
static inline void
xfs_bmap_add_free(
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cdb74d2e2a43..aff82ed112c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -416,8 +416,10 @@ xfs_bmbt_verify(
xfs_failaddr_t fa;
unsigned int level;
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
/*
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
@@ -425,11 +427,6 @@ xfs_bmbt_verify(
fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_BMAP_MAGIC):
- break;
- default:
- return __this_address;
}
/*
@@ -481,6 +478,8 @@ xfs_bmbt_write_verify(
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
.name = "xfs_bmbt",
+ .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+ cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
.verify_struct = xfs_bmbt_verify,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 376bee94b5dd..e2737e2ac2ae 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state)
kmem_zone_free(xfs_da_state_zone, state);
}
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+ struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_blkinfo *hdr = &hdr3->hdr;
+
+ if (!xfs_verify_magic16(bp, hdr->magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static xfs_failaddr_t
xfs_da3_node_verify(
struct xfs_buf *bp)
@@ -124,27 +152,16 @@ xfs_da3_node_verify(
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
const struct xfs_dir_ops *ops;
+ xfs_failaddr_t fa;
ops = xfs_dir_get_ops(mp, NULL);
ops->node_hdr_from_disk(&ichdr, hdr);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_DA3_NODE_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_DA_NODE_MAGIC)
- return __this_address;
- }
if (ichdr.level == 0)
return __this_address;
if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -257,6 +274,8 @@ xfs_da3_node_verify_struct(
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
.name = "xfs_da3_node",
+ .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+ cpu_to_be16(XFS_DA3_NODE_MAGIC) },
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
.verify_struct = xfs_da3_node_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5d5bf3bffc78..ae654e06b2fb 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
}
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3);
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 229152cd1a24..156ce95c9c45 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -703,3 +703,20 @@ xfs_dir2_shrink_inode(
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
return 0;
}
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any slashes or nulls here */
+ return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c3e3f6b813d8..f54244779492 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 30ed5919da72..b7d6d78f4ce2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -53,18 +53,16 @@ xfs_dir3_block_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -112,6 +110,8 @@ xfs_dir3_block_write_verify(
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.name = "xfs_dir3_block",
+ .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+ cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
.verify_struct = xfs_dir3_block_verify,
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 01162c62ec8f..b7b9ce002cb9 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -252,18 +252,16 @@ xfs_dir3_data_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -339,6 +337,8 @@ xfs_dir3_data_write_verify(
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
.name = "xfs_dir3_data",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
.verify_struct = xfs_dir3_data_verify,
@@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.name = "xfs_dir3_data_reada",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1728a3e6f5cf..9a3767818c50 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int(
*/
static xfs_failaddr_t
xfs_dir3_leaf_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_leaf *leaf = bp->b_addr;
+ xfs_failaddr_t fa;
- ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- uint16_t magic3;
-
- magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
- : XFS_DIR3_LEAFN_MAGIC;
-
- if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
- return __this_address;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
- return __this_address;
- } else {
- if (leaf->hdr.info.magic != cpu_to_be16(magic))
- return __this_address;
- }
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
}
static void
-__read_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_read_verify(
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
xfs_failaddr_t fa;
@@ -185,23 +166,22 @@ __read_verify(
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa)
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
static void
-__write_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_write_verify(
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -216,60 +196,22 @@ __write_verify(
xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
}
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
.name = "xfs_dir3_leaf1",
- .verify_read = xfs_dir3_leaf1_read_verify,
- .verify_write = xfs_dir3_leaf1_write_verify,
- .verify_struct = xfs_dir3_leaf1_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
.name = "xfs_dir3_leafn",
- .verify_read = xfs_dir3_leafn_read_verify,
- .verify_write = xfs_dir3_leafn_write_verify,
- .verify_struct = xfs_dir3_leafn_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
int
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index f1bb3434f51c..3b03703c5c3d 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -87,20 +87,18 @@ xfs_dir3_free_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
- return __this_address;
}
/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -151,6 +149,8 @@ xfs_dir3_free_write_verify(
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
.name = "xfs_dir3_free",
+ .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+ cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
.verify_struct = xfs_dir3_free_verify,
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index d293f371dd54..fb5bd9a804f6 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify(
const struct xfs_buf_ops xfs_dquot_buf_ops = {
.name = "xfs_dquot",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
.verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.name = "xfs_dquot_ra",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a105cbb..79e6c4fb1d8a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -54,7 +54,8 @@
#define XFS_ERRTAG_BUF_LRU_REF 31
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
-#define XFS_ERRTAG_MAX 34
+#define XFS_ERRTAG_IUNLINK_FALLBACK 34
+#define XFS_ERRTAG_MAX 35
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
#define XFS_RANDOM_BUF_LRU_REF 2
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
+#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d32152fc8a6c..fe9898875097 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2508,7 +2508,7 @@ xfs_agi_verify(
/*
* Validate the magic number of the agi block.
*/
- if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ if (!xfs_verify_magic(bp, agi->agi_magicnum))
return __this_address;
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return __this_address;
@@ -2582,6 +2582,7 @@ xfs_agi_write_verify(
const struct xfs_buf_ops xfs_agi_buf_ops = {
.name = "xfs_agi",
+ .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
.verify_struct = xfs_agi_verify,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b25e7a0df47..1080381ff243 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_alloc_block(cur, start, new, stat);
return __xfs_inobt_alloc_block(cur, start, new, stat,
XFS_AG_RESV_METADATA);
@@ -154,7 +154,7 @@ xfs_finobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_free_block(cur, bp);
return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
}
@@ -260,6 +260,9 @@ xfs_inobt_verify(
xfs_failaddr_t fa;
unsigned int level;
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
/*
* During growfs operations, we can't verify the exact owner as the
* perag is not fully initialised and hence not attached to the buffer.
@@ -270,18 +273,10 @@ xfs_inobt_verify(
* but beware of the landmine (i.e. need to check pag->pagi_init) if we
* ever do.
*/
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_IBT_CRC_MAGIC):
- case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_IBT_MAGIC):
- case cpu_to_be32(XFS_FIBT_MAGIC):
- break;
- default:
- return __this_address;
}
/* level verification */
@@ -328,6 +323,16 @@ xfs_inobt_write_verify(
const struct xfs_buf_ops xfs_inobt_buf_ops = {
.name = "xfs_inobt",
+ .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+ .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+ .name = "xfs_finobt",
+ .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+ cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
.verify_struct = xfs_inobt_verify,
@@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
- .buf_ops = &xfs_inobt_buf_ops,
+ .buf_ops = &xfs_finobt_buf_ops,
.diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 771dd072015d..bc690f2409fa 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -614,16 +614,15 @@ xfs_iext_realloc_root(
}
/*
- * Increment the sequence counter if we are on a COW fork. This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed. We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
*/
static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
{
- if (state & BMAP_COWFORK)
- WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+ WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09d9c8cfa4a0..e021d5133ccb 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -97,10 +97,9 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
- di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- (unlinked_ino == NULLAGINO ||
- xfs_verify_agino(mp, agno, unlinked_ino));
+ xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -147,12 +146,16 @@ xfs_inode_buf_write_verify(
const struct xfs_buf_ops xfs_inode_buf_ops = {
.name = "xfs_inode",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
- .name = "xxfs_inode_ra",
+ .name = "xfs_inode_ra",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 60361d2d74a1..00c62ce170d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -14,7 +14,7 @@ struct xfs_dinode;
*/
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
- unsigned int if_seq; /* cow fork mod counter */
+ unsigned int if_seq; /* fork mod counter */
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index d9eab657b63e..6f47ab876d90 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -209,7 +209,7 @@ xfs_refcountbt_verify(
xfs_failaddr_t fa;
unsigned int level;
- if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -264,6 +264,7 @@ xfs_refcountbt_write_verify(
const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
.name = "xfs_refcountbt",
+ .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
.verify_read = xfs_refcountbt_read_verify,
.verify_write = xfs_refcountbt_write_verify,
.verify_struct = xfs_refcountbt_verify,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f79cf040d745..5738e11055e6 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -310,7 +310,7 @@ xfs_rmapbt_verify(
* from the on disk AGF. Again, we can only check against maximum limits
* in this case.
*/
- if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +365,7 @@ xfs_rmapbt_write_verify(
const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
.name = "xfs_rmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
.verify_read = xfs_rmapbt_read_verify,
.verify_write = xfs_rmapbt_write_verify,
.verify_struct = xfs_rmapbt_verify,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b5a82acd7dfe..77a3a4085de3 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -225,10 +225,11 @@ xfs_validate_sb_common(
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
uint32_t agcount = 0;
uint32_t rem;
- if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp, "bad magic number");
return -EWRONGFS;
}
@@ -781,12 +782,14 @@ out_error:
const struct xfs_buf_ops xfs_sb_buf_ops = {
.name = "xfs_sb",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.name = "xfs_sb_quiet",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
@@ -874,7 +877,7 @@ xfs_initialize_perag_data(
uint64_t bfreelst = 0;
uint64_t btree = 0;
uint64_t fdblocks;
- int error;
+ int error = 0;
for (index = 0; index < agcount; index++) {
/*
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1c5debe748f0..4e909791aeac 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 77d80106f989..a0ccc253c43d 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -95,7 +95,7 @@ xfs_symlink_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+ if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -159,6 +159,7 @@ xfs_symlink_write_verify(
const struct xfs_buf_ops xfs_symlink_buf_ops = {
.name = "xfs_symlink",
+ .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
.verify_struct = xfs_symlink_verify,
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 3306fc42cfad..de310712dd6d 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -116,6 +116,19 @@ xfs_verify_agino(
}
/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
+/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
@@ -204,3 +217,14 @@ xfs_verify_icount(
xfs_icount_range(mp, &min, &max);
return icount >= min && icount <= max;
}
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+ struct xfs_mount *mp,
+ xfs_fileoff_t dabno)
+{
+ xfs_dablk_t max_dablk = -1U;
+
+ return dabno <= max_dablk;
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 8f02855a019a..c5a25403b4db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t *first, xfs_agino_t *last);
bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90955ab1e895..ddf06bfaa29d 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -399,7 +399,7 @@ xchk_agf_xref_cntbt(
if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
return;
if (!have) {
- if (agf->agf_freeblks != be32_to_cpu(0))
+ if (agf->agf_freeblks != cpu_to_be32(0))
xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
return;
}
@@ -864,19 +864,17 @@ xchk_agi(
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (agino == NULLAGINO)
- continue;
- if (!xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 03d1e15cceba..64e31f87d490 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -341,23 +341,19 @@ xrep_agf(
struct xrep_find_ag_btree fab[XREP_AGF_MAX] = {
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTB_CRC_MAGIC,
+ .buf_ops = &xfs_bnobt_buf_ops,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTC_CRC_MAGIC,
+ .buf_ops = &xfs_cntbt_buf_ops,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
- .magic = XFS_RMAP_CRC_MAGIC,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
- .magic = XFS_REFC_CRC_MAGIC,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -875,12 +871,10 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_IBT_CRC_MAGIC,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_FIBT_CRC_MAGIC,
+ .buf_ops = &xfs_finobt_buf_ops,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 81d5e90547a1..dce74ec57038 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -82,12 +82,23 @@ xchk_xattr_listent(
sx = container_of(context, struct xchk_xattr, context);
+ if (xchk_should_terminate(sx->sc, &error)) {
+ context->seen_enough = 1;
+ return;
+ }
+
if (flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
xchk_ino_set_preen(sx->sc, context->dp->i_ino);
return;
}
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+ return;
+ }
+
args.flags = ATTR_KERNOTIME;
if (flags & XFS_ATTR_ROOT)
args.flags |= ATTR_ROOT;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e1d11f3223e3..a703cd58a90e 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -281,6 +281,31 @@ xchk_bmap_extent_xref(
xchk_ag_free(info->sc, &info->sc->sa);
}
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t off;
+
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+ return;
+
+ if (!xfs_verify_dablk(mp, irec->br_startoff))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ off = irec->br_startoff + irec->br_blockcount - 1;
+ if (!xfs_verify_dablk(mp, off))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
/* Scrub a single extent record. */
STATIC int
xchk_bmap_extent(
@@ -305,6 +330,8 @@ xchk_bmap_extent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ xchk_bmap_dirattr_extent(ip, info, irec);
+
/* There should never be a "hole" extent in either extent list. */
if (irec->br_startblock == HOLESTARTBLOCK)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cd3e4d768a18..a38a22785a1a 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -129,6 +129,12 @@ xchk_dir_actor(
goto out;
}
+ /* Does this name make sense? */
+ if (!xfs_dir2_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
if (!strncmp(".", name, namelen)) {
/* If this is "." then check that the inum matches the dir. */
if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 882dc56c5c21..700114f79a7d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt(
struct xchk_iallocbt {
/* Number of inodes we see while scanning inobt. */
unsigned long long inodes;
+
+ /* Expected next startino, for big block filesystems. */
+ xfs_agino_t next_startino;
+
+ /* Expected end of the current inode cluster. */
+ xfs_agino_t next_cluster_ino;
};
/*
@@ -128,41 +134,57 @@ xchk_iallocbt_freecount(
return hweight64(freemask);
}
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record. First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
struct xchk_btree *bs,
- xfs_ino_t fsino,
- xfs_agino_t chunkino,
- xfs_agino_t clusterino,
struct xfs_inobt_rec_incore *irec,
- struct xfs_buf *bp)
+ unsigned int irec_ino,
+ struct xfs_dinode *dip)
{
- struct xfs_dinode *dip;
struct xfs_mount *mp = bs->cur->bc_mp;
- bool inode_is_free = false;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ bool irec_free;
+ bool ino_inuse;
bool freemask_ok;
- bool inuse;
int error = 0;
if (xchk_should_terminate(bs->sc, &error))
return error;
- dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ /*
+ * Given an inobt record and the offset of an inode from the start of
+ * the record, compute which fs inode we're talking about.
+ */
+ agino = irec->ir_startino + irec_ino;
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
- (dip->di_version >= 3 &&
- be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
- if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
- inode_is_free = true;
- error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
- fsino + clusterino, &inuse);
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+ &ino_inuse);
if (error == -ENODATA) {
/* Not cached, just read the disk buffer */
- freemask_ok = inode_is_free ^ !!(dip->di_mode);
+ freemask_ok = irec_free ^ !!(dip->di_mode);
if (!bs->sc->try_harder && !freemask_ok)
return -EDEADLOCK;
} else if (error < 0) {
@@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask(
goto out;
} else {
/* Inode is all there. */
- freemask_ok = inode_is_free ^ inuse;
+ freemask_ok = irec_free ^ ino_inuse;
}
if (!freemask_ok)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -182,86 +204,221 @@ out:
return 0;
}
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk. If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
struct xchk_btree *bs,
- struct xfs_inobt_rec_incore *irec)
+ struct xfs_inobt_rec_incore *irec,
+ unsigned int cluster_base)
{
struct xfs_imap imap;
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_dinode *dip;
- struct xfs_buf *bp;
- xfs_ino_t fsino;
- xfs_agino_t nr_inodes;
- xfs_agino_t agino;
- xfs_agino_t chunkino;
- xfs_agino_t clusterino;
+ struct xfs_buf *cluster_bp;
+ unsigned int nr_inodes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agblock_t agbno;
- uint16_t holemask;
+ unsigned int cluster_index;
+ uint16_t cluster_mask = 0;
uint16_t ir_holemask;
int error = 0;
- /* Make sure the freemask matches the inode records. */
- nr_inodes = mp->m_inodes_per_cluster;
-
- for (agino = irec->ir_startino;
- agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
- agino += mp->m_inodes_per_cluster) {
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
- chunkino = agino - irec->ir_startino;
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-
- /* Compute the holemask mask for this cluster. */
- for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
- clusterino += XFS_INODES_PER_HOLEMASK_BIT)
- holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
- XFS_INODES_PER_HOLEMASK_BIT);
-
- /* The whole cluster must be a hole or not a hole. */
- ir_holemask = (irec->ir_holemask & holemask);
- if (ir_holemask != holemask && ir_holemask != 0) {
+ nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ mp->m_inodes_per_cluster);
+
+ /* Map this inode cluster */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
+
+ /* Compute a bitmask for this cluster that can be used for holemask. */
+ for (cluster_index = 0;
+ cluster_index < nr_inodes;
+ cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+ cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+ XFS_INODES_PER_HOLEMASK_BIT);
+
+ /*
+ * Map the first inode of this cluster to a buffer and offset.
+ * Be careful about inobt records that don't align with the start of
+ * the inode buffer when block sizes are large enough to hold multiple
+ * inode chunks. When this happens, cluster_base will be zero but
+ * ir_startino can be large enough to make im_boffset nonzero.
+ */
+ ir_holemask = (irec->ir_holemask & cluster_mask);
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
+ imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino);
+
+ if (imap.im_boffset != 0 && cluster_base != 0) {
+ ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+ imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+ cluster_mask, ir_holemask,
+ XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+ cluster_base));
+
+ /* The whole cluster must be a hole or not a hole. */
+ if (ir_holemask != cluster_mask && ir_holemask != 0) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask) {
+ xchk_xref_is_not_owned_by(bs->sc, agbno,
+ mp->m_blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+ return 0;
+ }
+
+ xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+
+ /* Grab the inode cluster buffer. */
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+ 0, 0);
+ if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+ return error;
+
+ /* Check free status of each inode within this cluster. */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ struct xfs_dinode *dip;
+
+ if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- continue;
+ break;
}
- /* If any part of this is a hole, skip it. */
- if (ir_holemask) {
- xchk_xref_is_not_owned_by(bs->sc, agbno,
- mp->m_blocks_per_cluster,
- &XFS_RMAP_OINFO_INODES);
- continue;
+ dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+ error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+ cluster_base + cluster_index, dip);
+ if (error)
+ break;
+ imap.im_boffset += mp->m_sb.sb_inodesize;
+ }
+
+ xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+ return error;
+}
+
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ unsigned int cluster_base;
+ int error = 0;
+
+ /*
+ * For the common case where this inobt record maps to multiple inode
+ * clusters this will call _check_cluster for each cluster.
+ *
+ * For the case that multiple inobt records map to a single cluster,
+ * this will call _check_cluster once.
+ */
+ for (cluster_base = 0;
+ cluster_base < XFS_INODES_PER_CHUNK;
+ cluster_base += bs->sc->mp->m_inodes_per_cluster) {
+ error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Make sure this inode btree record is aligned properly. Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk. This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_mount *mp = bs->sc->mp;
+ struct xchk_iallocbt *iabt = bs->private;
+
+ /*
+ * finobt records have different positioning requirements than inobt
+ * records: each finobt record must have a corresponding inobt record.
+ * That is checked in the xref function, so for now we only catch the
+ * obvious case where the record isn't at all aligned properly.
+ *
+ * Note that if a fs block contains more than a single chunk of inodes,
+ * we will have finobt records only for those chunks containing free
+ * inodes, and therefore expect chunk alignment of finobt records.
+ * Otherwise, we expect that the finobt record is aligned to the
+ * cluster alignment as told by the superblock.
+ */
+ if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ unsigned int imask;
+
+ imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ mp->m_cluster_align_inodes) - 1;
+ if (irec->ir_startino & imask)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ if (iabt->next_startino != NULLAGINO) {
+ /*
+ * We're midway through a cluster of inodes that is mapped by
+ * multiple inobt records. Did we get the record for the next
+ * irec in the sequence?
+ */
+ if (irec->ir_startino != iabt->next_startino) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
- &XFS_RMAP_OINFO_INODES);
+ iabt->next_startino += XFS_INODES_PER_CHUNK;
- /* Grab the inode cluster buffer. */
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
- agbno);
- imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
- imap.im_boffset = 0;
-
- error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
- &dip, &bp, 0, 0);
- if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
- &error))
- continue;
-
- /* Which inodes are free? */
- for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
- error = xchk_iallocbt_check_cluster_freemask(bs,
- fsino, chunkino, clusterino, irec, bp);
- if (error) {
- xfs_trans_brelse(bs->cur->bc_tp, bp);
- return error;
- }
+ /* Are we done with the cluster? */
+ if (iabt->next_startino >= iabt->next_cluster_ino) {
+ iabt->next_startino = NULLAGINO;
+ iabt->next_cluster_ino = NULLAGINO;
}
+ return;
+ }
+
+ /* inobt records must be aligned to cluster and inoalignmnt size. */
+ if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
- xfs_trans_brelse(bs->cur->bc_tp, bp);
+ if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- return error;
+ if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+ return;
+
+ /*
+ * If this is the start of an inode cluster that can be mapped by
+ * multiple inobt records, the next inobt record must follow exactly
+ * after this one.
+ */
+ iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+ iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster;
}
/* Scrub an inobt/finobt record. */
@@ -276,7 +433,6 @@ xchk_iallocbt_rec(
uint64_t holes;
xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agino_t agino;
- xfs_agblock_t agbno;
xfs_extlen_t len;
int holecount;
int i;
@@ -303,11 +459,9 @@ xchk_iallocbt_rec(
goto out;
}
- /* Make sure this record is aligned to cluster and inoalignmnt size. */
- agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
- if ((agbno & (mp->m_cluster_align - 1)) ||
- (agbno & (mp->m_blocks_per_cluster - 1)))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ xchk_iallocbt_rec_alignment(bs, &irec);
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
iabt->inodes += irec.ir_count;
@@ -320,7 +474,7 @@ xchk_iallocbt_rec(
if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
goto out;
- goto check_freemask;
+ goto check_clusters;
}
/* Check each chunk of a sparse inode cluster. */
@@ -346,8 +500,8 @@ xchk_iallocbt_rec(
holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-check_freemask:
- error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+ error = xchk_iallocbt_check_clusters(bs, &irec);
if (error)
goto out;
@@ -429,6 +583,8 @@ xchk_iallocbt(
struct xfs_btree_cur *cur;
struct xchk_iallocbt iabt = {
.inodes = 0,
+ .next_startino = NULLAGINO,
+ .next_cluster_ino = NULLAGINO,
};
int error;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1c8eecfe52b8..f28f4bad317b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -743,7 +743,8 @@ xrep_findroot_block(
/* Ensure the block magic matches the btree type we're looking for. */
btblock = XFS_BUF_TO_BLOCK(bp);
- if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ ASSERT(fab->buf_ops->magic[1] != 0);
+ if (btblock->bb_magic != fab->buf_ops->magic[1])
goto out;
/*
@@ -768,18 +769,23 @@ xrep_findroot_block(
if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
&mp->m_sb.sb_meta_uuid))
goto out;
+ /*
+ * Read verifiers can reference b_ops, so we set the pointer
+ * here. If the verifier fails we'll reset the buffer state
+ * to what it was before we touched the buffer.
+ */
+ bp->b_ops = fab->buf_ops;
fab->buf_ops->verify_read(bp);
if (bp->b_error) {
+ bp->b_ops = NULL;
bp->b_error = 0;
goto out;
}
/*
* Some read verifiers will (re)set b_ops, so we must be
- * careful not to blow away any such assignment.
+ * careful not to change b_ops after running the verifier.
*/
- if (!bp->b_ops)
- bp->b_ops = fab->buf_ops;
}
/*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index f2fc18bb7605..d990314eb08b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -42,9 +42,6 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
- /* in: magic number of the btree */
- uint32_t magic;
-
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 665d4bbb17cc..dbe115b075f7 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space(
startext = fsbno;
endext = fsbno + len - 1;
do_div(startext, sc->mp->m_sb.sb_rextsize);
- if (do_div(endext, sc->mp->m_sb.sb_rextsize))
- endext++;
- extcount = endext - startext;
+ do_div(endext, sc->mp->m_sb.sb_rextsize);
+ extcount = endext - startext + 1;
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
&is_free);
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 8344b14031ef..3c83e8b3b39c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error,
__entry->ret_ip)
);
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino, xfs_daddr_t map_daddr,
+ unsigned short map_len, unsigned int chunk_ino,
+ unsigned int nr_inodes, uint16_t cluster_mask,
+ uint16_t holemask, unsigned int cluster_ino),
+ TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+ cluster_mask, holemask, cluster_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(xfs_daddr_t, map_daddr)
+ __field(unsigned short, map_len)
+ __field(unsigned int, chunk_ino)
+ __field(unsigned int, nr_inodes)
+ __field(unsigned int, cluster_ino)
+ __field(uint16_t, cluster_mask)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ __entry->map_daddr = map_daddr;
+ __entry->map_len = map_len;
+ __entry->chunk_ino = chunk_ino;
+ __entry->nr_inodes = nr_inodes;
+ __entry->cluster_mask = cluster_mask;
+ __entry->holemask = holemask;
+ __entry->cluster_ino = cluster_ino;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->map_daddr,
+ __entry->map_len,
+ __entry->chunk_ino,
+ __entry->nr_inodes,
+ __entry->cluster_mask,
+ __entry->holemask,
+ __entry->cluster_ino)
+)
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 338b9d9984e0..3619e9e8d359 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -28,7 +28,8 @@
*/
struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap;
- unsigned int io_type;
+ int fork;
+ unsigned int data_seq;
unsigned int cow_seq;
struct xfs_ioend *ioend;
};
@@ -62,7 +63,7 @@ xfs_find_daxdev_for_inode(
static void
xfs_finish_page_writeback(
struct inode *inode,
- struct bio_vec *bvec,
+ struct bio_vec *bvec,
int error)
{
struct iomap_page *iop = to_iomap_page(bvec->bv_page);
@@ -98,6 +99,7 @@ xfs_destroy_ioend(
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
/*
* For the last bio, bi_private points to the ioend, so we
@@ -109,7 +111,7 @@ xfs_destroy_ioend(
next = bio->bi_private;
/* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
xfs_finish_page_writeback(inode, bvec, error);
bio_put(bio);
}
@@ -255,30 +257,20 @@ xfs_end_io(
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- break;
- }
-
goto done;
}
/*
- * Success: commit the COW or unwritten blocks if needed.
+ * Success: commit the COW or unwritten blocks if needed.
*/
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
error = xfs_reflink_end_cow(ip, offset, size);
- break;
- case XFS_IO_UNWRITTEN:
- /* writeback should never update isize */
+ else if (ioend->io_state == XFS_EXT_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- break;
- default:
+ else
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
- break;
- }
done:
if (ioend->io_append_trans)
@@ -293,7 +285,8 @@ xfs_end_bio(
struct xfs_ioend *ioend = bio->bi_private;
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
+ if (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state == XFS_EXT_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
else if (ioend->io_append_trans)
queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -301,6 +294,75 @@ xfs_end_bio(
xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ if (offset_fsb < wpc->imap.br_startoff ||
+ offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ return false;
+ /*
+ * If this is a COW mapping, it is sufficient to check that the mapping
+ * covers the offset. Be careful to check this first because the caller
+ * can revalidate a COW mapping without updating the data seqno.
+ */
+ if (wpc->fork == XFS_COW_FORK)
+ return true;
+
+ /*
+ * This is not a COW mapping. Check the sequence number of the data fork
+ * because concurrent changes could have invalidated the extent. Check
+ * the COW fork because concurrent changes since the last time we
+ * checked (and found nothing at this offset) could have added
+ * overlapping blocks.
+ */
+ if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ return false;
+ if (xfs_inode_has_cow_data(ip) &&
+ wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ return false;
+ return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs
+ * offset_fsb and put the result into wpc->imap. Allocate in a loop
+ * because it may take several attempts to allocate real blocks for a
+ * contiguous delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+ &wpc->imap, wpc->fork == XFS_COW_FORK ?
+ &wpc->cow_seq : &wpc->data_seq);
+ if (error)
+ return error;
+ } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+ return 0;
+}
+
STATIC int
xfs_map_blocks(
struct xfs_writepage_ctx *wpc,
@@ -310,26 +372,16 @@ xfs_map_blocks(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t count = i_blocksize(inode);
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
struct xfs_bmbt_irec imap;
- int whichfork = XFS_DATA_FORK;
struct xfs_iext_cursor icur;
- bool imap_valid;
+ int retries = 0;
int error = 0;
- /*
- * We have to make sure the cached mapping is within EOF to protect
- * against eofblocks trimming on file release leaving us with a stale
- * mapping. Otherwise, a page for a subsequent file extending buffered
- * write could get picked up by this writeback cycle and written to the
- * wrong blocks.
- *
- * Note that what we really want here is a generic mapping invalidation
- * mechanism to protect us from arbitrary extent modifying contexts, not
- * just eofblocks.
- */
- xfs_trim_extent_eof(&wpc->imap, ip);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
/*
* COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +398,19 @@ xfs_map_blocks(
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- imap_valid = offset_fsb >= wpc->imap.br_startoff &&
- offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
- if (imap_valid &&
- (!xfs_inode_has_cow_data(ip) ||
- wpc->io_type == XFS_IO_COW ||
- wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+ if (xfs_imap_valid(wpc, ip, offset_fsb))
return 0;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/*
* If we don't have a valid map, now it's time to get a new one for this
* offset. This will convert delayed allocations (including COW ones)
* into real extents. If we return without a valid map, it means we
* landed in a hole and we skip the block.
*/
+retry:
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_super->s_maxbytes);
-
- if (offset > mp->m_super->s_maxbytes - count)
- count = mp->m_super->s_maxbytes - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
/*
* Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +422,16 @@ xfs_map_blocks(
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- /*
- * Truncate can race with writeback since writeback doesn't
- * take the iolock and truncate decreases the file size before
- * it starts truncating the pages between new_size and old_size.
- * Therefore, we can end up in the situation where writeback
- * gets a CoW fork mapping but the truncate makes the mapping
- * invalid and we end up in here trying to get a new mapping.
- * bail out here so that we simply never get a valid mapping
- * and so we drop the write altogether. The page truncation
- * will kill the contents anyway.
- */
- if (offset > i_size_read(inode)) {
- wpc->io_type = XFS_IO_HOLE;
- return 0;
- }
- whichfork = XFS_COW_FORK;
- wpc->io_type = XFS_IO_COW;
+
+ wpc->fork = XFS_COW_FORK;
goto allocate_blocks;
}
/*
- * Map valid and no COW extent in the way? We're done.
+ * No COW extent overlap. Revalidate now that we may have updated
+ * ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (imap_valid) {
+ if (xfs_imap_valid(wpc, ip, offset_fsb)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -417,49 +443,65 @@ xfs_map_blocks(
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ wpc->fork = XFS_DATA_FORK;
+
+ /* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
- /* landed in a hole or beyond EOF */
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
- wpc->io_type = XFS_IO_HOLE;
- } else {
- /*
- * Truncate to the next COW extent if there is one. This is the
- * only opportunity to do this because we can skip COW fork
- * lookups for the subsequent blocks in the mapping; however,
- * the requirement to treat the COW range separately remains.
- */
- if (cow_fsb != NULLFILEOFF &&
- cow_fsb < imap.br_startoff + imap.br_blockcount)
- imap.br_blockcount = cow_fsb - imap.br_startoff;
-
- if (isnullstartblock(imap.br_startblock)) {
- /* got a delalloc extent */
- wpc->io_type = XFS_IO_DELALLOC;
- goto allocate_blocks;
- }
-
- if (imap.br_state == XFS_EXT_UNWRITTEN)
- wpc->io_type = XFS_IO_UNWRITTEN;
- else
- wpc->io_type = XFS_IO_OVERWRITE;
+ imap.br_state = XFS_EXT_NORM;
}
+ /*
+ * Truncate to the next COW extent if there is one. This is the only
+ * opportunity to do this because we can skip COW fork lookups for the
+ * subsequent blocks in the mapping; however, the requirement to treat
+ * the COW range separately remains.
+ */
+ if (cow_fsb != NULLFILEOFF &&
+ cow_fsb < imap.br_startoff + imap.br_blockcount)
+ imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+ /* got a delalloc extent? */
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ isnullstartblock(imap.br_startblock))
+ goto allocate_blocks;
+
wpc->imap = imap;
- trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+ trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
return 0;
allocate_blocks:
- error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
- &wpc->cow_seq);
- if (error)
+ error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ if (error) {
+ /*
+ * If we failed to find the extent in the COW fork we might have
+ * raced with a COW to data fork conversion or truncate.
+ * Restart the lookup to catch the extent in the data fork for
+ * the former case, but prevent additional retries to avoid
+ * looping forever for the latter case.
+ */
+ if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ goto retry;
+ ASSERT(error != -EAGAIN);
return error;
- ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
- imap.br_startoff + imap.br_blockcount <= cow_fsb);
- wpc->imap = imap;
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+ }
+
+ /*
+ * Due to merging the return real extent might be larger than the
+ * original delalloc one. Trim the return extent to the next COW
+ * boundary again to force a re-lookup.
+ */
+ if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+ cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+ ASSERT(wpc->imap.br_startoff <= offset_fsb);
+ ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+ trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
return 0;
}
@@ -484,7 +526,7 @@ xfs_submit_ioend(
int status)
{
/* Convert CoW extents to regular */
- if (!status && ioend->io_type == XFS_IO_COW) {
+ if (!status && ioend->io_fork == XFS_COW_FORK) {
/*
* Yuk. This can do memory allocation, but is not a
* transactional operation so everything is done in GFP_KERNEL
@@ -502,7 +544,8 @@ xfs_submit_ioend(
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN &&
+ (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state != XFS_EXT_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
!ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
@@ -531,7 +574,8 @@ xfs_submit_ioend(
static struct xfs_ioend *
xfs_alloc_ioend(
struct inode *inode,
- unsigned int type,
+ int fork,
+ xfs_exntst_t state,
xfs_off_t offset,
struct block_device *bdev,
sector_t sector)
@@ -545,7 +589,8 @@ xfs_alloc_ioend(
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
+ ioend->io_fork = fork;
+ ioend->io_state = state;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
@@ -606,21 +651,23 @@ xfs_add_to_ioend(
sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
- if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ if (!wpc->ioend ||
+ wpc->fork != wpc->ioend->io_fork ||
+ wpc->imap.br_state != wpc->ioend->io_state ||
sector != bio_end_sector(wpc->ioend->io_bio) ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- bdev, sector);
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+ wpc->imap.br_state, offset, bdev, sector);
}
- if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+ if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
if (iop)
atomic_inc(&iop->write_count);
if (bio_full(wpc->ioend->io_bio))
xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
- __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
wpc->ioend->io_size += len;
@@ -721,7 +768,7 @@ xfs_writepage_map(
error = xfs_map_blocks(wpc, inode, file_offset);
if (error)
break;
- if (wpc->io_type == XFS_IO_HOLE)
+ if (wpc->imap.br_startblock == HOLESTARTBLOCK)
continue;
xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
@@ -916,9 +963,7 @@ xfs_vm_writepage(
struct page *page,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
ret = xfs_do_writepage(page, wbc, &wpc);
@@ -932,9 +977,7 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -981,7 +1024,7 @@ xfs_vm_bmap(
* Since we don't pass back blockdev info, we can't return bmap
* information for rt files either.
*/
- if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e5c23948a8ab..6c2615b83c5d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,32 +9,12 @@
extern struct bio_set xfs_ioend_bioset;
/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- *
- * This enum is used in string mapping in xfs_trace.h; please keep the
- * TRACE_DEFINE_ENUMs for it up to date.
- */
-enum {
- XFS_IO_HOLE, /* covers region without any block allocation */
- XFS_IO_DELALLOC, /* covers delalloc region */
- XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
- XFS_IO_OVERWRITE, /* covers already allocated extent */
- XFS_IO_COW, /* covers copy-on-write extent */
-};
-
-#define XFS_IO_TYPES \
- { XFS_IO_HOLE, "hole" }, \
- { XFS_IO_DELALLOC, "delalloc" }, \
- { XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }, \
- { XFS_IO_COW, "CoW" }
-
-/*
* Structure for buffered I/O completions.
*/
struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
- unsigned int io_type; /* delalloc / unwritten */
+ int io_fork; /* inode fork written back */
+ xfs_exntst_t io_state; /* extent state */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a58034049995..3d213a7394c5 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -555,6 +555,7 @@ xfs_attr_put_listent(
attrlist_ent_t *aep;
int arraytop;
+ ASSERT(!context->seen_enough);
ASSERT(!(context->flags & ATTR_KERNOVAL));
ASSERT(context->count >= 0);
ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1ee8c5539fa4..2db43ff4f8b5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1162,16 +1162,13 @@ xfs_zero_file_space(
* by virtue of the hole punch.
*/
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out;
+ if (error || xfs_is_always_cow_inode(ip))
+ return error;
- error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+ return xfs_alloc_file_space(ip, round_down(offset, blksize),
round_up(offset + len, blksize) -
round_down(offset, blksize),
XFS_BMAPI_PREALLOC);
-out:
- return error;
-
}
static int
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index eedc5e0156ff..548344e25128 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -776,13 +776,24 @@ _xfs_buf_read(
}
/*
- * If the caller passed in an ops structure and the buffer doesn't have ops
- * assigned, set the ops and use them to verify the contents. If the contents
- * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no
- * recorded errors and is already in XBF_DONE state.
+ * Reverify a buffer found in cache without an attached ->b_ops.
+ *
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
+ *
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type. If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
*/
int
-xfs_buf_ensure_ops(
+xfs_buf_reverify(
struct xfs_buf *bp,
const struct xfs_buf_ops *ops)
{
@@ -824,7 +835,7 @@ xfs_buf_read_map(
return bp;
}
- xfs_buf_ensure_ops(bp, ops);
+ xfs_buf_reverify(bp, ops);
if (flags & XBF_ASYNC) {
/*
@@ -1536,8 +1547,7 @@ __xfs_buf_submit(
xfs_buf_ioerror(bp, -EIO);
bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
- if (bp->b_flags & XBF_ASYNC)
- xfs_buf_ioend(bp);
+ xfs_buf_ioend(bp);
return -EIO;
}
@@ -2194,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
atomic_set(&bp->b_lru_ref, lru_ref);
}
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+ struct xfs_buf *bp,
+ __be32 dmagic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+ struct xfs_buf *bp,
+ __be16 dmagic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic16[idx];
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b9f5511ea998..d0b96e071cec 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -125,6 +125,10 @@ struct xfs_buf_map {
struct xfs_buf_ops {
char *name;
+ union {
+ __be32 magic[2]; /* v4 and v5 on disk magic values */
+ __be16 magic16[2]; /* v4 and v5 on disk magic values */
+ };
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f542e77b..a1e177f66404 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_BUF_LRU_REF,
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
+ XFS_RANDOM_IUNLINK_FALLBACK,
};
struct xfs_errortag_attr {
@@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
+ XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
NULL,
};
@@ -357,7 +360,8 @@ xfs_buf_verifier_error(
fa = failaddr ? failaddr : __return_address;
__xfs_buf_ioerror(bp, error, fa);
- xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
fa, bp->b_ops->name, bp->b_bn, name);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..602aa7d62b66 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
+#define XFS_PTAG_VERIFIER_ERROR 0x00000100
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e47425071e65..1f2e2845eb76 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
return -EREMCHG;
}
@@ -872,14 +872,27 @@ xfs_file_fallocate(
goto out_unlock;
}
- if (mode & FALLOC_FL_ZERO_RANGE)
+ if (mode & FALLOC_FL_ZERO_RANGE) {
error = xfs_zero_file_space(ip, offset, len);
- else {
- if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ if (!xfs_is_always_cow_inode(ip)) {
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
}
+ } else {
+ /*
+ * If always_cow mode we can't use preallocations and
+ * thus should not create them.
+ */
+ if (xfs_is_always_cow_inode(ip)) {
+ error = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
}
@@ -1068,10 +1081,10 @@ xfs_file_llseek(
default:
return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
break;
case SEEK_DATA:
- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
break;
}
@@ -1203,6 +1216,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f3ef70c542e1..584648582ba7 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks(
int error = 0;
int err2;
+ mp->m_finobt_nores = false;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
pag = xfs_perag_get(mp, agno);
err2 = xfs_ag_resv_init(pag, NULL);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5169e84ae382..d0d377384120 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -16,7 +16,7 @@ xfs_param_t xfs_params = {
/* MIN DFLT MAX */
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
- .panic_mask = { 0, 0, 255 },
+ .panic_mask = { 0, 0, 256 },
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
.stats_clear = { 0, 0, 1 },
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ae667ba74a1c..f643a9295179 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
if (error)
goto out_trans_cancel;
- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+ error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- if (unlikely(mp->m_inotbt_nores)) {
+ if (unlikely(mp->m_finobt_nores)) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
&tp);
@@ -1907,86 +1907,510 @@ xfs_inactive(
}
/*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory. Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain. This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk. That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk. The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+ struct rhash_head iu_rhash_head;
+ xfs_agino_t iu_agino; /* X */
+ xfs_agino_t iu_next_unlinked; /* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const xfs_agino_t *key = arg->key;
+ const struct xfs_iunlink *iu = obj;
+
+ if (iu->iu_next_unlinked != *key)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+ .min_size = XFS_AGI_UNLINKED_BUCKETS,
+ .key_len = sizeof(xfs_agino_t),
+ .key_offset = offsetof(struct xfs_iunlink,
+ iu_next_unlinked),
+ .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino)
+{
+ struct xfs_iunlink *iu;
+
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+ struct xfs_perag *pag,
+ struct xfs_iunlink *iu)
+{
+ int error;
+
+ error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ /*
+ * Fail loudly if there already was an entry because that's a sign of
+ * corruption of in-memory data. Also fail loudly if we see an error
+ * code we didn't anticipate from the rhashtable code. Currently we
+ * only anticipate ENOMEM.
+ */
+ if (error) {
+ WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+ kmem_free(iu);
+ }
+ /*
+ * Absorb any runtime errors that aren't a result of corruption because
+ * this is a cache and we can always fall back to bucket list scanning.
+ */
+ if (error != 0 && error != -EEXIST)
+ error = 0;
+ return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t prev_agino,
+ xfs_agino_t this_agino)
+{
+ struct xfs_iunlink *iu;
+
+ if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+ return 0;
+
+ iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+ iu->iu_agino = prev_agino;
+ iu->iu_next_unlinked = this_agino;
+
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ xfs_agino_t next_unlinked)
+{
+ struct xfs_iunlink *iu;
+ int error;
+
+ /* Look up the old entry; if there wasn't one then exit. */
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ if (!iu)
+ return 0;
+
+ /*
+ * Remove the entry. This shouldn't ever return an error, but if we
+ * couldn't remove the old entry we don't want to add it again to the
+ * hash table, and if the entry disappeared on us then someone's
+ * violated the locking rules and we need to fail loudly. Either way
+ * we cannot remove the inode because internal state is or would have
+ * been corrupt.
+ */
+ error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ if (error)
+ return error;
+
+ /* If there is no new next entry just free our item and return. */
+ if (next_unlinked == NULLAGINO) {
+ kmem_free(iu);
+ return 0;
+ }
+
+ /* Update the entry and re-add it to the hash table. */
+ iu->iu_next_unlinked = next_unlinked;
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+ struct xfs_perag *pag)
+{
+ return rhashtable_init(&pag->pagi_unlinked_hash,
+ &xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+ void *ptr,
+ void *arg)
+{
+ struct xfs_iunlink *iu = ptr;
+ bool *freed_anything = arg;
+
+ *freed_anything = true;
+ kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+ struct xfs_perag *pag)
+{
+ bool freed_anything = false;
+
+ rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+ xfs_iunlink_free_item, &freed_anything);
+
+ ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results. The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ unsigned int bucket_index,
+ xfs_agino_t new_agino)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ xfs_agino_t old_value;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+ old_value, new_agino);
+
+ /*
+ * We should never find the head of the list already set to the value
+ * passed in because either we're adding or removing ourselves from the
+ * head of the list.
+ */
+ if (old_value == new_agino)
+ return -EFSCORRUPTED;
+
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_buf *ibp,
+ struct xfs_dinode *dip,
+ struct xfs_imap *imap,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ trace_xfs_iunlink_update_dinode(mp, agno, agino,
+ be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
+ offset = imap->im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_agnumber_t agno,
+ xfs_agino_t next_agino,
+ xfs_agino_t *old_next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ xfs_agino_t old_value;
+ int error;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+ if (error)
+ return error;
+
+ /* Make sure the old pointer isn't garbage. */
+ old_value = be32_to_cpu(dip->di_next_unlinked);
+ if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ *old_next_agino = old_value;
+ if (old_value == next_agino) {
+ if (next_agino != NULLAGINO)
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Ok, update the new pointer. */
+ xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ ibp, dip, &ip->i_imap, next_agino);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
*
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
STATIC int
xfs_iunlink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp = tp->t_mountp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agino_t agino;
- short bucket_index;
- int offset;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ xfs_agino_t next_agino;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
+ ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(VFS_I(ip)->i_mode != 0);
+ trace_xfs_iunlink(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
- error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- ASSERT(agino != 0);
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- ASSERT(agi->agi_unlinked[bucket_index]);
- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(mp, agno, next_agino))
+ return -EFSCORRUPTED;
+
+ if (next_agino != NULLAGINO) {
+ struct xfs_perag *pag;
+ xfs_agino_t old_agino;
+
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+ &old_agino);
+ if (error)
+ return error;
+ ASSERT(old_agino == NULLAGINO);
- if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
/*
- * There is already another inode in the bucket we need
- * to add ourselves to. Add us at the front of the list.
- * Here we put the head pointer into our next pointer,
- * and then we fall through to point the head at us.
+ * agino has been unlinked, add a backref from the next inode
+ * back to agino.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_add_backref(pag, agino, next_agino);
+ xfs_perag_put(pag);
if (error)
return error;
+ }
+
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
- ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
- dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int error;
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
+ imap->im_blkno = 0;
+ error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap returned error %d.",
+ __func__, error);
+ return error;
+ }
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino. Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t head_agino,
+ xfs_agino_t target_agino,
+ xfs_agino_t *agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agino_t next_agino;
+ int error;
+
+ ASSERT(head_agino != target_agino);
+ *bpp = NULL;
+
+ /* See if our backref cache can find it faster. */
+ *agino = xfs_iunlink_lookup_backref(pag, target_agino);
+ if (*agino != NULLAGINO) {
+ error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+ if (error)
+ return error;
+
+ if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+ return 0;
+
+ /*
+ * If we get here the cache contents were corrupt, so drop the
+ * buffer and fall back to walking the bucket list.
+ */
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ WARN_ON_ONCE(1);
+ }
+
+ trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+ /* Otherwise, walk the entire bucket until we find it. */
+ next_agino = head_agino;
+ while (next_agino != target_agino) {
+ xfs_agino_t unlinked_agino;
+
+ if (*bpp)
+ xfs_trans_brelse(tp, *bpp);
+
+ *agino = next_agino;
+ error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+ bpp);
+ if (error)
+ return error;
+
+ unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+ /*
+ * Make sure this pointer is valid and isn't an obvious
+ * infinite loop.
+ */
+ if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+ next_agino == unlinked_agino) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW, mp,
+ *dipp, sizeof(**dipp));
+ error = -EFSCORRUPTED;
+ return error;
+ }
+ next_agino = unlinked_agino;
}
- /*
- * Point the bucket head pointer at the inode being inserted.
- */
- ASSERT(agino != 0);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
return 0;
}
@@ -1995,181 +2419,106 @@ xfs_iunlink(
*/
STATIC int
xfs_iunlink_remove(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_ino_t next_ino;
- xfs_mount_t *mp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agnumber_t agno;
- xfs_agino_t agino;
- xfs_agino_t next_agino;
- xfs_buf_t *last_ibp;
- xfs_dinode_t *last_dip = NULL;
- short bucket_index;
- int offset, last_offset = 0;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ struct xfs_buf *last_ibp;
+ struct xfs_dinode *last_dip = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t next_agino;
+ xfs_agino_t head_agino;
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
- mp = tp->t_mountp;
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ trace_xfs_iunlink_remove(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
-
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the head pointer isn't garbage.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- if (!xfs_verify_agino(mp, agno, agino))
- return -EFSCORRUPTED;
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- if (!xfs_verify_agino(mp, agno,
- be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (!xfs_verify_agino(mp, agno, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
}
- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
- /*
- * We're at the head of the list. Get the inode's on-disk
- * buffer to see if there is anyone after us on the list.
- * Only modify our next pointer if it is not already NULLAGINO.
- * This saves us the overhead of dealing with the buffer when
- * there is no need to change it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the bucket head pointer at the next inode.
- */
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- } else {
- /*
- * We need to search the list for the inode being freed.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- last_ibp = NULL;
- while (next_agino != agino) {
- struct xfs_imap imap;
+ /*
+ * Set our inode's next_unlinked pointer to NULL and then return
+ * the old pointer value so that we can update whatever was previous
+ * to us in the list to point to whatever was next in the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+ if (error)
+ return error;
- if (last_ibp)
- xfs_trans_brelse(tp, last_ibp);
+ /*
+ * If there was a backref pointing from the next inode back to this
+ * one, remove it because we've removed this inode from the list.
+ *
+ * Later, if this inode was in the middle of the list we'll update
+ * this inode's backref to point from the next inode.
+ */
+ if (next_agino != NULLAGINO) {
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_change_backref(pag, next_agino,
+ NULLAGINO);
+ if (error)
+ goto out;
+ }
- imap.im_blkno = 0;
- next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+ if (head_agino == agino) {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
+ if (error)
+ goto out;
+ } else {
+ struct xfs_imap imap;
+ xfs_agino_t prev_agino;
- error = xfs_imap(mp, tp, next_ino, &imap, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
+ if (!pag)
+ pag = xfs_perag_get(mp, agno);
- error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
- &last_ibp, 0, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
+ /* We need to search the list for the inode being freed. */
+ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+ &prev_agino, &imap, &last_dip, &last_ibp,
+ pag);
+ if (error)
+ goto out;
- last_offset = imap.im_boffset;
- next_agino = be32_to_cpu(last_dip->di_next_unlinked);
- if (!xfs_verify_agino(mp, agno, next_agino)) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- last_dip, sizeof(*last_dip));
- return -EFSCORRUPTED;
- }
- }
+ /* Point the previous inode on the list to the next inode. */
+ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+ last_dip, &imap, next_agino);
/*
- * Now last_ibp points to the buffer previous to us on the
- * unlinked list. Pull us from the list.
+ * Now we deal with the backref for this inode. If this inode
+ * pointed at a real inode, change the backref that pointed to
+ * us to point to our old next. If this inode was the end of
+ * the list, delete the backref that pointed to us. Note that
+ * change_backref takes care of deleting the backref if
+ * next_agino is NULLAGINO.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the previous inode on the list to the next inode.
- */
- last_dip->di_next_unlinked = cpu_to_be32(next_agino);
- ASSERT(next_agino != 0);
- offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, last_dip);
-
- xfs_trans_inode_buf(tp, last_ibp);
- xfs_trans_log_buf(tp, last_ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, last_ibp);
+ error = xfs_iunlink_change_backref(pag, agino, next_agino);
+ if (error)
+ goto out;
}
- return 0;
+
+out:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout(
/*
* Prepare the tmpfile inode as if it were created through the VFS.
- * Otherwise, the link increment paths will complain about nlink 0->1.
- * Drop the link count as done by d_tmpfile(), complete the inode setup
- * and flag it as linkable.
+ * Complete the inode setup and flag it as linkable. nlink is already
+ * zero, so we can skip the drop_nlink.
*/
- drop_nlink(VFS_I(tmpfile));
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
VFS_I(tmpfile)->i_state |= I_LINKABLE;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..e62074a5257c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone;
bool xfs_inode_verify_forks(struct xfs_inode *ip);
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 27c93b5f029d..63d323916bba 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -35,18 +35,40 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-void
+static int
+xfs_alert_fsblock_zero(
+ xfs_inode_t *ip,
+ xfs_bmbt_irec_t *imap)
+{
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)imap->br_startblock,
+ (unsigned long long)imap->br_startoff,
+ (unsigned long long)imap->br_blockcount,
+ imap->br_state);
+ return -EFSCORRUPTED;
+}
+
+int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool shared)
{
struct xfs_mount *mp = ip->i_mount;
+ if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ return xfs_alert_fsblock_zero(ip, imap);
+
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ } else if (imap->br_startblock == DELAYSTARTBLOCK ||
+ isnullstartblock(imap->br_startblock)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
@@ -60,6 +82,13 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+ if (xfs_ipincount(ip) &&
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
+ if (shared)
+ iomap->flags |= IOMAP_F_SHARED;
+ return 0;
}
static void
@@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb(
return 0;
}
-STATIC int
-xfs_alert_fsblock_zero(
- xfs_inode_t *ip,
- xfs_bmbt_irec_t *imap)
-{
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x",
- (unsigned long long)ip->i_ino,
- (unsigned long long)imap->br_startblock,
- (unsigned long long)imap->br_startoff,
- (unsigned long long)imap->br_blockcount,
- imap->br_state);
- return -EFSCORRUPTED;
-}
-
int
xfs_iomap_write_direct(
xfs_inode_t *ip,
@@ -383,12 +395,13 @@ xfs_quota_calc_throttle(
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_inode *ip,
+ int whichfork,
loff_t offset,
loff_t count,
struct xfs_iext_cursor *icur)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmbt_irec prev;
int shift = 0;
@@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t maxbytes_fsb =
XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
xfs_fileoff_t end_fsb;
- int error = 0, eof = 0;
- struct xfs_bmbt_irec got;
- struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
+ bool eof = false, cow_eof = false, shared = false;
+ int whichfork = XFS_DATA_FORK;
+ int error = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
ASSERT(!xfs_get_extsz_hint(ip));
@@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
goto out_unlock;
@@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay(
end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
- eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+ /*
+ * Search the data fork fork first to look up our source mapping. We
+ * always need the data fork map, as we have to return it to the
+ * iomap code so that the higher level write code can read data in to
+ * perform read-modify-write cycles for unaligned writes.
+ */
+ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
if (eof)
- got.br_startoff = end_fsb; /* fake hole until the end */
+ imap.br_startoff = end_fsb; /* fake hole until the end */
+
+ /* We never need to allocate blocks for zeroing a hole. */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+ goto out_unlock;
+ }
- if (got.br_startoff <= offset_fsb) {
+ /*
+ * Search the COW fork extent list even if we did not find a data fork
+ * extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
+ */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ whichfork = XFS_COW_FORK;
+ goto done;
+ }
+ }
+
+ if (imap.br_startoff <= offset_fsb) {
/*
* For reflink files we may need a delalloc reservation when
* overwriting shared extents. This includes zeroing of
* existing extents that contain data.
*/
- if (xfs_is_reflink_inode(ip) &&
- ((flags & IOMAP_WRITE) ||
- got.br_state != XFS_EXT_UNWRITTEN)) {
- xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
- error = xfs_reflink_reserve_cow(ip, &got);
- if (error)
- goto out_unlock;
+ if (!xfs_is_cow_inode(ip) ||
+ ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
}
- trace_xfs_iomap_found(ip, offset, count, 0, &got);
- goto done;
- }
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
- if (flags & IOMAP_ZERO) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
- goto out_unlock;
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_inode_need_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!shared) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
+ }
+
+ /*
+ * Fork all the shared blocks from our write offset until the
+ * end of the extent.
+ */
+ whichfork = XFS_COW_FORK;
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ } else {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat
+ * symmetric with the work writeback does. This is a completely
+ * arbitrary number pulled out of thin air.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+ if (xfs_is_always_cow_inode(ip))
+ whichfork = XFS_COW_FORK;
}
error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
- * to keep the chunks of work done where somewhat symmetric with the
- * work writeback does. This is a completely arbitrary number pulled
- * out of thin air as a best guess for initial testing.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
- &icur);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+ count, &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay(
}
retry:
- error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
- eof);
+ error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap,
+ whichfork == XFS_DATA_FORK ? &icur : &ccur,
+ whichfork == XFS_DATA_FORK ? eof : cow_eof);
switch (error) {
case 0:
break;
@@ -647,186 +711,22 @@ retry:
* them out if the write happens to fail.
*/
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap);
done:
- if (isnullstartblock(got.br_startblock))
- got.br_startblock = DELAYSTARTBLOCK;
-
- if (!got.br_startblock) {
- error = xfs_alert_fsblock_zero(ip, &got);
- if (error)
+ if (whichfork == XFS_COW_FORK) {
+ if (imap.br_startoff > offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
goto out_unlock;
- }
-
- xfs_bmbt_to_iomap(ip, iomap, &got);
-
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
- xfs_inode_t *ip,
- int whichfork,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- unsigned int *cow_seq)
-{
- xfs_mount_t *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_fileoff_t offset_fsb, last_block;
- xfs_fileoff_t end_fsb, map_start_fsb;
- xfs_filblks_t count_fsb;
- xfs_trans_t *tp;
- int nimaps;
- int error = 0;
- int flags = XFS_BMAPI_DELALLOC;
- int nres;
-
- if (whichfork == XFS_COW_FORK)
- flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
- /*
- * Make sure that the dquots are there.
- */
- error = xfs_qm_dqattach(ip);
- if (error)
- return error;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = imap->br_blockcount;
- map_start_fsb = imap->br_startoff;
-
- XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
- while (count_fsb != 0) {
- /*
- * Set up a transaction with which to allocate the
- * backing store for the file. Do allocations in a
- * loop until we get some space in the range we are
- * interested in. The other space that might be allocated
- * is in the delayed allocation extent on which we sit
- * but before our buffer starts.
- */
- nimaps = 0;
- while (nimaps == 0) {
- nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- /*
- * We have already reserved space for the extent and any
- * indirect blocks when creating the delalloc extent,
- * there is no need to reserve space in this transaction
- * again.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * it is possible that the extents have changed since
- * we did the read call as we dropped the ilock for a
- * while. We have to be careful about truncates or hole
- * punchs here - we are not allowed to allocate
- * non-delalloc blocks here.
- *
- * The only protection against truncation is the pages
- * for the range we are being asked to convert are
- * locked and hence a truncate will block on them
- * first.
- *
- * As a result, if we go beyond the range we really
- * need and hit an delalloc extent boundary followed by
- * a hole while we have excess blocks in the map, we
- * will fill the hole incorrectly and overrun the
- * transaction reservation.
- *
- * Using a single map prevents this as we are forced to
- * check each map we look for overlap with the desired
- * range and abort as soon as we find it. Also, given
- * that we only return a single map, having one beyond
- * what we can return is probably a bit silly.
- *
- * We also need to check that we don't go beyond EOF;
- * this is a truncate optimisation as a truncate sets
- * the new file size before block on the pages we
- * currently have locked under writeback. Because they
- * are about to be tossed, we don't need to write them
- * back....
- */
- nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(ip, &last_block,
- XFS_DATA_FORK);
- if (error)
- goto trans_cancel;
-
- last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
- if ((map_start_fsb + count_fsb) > last_block) {
- count_fsb = last_block - map_start_fsb;
- if (count_fsb == 0) {
- error = -EAGAIN;
- goto trans_cancel;
- }
- }
-
- /*
- * From this point onwards we overwrite the imap
- * pointer that the caller gave to us.
- */
- error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, flags, nres, imap,
- &nimaps);
- if (error)
- goto trans_cancel;
-
- error = xfs_trans_commit(tp);
- if (error)
- goto error0;
-
- if (whichfork == XFS_COW_FORK)
- *cow_seq = READ_ONCE(ifp->if_seq);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- /*
- * See if we were able to allocate an extent that
- * covers at least part of the callers request
- */
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_alert_fsblock_zero(ip, imap);
-
- if ((offset_fsb >= imap->br_startoff) &&
- (offset_fsb < (imap->br_startoff +
- imap->br_blockcount))) {
- XFS_STATS_INC(mp, xs_xstrat_quick);
- return 0;
}
-
- /*
- * So far we have not mapped the requested part of the
- * file, just surrounding data, try again.
- */
- count_fsb -= imap->br_blockcount;
- map_start_fsb = imap->br_startoff + imap->br_blockcount;
+ /* ensure we only report blocks we have a reservation for */
+ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+ shared = true;
}
-
-trans_cancel:
- xfs_trans_cancel(tp);
-error0:
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -975,7 +875,7 @@ xfs_ilock_for_iomap(
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_reflink_inode(ip) && is_write) {
+ if (xfs_is_cow_inode(ip) && is_write) {
/*
* FIXME: It could still overwrite on unshared extents and not
* need allocation.
@@ -1009,7 +909,7 @@ relock:
* check, so if we got ILOCK_SHARED for a write and but we're now a
* reflink inode we have to switch to ILOCK_EXCL and relock.
*/
- if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+ if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
xfs_iunlock(ip, mode);
mode = XFS_ILOCK_EXCL;
goto relock;
@@ -1081,23 +981,33 @@ xfs_file_iomap_begin(
* Break shared extents if necessary. Checks for non-blocking IO have
* been done up front, so we don't need to do them here.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
+ struct xfs_bmbt_irec cmap;
+ bool directio = (flags & IOMAP_DIRECT);
+
/* if zeroing doesn't need COW allocation, then we are done. */
if ((flags & IOMAP_ZERO) &&
!needs_cow_for_zeroing(&imap, nimaps))
goto out_found;
- if (flags & IOMAP_DIRECT) {
- /* may drop and re-acquire the ilock */
- error = xfs_reflink_allocate_cow(ip, &imap, &shared,
- &lockmode);
- if (error)
- goto out_unlock;
- } else {
- error = xfs_reflink_reserve_cow(ip, &imap);
- if (error)
- goto out_unlock;
- }
+ /* may drop and re-acquire the ilock */
+ cmap = imap;
+ error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+ directio);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For buffered writes we need to report the address of the
+ * previous block (if there was any) so that the higher level
+ * write code can perform read-modify-write operations; we
+ * won't need the CoW fork mapping until writeback. For direct
+ * I/O, which must be block aligned, we need to report the
+ * newly allocated address. If the data fork has a hole, copy
+ * the COW fork mapping to avoid allocating to the data fork.
+ */
+ if (directio || imap.br_startblock == HOLESTARTBLOCK)
+ imap = cmap;
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1139,23 +1049,15 @@ xfs_file_iomap_begin(
return error;
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
- if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
- & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
-
- xfs_bmbt_to_iomap(ip, iomap, &imap);
-
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
- return 0;
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_found:
ASSERT(nimaps);
xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
goto out_finish;
out_unlock:
@@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = {
};
static int
+xfs_seek_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ int error = 0;
+ unsigned lockmode;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lockmode = xfs_ilock_data_map_shared(ip);
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+ /*
+ * If we found a data extent we are done.
+ */
+ if (imap.br_startoff <= offset_fsb)
+ goto done;
+ data_fsb = imap.br_startoff;
+ } else {
+ /*
+ * Fake a hole until the end of the file.
+ */
+ data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ }
+
+ /*
+ * If a COW fork extent covers the hole, report it - capped to the next
+ * data fork extent:
+ */
+ if (xfs_inode_has_cow_data(ip) &&
+ xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cow_fsb = cmap.br_startoff;
+ if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+ if (data_fsb < cow_fsb + cmap.br_blockcount)
+ end_fsb = min(end_fsb, data_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ /*
+ * This is a COW extent, so we must probe the page cache
+ * because there could be dirty page cache being backed
+ * by this extent.
+ */
+ iomap->type = IOMAP_UNWRITTEN;
+ goto out_unlock;
+ }
+
+ /*
+ * Else report a hole, capped to the next found data or COW extent.
+ */
+ if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+ imap.br_blockcount = cow_fsb - offset_fsb;
+ else
+ imap.br_blockcount = data_fsb - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+done:
+ xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+ .iomap_begin = xfs_seek_iomap_begin,
+};
+
+static int
xfs_xattr_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin(
out_unlock:
xfs_iunlock(ip, lockmode);
- if (!error) {
- ASSERT(nimaps);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
- }
-
- return error;
+ if (error)
+ return error;
+ ASSERT(nimaps);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c6170548831b..5c2f6aa6d78f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,12 +13,10 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
- struct xfs_bmbt_irec *, unsigned int *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *, bool shared);
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
static inline xfs_filblks_t
@@ -42,6 +40,7 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f48ffd7a8d3e..74047bd0c1ae 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -191,9 +191,18 @@ xfs_generic_create(
xfs_setup_iops(ip);
- if (tmpfile)
+ if (tmpfile) {
+ /*
+ * The VFS requires that any inode fed to d_tmpfile must have
+ * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+ * However, we created the temp file with nlink == 0 because
+ * we're not allowed to put an inode with nlink > 0 on the
+ * unlinked list. Therefore we have to set nlink to 1 so that
+ * d_tmpfile can immediately set it back to zero.
+ */
+ set_nlink(inode, 1);
d_tmpfile(dentry, inode);
- else
+ } else
d_instantiate(dentry, inode);
xfs_finish_inode_setup(ip);
@@ -522,6 +531,10 @@ xfs_vn_getattr(
}
}
+ /*
+ * Note: If you add another clause to set an attribute flag, please
+ * update attributes_mask below.
+ */
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +542,10 @@ xfs_vn_getattr(
if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_APPEND |
+ STATX_ATTR_NODUMP);
+
switch (inode->i_mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9fe88d125f0a..3371d1ff27c4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_BTREE_BUF:
switch (magic32) {
case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTC_CRC_MAGIC:
case XFS_ABTB_MAGIC:
+ bp->b_ops = &xfs_bnobt_buf_ops;
+ break;
+ case XFS_ABTC_CRC_MAGIC:
case XFS_ABTC_MAGIC:
- bp->b_ops = &xfs_allocbt_buf_ops;
+ bp->b_ops = &xfs_cntbt_buf_ops;
break;
case XFS_IBT_CRC_MAGIC:
- case XFS_FIBT_CRC_MAGIC:
case XFS_IBT_MAGIC:
- case XFS_FIBT_MAGIC:
bp->b_ops = &xfs_inobt_buf_ops;
break;
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_finobt_buf_ops;
+ break;
case XFS_BMAP_CRC_MAGIC:
case XFS_BMAP_MAGIC:
bp->b_ops = &xfs_bmbt_buf_ops;
@@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2(
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+ if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b4d8c318be3c..fd63b0b1307c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -149,6 +149,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +228,9 @@ xfs_initialize_perag(
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
+ error = xfs_iunlink_init(pag);
+ if (error)
+ goto out_hash_destroy;
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@ out_unwind_new_pags:
if (!pag)
break;
xfs_buf_hash_destroy(pag);
+ xfs_iunlink_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7daafe064af8..110f927cf943 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -138,7 +138,7 @@ typedef struct xfs_mount {
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint64_t m_flags; /* global mount flags */
- bool m_inotbt_nores; /* no per-AG finobt resv. */
+ bool m_finobt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_ialloc_min_blks;/* min blocks in sparse inode
@@ -194,6 +194,7 @@ typedef struct xfs_mount {
*/
uint32_t m_generation;
+ bool m_always_cow;
bool m_fail_unmount;
#ifdef DEBUG
/*
@@ -396,6 +397,13 @@ typedef struct xfs_perag {
/* reference count */
uint8_t pagf_refcount_level;
+
+ /*
+ * Unlinked inode information. This incore information reflects
+ * data stored in the AGI, so callers must hold the AGI buffer lock
+ * or have some other means to control concurrency.
+ */
+ struct rhashtable pagi_unlinked_hash;
} xfs_perag_t;
static inline struct xfs_ag_resv *
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index d3e04d20d8d4..c8ba98fae30a 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+
+ /*
+ * The v5 superblock format extended several v4 header structures with
+ * additional data. While new fields are only accessible on v5
+ * superblocks, it's important that the v5 structures place original v4
+ * fields/headers in the correct location on-disk. For example, we must
+ * be able to find magic values at the same location in certain blocks
+ * regardless of superblock version.
+ *
+ * The following checks ensure that various v5 data structures place the
+ * subset of v4 metadata associated with the same type of block at the
+ * start of the on-disk block. If there is no data structure definition
+ * for certain types of v4 blocks, traverse down to the first field of
+ * common metadata (e.g., magic value) and make sure it is at offset
+ * zero.
+ */
+ XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c3599527d..bde2c9f56a46 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -185,7 +185,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c5b4fa004ca4..680ae7662a78 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared(
}
}
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool *shared)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got;
- int error = 0;
- bool eof = false;
- struct xfs_iext_cursor icur;
- bool shared;
-
- /*
- * Search the COW fork extent list first. This serves two purposes:
- * first this implement the speculative preallocation using cowextisze,
- * so that we also unshared block adjacent to shared blocks instead
- * of just the shared blocks themselves. Second the lookup in the
- * extent list is generally faster than going out to the shared extent
- * tree.
- */
-
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
- eof = true;
- if (!eof && got.br_startoff <= imap->br_startoff) {
- trace_xfs_reflink_cow_found(ip, imap);
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ /* We can't update any real extents in always COW mode. */
+ if (xfs_is_always_cow_inode(ip) &&
+ !isnullstartblock(imap->br_startblock)) {
+ *shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, &shared);
- if (error)
- return error;
-
- /* Not shared? Just report the (potentially capped) extent. */
- if (!shared)
- return 0;
-
- /*
- * Fork all the shared blocks from our write offset until the end of
- * the extent.
- */
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- return error;
-
- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &icur, eof);
- if (error == -ENOSPC || error == -EDQUOT)
- trace_xfs_reflink_cow_enospc(ip, imap);
- if (error)
- return error;
-
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
- trace_xfs_reflink_cow_alloc(ip, &got);
- return 0;
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
- struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- xfs_fileoff_t offset_fsb,
- xfs_filblks_t count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb)
{
- int nimaps = 1;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_btree_cur *dummy_cur = NULL;
+ int dummy_logflags;
+ int error = 0;
- if (imap->br_state == XFS_EXT_NORM)
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
return 0;
- xfs_trim_extent(imap, offset_fsb, count_fsb);
- trace_xfs_reflink_convert_cow(ip, imap);
- if (imap->br_blockcount == 0)
- return 0;
- return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
- &nimaps);
+ do {
+ if (got.br_startoff >= offset_fsb + count_fsb)
+ break;
+ if (got.br_state == XFS_EXT_NORM)
+ continue;
+ if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+ return -EIO;
+
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
+ if (!got.br_blockcount)
+ continue;
+
+ got.br_state = XFS_EXT_NORM;
+ error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+ XFS_COW_FORK, &icur, &dummy_cur, &got,
+ &dummy_logflags);
+ if (error)
+ return error;
+ } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+ return error;
}
/* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -334,15 +300,12 @@ xfs_reflink_convert_cow(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_filblks_t count_fsb = end_fsb - offset_fsb;
- struct xfs_bmbt_irec imap;
- int nimaps = 1, error = 0;
+ int error;
ASSERT(count != 0);
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
- XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -375,7 +338,7 @@ xfs_find_trim_cow_extent(
if (got.br_startoff > offset_fsb) {
xfs_trim_extent(imap, imap->br_startoff,
got.br_startoff - imap->br_startoff);
- return xfs_reflink_trim_around_shared(ip, imap, shared);
+ return xfs_inode_need_cow(ip, imap, shared);
}
*shared = true;
@@ -397,7 +360,8 @@ xfs_reflink_allocate_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared,
- uint *lockmode)
+ uint *lockmode,
+ bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
@@ -409,7 +373,10 @@ xfs_reflink_allocate_cow(
xfs_extlen_t resblks = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_is_reflink_inode(ip));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
if (error || !*shared)
@@ -471,7 +438,16 @@ xfs_reflink_allocate_cow(
if (nimaps == 0)
return -ENOSPC;
convert:
- return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+ xfs_trim_extent(imap, offset_fsb, count_fsb);
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || imap->br_state == XFS_EXT_NORM)
+ return 0;
+ trace_xfs_reflink_convert_cow(ip, imap);
+ return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
out_unreserve:
xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range(
int error;
trace_xfs_reflink_cancel_cow_range(ip, offset, count);
- ASSERT(xfs_is_reflink_inode(ip));
+ ASSERT(ip->i_cowfp);
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
if (count == NULLFILEOFF)
@@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks(
break;
ASSERT(nimaps == 1);
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+ trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
&imap);
/* Translate imap into the destination file. */
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 6d73daef1f13..28a43b7f581d 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,28 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+ return ip->i_mount->m_always_cow &&
+ xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+ return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+ bool *shared);
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+ struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+ bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c9097cb0b955..f093ea244849 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1594,6 +1594,13 @@ xfs_mount_alloc(
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
+ /*
+ * We don't create the finobt per-ag space reservation until after log
+ * recovery, so we must set this to true so that an ifree transaction
+ * started during log recovery will not depend on space reservations
+ * for finobt expansion.
+ */
+ mp->m_finobt_nores = true;
return mp;
}
@@ -1729,11 +1736,18 @@ xfs_fs_fill_super(
}
}
- if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
- xfs_alert(mp,
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_rblocks) {
+ xfs_alert(mp,
"reflink not compatible with realtime device!");
- error = -EINVAL;
- goto out_filestream_unmount;
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
+ if (xfs_globals.always_cow) {
+ xfs_info(mp, "using DEBUG-only always_cow mode.");
+ mp->m_always_cow = true;
+ }
}
if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 168488130a19..ad7f9be13087 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,7 @@ struct xfs_globals {
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
+ bool always_cow; /* use COW fork for all overwrites */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994a7250..cabda13f3c64 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -183,10 +183,34 @@ mount_delay_show(
}
XFS_SYSFS_ATTR_RW(mount_delay);
+static ssize_t
+always_cow_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.always_cow);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+static ssize_t
+always_cow_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
+ ATTR_LIST(always_cow),
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6fcc893dfc91..47fb07d86efd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \
DEFINE_READPAGE_EVENT(xfs_vm_readpage);
DEFINE_READPAGE_EVENT(xfs_vm_readpages);
-TRACE_DEFINE_ENUM(XFS_IO_HOLE);
-TRACE_DEFINE_ENUM(XFS_IO_DELALLOC);
-TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN);
-TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE);
-TRACE_DEFINE_ENUM(XFS_IO_COW);
-
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
- int type, struct xfs_bmbt_irec *irec),
- TP_ARGS(ip, offset, count, type, irec),
+ int whichfork, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, whichfork, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, type)
+ __field(int, whichfork)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->type = type;
+ __entry->whichfork = whichfork;
__entry->startoff = irec ? irec->br_startoff : 0;
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
- "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+ "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
__entry->count,
- __print_symbolic(__entry->type, XFS_IO_TYPES),
+ __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount)
)
-#define DEFINE_IOMAP_EVENT(name) \
+#define DEFINE_IMAP_EVENT(name) \
DEFINE_EVENT(xfs_imap_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int type, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+ int whichfork, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
TRACE_EVENT(xfs_reflink_remap_blocks_loop,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
@@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
DEFINE_TRANS_EVENT(xfs_trans_add_item);
DEFINE_TRANS_EVENT(xfs_trans_free_items);
+TRACE_EVENT(xfs_iunlink_update_bucket,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->bucket = bucket;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno %u agino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 11cff449d055..e1c7d55b32c3 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -17,7 +17,6 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_inode.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate a "bmap update done"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 629f1479c9d2..7d65ebf1e847 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -277,7 +277,7 @@ xfs_trans_read_buf_map(
* release this buffer when it kills the tranaction.
*/
ASSERT(bp->b_ops != NULL);
- error = xfs_buf_ensure_ops(bp, ops);
+ error = xfs_buf_reverify(bp, ops);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 0710434eb240..8ee7a3f8bb20 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -18,7 +18,6 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_trace.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate an "extent free done"
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
index 6c947ff4faf6..8d734728dd1b 100644
--- a/fs/xfs/xfs_trans_refcount.c
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -16,7 +16,6 @@
#include "xfs_refcount_item.h"
#include "xfs_alloc.h"
#include "xfs_refcount.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate a "refcount update done"
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index a42890931ecd..5c7936b1be13 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -16,7 +16,6 @@
#include "xfs_rmap_item.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
-#include "xfs_defer.h"
/* Set the map extent flags for this reverse mapping. */
static void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 63ee1d5bf1d7..9a63016009a1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
char *offset;
int arraytop;
+ if (context->count < 0 || context->seen_enough)
+ return;
+
if (!context->alist)
goto compute_size;