aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/ceph/caps.c17
-rw-r--r--fs/ceph/mds_client.c25
-rw-r--r--fs/ceph/snap.c34
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/cifs/cifsglob.h5
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c35
-rw-r--r--fs/cifs/inode.c19
-rw-r--r--fs/cifs/misc.c50
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/io-wq.c26
-rw-r--r--fs/io_uring.c58
-rw-r--r--fs/namespace.c42
-rw-r--r--fs/notify/fanotify/fanotify_user.c17
-rw-r--r--fs/notify/inotify/inotify_user.c17
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c47
-rw-r--r--fs/overlayfs/readdir.c5
18 files changed, 280 insertions, 125 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7bdefd0c789a..2a2900903f8c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4150,11 +4150,19 @@ bad:
/*
* Delayed work handler to process end of delayed cap release LRU list.
+ *
+ * If new caps are added to the list while processing it, these won't get
+ * processed in this run. In this case, the ci->i_hold_caps_max will be
+ * returned so that the work can be scheduled accordingly.
*/
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+ unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
+ unsigned long loop_start = jiffies;
+ unsigned long delay = 0;
dout("check_delayed_caps\n");
spin_lock(&mdsc->cap_delay_lock);
@@ -4162,6 +4170,11 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
+ if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
+ dout("%s caps added recently. Exiting loop", __func__);
+ delay = ci->i_hold_caps_max;
+ break;
+ }
if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
time_before(jiffies, ci->i_hold_caps_max))
break;
@@ -4177,6 +4190,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
}
}
spin_unlock(&mdsc->cap_delay_lock);
+
+ return delay;
}
/*
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9db1b39df773..afdc20213876 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4490,22 +4490,29 @@ void inc_session_sequence(struct ceph_mds_session *s)
}
/*
- * delayed work -- periodically trim expired leases, renew caps with mds
+ * delayed work -- periodically trim expired leases, renew caps with mds. If
+ * the @delay parameter is set to 0 or if it's more than 5 secs, the default
+ * workqueue delay value of 5 secs will be used.
*/
-static void schedule_delayed(struct ceph_mds_client *mdsc)
+static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
{
- int delay = 5;
- unsigned hz = round_jiffies_relative(HZ * delay);
- schedule_delayed_work(&mdsc->delayed_work, hz);
+ unsigned long max_delay = HZ * 5;
+
+ /* 5 secs default delay */
+ if (!delay || (delay > max_delay))
+ delay = max_delay;
+ schedule_delayed_work(&mdsc->delayed_work,
+ round_jiffies_relative(delay));
}
static void delayed_work(struct work_struct *work)
{
- int i;
struct ceph_mds_client *mdsc =
container_of(work, struct ceph_mds_client, delayed_work.work);
+ unsigned long delay;
int renew_interval;
int renew_caps;
+ int i;
dout("mdsc delayed_work\n");
@@ -4545,7 +4552,7 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
- ceph_check_delayed_caps(mdsc);
+ delay = ceph_check_delayed_caps(mdsc);
ceph_queue_cap_reclaim_work(mdsc);
@@ -4553,7 +4560,7 @@ static void delayed_work(struct work_struct *work)
maybe_recover_session(mdsc);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, delay);
}
int ceph_mdsc_init(struct ceph_fs_client *fsc)
@@ -5030,7 +5037,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, 0);
return;
bad_unlock:
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4ac0606dcbd4..4c6bd1042c94 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -67,19 +67,19 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
{
lockdep_assert_held(&mdsc->snap_rwsem);
- dout("get_realm %p %d -> %d\n", realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
/*
- * since we _only_ increment realm refs or empty the empty
- * list with snap_rwsem held, adjusting the empty list here is
- * safe. we do need to protect against concurrent empty list
- * additions, however.
+ * The 0->1 and 1->0 transitions must take the snap_empty_lock
+ * atomically with the refcount change. Go ahead and bump the
+ * nref here, unless it's 0, in which case we take the spinlock
+ * and then do the increment and remove it from the list.
*/
- if (atomic_inc_return(&realm->nref) == 1) {
- spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_not_zero(&realm->nref))
+ return;
+
+ spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_return(&realm->nref) == 1)
list_del_init(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
+ spin_unlock(&mdsc->snap_empty_lock);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -208,28 +208,28 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
{
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ /*
+ * We do not require the snap_empty_lock here, as any caller that
+ * increments the value must hold the snap_rwsem.
+ */
if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm);
}
/*
- * caller needn't hold any locks
+ * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
*/
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (!atomic_dec_and_test(&realm->nref))
+ if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
return;
if (down_write_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&mdsc->snap_empty_lock);
__destroy_snap_realm(mdsc, realm);
up_write(&mdsc->snap_rwsem);
} else {
- spin_lock(&mdsc->snap_empty_lock);
list_add(&realm->empty_item, &mdsc->snap_empty);
spin_unlock(&mdsc->snap_empty_lock);
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6b6332a5c113..9215a2f4535c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1167,7 +1167,7 @@ extern void ceph_flush_snaps(struct ceph_inode_info *ci,
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c0bfc2f01030..c6a9542ca281 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1611,6 +1611,11 @@ struct dfs_info3_param {
int ttl;
};
+struct file_list {
+ struct list_head list;
+ struct cifsFileInfo *cfile;
+};
+
/*
* common struct for holding inode info when searching for or updating an
* inode with new info
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 79402ca0ddfa..5f8a302ffcb2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -100,7 +100,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0;
- s = dentry_path_raw(direntry, page, PAGE_SIZE);
+ s = dentry_path_raw(direntry, page, PATH_MAX);
if (IS_ERR(s))
return s;
if (!s[1]) // for root we want "", not "/"
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0a72840a88f1..bb98fbdd22a9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4848,34 +4848,33 @@ void cifs_oplock_break(struct work_struct *work)
oplock_break_ack:
/*
- * releasing stale oplock after recent reconnect of smb session using
- * a now incorrect file handle is not a data integrity issue but do
- * not bother sending an oplock release if session to server still is
- * disconnected since oplock already released by the server
- */
- if (!cfile->oplock_break_cancelled) {
- rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
- cinode);
- cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
- }
- /*
* When oplock break is received and there are no active
* file handles but cached, then schedule deferred close immediately.
* So, new open will not use cached handle.
*/
spin_lock(&CIFS_I(inode)->deferred_lock);
is_deferred = cifs_is_deferred_close(cfile, &dclose);
+ spin_unlock(&CIFS_I(inode)->deferred_lock);
if (is_deferred &&
cfile->deferred_close_scheduled &&
delayed_work_pending(&cfile->deferred)) {
- /*
- * If there is no pending work, mod_delayed_work queues new work.
- * So, Increase the ref count to avoid use-after-free.
- */
- if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0))
- cifsFileInfo_get(cfile);
+ if (cancel_delayed_work(&cfile->deferred)) {
+ _cifsFileInfo_put(cfile, false, false);
+ goto oplock_break_done;
+ }
}
- spin_unlock(&CIFS_I(inode)->deferred_lock);
+ /*
+ * releasing stale oplock after recent reconnect of smb session using
+ * a now incorrect file handle is not a data integrity issue but do
+ * not bother sending an oplock release if session to server still is
+ * disconnected since oplock already released by the server
+ */
+ if (!cfile->oplock_break_cancelled) {
+ rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
+ cinode);
+ cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
+ }
+oplock_break_done:
_cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
cifs_done_oplock_break(cinode);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b96b253e7635..65f8a70cece3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1625,7 +1625,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
goto unlink_out;
}
- cifs_close_all_deferred_files(tcon);
+ cifs_close_deferred_file(CIFS_I(inode));
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = CIFSPOSIXDelFile(xid, tcon, full_path,
@@ -2084,6 +2084,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
FILE_UNIX_BASIC_INFO *info_buf_target;
unsigned int xid;
int rc, tmprc;
+ int retry_count = 0;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -2113,10 +2114,24 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
goto cifs_rename_exit;
}
- cifs_close_all_deferred_files(tcon);
+ cifs_close_deferred_file(CIFS_I(d_inode(source_dentry)));
+ if (d_inode(target_dentry) != NULL)
+ cifs_close_deferred_file(CIFS_I(d_inode(target_dentry)));
+
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
to_name);
+ if (rc == -EACCES) {
+ while (retry_count < 3) {
+ cifs_close_all_deferred_files(tcon);
+ rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
+ to_name);
+ if (rc != -EACCES)
+ break;
+ retry_count++;
+ }
+ }
+
/*
* No-replace is the natural behavior for CIFS, so skip unlink hacks.
*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 844abeb2b48f..9469f1cf0b46 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -723,13 +723,31 @@ void
cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *cfile = NULL;
- struct cifs_deferred_close *dclose;
+ struct file_list *tmp_list, *tmp_next_list;
+ struct list_head file_head;
+
+ if (cifs_inode == NULL)
+ return;
+ INIT_LIST_HEAD(&file_head);
+ spin_lock(&cifs_inode->open_file_lock);
list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
- spin_lock(&cifs_inode->deferred_lock);
- if (cifs_is_deferred_close(cfile, &dclose))
- mod_delayed_work(deferredclose_wq, &cfile->deferred, 0);
- spin_unlock(&cifs_inode->deferred_lock);
+ if (delayed_work_pending(&cfile->deferred)) {
+ if (cancel_delayed_work(&cfile->deferred)) {
+ tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+ if (tmp_list == NULL)
+ continue;
+ tmp_list->cfile = cfile;
+ list_add_tail(&tmp_list->list, &file_head);
+ }
+ }
+ }
+ spin_unlock(&cifs_inode->open_file_lock);
+
+ list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
+ _cifsFileInfo_put(tmp_list->cfile, true, false);
+ list_del(&tmp_list->list);
+ kfree(tmp_list);
}
}
@@ -738,20 +756,30 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
{
struct cifsFileInfo *cfile;
struct list_head *tmp;
+ struct file_list *tmp_list, *tmp_next_list;
+ struct list_head file_head;
+ INIT_LIST_HEAD(&file_head);
spin_lock(&tcon->open_file_lock);
list_for_each(tmp, &tcon->openFileList) {
cfile = list_entry(tmp, struct cifsFileInfo, tlist);
if (delayed_work_pending(&cfile->deferred)) {
- /*
- * If there is no pending work, mod_delayed_work queues new work.
- * So, Increase the ref count to avoid use-after-free.
- */
- if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0))
- cifsFileInfo_get(cfile);
+ if (cancel_delayed_work(&cfile->deferred)) {
+ tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+ if (tmp_list == NULL)
+ continue;
+ tmp_list->cfile = cfile;
+ list_add_tail(&tmp_list->list, &file_head);
+ }
}
}
spin_unlock(&tcon->open_file_lock);
+
+ list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
+ _cifsFileInfo_put(tmp_list->cfile, true, false);
+ list_del(&tmp_list->list);
+ kfree(tmp_list);
+ }
}
/* parses DFS refferal V3 structure
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 781d14e5f2af..b6d2e3591927 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2426,7 +2426,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
memcpy(aclptr, &acl, sizeof(struct cifs_acl));
buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
- *len = ptr - (__u8 *)buf;
+ *len = roundup(ptr - (__u8 *)buf, 8);
return buf;
}
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 12fc19353bb0..7d2ed8c7dd31 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -129,7 +129,7 @@ struct io_cb_cancel_data {
bool cancel_all;
};
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first);
static void io_wqe_dec_running(struct io_worker *worker);
static bool io_worker_get(struct io_worker *worker)
@@ -248,18 +248,20 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
rcu_read_unlock();
if (!ret) {
- bool do_create = false;
+ bool do_create = false, first = false;
raw_spin_lock_irq(&wqe->lock);
if (acct->nr_workers < acct->max_workers) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
+ if (!acct->nr_workers)
+ first = true;
acct->nr_workers++;
do_create = true;
}
raw_spin_unlock_irq(&wqe->lock);
if (do_create)
- create_io_worker(wqe->wq, wqe, acct->index);
+ create_io_worker(wqe->wq, wqe, acct->index, first);
}
}
@@ -282,16 +284,26 @@ static void create_worker_cb(struct callback_head *cb)
struct io_wq *wq;
struct io_wqe *wqe;
struct io_wqe_acct *acct;
+ bool do_create = false, first = false;
cwd = container_of(cb, struct create_worker_data, work);
wqe = cwd->wqe;
wq = wqe->wq;
acct = &wqe->acct[cwd->index];
raw_spin_lock_irq(&wqe->lock);
- if (acct->nr_workers < acct->max_workers)
+ if (acct->nr_workers < acct->max_workers) {
+ if (!acct->nr_workers)
+ first = true;
acct->nr_workers++;
+ do_create = true;
+ }
raw_spin_unlock_irq(&wqe->lock);
- create_io_worker(wq, cwd->wqe, cwd->index);
+ if (do_create) {
+ create_io_worker(wq, wqe, cwd->index, first);
+ } else {
+ atomic_dec(&acct->nr_running);
+ io_worker_ref_put(wq);
+ }
kfree(cwd);
}
@@ -629,7 +641,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
raw_spin_unlock_irq(&worker->wqe->lock);
}
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first)
{
struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
@@ -670,7 +682,7 @@ fail:
worker->flags |= IO_WORKER_F_FREE;
if (index == IO_WQ_ACCT_BOUND)
worker->flags |= IO_WORKER_F_BOUND;
- if ((acct->nr_workers == 1) && (worker->flags & IO_WORKER_F_BOUND))
+ if (first && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
raw_spin_unlock_irq(&wqe->lock);
wake_up_new_task(tsk);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bf548af0426c..04c6d059ea94 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,6 +78,7 @@
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
+#include <linux/tracehook.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -1499,7 +1500,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
clear_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
}
if (posted)
@@ -1578,7 +1580,9 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
@@ -2222,9 +2226,9 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
static inline bool io_run_task_work(void)
{
- if (current->task_works) {
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
__set_current_state(TASK_RUNNING);
- task_work_run();
+ tracehook_notify_signal();
return true;
}
@@ -6803,14 +6807,16 @@ static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
/* Tell userspace we may need a wakeup call */
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
{
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
@@ -7132,16 +7138,6 @@ static void **io_alloc_page_table(size_t size)
return table;
}
-static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
-{
- spin_lock_bh(&ctx->rsrc_ref_lock);
-}
-
-static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
-{
- spin_unlock_bh(&ctx->rsrc_ref_lock);
-}
-
static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
percpu_ref_exit(&ref_node->refs);
@@ -7158,9 +7154,9 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
rsrc_node->rsrc_data = data_to_kill;
- io_rsrc_ref_lock(ctx);
+ spin_lock_irq(&ctx->rsrc_ref_lock);
list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
- io_rsrc_ref_unlock(ctx);
+ spin_unlock_irq(&ctx->rsrc_ref_lock);
atomic_inc(&data_to_kill->refs);
percpu_ref_kill(&rsrc_node->refs);
@@ -7199,17 +7195,19 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
/* kill initial ref, already quiesced if zero */
if (atomic_dec_and_test(&data->refs))
break;
+ mutex_unlock(&ctx->uring_lock);
flush_delayed_work(&ctx->rsrc_put_work);
ret = wait_for_completion_interruptible(&data->done);
- if (!ret)
+ if (!ret) {
+ mutex_lock(&ctx->uring_lock);
break;
+ }
atomic_inc(&data->refs);
/* wait for all works potentially completing data->done */
flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
- mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig();
mutex_lock(&ctx->uring_lock);
} while (ret >= 0);
@@ -7668,9 +7666,10 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
+ unsigned long flags;
bool first_add = false;
- io_rsrc_ref_lock(ctx);
+ spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
node->done = true;
while (!list_empty(&ctx->rsrc_ref_list)) {
@@ -7682,7 +7681,7 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
list_del(&node->node);
first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
}
- io_rsrc_ref_unlock(ctx);
+ spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
if (first_add)
mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
@@ -8653,13 +8652,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static bool io_wait_rsrc_data(struct io_rsrc_data *data)
+static void io_wait_rsrc_data(struct io_rsrc_data *data)
{
- if (!data)
- return false;
- if (!atomic_dec_and_test(&data->refs))
+ if (data && !atomic_dec_and_test(&data->refs))
wait_for_completion(&data->done);
- return true;
}
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -8671,10 +8667,14 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
+ /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
+ io_wait_rsrc_data(ctx->buf_data);
+ io_wait_rsrc_data(ctx->file_data);
+
mutex_lock(&ctx->uring_lock);
- if (io_wait_rsrc_data(ctx->buf_data))
+ if (ctx->buf_data)
__io_sqe_buffers_unregister(ctx);
- if (io_wait_rsrc_data(ctx->file_data))
+ if (ctx->file_data)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
diff --git a/fs/namespace.c b/fs/namespace.c
index ab4174a3c802..f79d9471cb76 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1938,6 +1938,20 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
+static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+ struct mount *child;
+
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ if (!is_subdir(child->mnt_mountpoint, dentry))
+ continue;
+
+ if (child->mnt.mnt_flags & MNT_LOCKED)
+ return true;
+ }
+ return false;
+}
+
/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
@@ -1953,10 +1967,19 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
+ down_read(&namespace_sem);
if (IS_MNT_UNBINDABLE(old_mnt))
- return ERR_PTR(-EINVAL);
+ goto invalid;
+
+ if (!check_mnt(old_mnt))
+ goto invalid;
+
+ if (has_locked_children(old_mnt, path->dentry))
+ goto invalid;
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+ up_read(&namespace_sem);
+
if (IS_ERR(new_mnt))
return ERR_CAST(new_mnt);
@@ -1964,6 +1987,10 @@ struct vfsmount *clone_private_mount(const struct path *path)
new_mnt->mnt_ns = MNT_NS_INTERNAL;
return &new_mnt->mnt;
+
+invalid:
+ up_read(&namespace_sem);
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -2315,19 +2342,6 @@ static int do_change_type(struct path *path, int ms_flags)
return err;
}
-static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
-{
- struct mount *child;
- list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
- if (!is_subdir(child->mnt_mountpoint, dentry))
- continue;
-
- if (child->mnt.mnt_flags & MNT_LOCKED)
- return true;
- }
- return false;
-}
-
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 64864fb40b40..28b67cb9458d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -54,22 +54,27 @@ static int fanotify_max_queued_events __read_mostly;
#include <linux/sysctl.h>
+static long ft_zero = 0;
+static long ft_int_max = INT_MAX;
+
struct ctl_table fanotify_table[] = {
{
.procname = "max_user_groups",
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &ft_zero,
+ .extra2 = &ft_int_max,
},
{
.procname = "max_user_marks",
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &ft_zero,
+ .extra2 = &ft_int_max,
},
{
.procname = "max_queued_events",
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 98f61b31745a..62051247f6d2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -55,22 +55,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
#include <linux/sysctl.h>
+static long it_zero = 0;
+static long it_int_max = INT_MAX;
+
struct ctl_table inotify_table[] = {
{
.procname = "max_user_instances",
.data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &it_zero,
+ .extra2 = &it_int_max,
},
{
.procname = "max_user_watches",
.data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &it_zero,
+ .extra2 = &it_int_max,
},
{
.procname = "max_queued_events",
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 41ebf52f1bbc..ebde05c9cf62 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -392,6 +392,7 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
*/
take_dentry_name_snapshot(&name, real);
this = lookup_one_len(name.name.name, connected, name.name.len);
+ release_dentry_name_snapshot(&name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
goto fail;
@@ -406,7 +407,6 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
}
out:
- release_dentry_name_snapshot(&name);
dput(parent);
inode_unlock(dir);
return this;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4d53d3b7e5fe..d081faa55e83 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -392,6 +392,51 @@ out_unlock:
return ret;
}
+/*
+ * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
+ * due to lock order inversion between pipe->mutex in iter_file_splice_write()
+ * and file_start_write(real.file) in ovl_write_iter().
+ *
+ * So do everything ovl_write_iter() does and call iter_file_splice_write() on
+ * the real file.
+ */
+static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ struct inode *inode = file_inode(out);
+ struct inode *realinode = ovl_inode_real(inode);
+ ssize_t ret;
+
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(realinode, inode);
+ ret = file_remove_privs(out);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(out, &real);
+ if (ret)
+ goto out_unlock;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ file_start_write(real.file);
+
+ ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
+
+ file_end_write(real.file);
+ /* Update size */
+ ovl_copyattr(realinode, inode);
+ revert_creds(old_cred);
+ fdput(real);
+
+out_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct fd real;
@@ -603,7 +648,7 @@ const struct file_operations ovl_file_operations = {
.fadvise = ovl_fadvise,
.flush = ovl_flush,
.splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
+ .splice_write = ovl_splice_write,
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e8ad2c2c77dd..150fdf3bc68d 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -481,6 +481,8 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
}
this = lookup_one_len(p->name, dir, p->len);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
+ /* Mark a stale entry */
+ p->is_whiteout = true;
if (IS_ERR(this)) {
err = PTR_ERR(this);
this = NULL;
@@ -776,6 +778,9 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
if (err)
goto out;
}
+ }
+ /* ovl_cache_update_ino() sets is_whiteout on stale entry */
+ if (!p->is_whiteout) {
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
}