aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c58
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/btrfs/inode.c24
-rw-r--r--fs/btrfs/locking.c9
-rw-r--r--fs/btrfs/ordered-data.c11
-rw-r--r--fs/btrfs/volumes.c10
-rw-r--r--fs/io_uring.c81
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/open.c19
10 files changed, 179 insertions, 39 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4707dfff991b..c2a85b587922 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -345,15 +345,24 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
struct bio *bio;
bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+ bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0;
loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
- int ret = 0;
+ gfp_t gfp;
+ ssize_t ret;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
+ if (nowait)
+ gfp = GFP_NOWAIT;
+ else
+ gfp = GFP_KERNEL;
+
+ bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool);
+ if (!bio)
+ return -EAGAIN;
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
@@ -375,7 +384,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
if (!is_poll)
blk_start_plug(&plug);
+ ret = 0;
for (;;) {
+ int err;
+
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> 9;
bio->bi_write_hint = iocb->ki_hint;
@@ -383,8 +395,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
- ret = bio_iov_iter_get_pages(bio, iter);
- if (unlikely(ret)) {
+ err = bio_iov_iter_get_pages(bio, iter);
+ if (unlikely(err)) {
+ if (!ret)
+ ret = err;
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
break;
@@ -399,6 +413,14 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
task_io_account_write(bio->bi_iter.bi_size);
}
+ /*
+ * Tell underlying layer to not block for resource shortage.
+ * And if we would have blocked, return error inline instead
+ * of through the bio->bi_end_io() callback.
+ */
+ if (nowait)
+ bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);
+
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
@@ -412,6 +434,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
}
qc = submit_bio(bio);
+ if (qc == BLK_QC_T_EAGAIN) {
+ if (!ret)
+ ret = -EAGAIN;
+ goto error;
+ }
if (polled)
WRITE_ONCE(iocb->ki_cookie, qc);
@@ -432,8 +459,20 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
atomic_inc(&dio->ref);
}
- submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL, nr_pages);
+ qc = submit_bio(bio);
+ if (qc == BLK_QC_T_EAGAIN) {
+ if (!ret)
+ ret = -EAGAIN;
+ goto error;
+ }
+ ret += bio->bi_iter.bi_size;
+
+ bio = bio_alloc(gfp, nr_pages);
+ if (!bio) {
+ if (!ret)
+ ret = -EAGAIN;
+ goto error;
+ }
}
if (!is_poll)
@@ -453,13 +492,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
}
__set_current_state(TASK_RUNNING);
+out:
if (!ret)
ret = blk_status_to_errno(dio->bio.bi_status);
- if (likely(!ret))
- ret = dio->size;
bio_put(&dio->bio);
return ret;
+error:
+ if (!is_poll)
+ blk_finish_plug(&plug);
+ goto out;
}
static ssize_t
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 212b4a854f2c..38651fae7f21 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,7 @@ config BTRFS_FS
tristate "Btrfs filesystem support"
select CRYPTO
select CRYPTO_CRC32C
+ select LIBCRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41a2bd2e0c56..5f7ee70b3d1a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4106,6 +4106,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
cleanup_srcu_struct(&fs_info->subvol_srcu);
+ btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
btrfs_free_ref_cache(fs_info);
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1af069a9a0c7..ee582a36653d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -395,10 +395,31 @@ static noinline int add_async_extent(struct async_chunk *cow,
return 0;
}
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool inode_can_compress(struct inode *inode)
+{
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
+ BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
+
+/*
+ * Check if the inode needs to be submitted to compression, based on mount
+ * options, defragmentation, properties or heuristics.
+ */
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ if (!inode_can_compress(inode)) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
+ btrfs_ino(BTRFS_I(inode)));
+ return 0;
+ }
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
@@ -1631,7 +1652,8 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- } else if (!inode_need_compress(inode, start, end)) {
+ } else if (!inode_can_compress(inode) ||
+ !inode_need_compress(inode, start, end)) {
ret = cow_file_range(inode, locked_page, start, end, end,
page_started, nr_written, 1, NULL);
} else {
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 98fccce4208c..393eceda57c8 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -346,9 +346,12 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
btrfs_assert_no_spinning_writers(eb);
eb->blocking_writers--;
- /* Use the lighter barrier after atomic */
- smp_mb__after_atomic();
- cond_wake_up_nomb(&eb->write_lock_wq);
+ /*
+ * We need to order modifying blocking_writers above with
+ * actually waking up the sleepers to ensure they see the
+ * updated value of blocking_writers
+ */
+ cond_wake_up(&eb->write_lock_wq);
} else {
btrfs_assert_spinning_writers_put(eb);
write_unlock(&eb->lock);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1744ba8b2754..ae7f64a8facb 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -985,13 +985,14 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
struct extent_state **cached_state)
{
struct btrfs_ordered_extent *ordered;
- struct extent_state *cachedp = NULL;
+ struct extent_state *cache = NULL;
+ struct extent_state **cachedp = &cache;
if (cached_state)
- cachedp = *cached_state;
+ cachedp = cached_state;
while (1) {
- lock_extent_bits(tree, start, end, &cachedp);
+ lock_extent_bits(tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1001,10 +1002,10 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
* aren't exposing it outside of this function
*/
if (!cached_state)
- refcount_dec(&cachedp->refs);
+ refcount_dec(&cache->refs);
break;
}
- unlock_extent_cached(tree, start, end, &cachedp);
+ unlock_extent_cached(tree, start, end, cachedp);
btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a13ddba1ebc3..d74b74ca07af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5941,6 +5941,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
+ int ret = 0;
ASSERT(op != BTRFS_MAP_DISCARD);
@@ -5961,8 +5962,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
stripe_offset, offset, em->start, logical, stripe_len);
- free_extent_map(em);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
/* stripe_offset is the offset of this block in its stripe */
@@ -6009,7 +6010,10 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom->stripe_offset = stripe_offset;
io_geom->raid56_stripe_offset = raid56_full_stripe_start;
- return 0;
+out:
+ /* once for us */
+ free_extent_map(em);
+ return ret;
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e2a66e12fbc6..012bc0efb9d3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
struct file *file;
off_t io_end;
- size_t io_pages;
+ size_t io_len;
};
struct io_ring_ctx {
@@ -333,7 +333,8 @@ struct io_kiocb {
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
-#define REQ_F_FAIL_LINK 128 /* fail rest of links */
+#define REQ_F_LINK_DONE 128 /* linked sqes done */
+#define REQ_F_FAIL_LINK 256 /* fail rest of links */
u64 user_data;
u32 result;
u32 sequence;
@@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
return false;
- return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
+ return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
}
static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
nxt->flags |= REQ_F_LINK;
}
+ nxt->flags |= REQ_F_LINK_DONE;
INIT_WORK(&nxt->work, io_sq_wq_submit_work);
queue_work(req->ctx->sqo_wq, &nxt->work);
}
@@ -1064,8 +1066,44 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
*/
offset = buf_addr - imu->ubuf;
iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
- if (offset)
- iov_iter_advance(iter, offset);
+
+ if (offset) {
+ /*
+ * Don't use iov_iter_advance() here, as it's really slow for
+ * using the latter parts of a big fixed buffer - it iterates
+ * over each segment manually. We can cheat a bit here, because
+ * we know that:
+ *
+ * 1) it's a BVEC iter, we set it up
+ * 2) all bvecs are PAGE_SIZE in size, except potentially the
+ * first and last bvec
+ *
+ * So just find our index, and adjust the iterator afterwards.
+ * If the offset is within the first bvec (or the whole first
+ * bvec, just use iov_iter_advance(). This makes it easier
+ * since we can just skip the first segment, which may not
+ * be PAGE_SIZE aligned.
+ */
+ const struct bio_vec *bvec = imu->bvec;
+
+ if (offset <= bvec->bv_len) {
+ iov_iter_advance(iter, offset);
+ } else {
+ unsigned long seg_skip;
+
+ /* skip first vec */
+ offset -= bvec->bv_len;
+ seg_skip = 1 + (offset >> PAGE_SHIFT);
+
+ iter->bvec = bvec + seg_skip;
+ iter->nr_segs -= seg_skip;
+ iter->count -= (seg_skip << PAGE_SHIFT);
+ iter->iov_offset = offset & ~PAGE_MASK;
+ if (iter->iov_offset)
+ iter->count -= iter->iov_offset;
+ }
+ }
+
return 0;
}
@@ -1120,28 +1158,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
off_t io_end = kiocb->ki_pos + len;
if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
- unsigned long max_pages;
+ unsigned long max_bytes;
/* Use 8x RA size as a decent limiter for both reads/writes */
- max_pages = filp->f_ra.ra_pages;
- if (!max_pages)
- max_pages = VM_READAHEAD_PAGES;
- max_pages *= 8;
-
- /* If max pages are exceeded, reset the state */
- len >>= PAGE_SHIFT;
- if (async_list->io_pages + len <= max_pages) {
+ max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
+ if (!max_bytes)
+ max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
+
+ /* If max len are exceeded, reset the state */
+ if (async_list->io_len + len <= max_bytes) {
req->flags |= REQ_F_SEQ_PREV;
- async_list->io_pages += len;
+ async_list->io_len += len;
} else {
io_end = 0;
- async_list->io_pages = 0;
+ async_list->io_len = 0;
}
}
/* New file? Reset state. */
if (async_list->file != filp) {
- async_list->io_pages = 0;
+ async_list->io_len = 0;
async_list->file = filp;
}
async_list->io_end = io_end;
@@ -1630,6 +1666,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+ INIT_LIST_HEAD(&req->list);
+
mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
spin_lock_irq(&ctx->completion_lock);
@@ -1844,6 +1882,10 @@ restart:
/* async context always use a copy of the sqe */
kfree(sqe);
+ /* req from defer and link list needn't decrease async cnt */
+ if (req->flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
+ goto out;
+
if (!async_list)
break;
if (!list_empty(&req_list)) {
@@ -1891,6 +1933,7 @@ restart:
}
}
+out:
if (cur_mm) {
set_fs(old_fs);
unuse_mm(cur_mm);
@@ -1917,6 +1960,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
ret = true;
spin_lock(&list->lock);
list_add_tail(&req->list, &list->list);
+ /*
+ * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
+ */
+ smp_mb();
if (!atomic_read(&list->cnt)) {
list_del_init(&req->list);
ret = false;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6464ea4acba9..d28d30b13043 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1463,7 +1463,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
disconnect = disconnect_mount(p, how);
-
if (mnt_has_parent(p)) {
mnt_add_count(p->mnt_parent, -1);
if (!disconnect) {
@@ -1471,10 +1470,11 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
} else {
umount_mnt(p);
- hlist_add_head(&p->mnt_umount, &unmounted);
}
}
change_mnt_propagation(p, MS_PRIVATE);
+ if (disconnect)
+ hlist_add_head(&p->mnt_umount, &unmounted);
}
}
diff --git a/fs/open.c b/fs/open.c
index b5b80469b93d..a59abe3c669a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -374,6 +374,25 @@ long do_faccessat(int dfd, const char __user *filename, int mode)
override_cred->cap_permitted;
}
+ /*
+ * The new set of credentials can *only* be used in
+ * task-synchronous circumstances, and does not need
+ * RCU freeing, unless somebody then takes a separate
+ * reference to it.
+ *
+ * NOTE! This is _only_ true because this credential
+ * is used purely for override_creds() that installs
+ * it as the subjective cred. Other threads will be
+ * accessing ->real_cred, not the subjective cred.
+ *
+ * If somebody _does_ make a copy of this (using the
+ * 'get_current_cred()' function), that will clear the
+ * non_rcu field, because now that other user may be
+ * expecting RCU freeing. But normal thread-synchronous
+ * cred accesses will keep things non-RCY.
+ */
+ override_cred->non_rcu = 1;
+
old_cred = override_creds(override_cred);
retry:
res = user_path_at(dfd, filename, lookup_flags, &path);