aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/file.c
diff options
context:
space:
mode:
authorJeff Layton <jlayton@kernel.org>2019-08-02 13:15:39 -0400
committerIlya Dryomov <idryomov@gmail.com>2019-09-16 12:06:25 +0200
commit321fe13c939876b55fbb7780a243c86e577a2151 (patch)
tree9963961399b7648c1207012d31b1dc51f50a389c /fs/ceph/file.c
parentlibceph: handle OSD op ceph_pagelist_append() errors (diff)
downloadlinux-dev-321fe13c939876b55fbb7780a243c86e577a2151.tar.xz
linux-dev-321fe13c939876b55fbb7780a243c86e577a2151.zip
ceph: add buffered/direct exclusionary locking for reads and writes
xfstest generic/451 intermittently fails. The test does O_DIRECT writes to a file, and then reads back the result using buffered I/O, while running a separate set of tasks that are also doing buffered reads. The client will invalidate the cache prior to a direct write, but it's easy for one of the other readers' replies to race in and reinstantiate the invalidated range with stale data. To fix this, we must to serialize direct I/O writes and buffered reads. We could just sprinkle in some shared locks on the i_rwsem for reads, and increase the exclusive footprint on the write side, but that would cause O_DIRECT writes to end up serialized vs. other direct requests. Instead, borrow the scheme used by nfs.ko. Buffered writes take the i_rwsem exclusively, but buffered reads take a shared lock, allowing them to run in parallel. O_DIRECT requests also take a shared lock, but we need for them to not run in parallel with buffered reads. A flag on the ceph_inode_info is used to indicate whether it's in direct or buffered I/O mode. When a conflicting request is submitted, it will block until the inode can be flipped to the necessary mode. Link: https://tracker.ceph.com/issues/40985 Signed-off-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: "Yan, Zheng" <zyan@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'fs/ceph/file.c')
-rw-r--r--fs/ceph/file.c35
1 files changed, 22 insertions, 13 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5182e1a49d6f..ff17a81bf2e2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -15,6 +15,7 @@
#include "super.h"
#include "mds_client.h"
#include "cache.h"
+#include "io.h"
static __le32 ceph_flags_sys2wire(u32 flags)
{
@@ -930,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct ceph_aio_request *aio_req = NULL;
int num_pages = 0;
int flags;
- int ret;
+ int ret = 0;
struct timespec64 mtime = current_time(inode);
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
@@ -944,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
(write ? "write" : "read"), file, pos, (unsigned)count,
snapc, snapc ? snapc->seq : 0);
- ret = filemap_write_and_wait_range(inode->i_mapping,
- pos, pos + count - 1);
- if (ret < 0)
- return ret;
-
if (write) {
int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
@@ -1284,12 +1280,16 @@ again:
if (ci->i_inline_version == CEPH_INLINE_NONE) {
if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+ ceph_start_io_direct(inode);
ret = ceph_direct_read_write(iocb, to,
NULL, NULL);
+ ceph_end_io_direct(inode);
if (ret >= 0 && ret < len)
retry_op = CHECK_EOF;
} else {
+ ceph_start_io_read(inode);
ret = ceph_sync_read(iocb, to, &retry_op);
+ ceph_end_io_read(inode);
}
} else {
retry_op = READ_INLINE;
@@ -1300,7 +1300,9 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
ceph_add_rw_context(fi, &rw_ctx);
+ ceph_start_io_read(inode);
ret = generic_file_read_iter(iocb, to);
+ ceph_end_io_read(inode);
ceph_del_rw_context(fi, &rw_ctx);
}
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
@@ -1409,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
return -ENOMEM;
retry_snap:
- inode_lock(inode);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ ceph_start_io_direct(inode);
+ else
+ ceph_start_io_write(inode);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
@@ -1480,7 +1485,6 @@ retry_snap:
(ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
struct ceph_snap_context *snapc;
struct iov_iter data;
- inode_unlock(inode);
spin_lock(&ci->i_ceph_lock);
if (__ceph_have_pending_cap_snap(ci)) {
@@ -1497,11 +1501,14 @@ retry_snap:
/* we might need to revert back to that point */
data = *from;
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (iocb->ki_flags & IOCB_DIRECT) {
written = ceph_direct_read_write(iocb, &data, snapc,
&prealloc_cf);
- else
+ ceph_end_io_direct(inode);
+ } else {
written = ceph_sync_write(iocb, &data, pos, snapc);
+ ceph_end_io_write(inode);
+ }
if (written > 0)
iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
@@ -1516,7 +1523,7 @@ retry_snap:
written = generic_perform_write(file, from, pos);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
- inode_unlock(inode);
+ ceph_end_io_write(inode);
}
if (written >= 0) {
@@ -1551,9 +1558,11 @@ retry_snap:
}
goto out_unlocked;
-
out:
- inode_unlock(inode);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_write(inode);
out_unlocked:
ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL;