aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/addr.c619
-rw-r--r--fs/ceph/cache.c220
-rw-r--r--fs/ceph/cache.h103
-rw-r--r--fs/ceph/caps.c330
-rw-r--r--fs/ceph/debugfs.c5
-rw-r--r--fs/ceph/dir.c96
-rw-r--r--fs/ceph/export.c3
-rw-r--r--fs/ceph/file.c285
-rw-r--r--fs/ceph/inode.c178
-rw-r--r--fs/ceph/locks.c8
-rw-r--r--fs/ceph/mds_client.c379
-rw-r--r--fs/ceph/mds_client.h34
-rw-r--r--fs/ceph/mdsmap.c24
-rw-r--r--fs/ceph/metric.c65
-rw-r--r--fs/ceph/metric.h63
-rw-r--r--fs/ceph/quota.c36
-rw-r--r--fs/ceph/snap.c271
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c209
-rw-r--r--fs/ceph/super.h118
-rw-r--r--fs/ceph/xattr.c49
21 files changed, 1964 insertions, 1132 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e53c8541f5b2..dcf701b05cc1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -4,8 +4,8 @@
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/pagemap.h>
-#include <linux/writeback.h> /* generic_writepages */
#include <linux/slab.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
@@ -63,7 +63,7 @@
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct folio *folio, void **_fsdata);
+ struct folio **foliop, void **_fsdata);
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
@@ -76,18 +76,17 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
* Dirty a page. Optimistically adjust accounting, on the assumption
* that we won't race with invalidate. If we do, readjust.
*/
-static int ceph_set_page_dirty(struct page *page)
+static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct address_space *mapping = page->mapping;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- if (PageDirty(page)) {
- dout("%p set_page_dirty %p idx %lu -- already dirty\n",
- mapping->host, page, page->index);
- BUG_ON(!PagePrivate(page));
- return 0;
+ if (folio_test_dirty(folio)) {
+ dout("%p dirty_folio %p idx %lu -- already dirty\n",
+ mapping->host, folio, folio->index);
+ VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
+ return false;
}
inode = mapping->host;
@@ -111,75 +110,81 @@ static int ceph_set_page_dirty(struct page *page)
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
- dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d "
"snapc %p seq %lld (%d snaps)\n",
- mapping->host, page, page->index,
+ mapping->host, folio, folio->index,
ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
/*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
+ * Reference snap context in folio->private. Also set
+ * PagePrivate so that we get invalidate_folio callback.
*/
- BUG_ON(PagePrivate(page));
- attach_page_private(page, snapc);
+ VM_WARN_ON_FOLIO(folio->private, folio);
+ folio_attach_private(folio, snapc);
- return __set_page_dirty_nobuffers(page);
+ return ceph_fscache_dirty_folio(mapping, folio);
}
/*
- * If we are truncating the full page (i.e. offset == 0), adjust the
- * dirty page counters appropriately. Only called if there is private
- * data on the page.
+ * If we are truncating the full folio (i.e. offset == 0), adjust the
+ * dirty folio counters appropriately. Only called if there is private
+ * data on the folio.
*/
-static void ceph_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ceph_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- wait_on_page_fscache(page);
-
- inode = page->mapping->host;
+ inode = folio->mapping->host;
ci = ceph_inode(inode);
- if (offset != 0 || length != thp_size(page)) {
- dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
- inode, page, page->index, offset, length);
+ if (offset != 0 || length != folio_size(folio)) {
+ dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n",
+ inode, folio->index, offset, length);
return;
}
- WARN_ON(!PageLocked(page));
- if (!PagePrivate(page))
- return;
+ WARN_ON(!folio_test_locked(folio));
+ if (folio_test_private(folio)) {
+ dout("%p invalidate_folio idx %lu full dirty page\n",
+ inode, folio->index);
- dout("%p invalidatepage %p idx %lu full dirty page\n",
- inode, page, page->index);
+ snapc = folio_detach_private(folio);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ }
- snapc = detach_page_private(page);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
+ folio_wait_fscache(folio);
}
-static int ceph_releasepage(struct page *page, gfp_t gfp)
+static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
{
- dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
- page, page->index, PageDirty(page) ? "" : "not ");
+ struct inode *inode = folio->mapping->host;
- if (PageFsCache(page)) {
- if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
- return 0;
- wait_on_page_fscache(page);
+ dout("%llx:%llx release_folio idx %lu (%sdirty)\n",
+ ceph_vinop(inode),
+ folio->index, folio_test_dirty(folio) ? "" : "not ");
+
+ if (folio_test_private(folio))
+ return false;
+
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_fscache(folio);
}
- return !PagePrivate(page);
+ ceph_fscache_note_page_release(inode);
+ return true;
}
-static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
+static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
{
- struct inode *inode = rreq->mapping->host;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_layout *lo = &ci->i_layout;
u32 blockoff;
@@ -194,9 +199,9 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
rreq->len = roundup(rreq->len, lo->stripe_unit);
}
-static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
+static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
- struct inode *inode = subreq->rreq->mapping->host;
+ struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
@@ -213,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
- struct netfs_read_subrequest *subreq = req->r_priv;
+ struct netfs_io_subrequest *subreq = req->r_priv;
int num_pages;
int err = req->r_result;
@@ -232,17 +237,72 @@ static void finish_netfs_read(struct ceph_osd_request *req)
if (err >= 0 && err < subreq->len)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- netfs_subreq_terminated(subreq, err, true);
+ netfs_subreq_terminated(subreq, err, false);
num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
ceph_put_page_vector(osd_data->pages, num_pages, false);
iput(req->r_inode);
}
-static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
+static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
{
- struct netfs_read_request *rreq = subreq->rreq;
- struct inode *inode = rreq->mapping->host;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ struct ceph_mds_reply_info_in *iinfo;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct iov_iter iter;
+ ssize_t err = 0;
+ size_t len;
+ int mode;
+
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+
+ if (subreq->start >= inode->i_size)
+ goto out;
+
+ /* We need to fetch the inline data. */
+ mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_ino1 = ci->i_vino;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
+ req->r_num_caps = 2;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto out;
+
+ rinfo = &req->r_reply_info;
+ iinfo = &rinfo->targeti;
+ if (iinfo->inline_version == CEPH_INLINE_NONE) {
+ /* The data got uninlined */
+ ceph_mdsc_put_request(req);
+ return false;
+ }
+
+ len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
+ if (err == 0)
+ err = -EFAULT;
+
+ ceph_mdsc_put_request(req);
+out:
+ netfs_subreq_terminated(subreq, err, false);
+ return true;
+}
+
+static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
@@ -253,6 +313,9 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
int err = 0;
u64 len = subreq->len;
+ if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
+ return;
+
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
0, 1, CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
@@ -265,7 +328,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
- err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
+ err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
goto out;
@@ -274,6 +337,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
/* should always give us a page-aligned read */
WARN_ON_ONCE(page_off);
len = err;
+ err = 0;
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
req->r_callback = finish_netfs_read;
@@ -281,9 +345,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
req->r_inode = inode;
ihold(inode);
- err = ceph_osdc_start_request(req->r_osdc, req, false);
- if (err)
- iput(inode);
+ ceph_osdc_start_request(req->r_osdc, req);
out:
ceph_osdc_put_request(req);
if (err)
@@ -291,92 +353,95 @@ out:
dout("%s: result %d\n", __func__, err);
}
-static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file)
+static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
+ struct inode *inode = rreq->inode;
+ int got = 0, want = CEPH_CAP_FILE_CACHE;
+ int ret = 0;
+
+ if (rreq->origin != NETFS_READAHEAD)
+ return 0;
+
+ if (file) {
+ struct ceph_rw_context *rw_ctx;
+ struct ceph_file_info *fi = file->private_data;
+
+ rw_ctx = ceph_find_rw_context(fi);
+ if (rw_ctx)
+ return 0;
+ }
+
+ /*
+ * readahead callers do not necessarily hold Fcb caps
+ * (e.g. fadvise, madvise).
+ */
+ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
+ if (ret < 0) {
+ dout("start_read %p, error getting cap\n", inode);
+ return ret;
+ }
+
+ if (!(got & want)) {
+ dout("start_read %p, no cache cap\n", inode);
+ return -EACCES;
+ }
+ if (ret == 0)
+ return -EACCES;
+
+ rreq->netfs_priv = (void *)(uintptr_t)got;
+ return 0;
}
-static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
+static void ceph_netfs_free_request(struct netfs_io_request *rreq)
{
- struct inode *inode = mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int got = (uintptr_t)priv;
+ struct ceph_inode_info *ci = ceph_inode(rreq->inode);
+ int got = (uintptr_t)rreq->netfs_priv;
if (got)
ceph_put_cap_refs(ci, got);
}
-static const struct netfs_read_request_ops ceph_netfs_read_ops = {
- .init_rreq = ceph_init_rreq,
- .is_cache_enabled = ceph_is_cache_enabled,
+const struct netfs_request_ops ceph_netfs_ops = {
+ .init_request = ceph_init_request,
+ .free_request = ceph_netfs_free_request,
.begin_cache_operation = ceph_begin_cache_operation,
- .issue_op = ceph_netfs_issue_op,
+ .issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
- .cleanup = ceph_readahead_cleanup,
};
-/* read a single page, without unlocking it. */
-static int ceph_readpage(struct file *file, struct page *subpage)
+#ifdef CONFIG_CEPH_FSCACHE
+static void ceph_set_page_fscache(struct page *page)
{
- struct folio *folio = page_folio(subpage);
- struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vino vino = ceph_vino(inode);
- size_t len = folio_size(folio);
- u64 off = folio_file_pos(folio);
-
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- /*
- * Uptodate inline data should have been added
- * into page cache while getting Fcr caps.
- */
- if (off == 0) {
- folio_unlock(folio);
- return -EINVAL;
- }
- zero_user_segment(&folio->page, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
- }
-
- dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
- vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
-
- return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
+ set_page_fscache(page);
}
-static void ceph_readahead(struct readahead_control *ractl)
+static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
{
- struct inode *inode = file_inode(ractl->file);
- struct ceph_file_info *fi = ractl->file->private_data;
- struct ceph_rw_context *rw_ctx;
- int got = 0;
- int ret = 0;
+ struct inode *inode = priv;
- if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
- return;
+ if (IS_ERR_VALUE(error) && error != -ENOBUFS)
+ ceph_fscache_invalidate(inode, false);
+}
- rw_ctx = ceph_find_rw_context(fi);
- if (!rw_ctx) {
- /*
- * readahead callers do not necessarily hold Fcb caps
- * (e.g. fadvise, madvise).
- */
- int want = CEPH_CAP_FILE_CACHE;
+static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
- ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
- if (ret < 0)
- dout("start_read %p, error getting cap\n", inode);
- else if (!(got & want))
- dout("start_read %p, no cache cap\n", inode);
+ fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
+ ceph_fscache_write_terminated, inode, caching);
+}
+#else
+static inline void ceph_set_page_fscache(struct page *page)
+{
+}
- if (ret <= 0)
- return;
- }
- netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got);
+static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
+{
}
+#endif /* CONFIG_CEPH_FSCACHE */
struct ceph_writeback_ctl
{
@@ -483,6 +548,7 @@ static u64 get_writepages_data_length(struct inode *inode,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -493,6 +559,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
+ bool caching = ceph_is_cache_enabled(inode);
dout("writepage %p idx %lu\n", page, page->index);
@@ -516,8 +583,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n", page, ceph_wbc.i_size);
- page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
+ dout("folio at %lu beyond eof %llu\n", folio->index,
+ ceph_wbc.i_size);
+ folio_invalidate(folio, 0, folio_size(folio));
return 0;
}
@@ -529,28 +597,30 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = true;
- set_page_writeback(page);
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
true);
if (IS_ERR(req)) {
redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
return PTR_ERR(req);
}
+ set_page_writeback(page);
+ if (caching)
+ ceph_set_page_fscache(page);
+ ceph_fscache_write_to_cache(inode, page_off, len, caching);
+
/* it may be a short write due to an object boundary */
WARN_ON_ONCE(len > thp_size(page));
osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(osdc, req, true);
- if (!err)
- err = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_start_request(osdc, req);
+ err = ceph_osdc_wait_request(osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
@@ -588,7 +658,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = false;
return err;
}
@@ -599,6 +669,13 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = page->mapping->host;
BUG_ON(!inode);
ihold(inode);
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ ceph_inode_to_client(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
+ wait_on_page_fscache(page);
+
err = writepage_nounlock(page, wbc);
if (err == -ERESTARTSYS) {
/* direct memory reclaimer was killed by SIGKILL. return 0
@@ -652,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req)
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
- if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
+ pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
+ __func__, req->r_ops[i].op, req, i, req->r_tid);
break;
+ }
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
@@ -669,8 +749,7 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
+ fsc->write_congested = false;
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
@@ -720,6 +799,11 @@ static int ceph_writepages_start(struct address_space *mapping,
struct ceph_writeback_ctl ceph_wbc;
bool should_loop, range_whole = false;
bool done = false;
+ bool caching = ceph_is_cache_enabled(inode);
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fsc->write_congested)
+ return 0;
dout("writepages_start %p (mode=%s)\n", inode,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
@@ -828,14 +912,16 @@ get_more_pages:
continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n",
- page, ceph_wbc.i_size);
+ struct folio *folio = page_folio(page);
+
+ dout("folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
- page_offset(page) >= i_size_read(inode)) &&
- clear_page_dirty_for_io(page))
- mapping->a_ops->invalidatepage(page,
- 0, thp_size(page));
- unlock_page(page);
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0,
+ folio_size(folio));
+ folio_unlock(folio);
continue;
}
if (strip_unit_end && (page->index > strip_unit_end)) {
@@ -843,7 +929,7 @@ get_more_pages:
unlock_page(page);
break;
}
- if (PageWriteback(page)) {
+ if (PageWriteback(page) || PageFsCache(page)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
dout("%p under writeback\n", page);
unlock_page(page);
@@ -851,6 +937,7 @@ get_more_pages:
}
dout("waiting on writeback %p\n", page);
wait_on_page_writeback(page);
+ wait_on_page_fscache(page);
}
if (!clear_page_dirty_for_io(page)) {
@@ -914,11 +1001,8 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb)) {
- set_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
- }
-
+ fsc->mount_options->congestion_kb))
+ fsc->write_congested = true;
pages[locked_pages++] = page;
pvec.pages[i] = NULL;
@@ -983,9 +1067,19 @@ new_request:
op_idx = 0;
for (i = 0; i < locked_pages; i++) {
u64 cur_offset = page_offset(pages[i]);
+ /*
+ * Discontinuity in page range? Ceph can handle that by just passing
+ * multiple extents in the write op.
+ */
if (offset + len != cur_offset) {
+ /* If it's full, stop here */
if (op_idx + 1 == req->r_num_ops)
break;
+
+ /* Kick off an fscache write with what we have so far. */
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ /* Start a new extent */
osd_req_op_extent_dup_last(req, op_idx,
cur_offset - offset);
dout("writepages got pages at %llu~%llu\n",
@@ -996,14 +1090,17 @@ new_request:
osd_req_op_extent_update(req, op_idx, len);
len = 0;
- offset = cur_offset;
+ offset = cur_offset;
data_pages = pages + i;
op_idx++;
}
set_page_writeback(pages[i]);
+ if (caching)
+ ceph_set_page_fscache(pages[i]);
len += thp_size(page);
}
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
if (ceph_wbc.size_stable) {
len = min(len, ceph_wbc.i_size - offset);
@@ -1051,8 +1148,7 @@ new_request:
}
req->r_mtime = inode->i_mtime;
- rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
- BUG_ON(rc);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
req = NULL;
wbc->nr_to_write -= i;
@@ -1188,18 +1284,19 @@ ceph_find_incompatible(struct page *page)
}
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct folio *folio, void **_fsdata)
+ struct folio **foliop, void **_fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(folio_page(folio, 0));
+ snapc = ceph_find_incompatible(folio_page(*foliop, 0));
if (snapc) {
int r;
- folio_unlock(folio);
- folio_put(folio);
+ folio_unlock(*foliop);
+ folio_put(*foliop);
+ *foliop = NULL;
if (IS_ERR(snapc))
return PTR_ERR(snapc);
@@ -1217,59 +1314,22 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
* clean, or already dirty within the same snap context.
*/
static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned aop_flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct folio *folio = NULL;
- pgoff_t index = pos >> PAGE_SHIFT;
int r;
- /*
- * Uninlining should have already been done and everything updated, EXCEPT
- * for inline_version sent to the MDS.
- */
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
- if (aop_flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
- folio = __filemap_get_folio(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
+ if (r < 0)
+ return r;
- /*
- * The inline_version on a new inode is set to 1. If that's the
- * case, then the folio is brand new and isn't yet Uptodate.
- */
- r = 0;
- if (index == 0 && ci->i_inline_version != 1) {
- if (!folio_test_uptodate(folio)) {
- WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
- ci->i_inline_version);
- r = -EINVAL;
- }
- goto out;
- }
- zero_user_segment(&folio->page, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- goto out;
- }
-
- r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
- &ceph_netfs_read_ops, NULL);
-out:
- if (r == 0)
- folio_wait_fscache(folio);
- if (r < 0) {
- if (folio)
- folio_put(folio);
- } else {
- WARN_ON_ONCE(!folio_test_locked(folio));
- *pagep = &folio->page;
- }
- return r;
+ folio_wait_fscache(folio);
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ *pagep = &folio->page;
+ return 0;
}
/*
@@ -1313,15 +1373,15 @@ out:
}
const struct address_space_operations ceph_aops = {
- .readpage = ceph_readpage,
- .readahead = ceph_readahead,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
.writepage = ceph_writepage,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
- .set_page_dirty = ceph_set_page_dirty,
- .invalidatepage = ceph_invalidatepage,
- .releasepage = ceph_releasepage,
+ .dirty_folio = ceph_dirty_folio,
+ .invalidate_folio = ceph_invalidate_folio,
+ .release_folio = ceph_release_folio,
.direct_IO = noop_direct_IO,
};
@@ -1372,7 +1432,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
inode, off, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
- ci->i_inline_version == CEPH_INLINE_NONE) {
+ !ceph_has_inline_data(ci)) {
CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
ceph_add_rw_context(fi, &rw_ctx);
ret = filemap_fault(vmf);
@@ -1455,19 +1515,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- struct page *locked_page = NULL;
- if (off == 0) {
- lock_page(page);
- locked_page = page;
- }
- err = ceph_uninline_data(vma->vm_file, locked_page);
- if (locked_page)
- unlock_page(locked_page);
- if (err < 0)
- goto out_free;
- }
-
if (off + thp_size(page) <= size)
len = thp_size(page);
else
@@ -1524,11 +1571,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
ceph_put_snap_context(snapc);
} while (err == 0);
- if (ret == VM_FAULT_LOCKED ||
- ci->i_inline_version != CEPH_INLINE_NONE) {
+ if (ret == VM_FAULT_LOCKED) {
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1592,16 +1637,18 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
-int ceph_uninline_data(struct file *filp, struct page *locked_page)
+int ceph_uninline_data(struct file *file)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_osd_request *req;
- struct page *page = NULL;
- u64 len, inline_version;
+ struct ceph_osd_request *req = NULL;
+ struct ceph_cap_flush *prealloc_cf;
+ struct folio *folio = NULL;
+ u64 inline_version = CEPH_INLINE_NONE;
+ struct page *pages[1];
int err = 0;
- bool from_pagecache = false;
+ u64 len;
spin_lock(&ci->i_ceph_lock);
inline_version = ci->i_inline_version;
@@ -1610,64 +1657,43 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
dout("uninline_data %p %llx.%llx inline_version %llu\n",
inode, ceph_vinop(inode), inline_version);
- if (inline_version == 1 || /* initial version, no data */
- inline_version == CEPH_INLINE_NONE)
- goto out;
+ if (inline_version == CEPH_INLINE_NONE)
+ return 0;
- if (locked_page) {
- page = locked_page;
- WARN_ON(!PageUptodate(page));
- } else if (ceph_caps_issued(ci) &
- (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
- page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- from_pagecache = true;
- lock_page(page);
- } else {
- put_page(page);
- page = NULL;
- }
- }
- }
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
- if (page) {
- len = i_size_read(inode);
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- } else {
- page = __page_cache_alloc(GFP_NOFS);
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- err = __ceph_do_getattr(inode, page,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (err < 0) {
- /* no inline data */
- if (err == -ENODATA)
- err = 0;
- goto out;
- }
- len = err;
+ if (inline_version == 1) /* initial version, no data */
+ goto out_uninline;
+
+ folio = read_mapping_folio(inode->i_mapping, 0, file);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
+ goto out;
}
+ folio_lock(folio);
+
+ len = i_size_read(inode);
+ if (len > folio_size(folio))
+ len = folio_size(folio);
+
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
- goto out;
+ goto out_unlock;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
@@ -1676,10 +1702,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
- osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+ pages[0] = folio_page(folio, 0);
+ osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
{
__le64 xattr_buf = cpu_to_le64(inline_version);
@@ -1689,7 +1716,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
CEPH_OSD_CMPXATTR_OP_GT,
CEPH_OSD_CMPXATTR_MODE_U64);
if (err)
- goto out_put;
+ goto out_put_req;
}
{
@@ -1700,30 +1727,41 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
"inline_version",
xattr_buf, xattr_len, 0, 0);
if (err)
- goto out_put;
+ goto out_put_req;
}
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
-out_put:
+out_uninline:
+ if (!err) {
+ int dirty;
+
+ /* Set to CAP_INLINE_NONE and dirty the caps */
+ down_read(&fsc->mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ up_read(&fsc->mdsc->snap_rwsem);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+out_put_req:
ceph_osdc_put_request(req);
if (err == -ECANCELED)
err = 0;
-out:
- if (page && page != locked_page) {
- if (from_pagecache) {
- unlock_page(page);
- put_page(page);
- } else
- __free_pages(page, 0);
+out_unlock:
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
-
+out:
+ ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
inode, ceph_vinop(inode), inline_version, err);
return err;
@@ -1738,9 +1776,8 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
- file_accessed(file);
vma->vm_ops = &ceph_vmops;
return 0;
}
@@ -1753,7 +1790,7 @@ enum {
static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
s64 pool, struct ceph_string *pool_ns)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
struct rb_node **p, *parent;
@@ -1866,15 +1903,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
0, false, true);
- err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+ ceph_osdc_start_request(&fsc->client->osdc, rd_req);
- wr_req->r_mtime = ci->vfs_inode.i_mtime;
- err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+ wr_req->r_mtime = ci->netfs.inode.i_mtime;
+ ceph_osdc_start_request(&fsc->client->osdc, wr_req);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
- if (!err2)
- err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+ err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 457afda5498a..177d8e8d73fe 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -12,199 +12,99 @@
#include "super.h"
#include "cache.h"
-struct fscache_netfs ceph_cache_netfs = {
- .name = "ceph",
- .version = 0,
-};
-
-static DEFINE_MUTEX(ceph_fscache_lock);
-static LIST_HEAD(ceph_fscache_list);
-
-struct ceph_fscache_entry {
- struct list_head list;
- struct fscache_cookie *fscache;
- size_t uniq_len;
- /* The following members must be last */
- struct ceph_fsid fsid;
- char uniquifier[];
-};
-
-static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
- .name = "CEPH.fsid",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-int __init ceph_fscache_register(void)
-{
- return fscache_register_netfs(&ceph_cache_netfs);
-}
-
-void ceph_fscache_unregister(void)
-{
- fscache_unregister_netfs(&ceph_cache_netfs);
-}
-
-int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc)
+void ceph_fscache_register_inode_cookie(struct inode *inode)
{
- const struct ceph_fsid *fsid = &fsc->client->fsid;
- const char *fscache_uniq = fsc->mount_options->fscache_uniq;
- size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
- struct ceph_fscache_entry *ent;
- int err = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- mutex_lock(&ceph_fscache_lock);
- list_for_each_entry(ent, &ceph_fscache_list, list) {
- if (memcmp(&ent->fsid, fsid, sizeof(*fsid)))
- continue;
- if (ent->uniq_len != uniq_len)
- continue;
- if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len))
- continue;
-
- errorfc(fc, "fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option",
- fsid);
- err = -EBUSY;
- goto out_unlock;
- }
+ /* No caching for filesystem? */
+ if (!fsc->fscache)
+ return;
- ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL);
- if (!ent) {
- err = -ENOMEM;
- goto out_unlock;
- }
+ /* Regular files only */
+ if (!S_ISREG(inode->i_mode))
+ return;
- memcpy(&ent->fsid, fsid, sizeof(*fsid));
- if (uniq_len > 0) {
- memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
- ent->uniq_len = uniq_len;
- }
+ /* Only new inodes! */
+ if (!(inode->i_state & I_NEW))
+ return;
- fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
- &ceph_fscache_fsid_object_def,
- &ent->fsid, sizeof(ent->fsid) + uniq_len,
- NULL, 0,
- fsc, 0, true);
+ WARN_ON_ONCE(ci->netfs.cache);
- if (fsc->fscache) {
- ent->fscache = fsc->fscache;
- list_add_tail(&ent->list, &ceph_fscache_list);
- } else {
- kfree(ent);
- errorfc(fc, "unable to register fscache cookie for fsid %pU",
- fsid);
- /* all other fs ignore this error */
- }
-out_unlock:
- mutex_unlock(&ceph_fscache_lock);
- return err;
+ ci->netfs.cache =
+ fscache_acquire_cookie(fsc->fscache, 0,
+ &ci->i_vino, sizeof(ci->i_vino),
+ &ci->i_version, sizeof(ci->i_version),
+ i_size_read(inode));
}
-static enum fscache_checkaux ceph_fscache_inode_check_aux(
- void *cookie_netfs_data, const void *data, uint16_t dlen,
- loff_t object_size)
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
{
- struct ceph_inode_info* ci = cookie_netfs_data;
- struct inode* inode = &ci->vfs_inode;
-
- if (dlen != sizeof(ci->i_version) ||
- i_size_read(inode) != object_size)
- return FSCACHE_CHECKAUX_OBSOLETE;
+ fscache_relinquish_cookie(ceph_fscache_cookie(ci), false);
+}
- if (*(u64 *)data != ci->i_version)
- return FSCACHE_CHECKAUX_OBSOLETE;
+void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
- dout("ceph inode 0x%p cached okay\n", ci);
- return FSCACHE_CHECKAUX_OKAY;
+ fscache_use_cookie(ceph_fscache_cookie(ci), will_modify);
}
-static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
- .name = "CEPH.inode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = ceph_fscache_inode_check_aux,
-};
-
-void ceph_fscache_register_inode_cookie(struct inode *inode)
+void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-
- /* No caching for filesystem */
- if (!fsc->fscache)
- return;
- /* Only cache for regular files that are read only */
- if (!S_ISREG(inode->i_mode))
- return;
+ if (update) {
+ loff_t i_size = i_size_read(inode);
- inode_lock_nested(inode, I_MUTEX_CHILD);
- if (!ci->fscache) {
- ci->fscache = fscache_acquire_cookie(fsc->fscache,
- &ceph_fscache_inode_object_def,
- &ci->i_vino, sizeof(ci->i_vino),
- &ci->i_version, sizeof(ci->i_version),
- ci, i_size_read(inode), false);
+ fscache_unuse_cookie(ceph_fscache_cookie(ci),
+ &ci->i_version, &i_size);
+ } else {
+ fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL);
}
- inode_unlock(inode);
}
-void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+void ceph_fscache_update(struct inode *inode)
{
- struct fscache_cookie* cookie;
-
- if ((cookie = ci->fscache) == NULL)
- return;
-
- ci->fscache = NULL;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ loff_t i_size = i_size_read(inode);
- fscache_relinquish_cookie(cookie, &ci->i_vino, false);
+ fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size);
}
-static bool ceph_fscache_can_enable(void *data)
+void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
{
- struct inode *inode = data;
- return !inode_is_open_for_write(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ fscache_invalidate(ceph_fscache_cookie(ci),
+ &ci->i_version, i_size_read(inode),
+ dio_write ? FSCACHE_INVAL_DIO_WRITE : 0);
}
-void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
+ const struct ceph_fsid *fsid = &fsc->client->fsid;
+ const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+ size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+ char *name;
+ int err = 0;
- if (!fscache_cookie_valid(ci->fscache))
- return;
+ name = kasprintf(GFP_KERNEL, "ceph,%pU%s%s", fsid, uniq_len ? "," : "",
+ uniq_len ? fscache_uniq : "");
+ if (!name)
+ return -ENOMEM;
- if (inode_is_open_for_write(inode)) {
- dout("fscache_file_set_cookie %p %p disabling cache\n",
- inode, filp);
- fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
- } else {
- fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
- ceph_fscache_can_enable, inode);
- if (fscache_cookie_enabled(ci->fscache)) {
- dout("fscache_file_set_cookie %p %p enabling cache\n",
- inode, filp);
- }
+ fsc->fscache = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(fsc->fscache)) {
+ errorfc(fc, "Unable to register fscache cookie for %s", name);
+ err = fsc->fscache ? PTR_ERR(fsc->fscache) : -EOPNOTSUPP;
+ fsc->fscache = NULL;
}
+ kfree(name);
+ return err;
}
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
- if (fscache_cookie_valid(fsc->fscache)) {
- struct ceph_fscache_entry *ent;
- bool found = false;
-
- mutex_lock(&ceph_fscache_lock);
- list_for_each_entry(ent, &ceph_fscache_list, list) {
- if (ent->fscache == fsc->fscache) {
- list_del(&ent->list);
- kfree(ent);
- found = true;
- break;
- }
- }
- WARN_ON_ONCE(!found);
- mutex_unlock(&ceph_fscache_lock);
-
- __fscache_relinquish_cookie(fsc->fscache, NULL, false);
- }
- fsc->fscache = NULL;
+ fscache_relinquish_volume(fsc->fscache, NULL, false);
}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 058ea2a04376..dc502daac49a 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -12,61 +12,70 @@
#include <linux/netfs.h>
#ifdef CONFIG_CEPH_FSCACHE
-
-extern struct fscache_netfs ceph_cache_netfs;
-
-int ceph_fscache_register(void);
-void ceph_fscache_unregister(void);
+#include <linux/fscache.h>
int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc);
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
void ceph_fscache_register_inode_cookie(struct inode *inode);
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
-void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
-void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);
-static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+void ceph_fscache_use_cookie(struct inode *inode, bool will_modify);
+void ceph_fscache_unuse_cookie(struct inode *inode, bool update);
+
+void ceph_fscache_update(struct inode *inode);
+void ceph_fscache_invalidate(struct inode *inode, bool dio_write);
+
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
- ci->fscache = NULL;
+ return netfs_i_cookie(&ci->netfs);
}
-static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
{
- return ci->fscache;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
+
+ if (cookie) {
+ ceph_fscache_use_cookie(inode, true);
+ fscache_resize_cookie(cookie, to);
+ ceph_fscache_unuse_cookie(inode, true);
+ }
}
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
{
- fscache_invalidate(ceph_inode(inode)->fscache);
+ fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline bool ceph_is_cache_enabled(struct inode *inode)
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode));
+ struct ceph_inode_info *ci = ceph_inode(mapping->host);
- if (!cookie)
- return false;
- return fscache_cookie_enabled(cookie);
+ return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
}
-static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
+static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
{
struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
- return fscache_begin_read_operation(rreq, cookie);
+ return fscache_begin_read_operation(&rreq->cache_resources, cookie);
}
-#else
-static inline int ceph_fscache_register(void)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
{
- return 0;
+ return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline void ceph_fscache_unregister(void)
+static inline void ceph_fscache_note_page_release(struct inode *inode)
{
-}
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ fscache_note_page_release(ceph_fscache_cookie(ci));
+}
+#else /* CONFIG_CEPH_FSCACHE */
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
struct fs_context *fc)
{
@@ -77,41 +86,63 @@ static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
}
-static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
{
}
-static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
{
- return NULL;
}
-static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
+static inline void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
{
}
-static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+static inline void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
{
}
-static inline void ceph_fscache_file_set_cookie(struct inode *inode,
- struct file *filp)
+static inline void ceph_fscache_update(struct inode *inode)
{
}
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
{
}
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+{
+ return NULL;
+}
+
+static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
+{
+}
+
+static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
+{
+}
+
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ return filemap_dirty_folio(mapping, folio);
+}
+
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return false;
}
-static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
+static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
{
return -ENOBUFS;
}
-#endif
-#endif /* _CEPH_CACHE_H */
+static inline void ceph_fscache_note_page_release(struct inode *inode)
+{
+}
+#endif /* CONFIG_CEPH_FSCACHE */
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b9460b6fb76f..fb023f9fafcb 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+ dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
ci->i_hold_caps_max - jiffies);
}
@@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
@@ -531,7 +531,7 @@ no_change:
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
@@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
if (list_empty(&ci->i_cap_delay_list))
return;
spin_lock(&mdsc->cap_delay_lock);
@@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if (S_ISREG(ci->vfs_inode.i_mode) &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
(issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
@@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
if (issued & CEPH_CAP_FILE_SHARED)
atomic_inc(&ci->i_shared_gen);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->netfs.inode);
__ceph_dir_clear_complete(ci);
}
}
/* Wipe saved layout if we're losing DIR_CREATE caps */
- if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
!(issued & CEPH_CAP_DIR_CREATE)) {
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
@@ -602,8 +602,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* @ci: inode to be moved
* @session: new auth caps session
*/
-static void change_auth_cap_ses(struct ceph_inode_info *ci,
- struct ceph_mds_session *session)
+void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
{
lockdep_assert_held(&ci->i_ceph_lock);
@@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode,
cap->issue_seq = seq;
cap->mseq = mseq;
cap->cap_gen = gen;
+ wake_up_all(&ci->i_cap_wq);
}
/*
@@ -771,7 +772,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+ "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
return 0;
}
@@ -797,7 +798,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (!__cap_is_valid(cap))
continue;
dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
have |= cap->issued;
if (implemented)
*implemented |= cap->implemented;
@@ -844,12 +845,12 @@ static void __touch_cap(struct ceph_cap *cap)
spin_lock(&s->s_cap_lock);
if (!s->s_cap_iterator) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+ dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
} else {
dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->vfs_inode, cap, s->s_mds);
+ &cap->ci->netfs.inode, cap, s->s_mds);
}
spin_unlock(&s->s_cap_lock);
}
@@ -867,7 +868,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode),
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -879,7 +880,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
continue;
if ((cap->issued & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -891,7 +892,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
have |= cap->issued;
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode),
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -919,7 +920,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
int touch)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
int r;
r = __ceph_caps_issued_mask(ci, mask, touch);
@@ -950,7 +951,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int ret;
spin_lock(&ci->i_ceph_lock);
@@ -969,8 +970,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
- (S_ISREG(ci->vfs_inode.i_mode) &&
- ci->vfs_inode.i_data.nrpages))
+ (S_ISREG(ci->netfs.inode.i_mode) &&
+ ci->netfs.inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
@@ -993,11 +994,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
int want = 0;
/* use used_cutoff here, to keep dir's wanted caps longer */
@@ -1050,7 +1051,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
int __ceph_caps_wanted(struct ceph_inode_info *ci)
{
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
/* we want EXCL if holding caps of dir ops */
if (w & CEPH_CAP_ANY_DIR_OPS)
w |= CEPH_CAP_FILE_EXCL;
@@ -1116,9 +1117,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
- mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
+ mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
/* remove from inode's cap rbtree, and clear auth cap */
rb_erase(&cap->ci_node, &ci->i_caps);
@@ -1169,7 +1170,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
* keep i_snap_realm.
*/
if (ci->i_wr_ref == 0 && ci->i_snap_realm)
- ceph_change_snap_realm(&ci->vfs_inode, NULL);
+ ceph_change_snap_realm(&ci->netfs.inode, NULL);
__cap_delay_cancel(mdsc, ci);
}
@@ -1188,11 +1189,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- fsc = ceph_inode_to_client(&ci->vfs_inode);
+ fsc = ceph_inode_to_client(&ci->netfs.inode);
WARN_ON_ONCE(ci->i_auth_cap == cap &&
!list_empty(&ci->i_dirty_item) &&
!fsc->blocklisted &&
- !ceph_inode_is_shutdown(&ci->vfs_inode));
+ !ceph_inode_is_shutdown(&ci->netfs.inode));
__ceph_remove_cap(cap, queue_release);
}
@@ -1343,7 +1344,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
int flushing, u64 flush_tid, u64 oldest_flush_tid)
{
struct ceph_inode_info *ci = cap->ci;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int held, revoking;
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1440,7 +1441,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
struct ceph_msg *msg;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
if (!msg) {
@@ -1528,7 +1529,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
u64 oldest_flush_tid = 0;
@@ -1577,7 +1578,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
while (first_tid <= last_tid) {
struct ceph_cap *cap = ci->i_auth_cap;
- struct ceph_cap_flush *cf;
+ struct ceph_cap_flush *cf = NULL, *iter;
int ret;
if (!(cap && cap->session == session)) {
@@ -1587,8 +1588,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
}
ret = -ENOENT;
- list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
- if (cf->tid >= first_tid) {
+ list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
+ if (iter->tid >= first_tid) {
+ cf = iter;
ret = 0;
break;
}
@@ -1621,7 +1623,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL;
int mds;
@@ -1681,8 +1683,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct inode *inode = &ci->vfs_inode;
+ ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
int was = ci->i_dirty_caps;
int dirty = 0;
@@ -1695,7 +1697,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return 0;
}
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+ dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
ceph_cap_string(mask), ceph_cap_string(was),
ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
@@ -1711,7 +1713,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
ci->i_snap_realm->cached_context);
}
dout(" inode %p now dirty snapc %p auth cap %p\n",
- &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
list_add(&ci->i_dirty_item, &session->s_cap_dirty);
@@ -1856,7 +1858,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
u32 invalidating_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- ceph_fscache_invalidate(inode);
+ ceph_fscache_invalidate(inode, false);
invalidate_mapping_pages(&inode->i_data, 0, -1);
spin_lock(&ci->i_ceph_lock);
@@ -1874,7 +1876,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
bool __ceph_should_report_size(struct ceph_inode_info *ci)
{
- loff_t size = i_size_read(&ci->vfs_inode);
+ loff_t size = i_size_read(&ci->netfs.inode);
/* mds will adjust max size according to the reported size */
if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
return false;
@@ -1899,7 +1901,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
@@ -1910,11 +1912,19 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct rb_node *p;
bool queue_invalidate = false;
bool tried_invalidate = false;
+ bool queue_writeback = false;
if (session)
ceph_get_mds_session(session);
spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ /* Don't send messages until we get async create reply */
+ spin_unlock(&ci->i_ceph_lock);
+ ceph_put_mds_session(session);
+ return;
+ }
+
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
retry:
@@ -1969,14 +1979,15 @@ retry:
}
dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode),
+ " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+ (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
+ (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
/*
* If we no longer need to hold onto old our caps, and we may
@@ -2055,10 +2066,27 @@ retry:
}
/* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & cap_used) == 0) {
- dout("completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ dout("completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
}
/* want more caps from mds? */
@@ -2128,6 +2156,8 @@ ack:
spin_unlock(&ci->i_ceph_lock);
ceph_put_mds_session(session);
+ if (queue_writeback)
+ ceph_queue_writeback(inode);
if (queue_invalidate)
ceph_queue_invalidate(inode);
}
@@ -2211,13 +2241,14 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}
/*
- * wait for any unsafe requests to complete.
+ * flush the mdlog and wait for any unsafe requests to complete.
*/
-static int unsafe_request_wait(struct inode *inode)
+static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ unsigned int max_sessions;
int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
@@ -2236,36 +2267,46 @@ static int unsafe_request_wait(struct inode *inode)
spin_unlock(&ci->i_unsafe_lock);
/*
+ * The mdsc->max_sessions is unlikely to be changed
+ * mostly, here we will retry it by reallocating the
+ * sessions array memory to get rid of the mdsc->mutex
+ * lock.
+ */
+retry:
+ max_sessions = mdsc->max_sessions;
+
+ /*
* Trigger to flush the journal logs in all the relevant MDSes
* manually, or in the worst case we must wait at most 5 seconds
* to wait the journal logs to be flushed by the MDSes periodically.
*/
- if (req1 || req2) {
+ if ((req1 || req2) && likely(max_sessions)) {
struct ceph_mds_session **sessions = NULL;
struct ceph_mds_session *s;
struct ceph_mds_request *req;
- unsigned int max;
int i;
- /*
- * The mdsc->max_sessions is unlikely to be changed
- * mostly, here we will retry it by reallocating the
- * sessions arrary memory to get rid of the mdsc->mutex
- * lock.
- */
-retry:
- max = mdsc->max_sessions;
- sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
- if (!sessions)
- return -ENOMEM;
+ sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
+ if (!sessions) {
+ err = -ENOMEM;
+ goto out;
+ }
spin_lock(&ci->i_unsafe_lock);
if (req1) {
list_for_each_entry(req, &ci->i_unsafe_dirops,
r_unsafe_dir_item) {
s = req->r_session;
- if (unlikely(s->s_mds >= max)) {
+ if (!s)
+ continue;
+ if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s)
+ ceph_put_mds_session(s);
+ }
+ kfree(sessions);
goto retry;
}
if (!sessions[s->s_mds]) {
@@ -2278,8 +2319,16 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_iops,
r_unsafe_target_item) {
s = req->r_session;
- if (unlikely(s->s_mds >= max)) {
+ if (!s)
+ continue;
+ if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s)
+ ceph_put_mds_session(s);
+ }
+ kfree(sessions);
goto retry;
}
if (!sessions[s->s_mds]) {
@@ -2300,7 +2349,7 @@ retry:
spin_unlock(&ci->i_ceph_lock);
/* send flush mdlog request to MDSes */
- for (i = 0; i < max; i++) {
+ for (i = 0; i < max_sessions; i++) {
s = sessions[i];
if (s) {
send_flush_mdlog(s);
@@ -2310,22 +2359,26 @@ retry:
kfree(sessions);
}
- dout("unsafe_request_wait %p wait on tid %llu %llu\n",
+ dout("%s %p wait on tid %llu %llu\n", __func__,
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
ceph_timeout_jiffies(req1->r_timeout));
if (ret)
err = -EIO;
- ceph_mdsc_put_request(req1);
}
if (req2) {
ret = !wait_for_completion_timeout(&req2->r_safe_completion,
ceph_timeout_jiffies(req2->r_timeout));
if (ret)
err = -EIO;
- ceph_mdsc_put_request(req2);
}
+
+out:
+ if (req1)
+ ceph_mdsc_put_request(req1);
+ if (req2)
+ ceph_mdsc_put_request(req2);
return err;
}
@@ -2350,7 +2403,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
- err = unsafe_request_wait(inode);
+ err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
/*
* only wait on non-file metadata writeback (the mds
@@ -2388,7 +2441,11 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
dout("write_inode %p wait=%d\n", inode, wait);
+ ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
+ err = ceph_wait_on_async_create(inode);
+ if (err)
+ return err;
dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
@@ -2412,13 +2469,17 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
int ret;
u64 first_tid = 0;
u64 last_snap_flush = 0;
+ /* Don't do anything until create reply comes in */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
+ return;
+
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
@@ -2501,7 +2562,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2551,7 +2612,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2571,7 +2632,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
lockdep_assert_held(&ci->i_ceph_lock);
- dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
+ dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2614,10 +2675,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
}
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
- ihold(&ci->vfs_inode);
+ ihold(&ci->netfs.inode);
ci->i_wb_ref++;
dout("%s %p wb %d -> %d (?)\n", __func__,
- &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2699,13 +2760,17 @@ again:
* on transition from wanted -> needed caps. This is needed
* for WRBUFFER|WR -> WR to avoid a new WR sync write from
* going before a prior buffered writeback happens.
+ *
+ * For RDCACHE|RD -> RD, there is not need to wait and we can
+ * just exclude the revoking caps and force to sync read.
*/
int not = want & ~(have & need);
int revoking = implemented & ~have;
+ int exclude = revoking & not;
dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
- if ((revoking & not) == 0) {
+ if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
if (!snap_rwsem_locked &&
!ci->i_head_snapc &&
(need & CEPH_CAP_FILE_WR)) {
@@ -2727,7 +2792,7 @@ again:
snap_rwsem_locked = true;
}
if ((have & want) == want)
- *got = need | want;
+ *got = need | (want & ~exclude);
else
*got = need;
ceph_take_cap_refs(ci, *got, true);
@@ -2945,8 +3010,8 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
return ret;
}
- if (S_ISREG(ci->vfs_inode.i_mode) &&
- ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
+ ceph_has_inline_data(ci) &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
struct page *page =
@@ -3035,7 +3100,7 @@ enum put_cap_refs_mode {
static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
enum put_cap_refs_mode mode)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int last = 0, put = 0, flushsnaps = 0, wake = 0;
bool check_flushsnaps = false;
@@ -3143,11 +3208,10 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap = NULL;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
int put = 0;
bool last = false;
- bool found = false;
bool flush_snaps = false;
bool complete_capsnap = false;
@@ -3174,14 +3238,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
last ? " LAST" : "");
} else {
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->context == snapc) {
- found = true;
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->context == snapc) {
+ capsnap = iter;
break;
}
}
- if (!found) {
+ if (!capsnap) {
/*
* The capsnap should already be removed when removing
* auth cap in the case of a forced unmount.
@@ -3375,8 +3439,7 @@ static void handle_cap_grant(struct inode *inode,
if ((newcaps & CEPH_CAP_LINK_SHARED) &&
(extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
set_nlink(inode, le32_to_cpu(grant->nlink));
- if (inode->i_nlink == 0 &&
- (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+ if (inode->i_nlink == 0)
deleted_inode = true;
}
@@ -3492,6 +3555,9 @@ static void handle_cap_grant(struct inode *inode,
check_caps = 1; /* check auth cap only */
else
check_caps = 2; /* check all caps */
+ /* If there is new caps, try to wake up the waiters */
+ if (~cap->issued & newcaps)
+ wake = true;
cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) {
@@ -3521,24 +3587,23 @@ static void handle_cap_grant(struct inode *inode,
fill_inline = true;
}
- if (ci->i_auth_cap == cap &&
- le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- if (newcaps & ~extra_info->issued)
- wake = true;
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ if (ci->i_auth_cap == cap) {
+ if (newcaps & ~extra_info->issued)
+ wake = true;
- if (ci->i_requested_max_size > max_size ||
- !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
- /* re-request max_size if necessary */
- ci->i_requested_max_size = 0;
- wake = true;
- }
+ if (ci->i_requested_max_size > max_size ||
+ !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
+ /* re-request max_size if necessary */
+ ci->i_requested_max_size = 0;
+ wake = true;
+ }
- ceph_kick_flushing_inode_caps(session, ci);
- spin_unlock(&ci->i_ceph_lock);
+ ceph_kick_flushing_inode_caps(session, ci);
+ }
up_read(&session->s_mdsc->snap_rwsem);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
+ spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
@@ -3641,7 +3706,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
session->s_mds,
&list_first_entry(&session->s_cap_flushing,
struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ i_flushing_item)->netfs.inode);
}
}
mdsc->num_cap_flushing--;
@@ -3732,8 +3797,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
- struct ceph_cap_snap *capsnap;
- bool flushed = false;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
bool wake_ci = false;
bool wake_mdsc = false;
@@ -3741,26 +3805,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
inode, ci, session->s_mds, follows);
spin_lock(&ci->i_ceph_lock);
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->follows == follows) {
- if (capsnap->cap_flush.tid != flush_tid) {
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->follows == follows) {
+ if (iter->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", capsnap, follows,
- flush_tid, capsnap->cap_flush.tid);
+ " %lld\n", iter, follows,
+ flush_tid, iter->cap_flush.tid);
break;
}
- flushed = true;
+ capsnap = iter;
break;
} else {
dout(" skipping cap_snap %p follows %lld\n",
- capsnap, capsnap->follows);
+ iter, iter->follows);
}
}
- if (flushed)
+ if (capsnap)
ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
spin_unlock(&ci->i_ceph_lock);
- if (flushed) {
+ if (capsnap) {
ceph_put_snap_context(capsnap->context);
ceph_put_cap_snap(capsnap);
if (wake_ci)
@@ -3837,6 +3901,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
inode, ci, mds, mseq, target);
retry:
+ down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
@@ -3900,6 +3965,7 @@ retry:
}
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
/* open target session */
@@ -3925,6 +3991,7 @@ retry:
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
if (tsession) {
mutex_unlock(&tsession->s_mutex);
@@ -4133,7 +4200,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
- ci = ceph_inode(inode);
dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
vino.snap, inode);
@@ -4159,6 +4225,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
goto flush_cap_releases;
}
+ ci = ceph_inode(inode);
/* these will work even if we don't have a cap yet */
switch (op) {
@@ -4286,7 +4353,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
break;
list_del_init(&ci->i_cap_delay_list);
- inode = igrab(&ci->vfs_inode);
+ inode = igrab(&ci->netfs.inode);
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
@@ -4314,10 +4381,11 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
while (!list_empty(&s->s_cap_dirty)) {
ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
i_dirty_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
+ ceph_wait_on_async_create(inode);
ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
@@ -4348,9 +4416,9 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
- bool is_opened = false;
+ bool already_opened = false;
int i;
if (count == 1)
@@ -4358,19 +4426,19 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
- if (bits & (1 << i))
- ci->i_nr_by_mode[i] += count;
-
/*
- * If any of the mode ref is larger than 1,
+ * If any of the mode ref is larger than 0,
* that means it has been already opened by
* others. Just skip checking the PIN ref.
*/
- if (i && ci->i_nr_by_mode[i] > 1)
- is_opened = true;
+ if (i && ci->i_nr_by_mode[i])
+ already_opened = true;
+
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i] += count;
}
- if (!is_opened)
+ if (!already_opened)
percpu_counter_inc(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
@@ -4382,7 +4450,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
bool is_closed = true;
int i;
@@ -4597,7 +4665,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
lockdep_assert_held(&ci->i_ceph_lock);
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
is_auth = (cap == ci->i_auth_cap);
__ceph_remove_cap(cap, false);
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3cf7c9c1085b..bec3c4549c07 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_client_metric *cm = &fsc->mdsc->metric;
struct ceph_metric *m;
- s64 total, sum, avg, min, max, sq;
+ s64 total, avg, min, max, sq;
int i;
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
@@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
m = &cm->metric[i];
spin_lock(&m->lock);
total = m->total;
- sum = m->latency_sum;
- avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
+ avg = m->latency_avg;
min = m->latency_min;
max = m->latency_max;
sq = m->latency_sq_sum;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 133dbd9338e7..e7e2ebac330d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
- i_mutex, no need to use page lock */
+ i_rwsem, no need to use page lock */
unlock_page(cache_ctl->page);
cache_ctl->dentries = kmap(cache_ctl->page);
}
@@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
rcu_read_lock();
spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
- * marked as complete while not holding the i_mutex. */
+ * marked as complete while not holding the i_rwsem. */
if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
dentry = cache_ctl->dentries[cache_ctl->index];
else
@@ -478,8 +478,11 @@ more:
2 : (fpos_off(rde->offset) + 1);
err = note_last_dentry(dfi, rde->name, rde->name_len,
next_offset);
- if (err)
+ if (err) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
return err;
+ }
} else if (req->r_reply_info.dir_end) {
dfi->next_offset = 2;
/* keep last name */
@@ -520,6 +523,12 @@ more:
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
le32_to_cpu(rde->inode.in->mode) >> 12)) {
+ /*
+ * NOTE: Here no need to put the 'dfi->last_readdir',
+ * because when dir_emit stops us it's most likely
+ * doesn't have enough memory, etc. So for next readdir
+ * it will continue.
+ */
dout("filldir stopping us...\n");
return 0;
}
@@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
+ struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
/* .snap dir? */
if (ceph_snap(parent) == CEPH_NOSNAP &&
@@ -847,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_quota_is_max_files_exceeded(dir)) {
err = -EDQUOT;
goto out;
@@ -909,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_quota_is_max_files_exceeded(dir)) {
err = -EDQUOT;
goto out;
@@ -959,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
- int err = -EROFS;
+ int err;
int op;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
op = CEPH_MDS_OP_MKSNAP;
@@ -971,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
+ err = -EROFS;
goto out;
}
@@ -1028,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct ceph_mds_request *req;
int err;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
@@ -1062,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct dentry *dentry = req->r_dentry;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
+ __func__, dentry, dentry);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
+ spin_unlock(&dentry->d_lock);
+
+ synchronize_rcu();
+
if (result == -EJUKEBOX)
goto out;
@@ -1072,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
if (result) {
int pathlen = 0;
u64 base = 0;
- char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ char *path = ceph_mdsc_build_path(dentry, &pathlen,
&base, 0);
/* mark error on parent + clear complete */
@@ -1080,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
ceph_dir_clear_complete(req->r_parent);
/* drop the dentry -- we don't know its status */
- if (!d_unhashed(req->r_dentry))
- d_drop(req->r_dentry);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
/* mark inode itself for an error (since metadata is bogus) */
mapping_set_error(req->r_old_inode->i_mapping, result);
- pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
+ pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
}
@@ -1171,6 +1215,8 @@ retry:
if (try_async && op == CEPH_MDS_OP_UNLINK &&
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
dentry->d_name.len, dentry->d_name.name,
ceph_cap_string(req->r_dir_caps));
@@ -1178,6 +1224,16 @@ retry:
req->r_callback = ceph_async_unlink_cb;
req->r_old_inode = d_inode(dentry);
ihold(req->r_old_inode);
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
+ dentry->d_name.hash);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
/*
@@ -1186,10 +1242,20 @@ retry:
*/
drop_nlink(inode);
d_delete(dentry);
- } else if (err == -EJUKEBOX) {
- try_async = false;
- ceph_mdsc_put_request(req);
- goto retry;
+ } else {
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ if (err == -EJUKEBOX) {
+ try_async = false;
+ ceph_mdsc_put_request(req);
+ goto retry;
+ }
}
} else {
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
@@ -1228,6 +1294,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
(!ceph_quota_is_same_realm(old_dir, new_dir)))
return -EXDEV;
+ err = ceph_wait_on_conflict_unlink(new_dentry);
+ if (err)
+ return err;
+
dout("rename dir %p dentry %p to dir %p dentry %p\n",
old_dir, old_dentry, new_dir, new_dentry);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e0fa66ac8b9f..f780e4e0d062 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct inode *inode = __lookup_inode(sb, ino);
+ struct ceph_inode_info *ci = ceph_inode(inode);
int err;
if (IS_ERR(inode))
@@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
return ERR_PTR(err);
}
/* -ESTALE if inode as been unlinked and no file is open */
- if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) {
+ if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) {
iput(inode);
return ERR_PTR(-ESTALE);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 02a0a0fd9ccd..04fd34557de8 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -95,12 +95,11 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
size_t start;
int idx = 0;
- bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+ bytes = iov_iter_get_pages2(iter, pages, maxsize - size,
ITER_GET_BVECS_PAGES, &start);
if (bytes < 0)
return size ?: bytes;
- iov_iter_advance(iter, bytes);
size += bytes;
for ( ; bytes; idx++, bvec_idx++) {
@@ -204,7 +203,10 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
int fmode, bool isdir)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
struct ceph_file_info *fi;
+ int ret;
dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
inode->i_mode, isdir ? "dir" : "regular");
@@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
if (!fi)
return -ENOMEM;
+ if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+ fi->flags |= CEPH_F_SYNC;
+
file->private_data = fi;
}
@@ -235,7 +240,21 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
INIT_LIST_HEAD(&fi->rw_contexts);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
+ if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
+ ret = ceph_uninline_data(file);
+ if (ret < 0)
+ goto error;
+ }
+
return 0;
+
+error:
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
+ ceph_put_fmode(ci, fi->fmode, 1);
+ kmem_cache_free(ceph_file_cachep, fi);
+ /* wake up anyone waiting for caps on this inode */
+ wake_up_all(&ci->i_cap_wq);
+ return ret;
}
/*
@@ -248,8 +267,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
- ceph_fscache_register_inode_cookie(inode);
- ceph_fscache_file_set_cookie(inode, file);
+ ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE);
fallthrough;
case S_IFDIR:
ret = ceph_init_file_info(inode, file, fmode,
@@ -512,52 +530,67 @@ static void restore_deleg_ino(struct inode *dir, u64 ino)
}
}
+static void wake_async_create_waiters(struct inode *inode,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ }
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct dentry *dentry = req->r_dentry;
+ struct inode *dinode = d_inode(dentry);
+ struct inode *tinode = req->r_target_inode;
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
+ WARN_ON_ONCE(dinode && tinode && dinode != tinode);
+
+ /* MDS changed -- caller must resubmit */
if (result == -EJUKEBOX)
goto out;
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- struct dentry *dentry = req->r_dentry;
- struct inode *inode = d_inode(dentry);
int pathlen = 0;
u64 base = 0;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
+ pr_warn("async create failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
d_drop(dentry);
- ceph_inode_shutdown(inode);
-
- pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ if (dinode) {
+ mapping_set_error(dinode->i_mapping, result);
+ ceph_inode_shutdown(dinode);
+ wake_async_create_waiters(dinode, req->r_session);
+ }
}
- if (req->r_target_inode) {
- struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
- u64 ino = ceph_vino(req->r_target_inode).ino;
+ if (tinode) {
+ u64 ino = ceph_vino(tinode).ino;
if (req->r_deleg_ino != ino)
pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
__func__, req->r_err, req->r_deleg_ino, ino);
- mapping_set_error(req->r_target_inode->i_mapping, result);
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
- ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
- wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
- }
- ceph_kick_flushing_inode_caps(req->r_session, ci);
- spin_unlock(&ci->i_ceph_lock);
+ mapping_set_error(tinode->i_mapping, result);
+ wake_async_create_waiters(tinode, req->r_session);
} else if (!result) {
pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
req->r_deleg_ino);
@@ -577,8 +610,10 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
struct ceph_mds_reply_inode in = { };
struct ceph_mds_reply_info_in iinfo = { .in = &in };
struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
struct inode *inode;
struct timespec64 now;
+ struct ceph_string *pool_ns;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_vino vino = { .ino = req->r_deleg_ino,
.snap = CEPH_NOSNAP };
@@ -593,9 +628,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
iinfo.change_attr = 1;
ceph_encode_timespec64(&iinfo.btime, &now);
- iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
- iinfo.xattr_data = xattr_buf;
- memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ if (req->r_pagelist) {
+ iinfo.xattr_len = req->r_pagelist->length;
+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
+ } else {
+ /* fake it */
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ }
in.ino = cpu_to_le64(vino.ino);
in.snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -605,17 +646,31 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
in.cap.flags = CEPH_CAP_FLAG_AUTH;
in.ctime = in.mtime = in.atime = iinfo.btime;
- in.mode = cpu_to_le32((u32)mode);
in.truncate_seq = cpu_to_le32(1);
in.truncate_size = cpu_to_le64(-1ULL);
in.xattr_version = cpu_to_le64(1);
in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
- in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
- dir->i_gid : current_fsgid()));
+ if (dir->i_mode & S_ISGID) {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
+
+ /* Directories always inherit the setgid bit. */
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
+ }
+ in.mode = cpu_to_le32((u32)mode);
+
in.nlink = cpu_to_le32(1);
in.max_size = cpu_to_le64(lo->stripe_unit);
ceph_file_layout_to_legacy(lo, &in.layout);
+ /* lo is private, so pool_ns can't change */
+ pool_ns = rcu_dereference_raw(lo->pool_ns);
+ if (pool_ns) {
+ iinfo.pool_ns_len = pool_ns->len;
+ iinfo.pool_ns_data = pool_ns->str;
+ }
down_read(&mdsc->snap_rwsem);
ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
@@ -654,6 +709,12 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
file->f_mode |= FMODE_CREATED;
ret = finish_open(file, dentry, ceph_open);
}
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_CREATE;
+ wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT);
+ spin_unlock(&dentry->d_lock);
+
return ret;
}
@@ -680,6 +741,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+ /*
+ * Do not truncate the file, since atomic_open is called before the
+ * permission check. The caller will do the truncation afterward.
+ */
+ flags &= ~O_TRUNC;
+
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
@@ -689,6 +759,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ /* Async create can't handle more than a page of xattrs */
+ if (as_ctx.pagelist &&
+ !list_is_singular(&as_ctx.pagelist->head))
+ try_async = false;
} else if (!d_in_lookup(dentry)) {
/* If it's not being looked up, it's negative */
return -ENOENT;
@@ -722,9 +796,16 @@ retry:
(req->r_dir_caps =
try_prep_async_create(dir, dentry, &lo,
&req->r_deleg_ino))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
req->r_callback = ceph_async_create_cb;
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_CREATE;
+ spin_unlock(&dentry->d_lock);
+
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
err = ceph_finish_async_create(dir, dentry,
@@ -734,16 +815,16 @@ retry:
restore_deleg_ino(dir, req->r_deleg_ino);
ceph_mdsc_put_request(req);
try_async = false;
+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
goto retry;
}
+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
goto out_req;
}
}
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
if (err == -ENOENT) {
dentry = ceph_handle_snapdir(req, dentry);
if (IS_ERR(dentry)) {
@@ -810,6 +891,7 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p regular file %p\n", inode, file);
WARN_ON(!list_empty(&fi->rw_contexts));
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
ceph_put_fmode(ci, fi->fmode, 1);
kmem_cache_free(ceph_file_cachep, fi);
@@ -847,7 +929,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ssize_t ret;
u64 off = iocb->ki_pos;
u64 len = iov_iter_count(to);
- u64 i_size;
+ u64 i_size = i_size_read(inode);
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -898,9 +980,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
false, false);
- ret = ceph_osdc_start_request(osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
ceph_update_read_metrics(&fsc->mdsc->metric,
req->r_start_latency,
@@ -1015,7 +1096,6 @@ static void ceph_aio_complete(struct inode *inode,
}
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&aio_req->prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1164,7 +1244,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_inode = inode;
req->r_priv = aio_req;
- ret = ceph_osdc_start_request(req->r_osdc, req, false);
+ ceph_osdc_start_request(req->r_osdc, req);
out:
if (ret < 0) {
req->r_result = ret;
@@ -1196,7 +1276,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
- bool should_dirty = !write && iter_is_iovec(iter);
+ bool should_dirty = !write && user_backed_iter(iter);
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -1206,7 +1286,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
snapc, snapc ? snapc->seq : 0);
if (write) {
- int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
+ int ret2;
+
+ ceph_fscache_invalidate(inode, true);
+
+ ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
@@ -1297,9 +1381,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
continue;
}
- ret = ceph_osdc_start_request(req->r_osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(req->r_osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
if (write)
ceph_update_write_metrics(metric, req->r_start_latency,
@@ -1362,8 +1445,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
r_private_item);
list_del_init(&req->r_private_item);
if (ret >= 0)
- ret = ceph_osdc_start_request(req->r_osdc,
- req, false);
+ ceph_osdc_start_request(req->r_osdc, req);
if (ret < 0) {
req->r_result = ret;
ceph_aio_complete_req(req);
@@ -1417,6 +1499,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (ret < 0)
return ret;
+ ceph_fscache_invalidate(inode, false);
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(pos + count - 1) >> PAGE_SHIFT);
@@ -1475,9 +1558,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
false, true);
req->r_mtime = mtime;
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, ret);
@@ -1524,7 +1606,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct ceph_inode_info *ci = ceph_inode(inode);
bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
ssize_t ret;
- int want, got = 0;
+ int want = 0, got = 0;
int retry_op = 0, read = 0;
again:
@@ -1539,13 +1621,14 @@ again:
else
ceph_start_io_read(inode);
+ if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+ want |= CEPH_CAP_FILE_CACHE;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_CACHE;
+ want |= CEPH_CAP_FILE_LAZYIO;
+
ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
if (ret < 0) {
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_end_io_direct(inode);
else
ceph_end_io_read(inode);
@@ -1560,7 +1643,7 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
- if (ci->i_inline_version == CEPH_INLINE_NONE) {
+ if (!ceph_has_inline_data(ci)) {
if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
ret = ceph_direct_read_write(iocb, to,
NULL, NULL);
@@ -1679,7 +1762,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
- int err, want, got;
+ int err, want = 0, got;
bool direct_lock = false;
u32 map_flags;
u64 pool_flags;
@@ -1746,18 +1829,12 @@ retry_snap:
if (err)
goto out;
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- err = ceph_uninline_data(file, NULL);
- if (err < 0)
- goto out;
- }
-
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
inode, ceph_vinop(inode), pos, count, i_size_read(inode));
+ if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+ want |= CEPH_CAP_FILE_BUFFER;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
+ want |= CEPH_CAP_FILE_LAZYIO;
got = 0;
err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
if (err < 0)
@@ -1813,7 +1890,7 @@ retry_snap:
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
- written = generic_perform_write(file, from, pos);
+ written = generic_perform_write(iocb, from);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
ceph_end_io_write(inode);
@@ -1823,14 +1900,13 @@ retry_snap:
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1870,57 +1946,15 @@ out_unlocked:
*/
static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *inode = file->f_mapping->host;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- loff_t i_size;
- loff_t ret;
-
- inode_lock(inode);
-
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
+ struct inode *inode = file_inode(file);
+ int ret;
+
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
if (ret < 0)
- goto out;
- }
-
- i_size = i_size_read(inode);
- switch (whence) {
- case SEEK_END:
- offset += i_size;
- break;
- case SEEK_CUR:
- /*
- * Here we special-case the lseek(fd, 0, SEEK_CUR)
- * position-querying operation. Avoid rewriting the "same"
- * f_pos value back to the file because a concurrent read(),
- * write() or lseek() might have altered it
- */
- if (offset == 0) {
- ret = file->f_pos;
- goto out;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset < 0 || offset >= i_size) {
- ret = -ENXIO;
- goto out;
- }
- break;
- case SEEK_HOLE:
- if (offset < 0 || offset >= i_size) {
- ret = -ENXIO;
- goto out;
- }
- offset = i_size;
- break;
+ return ret;
}
-
- ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size));
-
-out:
- inode_unlock(inode);
- return ret;
+ return generic_file_llseek(file, offset, whence);
}
static inline void ceph_zero_partial_page(
@@ -1989,12 +2023,10 @@ static int ceph_zero_partial_object(struct inode *inode,
}
req->r_mtime = inode->i_mtime;
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret) {
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
- if (ret == -ENOENT)
- ret = 0;
- }
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ if (ret == -ENOENT)
+ ret = 0;
ceph_osdc_put_request(req);
out:
@@ -2077,12 +2109,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- ret = ceph_uninline_data(file, NULL);
- if (ret < 0)
- goto unlock;
- }
-
size = i_size_read(inode);
/* Are we punching a hole beyond EOF? */
@@ -2101,12 +2127,12 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
filemap_invalidate_lock(inode->i_mapping);
+ ceph_fscache_invalidate(inode, false);
ceph_zero_pagecache_range(inode, offset, length);
ret = ceph_zero_objects(inode, offset, length);
if (!ret) {
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -2302,7 +2328,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
if (IS_ERR(req))
ret = PTR_ERR(req);
else {
- ceph_osdc_start_request(osdc, req, false);
+ ceph_osdc_start_request(osdc, req);
ret = ceph_osdc_wait_request(osdc, req);
ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
req->r_start_latency,
@@ -2425,6 +2451,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
goto out_caps;
/* Drop dst file cached pages */
+ ceph_fscache_invalidate(dst_inode, false);
ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
dst_off >> PAGE_SHIFT,
(dst_off + len) >> PAGE_SHIFT);
@@ -2494,11 +2521,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
/* Let the MDS know about dst file size change */
if (ceph_inode_set_size(dst_inode, dst_off) ||
ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
- ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH,
+ NULL);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
- dst_ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
spin_unlock(&dst_ci->i_ceph_lock);
if (dirty)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e3322fcb2e8d..4af5e55abc15 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent)
if (!S_ISDIR(parent->i_mode)) {
pr_warn_once("bad snapdir parent type (mode=0%o)\n",
parent->i_mode);
- return ERR_PTR(-ENOTDIR);
+ goto err;
}
if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once("bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
- return ERR_PTR(-ENOTDIR);
+ goto err;
}
inode->i_mode = parent->i_mode;
@@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
return inode;
+err:
+ if ((inode->i_state & I_NEW))
+ discard_new_inode(inode);
+ else
+ iput(inode);
+ return ERR_PTR(-ENOTDIR);
}
const struct inode_operations ceph_file_iops = {
@@ -170,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_insert_color(&frag->node, &ci->i_fragtree);
dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
+ ceph_vinop(&ci->netfs.inode), f);
return frag;
}
@@ -356,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (nsplits != ci->i_fragtree_nsplits) {
update = true;
} else if (nsplits) {
- i = prandom_u32() % nsplits;
+ i = prandom_u32_max(nsplits);
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
update = true;
@@ -447,11 +453,14 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
struct ceph_inode_info *ci;
int i;
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->vfs_inode);
+ dout("alloc_inode %p\n", &ci->netfs.inode);
+
+ /* Set parameters for the netfs library */
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
spin_lock_init(&ci->i_ceph_lock);
@@ -538,10 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
-
- ceph_fscache_inode_init(ci);
-
- return &ci->vfs_inode;
+ return &ci->netfs.inode;
}
void ceph_free_inode(struct inode *inode)
@@ -564,13 +570,15 @@ void ceph_evict_inode(struct inode *inode)
percpu_counter_dec(&mdsc->metric.total_inodes);
truncate_inode_pages_final(&inode->i_data);
+ if (inode->i_state & I_PINNING_FSCACHE_WB)
+ ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
ceph_fscache_unregister_inode_cookie(ci);
__ceph_remove_caps(ci);
- if (__ceph_has_any_quota(ci))
+ if (__ceph_has_quota(ci, QUOTA_GET_ANY))
ceph_adjust_quota_realms_count(inode, false);
/*
@@ -634,6 +642,12 @@ int ceph_fill_file_size(struct inode *inode, int issued,
}
i_size_write(inode, size);
inode->i_blocks = calc_inode_blocks(size);
+ /*
+ * If we're expanding, then we should be able to just update
+ * the existing cookie.
+ */
+ if (size > isize)
+ ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
dout("truncate_seq %u -> %u\n",
@@ -666,10 +680,6 @@ int ceph_fill_file_size(struct inode *inode, int issued,
truncate_size);
ci->i_truncate_size = truncate_size;
}
-
- if (queue_trunc)
- ceph_fscache_invalidate(inode);
-
return queue_trunc;
}
@@ -1039,7 +1049,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
iinfo->inline_version >= ci->i_inline_version) {
int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
ci->i_inline_version = iinfo->inline_version;
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (ceph_has_inline_data(ci) &&
(locked_page || (info_caps & cache_caps)))
fill_inline = true;
}
@@ -1053,6 +1063,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_register_inode_cookie(inode);
+
if (fill_inline)
ceph_fill_inline_data(inode, locked_page,
iinfo->inline_data, iinfo->inline_len);
@@ -1195,7 +1207,7 @@ out_unlock:
/*
* splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
+ * caller must hold directory i_rwsem for this to be safe.
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
@@ -1454,10 +1466,12 @@ retry_lookup:
} else if (have_lease) {
if (d_unhashed(dn))
d_add(dn, NULL);
+ }
+
+ if (!d_unhashed(dn) && have_lease)
update_dentry_lease(dir, dn,
rinfo->dlease, session,
req->r_request_started);
- }
goto done;
}
@@ -1592,7 +1606,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
- * i_mutex, no need to use page lock */
+ * i_rwsem, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
if (idx == 0)
@@ -1814,11 +1828,13 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
i_size_write(inode, size);
+ ceph_fscache_update(inode);
inode->i_blocks = calc_inode_blocks(size);
ret = __ceph_should_report_size(ci);
spin_unlock(&ci->i_ceph_lock);
+
return ret;
}
@@ -1844,6 +1860,8 @@ static void ceph_do_invalidate_pages(struct inode *inode)
u32 orig_gen;
int check = 0;
+ ceph_fscache_invalidate(inode, false);
+
mutex_lock(&ci->i_truncate_mutex);
if (ceph_inode_is_shutdown(inode)) {
@@ -1868,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- ceph_fscache_invalidate(inode);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
ceph_vinop(inode));
@@ -1937,6 +1954,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_resize(inode, to);
truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
@@ -1960,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work)
{
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_work);
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
dout("writeback %p\n", inode);
@@ -2174,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
inode->i_ctime = attr->ia_ctime;
+ inode_inc_iversion_raw(inode);
}
release &= issued;
@@ -2184,7 +2203,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
-
if (mask) {
req->r_inode = inode;
ihold(inode);
@@ -2242,6 +2260,36 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
+int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
+{
+ int issued = ceph_caps_issued(ceph_inode(inode));
+
+ /*
+ * If any 'x' caps is issued we can just choose the auth MDS
+ * instead of the random replica MDSes. Because only when the
+ * Locker is in LOCK_EXEC state will the loner client could
+ * get the 'x' caps. And if we send the getattr requests to
+ * any replica MDS it must auth pin and tries to rdlock from
+ * the auth MDS, and then the auth MDS need to do the Locker
+ * state transition to LOCK_SYNC. And after that the lock state
+ * will change back.
+ *
+ * This cost much when doing the Locker state transition and
+ * usually will need to revoke caps from clients.
+ *
+ * And for the 'Xs' caps for getxattr we will also choose the
+ * auth MDS, because the MDS side code is buggy due to setxattr
+ * won't notify the replica MDSes when the values changed and
+ * the replica MDS will return the old values. Though we will
+ * fix it in MDS code, but this still makes sense for old ceph.
+ */
+ if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
+ || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
+ return USE_AUTH_MDS;
+ else
+ return USE_ANY_MDS;
+}
+
/*
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
@@ -2265,7 +2313,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
return 0;
- mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+ mode = ceph_try_to_choose_auth_mds(inode, mask);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -2280,7 +2328,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (inline_version == 0) {
/* the reply is supposed to contain inline data */
err = -EINVAL;
- } else if (inline_version == CEPH_INLINE_NONE) {
+ } else if (inline_version == CEPH_INLINE_NONE ||
+ inline_version == 1) {
err = -ENODATA;
} else {
err = req->r_reply_info.targeti.inline_len;
@@ -2291,6 +2340,58 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
return err;
}
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int mode = USE_AUTH_MDS;
+ int err;
+ char *xattr_value;
+ size_t xattr_value_len;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
+ if (IS_ERR(req)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto put;
+ }
+
+ ihold(inode);
+ req->r_inode = inode;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto put;
+
+ xattr_value = req->r_reply_info.xattr_info.xattr_value;
+ xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
+
+ dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+
+ err = (int)xattr_value_len;
+ if (size == 0)
+ goto put;
+
+ if (xattr_value_len > size) {
+ err = -ERANGE;
+ goto put;
+ }
+
+ memcpy(value, xattr_value, xattr_value_len);
+put:
+ ceph_mdsc_put_request(req);
+out:
+ dout("do_getvxattr result=%d\n", err);
+ return err;
+}
+
/*
* Check inode permissions. We verify we have a valid value for
@@ -2348,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct super_block *sb = inode->i_sb;
struct ceph_inode_info *ci = ceph_inode(inode);
u32 valid_mask = STATX_BASIC_STATS;
int err = 0;
@@ -2356,7 +2458,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
return -ESTALE;
/* Skip the getattr altogether if we're asked not to sync */
- if (!(flags & AT_STATX_DONT_SYNC)) {
+ if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
err = ceph_do_getattr(inode,
statx_to_caps(request_mask, inode->i_mode),
flags & AT_STATX_FORCE_SYNC);
@@ -2377,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
}
if (ceph_snap(inode) == CEPH_NOSNAP)
- stat->dev = inode->i_sb->s_dev;
+ stat->dev = sb->s_dev;
else
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
- RBYTES))
+ if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
stat->size = ci->i_rbytes;
- else
+ } else if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ struct ceph_inode_info *pci;
+ struct ceph_snap_realm *realm;
+ struct inode *parent;
+
+ parent = ceph_lookup_inode(sb, ceph_ino(inode));
+ if (!parent)
+ return PTR_ERR(parent);
+
+ pci = ceph_inode(parent);
+ spin_lock(&pci->i_ceph_lock);
+ realm = pci->i_snap_realm;
+ if (realm)
+ stat->size = realm->num_snaps;
+ else
+ stat->size = 0;
+ spin_unlock(&pci->i_ceph_lock);
+ iput(parent);
+ } else {
stat->size = ci->i_files + ci->i_subdirs;
+ }
stat->blocks = 0;
stat->blksize = 65536;
/*
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d1f154aec249..3e2843e86e27 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
- if (wait)
- req->r_wait_for_completion = ceph_lock_wait_for_completion;
-
- err = ceph_mdsc_do_request(mdsc, inode, req);
+ err = ceph_mdsc_submit_request(mdsc, inode, req);
+ if (!err)
+ err = ceph_mdsc_wait_request(mdsc, req, wait ?
+ ceph_lock_wait_for_completion : NULL);
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 250aad330a10..26a0a8b9975e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end,
ceph_decode_32_safe(p, end, sets, bad);
dout("got %u sets of delegated inodes\n", sets);
while (sets--) {
- u64 start, len, ino;
+ u64 start, len;
ceph_decode_64_safe(p, end, start, bad);
ceph_decode_64_safe(p, end, len, bad);
@@ -449,14 +449,14 @@ static int ceph_parse_deleg_inos(void **p, void *end,
continue;
}
while (len--) {
- int err = xa_insert(&s->s_delegated_inos, ino = start++,
+ int err = xa_insert(&s->s_delegated_inos, start++,
DELEGATED_INO_AVAILABLE,
GFP_KERNEL);
if (!err) {
dout("added delegated inode 0x%llx\n",
start - 1);
} else if (err == -EBUSY) {
- pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
+ pr_warn("MDS delegated inode 0x%llx more than once.\n",
start - 1);
} else {
return err;
@@ -555,6 +555,28 @@ bad:
return -EIO;
}
+static int parse_reply_info_getvxattr(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ u32 value_len;
+
+ ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
+ ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
+ ceph_decode_skip_32(p, end, bad); /* skip payload length */
+
+ ceph_decode_32_safe(p, end, value_len, bad);
+
+ if (value_len == end - *p) {
+ info->xattr_info.xattr_value = *p;
+ info->xattr_info.xattr_value_len = value_len;
+ *p = end;
+ return value_len;
+ }
+bad:
+ return -EIO;
+}
+
/*
* parse extra results
*/
@@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end,
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features, s);
+ else if (op == CEPH_MDS_OP_GETVXATTR)
+ return parse_reply_info_getvxattr(p, end, info, features);
else
return -EIO;
}
@@ -631,6 +655,79 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
}
+/*
+ * In async unlink case the kclient won't wait for the first reply
+ * from MDS and just drop all the links and unhash the dentry and then
+ * succeeds immediately.
+ *
+ * For any new create/link/rename,etc requests followed by using the
+ * same file names we must wait for the first reply of the inflight
+ * unlink request, or the MDS possibly will fail these following
+ * requests with -EEXIST if the inflight async unlink request was
+ * delayed for some reasons.
+ *
+ * And the worst case is that for the none async openc request it will
+ * successfully open the file if the CDentry hasn't been unlinked yet,
+ * but later the previous delayed async unlink request will remove the
+ * CDenty. That means the just created file is possiblly deleted later
+ * by accident.
+ *
+ * We need to wait for the inflight async unlink requests to finish
+ * when creating new files/directories by using the same file names.
+ */
+int ceph_wait_on_conflict_unlink(struct dentry *dentry)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct dentry *pdentry = dentry->d_parent;
+ struct dentry *udentry, *found = NULL;
+ struct ceph_dentry_info *di;
+ struct qstr dname;
+ u32 hash = dentry->d_name.hash;
+ int err;
+
+ dname.name = dentry->d_name.name;
+ dname.len = dentry->d_name.len;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
+ hnode, hash) {
+ udentry = di->dentry;
+
+ spin_lock(&udentry->d_lock);
+ if (udentry->d_name.hash != hash)
+ goto next;
+ if (unlikely(udentry->d_parent != pdentry))
+ goto next;
+ if (!hash_hashed(&di->hnode))
+ goto next;
+
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
+ __func__, dentry, dentry);
+
+ if (!d_same_name(udentry, pdentry, &dname))
+ goto next;
+
+ spin_unlock(&udentry->d_lock);
+ found = dget(udentry);
+ break;
+next:
+ spin_unlock(&udentry->d_lock);
+ }
+ rcu_read_unlock();
+
+ if (likely(!found))
+ return 0;
+
+ dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
+ dentry, dentry, found, found);
+
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
+ TASK_KILLABLE);
+ dput(found);
+ return err;
+}
+
/*
* sessions
@@ -1196,14 +1293,17 @@ static int encode_supported_features(void **p, void *end)
if (count > 0) {
size_t i;
size_t size = FEATURE_BYTES(count);
+ unsigned long bit;
if (WARN_ON_ONCE(*p + 4 + size > end))
return -ERANGE;
ceph_encode_32(p, size);
memset(*p, 0, size);
- for (i = 0; i < count; i++)
- ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
+ for (i = 0; i < count; i++) {
+ bit = feature_bits[i];
+ ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
+ }
*p += size;
} else {
if (WARN_ON_ONCE(*p + 4 > end))
@@ -1540,7 +1640,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
p = session->s_caps.next;
while (p != &session->s_caps) {
cap = list_entry(p, struct ceph_cap, session_caps);
- inode = igrab(&cap->ci->vfs_inode);
+ inode = igrab(&cap->ci->netfs.inode);
if (!inode) {
p = p->next;
continue;
@@ -1598,7 +1698,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
int iputs;
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
spin_lock(&ci->i_ceph_lock);
iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
spin_unlock(&ci->i_ceph_lock);
@@ -2178,7 +2278,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
- __GFP_NOWARN,
+ __GFP_NOWARN |
+ __GFP_ZERO,
order);
if (rinfo->dir_entries)
break;
@@ -2217,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
+ req->r_feature_needed = -1;
kref_init(&req->r_kref);
RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_wait);
@@ -2626,7 +2728,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_mds_request_head_old *rhead;
struct ceph_msg *msg;
- int flags = 0;
+ int flags = 0, max_retry;
+
+ /*
+ * The type of 'r_attempts' in kernel 'ceph_mds_request'
+ * is 'int', while in 'ceph_mds_request_head' the type of
+ * 'num_retry' is '__u8'. So in case the request retries
+ * exceeding 256 times, the MDS will receive a incorrect
+ * retry seq.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * retrying the request makes no sense.
+ *
+ * In future this could be fixed in ceph code, so avoid
+ * using the hardcode here.
+ */
+ max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
+ max_retry = 1 << (max_retry * BITS_PER_BYTE);
+ if (req->r_attempts >= max_retry) {
+ pr_warn_ratelimited("%s request tid %llu seq overflow\n",
+ __func__, req->r_tid);
+ return -EMULTIHOP;
+ }
req->r_attempts++;
if (req->r_inode) {
@@ -2638,7 +2761,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
else
req->r_sent_on_mseq = -1;
}
- dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+ dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
@@ -2794,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc,
dout("do_request mds%d session %p state %s\n", mds, session,
ceph_session_state_name(session->s_state));
+
+ /*
+ * The old ceph will crash the MDSs when see unknown OPs
+ */
+ if (req->r_feature_needed > 0 &&
+ !test_bit(req->r_feature_needed, &session->s_features)) {
+ err = -EOPNOTSUPP;
+ goto out_session;
+ }
+
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
session->s_state != CEPH_MDS_SESSION_HUNG) {
/*
@@ -2838,6 +2971,64 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies;
+ /*
+ * For async create we will choose the auth MDS of frag in parent
+ * directory to send the request and ususally this works fine, but
+ * if the migrated the dirtory to another MDS before it could handle
+ * it the request will be forwarded.
+ *
+ * And then the auth cap will be changed.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {
+ struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+
+ /*
+ * The request maybe handled very fast and the new inode
+ * hasn't been linked to the dentry yet. We need to wait
+ * for the ceph_finish_async_create(), which shouldn't be
+ * stuck too long or fail in thoery, to finish when forwarding
+ * the request.
+ */
+ if (!d_inode(req->r_dentry)) {
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,
+ TASK_KILLABLE);
+ if (err) {
+ mutex_lock(&req->r_fill_mutex);
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ goto out_session;
+ }
+ }
+
+ ci = ceph_inode(d_inode(req->r_dentry));
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
+ dout("do_request session changed for auth cap %d -> %d\n",
+ cap->session->s_mds, session->s_mds);
+
+ /* Remove the auth cap from old session */
+ spin_lock(&cap->session->s_cap_lock);
+ cap->session->s_nr_caps--;
+ list_del_init(&cap->session_caps);
+ spin_unlock(&cap->session->s_cap_lock);
+
+ /* Add the auth cap to the new session */
+ cap->mds = mds;
+ cap->session = session;
+ spin_lock(&session->s_cap_lock);
+ session->s_nr_caps++;
+ list_add_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
+ change_auth_cap_ses(ci, session);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ }
+
err = __send_request(session, req, false);
out_session:
@@ -2946,15 +3137,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
return err;
}
-static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func)
{
int err;
/* wait */
dout("do_request waiting\n");
- if (!req->r_timeout && req->r_wait_for_completion) {
- err = req->r_wait_for_completion(mdsc, req);
+ if (wait_func) {
+ err = wait_func(mdsc, req);
} else {
long timeleft = wait_for_completion_killable_timeout(
&req->r_completion,
@@ -3011,7 +3203,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
/* issue */
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err)
- err = ceph_mdsc_wait_request(mdsc, req);
+ err = ceph_mdsc_wait_request(mdsc, req, NULL);
dout("do_request %p done, result %d\n", req, err);
return err;
}
@@ -3097,35 +3289,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
result = le32_to_cpu(head->result);
- /*
- * Handle an ESTALE
- * if we're not talking to the authority, send to them
- * if the authority has changed while we weren't looking,
- * send to new authority
- * Otherwise we just have to return an ESTALE
- */
- if (result == -ESTALE) {
- dout("got ESTALE on request %llu\n", req->r_tid);
- req->r_resend_mds = -1;
- if (req->r_direct_mode != USE_AUTH_MDS) {
- dout("not using auth, setting for that now\n");
- req->r_direct_mode = USE_AUTH_MDS;
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- } else {
- int mds = __choose_mds(mdsc, req, NULL);
- if (mds >= 0 && mds != req->r_session->s_mds) {
- dout("but auth changed, so resending\n");
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- }
- dout("have to return ESTALE on request %llu\n", req->r_tid);
- }
-
-
if (head->safe) {
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
@@ -3268,6 +3431,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
int err = -EINVAL;
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
+ bool aborted = false;
ceph_decode_need(&p, end, 2*sizeof(u32), bad);
next_mds = ceph_decode_32(&p);
@@ -3276,16 +3440,41 @@ static void handle_forward(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex);
req = lookup_get_request(mdsc, tid);
if (!req) {
+ mutex_unlock(&mdsc->mutex);
dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
- goto out; /* dup reply? */
+ return; /* dup reply? */
}
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd) {
- dout("forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
+ /*
+ * The type of 'num_fwd' in ceph 'MClientRequestForward'
+ * is 'int32_t', while in 'ceph_mds_request_head' the
+ * type is '__u8'. So in case the request bounces between
+ * MDSes exceeding 256 times, the client will get stuck.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * bouncing the request makes no sense.
+ *
+ * In future this could be fixed in ceph code, so avoid
+ * using the hardcode here.
+ */
+ int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
+ max = 1 << (max * BITS_PER_BYTE);
+ if (req->r_num_fwd >= max) {
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = -EMULTIHOP;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ aborted = true;
+ pr_warn_ratelimited("forward tid %llu seq overflow\n",
+ tid);
+ } else {
+ dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+ tid, next_mds, req->r_num_fwd, fwd_seq);
+ }
} else {
/* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
@@ -3297,9 +3486,12 @@ static void handle_forward(struct ceph_mds_client *mdsc,
put_request_session(req);
__do_request(mdsc, req);
}
- ceph_mdsc_put_request(req);
-out:
mutex_unlock(&mdsc->mutex);
+
+ /* kick calling process */
+ if (aborted)
+ complete_request(mdsc, req);
+ ceph_mdsc_put_request(req);
return;
bad:
@@ -3378,13 +3570,17 @@ static void handle_session(struct ceph_mds_session *session,
}
if (msg_version >= 5) {
- u32 flags;
- /* version >= 4, struct_v, struct_cv, len, metric_spec */
- ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad);
+ u32 flags, len;
+
+ /* version >= 4 */
+ ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
+ ceph_decode_32_safe(&p, end, len, bad); /* len */
+ ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
+
/* version >= 5, flags */
- ceph_decode_32_safe(&p, end, flags, bad);
+ ceph_decode_32_safe(&p, end, flags, bad);
if (flags & CEPH_SESSION_BLOCKLISTED) {
- pr_warn("mds%d session blocklisted\n", session->s_mds);
+ pr_warn("mds%d session blocklisted\n", session->s_mds);
blocklisted = true;
}
}
@@ -3413,11 +3609,26 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_OPEN:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect success\n", session->s_mds);
- session->s_state = CEPH_MDS_SESSION_OPEN;
- session->s_features = features;
- renewed_caps(mdsc, session, 0);
- if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features))
- metric_schedule_delayed(&mdsc->metric);
+
+ if (session->s_state == CEPH_MDS_SESSION_OPEN) {
+ pr_notice("mds%d is already opened\n", session->s_mds);
+ } else {
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ session->s_features = features;
+ renewed_caps(mdsc, session, 0);
+ if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
+ &session->s_features))
+ metric_schedule_delayed(&mdsc->metric);
+ }
+
+ /*
+ * The connection maybe broken and the session in client
+ * side has been reinitialized, need to update the seq
+ * anyway.
+ */
+ if (!session->s_seq && seq)
+ session->s_seq = seq;
+
wake = 1;
if (mdsc->stopping)
__close_session(mdsc, session);
@@ -3683,7 +3894,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
char *path;
- int pathlen, err;
+ int pathlen = 0, err;
u64 pathbase;
u64 snap_follows;
@@ -3703,7 +3914,6 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
} else {
path = NULL;
- pathlen = 0;
pathbase = 0;
}
@@ -4400,12 +4610,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
memcpy((void *)(lease + 1) + 4,
dentry->d_name.name, dentry->d_name.len);
spin_unlock(&dentry->d_lock);
- /*
- * if this is a preemptive lease RELEASE, no need to
- * flush request stream, since the actual request will
- * soon follow.
- */
- msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
ceph_con_send(&session->s_con, msg);
}
@@ -4438,8 +4642,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
- struct ceph_fs_client *fsc = s->s_mdsc->fsc;
-
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
@@ -4448,10 +4650,6 @@ bool check_session_state(struct ceph_mds_session *s)
}
break;
case CEPH_MDS_SESSION_CLOSING:
- /* Should never reach this when not force unmounting */
- WARN_ON_ONCE(s->s_ttl &&
- READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
- fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
case CEPH_MDS_SESSION_CLOSED:
@@ -4706,15 +4904,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
}
/*
- * wait for all write mds requests to flush.
+ * flush the mdlog and wait for all write mds requests to flush.
*/
-static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+ u64 want_tid)
{
struct ceph_mds_request *req = NULL, *nextreq;
+ struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
mutex_lock(&mdsc->mutex);
- dout("wait_unsafe_requests want %lld\n", want_tid);
+ dout("%s want %lld\n", __func__, want_tid);
restart:
req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) {
@@ -4726,14 +4926,32 @@ restart:
nextreq = NULL;
if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
(req->r_op & CEPH_MDS_OP_WRITE)) {
+ struct ceph_mds_session *s = req->r_session;
+
+ if (!s) {
+ req = nextreq;
+ continue;
+ }
+
/* write op */
ceph_mdsc_get_request(req);
if (nextreq)
ceph_mdsc_get_request(nextreq);
+ s = ceph_get_mds_session(s);
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests wait on %llu (want %llu)\n",
+
+ /* send flush mdlog request to MDS */
+ if (last_session != s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(last_session);
+ last_session = s;
+ } else {
+ ceph_put_mds_session(s);
+ }
+ dout("%s wait on %llu (want %llu)\n", __func__,
req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion);
+
mutex_lock(&mdsc->mutex);
ceph_mdsc_put_request(req);
if (!nextreq)
@@ -4748,7 +4966,8 @@ restart:
req = nextreq;
}
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests done\n");
+ ceph_put_mds_session(last_session);
+ dout("%s done\n", __func__);
}
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
@@ -4777,7 +4996,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
dout("sync want tid %lld flush_seq %lld\n",
want_tid, want_flush);
- wait_unsafe_requests(mdsc, want_tid);
+ flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
wait_caps_flush(mdsc, want_flush);
}
@@ -4842,7 +5061,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
ceph_cleanup_snapid_map(mdsc);
- ceph_cleanup_empty_realms(mdsc);
+ ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 97c7f7bfa55f..0598faa50e2e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -29,14 +29,13 @@ enum ceph_feature_type {
CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_DELEG_INO,
CEPHFS_FEATURE_METRIC_COLLECT,
+ CEPHFS_FEATURE_ALTERNATE_NAME,
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
+ CEPHFS_FEATURE_OP_GETVXATTR,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
};
-/*
- * This will always have the highest feature bit value
- * as the last element of the array.
- */
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
@@ -45,10 +44,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
CEPHFS_FEATURE_METRIC_COLLECT, \
- \
- CEPHFS_FEATURE_MAX, \
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
+ CEPHFS_FEATURE_OP_GETVXATTR, \
}
-#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
/*
* Some lock dependencies:
@@ -100,6 +98,11 @@ struct ceph_mds_reply_dir_entry {
loff_t offset;
};
+struct ceph_mds_reply_xattr {
+ char *xattr_value;
+ size_t xattr_value_len;
+};
+
/*
* parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry,
@@ -115,6 +118,7 @@ struct ceph_mds_reply_info_parsed {
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
+ struct ceph_mds_reply_xattr xattr_info;
/* extra */
union {
@@ -274,8 +278,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
- const struct cred *r_cred;
int r_request_release_offset;
+ const struct cred *r_cred;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -296,12 +300,11 @@ struct ceph_mds_request {
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
-
+ u32 r_readdir_offset;
struct page *r_locked_page;
int r_dir_caps;
int r_num_caps;
- u32 r_readdir_offset;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
@@ -329,13 +332,14 @@ struct ceph_mds_request {
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
- ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
+ int r_feature_needed;
+
struct ceph_cap_reservation r_caps_reservation;
};
@@ -507,6 +511,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
@@ -572,9 +579,10 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
- TASK_INTERRUPTIBLE);
+ TASK_KILLABLE);
}
+extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 30387733765d..3fbabc98e1f7 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
return -1;
/* pick */
- n = prandom_u32() % n;
+ n = prandom_u32_max(n);
for (j = 0, i = 0; i < m->possible_max_rank; i++) {
if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
@@ -352,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
__decode_and_drop_type(p, end, u8, bad_ext);
}
if (mdsmap_ev >= 8) {
- u32 name_len;
/* enabled */
ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
- ceph_decode_32_safe(p, end, name_len, bad_ext);
- ceph_decode_need(p, end, name_len, bad_ext);
- *p += name_len;
+ /* fs_name */
+ ceph_decode_skip_string(p, end, bad_ext);
}
/* damaged */
if (mdsmap_ev >= 9) {
@@ -370,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
} else {
m->m_damaged = false;
}
+ if (mdsmap_ev >= 17) {
+ /* balancer */
+ ceph_decode_skip_string(p, end, bad_ext);
+ /* standby_count_wanted */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* old_max_mds */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* min_compat_client */
+ ceph_decode_skip_8(p, end, bad_ext);
+ /* required_client_features */
+ ceph_decode_skip_set(p, end, 64, bad_ext);
+ ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
+ } else {
+ /* This forces the usage of the (sync) SETXATTR Op */
+ m->m_max_xattr_size = 0;
+ }
bad_ext:
dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
!!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index c57699d8408d..c47347d2e84e 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -8,6 +8,12 @@
#include "metric.h"
#include "mds_client.h"
+static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
+{
+ struct timespec64 t = ktime_to_timespec64(val);
+ ceph_encode_timespec64(ts, &t);
+}
+
static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s)
{
@@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
u64 nr_caps = atomic64_read(&m->total_caps);
u32 header_len = sizeof(struct ceph_metric_header);
struct ceph_msg *msg;
- struct timespec64 ts;
s64 sum;
s32 items = 0;
s32 len;
@@ -59,37 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the read latency metric */
read = (struct ceph_metric_read_latency *)(cap + 1);
read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
- read->header.ver = 1;
+ read->header.ver = 2;
read->header.compat = 1;
read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
sum = m->metric[METRIC_READ].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- read->sec = cpu_to_le32(ts.tv_sec);
- read->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&read->lat, sum);
+ ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg);
+ read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum);
+ read->count = cpu_to_le64(m->metric[METRIC_READ].total);
items++;
/* encode the write latency metric */
write = (struct ceph_metric_write_latency *)(read + 1);
write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
- write->header.ver = 1;
+ write->header.ver = 2;
write->header.compat = 1;
write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
sum = m->metric[METRIC_WRITE].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- write->sec = cpu_to_le32(ts.tv_sec);
- write->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&write->lat, sum);
+ ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg);
+ write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum);
+ write->count = cpu_to_le64(m->metric[METRIC_WRITE].total);
items++;
/* encode the metadata latency metric */
meta = (struct ceph_metric_metadata_latency *)(write + 1);
meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
- meta->header.ver = 1;
+ meta->header.ver = 2;
meta->header.compat = 1;
meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
sum = m->metric[METRIC_METADATA].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- meta->sec = cpu_to_le32(ts.tv_sec);
- meta->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&meta->lat, sum);
+ ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg);
+ meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum);
+ meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total);
items++;
/* encode the dentry lease metric */
@@ -160,8 +168,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
msg->hdr.version = cpu_to_le16(1);
msg->hdr.compat_version = cpu_to_le16(1);
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("client%llu send metrics to mds%d\n",
- ceph_client_gid(mdsc->fsc->client), s->s_mds);
ceph_con_send(&s->s_con, msg);
return true;
@@ -252,6 +258,7 @@ int ceph_metric_init(struct ceph_client_metric *m)
metric->size_max = 0;
metric->total = 0;
metric->latency_sum = 0;
+ metric->latency_avg = 0;
metric->latency_sq_sum = 0;
metric->latency_min = KTIME_MAX;
metric->latency_max = 0;
@@ -309,20 +316,19 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
max = new; \
}
-static inline void __update_stdev(ktime_t total, ktime_t lsum,
- ktime_t *sq_sump, ktime_t lat)
+static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg,
+ ktime_t *sq_sump, ktime_t lat)
{
- ktime_t avg, sq;
-
- if (unlikely(total == 1))
- return;
-
- /* the sq is (lat - old_avg) * (lat - new_avg) */
- avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1));
- sq = lat - avg;
- avg = DIV64_U64_ROUND_CLOSEST(lsum, total);
- sq = sq * (lat - avg);
- *sq_sump += sq;
+ ktime_t avg;
+
+ if (unlikely(total == 1)) {
+ *lavg = lat;
+ } else {
+ /* the sq is (lat - old_avg) * (lat - new_avg) */
+ avg = *lavg + div64_s64(lat - *lavg, total);
+ *sq_sump += (lat - *lavg)*(lat - avg);
+ *lavg = avg;
+ }
}
void ceph_update_metrics(struct ceph_metric *m,
@@ -341,6 +347,7 @@ void ceph_update_metrics(struct ceph_metric *m,
METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size);
m->latency_sum += lat;
METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat);
- __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat);
+ __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum,
+ lat);
spin_unlock(&m->lock);
}
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index bb45608181e7..0d0c44bd3332 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -2,7 +2,7 @@
#ifndef _FS_CEPH_MDS_METRIC_H
#define _FS_CEPH_MDS_METRIC_H
-#include <linux/types.h>
+#include <linux/ceph/types.h>
#include <linux/percpu_counter.h>
#include <linux/ktime.h>
@@ -19,27 +19,39 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_OPENED_INODES,
CLIENT_METRIC_TYPE_READ_IO_SIZES,
CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
-
- CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
};
/*
* This will always have the highest metric bit value
* as the last element of the array.
*/
-#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
- CLIENT_METRIC_TYPE_CAP_INFO, \
- CLIENT_METRIC_TYPE_READ_LATENCY, \
- CLIENT_METRIC_TYPE_WRITE_LATENCY, \
- CLIENT_METRIC_TYPE_METADATA_LATENCY, \
- CLIENT_METRIC_TYPE_DENTRY_LEASE, \
- CLIENT_METRIC_TYPE_OPENED_FILES, \
- CLIENT_METRIC_TYPE_PINNED_ICAPS, \
- CLIENT_METRIC_TYPE_OPENED_INODES, \
- CLIENT_METRIC_TYPE_READ_IO_SIZES, \
- CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
- \
- CLIENT_METRIC_TYPE_MAX, \
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
+ CLIENT_METRIC_TYPE_CAP_INFO, \
+ CLIENT_METRIC_TYPE_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_DENTRY_LEASE, \
+ CLIENT_METRIC_TYPE_OPENED_FILES, \
+ CLIENT_METRIC_TYPE_PINNED_ICAPS, \
+ CLIENT_METRIC_TYPE_OPENED_INODES, \
+ CLIENT_METRIC_TYPE_READ_IO_SIZES, \
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
+ \
+ CLIENT_METRIC_TYPE_MAX, \
}
struct ceph_metric_header {
@@ -60,22 +72,28 @@ struct ceph_metric_cap {
/* metric read latency header */
struct ceph_metric_read_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric write latency header */
struct ceph_metric_write_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric metadata latency header */
struct ceph_metric_metadata_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric dentry lease header */
@@ -140,6 +158,7 @@ struct ceph_metric {
u64 size_min;
u64 size_max;
ktime_t latency_sum;
+ ktime_t latency_avg;
ktime_t latency_sq_sum;
ktime_t latency_min;
ktime_t latency_max;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 620c691af40e..64592adfe48f 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -30,6 +30,9 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
/* if root is the real CephFS root, we don't have quota realms */
if (root && ceph_ino(root) == CEPH_INO_ROOT)
return false;
+ /* MDS stray dirs have no quota realms */
+ if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino))
+ return false;
/* otherwise, we can't know for sure */
return true;
}
@@ -192,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
/*
* This function walks through the snaprealm for an inode and returns the
- * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
- * or max_bytes). If the root is reached, return the root ceph_snap_realm
- * instead.
+ * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
+ * max_bytes, or any, depending on the 'which_quota' argument). If the root is
+ * reached, return the root ceph_snap_realm instead.
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
@@ -206,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* will be restarted.
*/
static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
- struct inode *inode, bool retry)
+ struct inode *inode,
+ enum quota_get_realm which_quota,
+ bool retry)
{
struct ceph_inode_info *ci = NULL;
struct ceph_snap_realm *realm, *next;
@@ -245,7 +250,7 @@ restart:
}
ci = ceph_inode(in);
- has_quota = __ceph_has_any_quota(ci);
+ has_quota = __ceph_has_quota(ci, which_quota);
iput(in);
next = realm->parent;
@@ -276,8 +281,8 @@ restart:
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
- old_realm = get_quota_realm(mdsc, old, true);
- new_realm = get_quota_realm(mdsc, new, false);
+ old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
+ new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
if (PTR_ERR(new_realm) == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
@@ -480,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
- realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
+ realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
+ QUOTA_GET_MAX_BYTES, true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
@@ -494,10 +500,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
if (ci->i_max_bytes) {
total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT;
used = ci->i_rbytes >> CEPH_BLOCK_SHIFT;
+ /* For quota size less than 4MB, use 4KB block size */
+ if (!total) {
+ total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT;
+ used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT;
+ buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT;
+ }
/* It is possible for a quota to be exceeded.
* Report 'zero' in that case
*/
free = total > used ? total - used : 0;
+ /* For quota size less than 4KB, report the
+ * total=used=4KB,free=0 when quota is full
+ * and total=free=4KB, used=0 otherwise */
+ if (!total) {
+ total = 1;
+ free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0;
+ buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT;
+ }
}
spin_unlock(&ci->i_ceph_lock);
if (total) {
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index b41e6724c591..864cdaa0d2bd 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -121,18 +121,23 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 1); /* for caller */
+ /* Do not release the global dummy snaprealm until unmouting */
+ if (ino == CEPH_INO_GLOBAL_SNAPREALM)
+ atomic_set(&realm->nref, 2);
+ else
+ atomic_set(&realm->nref, 1);
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
INIT_LIST_HEAD(&realm->empty_item);
INIT_LIST_HEAD(&realm->dirty_item);
+ INIT_LIST_HEAD(&realm->rebuild_item);
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
mdsc->num_snap_realms++;
- dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ dout("%s %llx %p\n", __func__, realm->ino, realm);
return realm;
}
@@ -156,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
else if (ino > r->ino)
n = n->rb_right;
else {
- dout("lookup_snap_realm %llx %p\n", r->ino, r);
+ dout("%s %llx %p\n", __func__, r->ino, r);
return r;
}
}
@@ -184,7 +189,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
{
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+ dout("%s %p %llx\n", __func__, realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
mdsc->num_snap_realms--;
@@ -260,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_empty_lock);
}
-void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc)
{
+ struct ceph_snap_realm *global_realm;
+
down_write(&mdsc->snap_rwsem);
+ global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM);
+ if (global_realm)
+ ceph_put_snap_realm(mdsc, global_realm);
__cleanup_empty_realms(mdsc);
up_write(&mdsc->snap_rwsem);
}
@@ -292,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
if (IS_ERR(parent))
return PTR_ERR(parent);
}
- dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
- realm->ino, realm, realm->parent_ino, realm->parent,
- parentino, parent);
+ dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino,
+ realm, realm->parent_ino, realm->parent, parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
@@ -320,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b)
* build the snap context for a given realm.
*/
static int build_snap_context(struct ceph_snap_realm *realm,
- struct list_head* dirty_realms)
+ struct list_head *realm_queue,
+ struct list_head *dirty_realms)
{
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
@@ -334,9 +344,9 @@ static int build_snap_context(struct ceph_snap_realm *realm,
*/
if (parent) {
if (!parent->cached_context) {
- err = build_snap_context(parent, dirty_realms);
- if (err)
- goto fail;
+ /* add to the queue head */
+ list_add(&parent->rebuild_item, realm_queue);
+ return 1;
}
num += parent->cached_context->num_snaps;
}
@@ -349,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
- " (unchanged)\n",
- realm->ino, realm, realm->cached_context,
+ dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+ __func__, realm->ino, realm, realm->cached_context,
realm->cached_context->seq,
(unsigned int)realm->cached_context->num_snaps);
return 0;
@@ -390,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
- realm->ino, realm, snapc, snapc->seq,
- (unsigned int) snapc->num_snaps);
+ dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino,
+ realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps);
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
@@ -409,8 +417,7 @@ fail:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
- pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
- realm, err);
+ pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err);
return err;
}
@@ -420,13 +427,50 @@ fail:
static void rebuild_snap_realms(struct ceph_snap_realm *realm,
struct list_head *dirty_realms)
{
- struct ceph_snap_realm *child;
+ LIST_HEAD(realm_queue);
+ int last = 0;
+ bool skip = false;
+
+ list_add_tail(&realm->rebuild_item, &realm_queue);
+
+ while (!list_empty(&realm_queue)) {
+ struct ceph_snap_realm *_realm, *child;
+
+ _realm = list_first_entry(&realm_queue,
+ struct ceph_snap_realm,
+ rebuild_item);
+
+ /*
+ * If the last building failed dues to memory
+ * issue, just empty the realm_queue and return
+ * to avoid infinite loop.
+ */
+ if (last < 0) {
+ list_del_init(&_realm->rebuild_item);
+ continue;
+ }
+
+ last = build_snap_context(_realm, &realm_queue, dirty_realms);
+ dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
+ last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
+
+ /* is any child in the list ? */
+ list_for_each_entry(child, &_realm->children, child_item) {
+ if (!list_empty(&child->rebuild_item)) {
+ skip = true;
+ break;
+ }
+ }
- dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm, dirty_realms);
+ if (!skip) {
+ list_for_each_entry(child, &_realm->children, child_item)
+ list_add_tail(&child->rebuild_item, &realm_queue);
+ }
- list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child, dirty_realms);
+ /* last == 1 means need to build parent first */
+ if (last <= 0)
+ list_del_init(&_realm->rebuild_item);
+ }
}
@@ -474,23 +518,15 @@ static bool has_new_snaps(struct ceph_snap_context *o,
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
* change).
*/
-static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap **pcapsnap)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
int used, dirty;
- capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
- if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
- return;
- }
- capsnap->cap_flush.is_capsnap = true;
- INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
- INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
-
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
@@ -511,12 +547,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
- dout("queue_cap_snap %p already pending\n", inode);
+ dout("%s %p %llx.%llx already pending\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ dout("%s %p %llx.%llx nothing dirty|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
@@ -536,20 +574,17 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
- dout("queue_cap_snap %p "
- "no new_snap|dirty_page|writing\n", inode);
+ dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
}
- dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
- inode, capsnap, old_snapc, ceph_cap_string(dirty),
- capsnap->need_flush ? "" : "no_flush");
+ dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+ __func__, inode, ceph_vinop(inode), capsnap, old_snapc,
+ ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
ihold(inode);
- refcount_set(&capsnap->nref, 1);
- INIT_LIST_HEAD(&capsnap->ci_item);
-
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
@@ -579,31 +614,30 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+ " now pending\n", __func__, inode, ceph_vinop(inode),
capsnap, old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
- capsnap = NULL;
+ *pcapsnap = NULL;
old_snapc = NULL;
update_snapc:
- if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_wr_ref == 0 &&
- ci->i_dirty_caps == 0 &&
- ci->i_flushing_caps == 0) {
- ci->i_head_snapc = NULL;
- } else {
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock);
ceph_buffer_put(old_blob);
- kfree(capsnap);
ceph_put_snap_context(old_snapc);
}
@@ -618,7 +652,7 @@ update_snapc:
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
@@ -632,27 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->truncate_size = ci->i_truncate_size;
capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size,
- capsnap->dirty_pages);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "still has %d dirty pages\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size, capsnap->dirty_pages);
return 0;
}
/* Fb cap still in use, delay it */
if (ci->i_wb_ref) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "used WRBUFFER, delaying\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "used WRBUFFER, delaying\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
capsnap->writing = 1;
return 0;
}
ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
- inode, capsnap, capsnap->context,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+ __func__, inode, ceph_vinop(inode), capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
capsnap->size);
@@ -671,24 +706,47 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
+ struct ceph_cap_snap *capsnap = NULL;
- dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+ dout("%s %p %llx inode\n", __func__, realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
+ struct inode *inode = igrab(&ci->netfs.inode);
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
lastinode = inode;
- ceph_queue_cap_snap(ci);
+
+ /*
+ * Allocate the capsnap memory outside of ceph_queue_cap_snap()
+ * to reduce very possible but unnecessary frequently memory
+ * allocate/free in this loop.
+ */
+ if (!capsnap) {
+ capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
+ if (!capsnap) {
+ pr_err("ENOMEM allocating ceph_cap_snap on %p\n",
+ inode);
+ return;
+ }
+ }
+ capsnap->cap_flush.is_capsnap = true;
+ refcount_set(&capsnap->nref, 1);
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
+ INIT_LIST_HEAD(&capsnap->ci_item);
+
+ ceph_queue_cap_snap(ci, &capsnap);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
- dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+ if (capsnap)
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
+ dout("%s %p %llx done\n", __func__, realm, realm->ino);
}
/*
@@ -707,14 +765,16 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
__le64 *prior_parent_snaps; /* encoded */
struct ceph_snap_realm *realm = NULL;
struct ceph_snap_realm *first_realm = NULL;
- int invalidate = 0;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ int rebuild_snapcs;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("update_snap_trace deletion=%d\n", deletion);
+ dout("%s deletion=%d\n", __func__, deletion);
more:
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -738,10 +798,10 @@ more:
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+ dout("%s updating %llx %p %lld -> %lld\n", __func__,
realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
@@ -763,22 +823,30 @@ more:
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
- dout("update_snap_trace %llx %p seq %lld new\n",
+ dout("%s %llx %p seq %lld new\n", __func__,
realm->ino, realm, realm->seq);
- invalidate = 1;
+ rebuild_snapcs = 1;
} else {
- dout("update_snap_trace %llx %p seq %lld unchanged\n",
+ dout("%s %llx %p seq %lld unchanged\n", __func__,
realm->ino, realm, realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
+
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate && p >= e)
- rebuild_snap_realms(realm, &dirty_realms);
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -814,7 +882,7 @@ fail:
ceph_put_snap_realm(mdsc, realm);
if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
- pr_err("update_snap_trace error %d\n", err);
+ pr_err("%s error %d\n", __func__, err);
return err;
}
@@ -831,12 +899,12 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
struct inode *inode;
struct ceph_mds_session *session = NULL;
- dout("flush_snaps\n");
+ dout("%s\n", __func__);
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
struct ceph_inode_info, i_snap_flush_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
ceph_flush_snaps(ci, &session);
@@ -846,7 +914,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_flush_lock);
ceph_put_mds_session(session);
- dout("flush_snaps done\n");
+ dout("%s done\n", __func__);
}
/**
@@ -928,8 +996,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
- dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
- ceph_snap_op_name(op), split, trace_len);
+ dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
+ mds, ceph_snap_op_name(op), split, trace_len);
mutex_lock(&session->s_mutex);
inc_session_sequence(session);
@@ -989,13 +1057,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
- dout(" leaving %p in newer realm %llx %p\n",
- inode, ci->i_snap_realm->ino,
+ dout(" leaving %p %llx.%llx in newer realm %llx %p\n",
+ inode, ceph_vinop(inode), ci->i_snap_realm->ino,
ci->i_snap_realm);
goto skip_inode;
}
- dout(" will move %p to split realm %llx %p\n",
- inode, realm->ino, realm);
+ dout(" will move %p %llx.%llx to split realm %llx %p\n",
+ inode, ceph_vinop(inode), realm->ino, realm);
ceph_get_snap_realm(mdsc, realm);
ceph_change_snap_realm(inode, realm);
@@ -1038,7 +1106,7 @@ skip_inode:
return;
bad:
- pr_err("corrupt snap message from mds%d\n", mds);
+ pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
@@ -1071,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
}
spin_unlock(&mdsc->snapid_map_lock);
if (exist) {
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
@@ -1115,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
if (exist) {
free_anon_bdev(sm->dev);
kfree(sm);
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
- dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ dout("%s create snapid map %llx -> %x\n", __func__,
+ sm->snap, sm->dev);
return sm;
}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 573bb9556fb5..e36e8948e728 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_GETVXATTR: return "getvxattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bab61232dc5a..3fc48b43cab0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -27,6 +27,8 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#include <uapi/linux/magic.h>
+
static DEFINE_SPINLOCK(ceph_fsc_lock);
static LIST_HEAD(ceph_fsc_list);
@@ -70,15 +72,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
/*
- * express utilization in terms of large blocks to avoid
+ * Express utilization in terms of large blocks to avoid
* overflow on 32-bit machines.
- *
- * NOTE: for the time being, we make bsize == frsize to humor
- * not-yet-ancient versions of glibc that are broken.
- * Someday, we will probably want to report a real block
- * size... whatever that may mean for a network file system!
*/
- buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
/*
@@ -93,6 +89,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
}
+ /*
+ * NOTE: for the time being, we make bsize == frsize to humor
+ * not-yet-ancient versions of glibc that are broken.
+ * Someday, we will probably want to report a real block
+ * size... whatever that may mean for a network file system!
+ */
+ buf->f_bsize = buf->f_frsize;
+
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
@@ -146,6 +150,7 @@ enum {
Opt_mds_namespace,
Opt_recover_session,
Opt_source,
+ Opt_mon_addr,
/* string args above */
Opt_dirstat,
Opt_rbytes,
@@ -159,6 +164,7 @@ enum {
Opt_quotadf,
Opt_copyfrom,
Opt_wsync,
+ Opt_pagecache,
};
enum ceph_recover_session_mode {
@@ -197,8 +203,10 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_u32 ("rsize", Opt_rsize),
fsparam_string ("snapdirname", Opt_snapdirname),
fsparam_string ("source", Opt_source),
+ fsparam_string ("mon_addr", Opt_mon_addr),
fsparam_u32 ("wsize", Opt_wsize),
fsparam_flag_no ("wsync", Opt_wsync),
+ fsparam_flag_no ("pagecache", Opt_pagecache),
{}
};
@@ -228,9 +236,92 @@ static void canonicalize_path(char *path)
}
/*
- * Parse the source parameter. Distinguish the server list from the path.
+ * Check if the mds namespace in ceph_mount_options matches
+ * the passed in namespace string. First time match (when
+ * ->mds_namespace is NULL) is treated specially, since
+ * ->mds_namespace needs to be initialized by the caller.
+ */
+static int namespace_equals(struct ceph_mount_options *fsopt,
+ const char *namespace, size_t len)
+{
+ return !(fsopt->mds_namespace &&
+ (strlen(fsopt->mds_namespace) != len ||
+ strncmp(fsopt->mds_namespace, namespace, len)));
+}
+
+static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end,
+ struct fs_context *fc)
+{
+ int r;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+
+ if (*dev_name_end != ':')
+ return invalfc(fc, "separator ':' missing in source");
+
+ r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name,
+ pctx->copts, fc->log.log, ',');
+ if (r)
+ return r;
+
+ fsopt->new_dev_syntax = false;
+ return 0;
+}
+
+static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end,
+ struct fs_context *fc)
+{
+ size_t len;
+ struct ceph_fsid fsid;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ char *fsid_start, *fs_name_start;
+
+ if (*dev_name_end != '=') {
+ dout("separator '=' missing in source");
+ return -EINVAL;
+ }
+
+ fsid_start = strchr(dev_name, '@');
+ if (!fsid_start)
+ return invalfc(fc, "missing cluster fsid");
+ ++fsid_start; /* start of cluster fsid */
+
+ fs_name_start = strchr(fsid_start, '.');
+ if (!fs_name_start)
+ return invalfc(fc, "missing file system name");
+
+ if (ceph_parse_fsid(fsid_start, &fsid))
+ return invalfc(fc, "Invalid FSID");
+
+ ++fs_name_start; /* start of file system name */
+ len = dev_name_end - fs_name_start;
+
+ if (!namespace_equals(fsopt, fs_name_start, len))
+ return invalfc(fc, "Mismatching mds_namespace");
+ kfree(fsopt->mds_namespace);
+ fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
+ dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace);
+
+ fsopt->new_dev_syntax = true;
+ return 0;
+}
+
+/*
+ * Parse the source parameter for new device format. Distinguish the device
+ * spec from the path. Try parsing new device format and fallback to old
+ * format if needed.
+ *
+ * New device syntax will looks like:
+ * <device_spec>=/<path>
+ * where
+ * <device_spec> is name@fsid.fsname
+ * <path> is optional, but if present must begin with '/'
+ * (monitor addresses are passed via mount option)
*
- * The source will look like:
+ * Old device syntax is:
* <server_spec>[,<server_spec>...]:[<path>]
* where
* <server_spec> is <ip>[:<port>]
@@ -263,24 +354,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
dev_name_end = dev_name + strlen(dev_name);
}
- dev_name_end--; /* back up to ':' separator */
- if (dev_name_end < dev_name || *dev_name_end != ':')
- return invalfc(fc, "No path or : separator in source");
+ dev_name_end--; /* back up to separator */
+ if (dev_name_end < dev_name)
+ return invalfc(fc, "Path missing in source");
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
if (fsopt->server_path)
dout("server path '%s'\n", fsopt->server_path);
- ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name,
- pctx->copts, fc->log.log);
- if (ret)
- return ret;
+ dout("trying new device syntax");
+ ret = ceph_parse_new_source(dev_name, dev_name_end, fc);
+ if (ret) {
+ if (ret != -EINVAL)
+ return ret;
+ dout("trying old device syntax");
+ ret = ceph_parse_old_source(dev_name, dev_name_end, fc);
+ if (ret)
+ return ret;
+ }
fc->source = param->string;
param->string = NULL;
return 0;
}
+static int ceph_parse_mon_addr(struct fs_parameter *param,
+ struct fs_context *fc)
+{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+
+ kfree(fsopt->mon_addr);
+ fsopt->mon_addr = param->string;
+ param->string = NULL;
+
+ return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr),
+ pctx->copts, fc->log.log, '/');
+}
+
static int ceph_parse_mount_param(struct fs_context *fc,
struct fs_parameter *param)
{
@@ -306,6 +417,8 @@ static int ceph_parse_mount_param(struct fs_context *fc,
param->string = NULL;
break;
case Opt_mds_namespace:
+ if (!namespace_equals(fsopt, param->string, strlen(param->string)))
+ return invalfc(fc, "Mismatching mds_namespace");
kfree(fsopt->mds_namespace);
fsopt->mds_namespace = param->string;
param->string = NULL;
@@ -323,6 +436,8 @@ static int ceph_parse_mount_param(struct fs_context *fc,
if (fc->source)
return invalfc(fc, "Multiple sources specified");
return ceph_parse_source(param, fc);
+ case Opt_mon_addr:
+ return ceph_parse_mon_addr(param, fc);
case Opt_wsize:
if (result.uint_32 < PAGE_SIZE ||
result.uint_32 > CEPH_MAX_WRITE_SIZE)
@@ -455,6 +570,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
else
fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
break;
+ case Opt_pagecache:
+ if (result.negated)
+ fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
+ else
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
+ break;
default:
BUG();
}
@@ -474,6 +595,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
kfree(args->mds_namespace);
kfree(args->server_path);
kfree(args->fscache_uniq);
+ kfree(args->mon_addr);
kfree(args);
}
@@ -517,6 +639,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr);
+ if (ret)
+ return ret;
+
return ceph_compare_options(new_opt, fsc->client);
}
@@ -572,15 +698,22 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
seq_puts(m, ",copyfrom");
- if (fsopt->mds_namespace)
+ /* dump mds_namespace when old device syntax is in use */
+ if (fsopt->mds_namespace && !fsopt->new_dev_syntax)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+ if (fsopt->mon_addr)
+ seq_printf(m, ",mon_addr=%s", fsopt->mon_addr);
+
if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");
if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
seq_puts(m, ",wsync");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+ seq_puts(m, ",nopagecache");
+
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%u", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -671,6 +804,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->have_copy_from2 = true;
atomic_long_set(&fsc->writeback_count, 0);
+ fsc->write_congested = false;
err = -ENOMEM;
/*
@@ -684,6 +818,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->cap_wq)
goto fail_inode_wq;
+ hash_init(fsc->async_unlink_conflict);
+ spin_lock_init(&fsc->async_unlink_conflict_lock);
+
spin_lock(&ceph_fsc_lock);
list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
spin_unlock(&ceph_fsc_lock);
@@ -733,6 +870,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_snap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
@@ -743,7 +881,7 @@ mempool_t *ceph_wb_pagevec_pool;
static void ceph_inode_init_once(void *foo)
{
struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
+ inode_init_once(&ci->netfs.inode);
}
static int __init init_caches(void)
@@ -761,6 +899,9 @@ static int __init init_caches(void)
ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
if (!ceph_cap_cachep)
goto bad_cap;
+ ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD);
+ if (!ceph_cap_snap_cachep)
+ goto bad_cap_snap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (!ceph_cap_flush_cachep)
@@ -787,16 +928,10 @@ static int __init init_caches(void)
if (!ceph_wb_pagevec_pool)
goto bad_pagevec_pool;
- error = ceph_fscache_register();
- if (error)
- goto bad_fscache;
-
return 0;
-bad_fscache:
- kmem_cache_destroy(ceph_mds_request_cachep);
bad_pagevec_pool:
- mempool_destroy(ceph_wb_pagevec_pool);
+ kmem_cache_destroy(ceph_mds_request_cachep);
bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
@@ -806,6 +941,8 @@ bad_file:
bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
+ kmem_cache_destroy(ceph_cap_snap_cachep);
+bad_cap_snap:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
@@ -822,14 +959,13 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_cap_snap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
kmem_cache_destroy(ceph_mds_request_cachep);
mempool_destroy(ceph_wb_pagevec_pool);
-
- ceph_fscache_unregister();
}
static void __ceph_umount_begin(struct ceph_fs_client *fsc)
@@ -988,6 +1124,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
s->s_time_gran = 1;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
+ s->s_flags |= SB_NODIRATIME | SB_NOATIME;
ret = set_anon_super_fc(s, fc);
if (ret != 0)
@@ -1060,6 +1197,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
static int ceph_get_tree(struct fs_context *fc)
{
struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
struct super_block *sb;
struct ceph_fs_client *fsc;
struct dentry *res;
@@ -1071,6 +1209,8 @@ static int ceph_get_tree(struct fs_context *fc)
if (!fc->source)
return invalfc(fc, "No source");
+ if (fsopt->new_dev_syntax && !fsopt->mon_addr)
+ return invalfc(fc, "No monitor address");
/* create client (which we may/may not use) */
fsc = create_fs_client(pctx->opts, pctx->copts);
@@ -1156,6 +1296,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
else
ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+ if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) {
+ kfree(fsc->mount_options->mon_addr);
+ fsc->mount_options->mon_addr = fsopt->mon_addr;
+ fsopt->mon_addr = NULL;
+ pr_notice("ceph: monitor addresses recorded, but not used for reconnection");
+ }
+
sync_filesystem(fc->root->d_sb);
return 0;
}
@@ -1333,6 +1480,14 @@ bool disable_send_metrics = false;
module_param_cb(disable_send_metrics, &param_ops_metrics, &disable_send_metrics, 0644);
MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
+/* for both v1 and v2 syntax */
+static bool mount_support = true;
+static const struct kernel_param_ops param_ops_mount_syntax = {
+ .get = param_get_bool,
+};
+module_param_cb(mount_syntax_v1, &param_ops_mount_syntax, &mount_support, 0444);
+module_param_cb(mount_syntax_v2, &param_ops_mount_syntax, &mount_support, 0444);
+
module_init(init_ceph);
module_exit(exit_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ac331aa07cfa..40630e6f691c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -17,21 +17,17 @@
#include <linux/posix_acl.h>
#include <linux/refcount.h>
#include <linux/security.h>
-
-#include <linux/ceph/libceph.h>
-
-#ifdef CONFIG_CEPH_FSCACHE
-#define FSCACHE_USE_NEW_IO_API
+#include <linux/netfs.h>
#include <linux/fscache.h>
-#endif
+#include <linux/hashtable.h>
-/* f_type in struct statfs */
-#define CEPH_SUPER_MAGIC 0x00c36400
+#include <linux/ceph/libceph.h>
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
+#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
@@ -45,6 +41,7 @@
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
+#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */
#define CEPH_MOUNT_OPT_DEFAULT \
(CEPH_MOUNT_OPT_DCACHE | \
@@ -89,6 +86,8 @@ struct ceph_mount_options {
unsigned int max_readdir; /* max readdir result (entries) */
unsigned int max_readdir_bytes; /* max readdir result (bytes) */
+ bool new_dev_syntax;
+
/*
* everything above this point can be memcmp'd; everything below
* is handled in compare_mount_options()
@@ -98,8 +97,11 @@ struct ceph_mount_options {
char *mds_namespace; /* default NULL */
char *server_path; /* default NULL (means "/") */
char *fscache_uniq; /* default NULL */
+ char *mon_addr;
};
+#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8
+
struct ceph_fs_client {
struct super_block *sb;
@@ -120,10 +122,14 @@ struct ceph_fs_client {
struct ceph_mds_client *mdsc;
atomic_long_t writeback_count;
+ bool write_congested;
struct workqueue_struct *inode_wq;
struct workqueue_struct *cap_wq;
+ DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
+ spinlock_t async_unlink_conflict_lock;
+
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
@@ -135,7 +141,7 @@ struct ceph_fs_client {
#endif
#ifdef CONFIG_CEPH_FSCACHE
- struct fscache_cookie *fscache;
+ struct fscache_volume *fscache;
#endif
};
@@ -229,7 +235,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
- kfree(capsnap);
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
}
}
@@ -280,7 +286,8 @@ struct ceph_dentry_info {
struct dentry *dentry;
struct ceph_mds_session *lease_session;
struct list_head lease_list;
- unsigned flags;
+ struct hlist_node hnode;
+ unsigned long flags;
int lease_shared_gen;
u32 lease_gen;
u32 lease_seq;
@@ -289,10 +296,14 @@ struct ceph_dentry_info {
u64 offset;
};
-#define CEPH_DENTRY_REFERENCED 1
-#define CEPH_DENTRY_LEASE_LIST 2
-#define CEPH_DENTRY_SHRINK_LIST 4
-#define CEPH_DENTRY_PRIMARY_LINK 8
+#define CEPH_DENTRY_REFERENCED (1 << 0)
+#define CEPH_DENTRY_LEASE_LIST (1 << 1)
+#define CEPH_DENTRY_SHRINK_LIST (1 << 2)
+#define CEPH_DENTRY_PRIMARY_LINK (1 << 3)
+#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4)
+#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
+#define CEPH_DENTRY_ASYNC_CREATE_BIT (5)
+#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT)
struct ceph_inode_xattrs_info {
/*
@@ -316,6 +327,7 @@ struct ceph_inode_xattrs_info {
* Ceph inode.
*/
struct ceph_inode_info {
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
@@ -426,17 +438,12 @@ struct ceph_inode_info {
struct work_struct i_work;
unsigned long i_work_mask;
-
-#ifdef CONFIG_CEPH_FSCACHE
- struct fscache_cookie *fscache;
-#endif
- struct inode vfs_inode; /* at end */
};
static inline struct ceph_inode_info *
ceph_inode(const struct inode *inode)
{
- return container_of(inode, struct ceph_inode_info, vfs_inode);
+ return container_of(inode, struct ceph_inode_info, netfs.inode);
}
static inline struct ceph_fs_client *
@@ -535,19 +542,23 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
*
* These come from src/mds/mdstypes.h in the ceph sources.
*/
-#define CEPH_MAX_MDS 0x100
-#define CEPH_NUM_STRAY 10
+#define CEPH_MAX_MDS 0x100
+#define CEPH_NUM_STRAY 10
#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS)
+#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS)
#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY))
static inline bool ceph_vino_is_reserved(const struct ceph_vino vino)
{
- if (vino.ino < CEPH_INO_SYSTEM_BASE &&
- vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) {
- WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino);
- return true;
- }
- return false;
+ if (vino.ino >= CEPH_INO_SYSTEM_BASE ||
+ vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET)
+ return false;
+
+ /* Don't warn on mdsdirs */
+ WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET,
+ "Attempt to access reserved inode number 0x%llx",
+ vino.ino);
+ return true;
}
static inline struct inode *ceph_find_inode(struct super_block *sb,
@@ -758,6 +769,8 @@ extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
extern void ceph_reservation_status(struct ceph_fs_client *client,
int *total, int *avail, int *used,
int *reserved, int *min);
+extern void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session);
@@ -878,6 +891,8 @@ struct ceph_snap_realm {
struct list_head dirty_item; /* if realm needs new context */
+ struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */
+
/* the current set of snaps for this realm */
struct ceph_snap_context *cached_context;
@@ -933,7 +948,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_msg *msg);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
-extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc);
extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
u64 snap);
@@ -1016,6 +1031,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode)
ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
}
+extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask);
extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force);
static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
@@ -1043,6 +1059,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
/* xattr.c */
int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
@@ -1207,12 +1224,21 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
/* addr.c */
extern const struct address_space_operations ceph_aops;
+extern const struct netfs_request_ops ceph_netfs_ops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
-extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
+static inline bool ceph_has_inline_data(struct ceph_inode_info *ci)
+{
+ if (ci->i_inline_version == CEPH_INLINE_NONE ||
+ ci->i_inline_version == 1) /* initial version, no data */
+ return false;
+ return true;
+}
+
/* file.c */
extern const struct file_operations ceph_file_fops;
@@ -1270,9 +1296,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client);
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
/* quota.c */
-static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci)
+
+enum quota_get_realm {
+ QUOTA_GET_MAX_FILES,
+ QUOTA_GET_MAX_BYTES,
+ QUOTA_GET_ANY
+};
+
+static inline bool __ceph_has_quota(struct ceph_inode_info *ci,
+ enum quota_get_realm which)
{
- return ci->i_max_files || ci->i_max_bytes;
+ bool has_quota = false;
+
+ switch (which) {
+ case QUOTA_GET_MAX_BYTES:
+ has_quota = !!ci->i_max_bytes;
+ break;
+ case QUOTA_GET_MAX_FILES:
+ has_quota = !!ci->i_max_files;
+ break;
+ default:
+ has_quota = !!(ci->i_max_files || ci->i_max_bytes);
+ }
+ return has_quota;
}
extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
@@ -1281,13 +1327,13 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
u64 max_bytes, u64 max_files)
{
bool had_quota, has_quota;
- had_quota = __ceph_has_any_quota(ci);
+ had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
ci->i_max_bytes = max_bytes;
ci->i_max_files = max_files;
- has_quota = __ceph_has_any_quota(ci);
+ has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
if (had_quota != has_quota)
- ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
+ ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index fcf7dfdecf96..f31350cda960 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_string *pool_ns;
s64 pool = ci->i_layout.pool_id;
@@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
- dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+ dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
@@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
char *val, size_t size)
{
ssize_t ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ci->i_layout.pool_id;
const char *pool_name;
@@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
}
@@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "client%lld",
ceph_client_gid(fsc->client));
@@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci,
}
#define XATTR_RSTAT_FIELD(_type, _name) \
XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
+#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = VXATTR_FLAG_RSTAT, \
+ }
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \
@@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rsubdirs),
XATTR_RSTAT_FIELD(dir, rsnaps),
XATTR_RSTAT_FIELD(dir, rbytes),
- XATTR_RSTAT_FIELD(dir, rctime),
+ XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
{
.name = "ceph.dir.pin",
.name_size = sizeof("ceph.dir.pin"),
@@ -621,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
}
dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val);
+ ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val);
return 0;
}
@@ -863,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
struct ceph_buffer *old_blob = NULL;
void *dest;
- dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+ dout("__build_xattrs_blob %p\n", &ci->netfs.inode);
if (ci->i_xattrs.dirty) {
int need = __get_required_blob_size(ci, 0, 0);
@@ -923,10 +931,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_xattr *xattr;
- struct ceph_vxattr *vxattr = NULL;
+ struct ceph_vxattr *vxattr;
int req_mask;
ssize_t err;
+ if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto handle_non_vxattrs;
+
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr) {
@@ -945,8 +956,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
err = -ERANGE;
}
return err;
+ } else {
+ err = ceph_do_getvxattr(inode, name, value, size);
+ /* this would happen with a new client and old server combo */
+ if (err == -EOPNOTSUPP)
+ err = -ENODATA;
+ return err;
}
-
+handle_non_vxattrs:
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
@@ -1069,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
flags |= CEPH_XATTR_REMOVE;
}
- dout("setxattr value=%.*s\n", (int)size, value);
+ dout("setxattr value size: %zu\n", size);
/* do request */
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1167,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name,
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
- if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+ required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+ if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
+ (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
+ dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
+ __func__, ci->i_xattrs.version, required_blob_size,
+ mdsc->mdsmap->m_max_xattr_size);
goto do_sync;
+ }
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
@@ -1184,8 +1207,6 @@ retry:
ceph_cap_string(issued));
__build_xattrs(inode);
- required_blob_size = __get_required_blob_size(ci, name_len, val_len);
-
if (!ci->i_xattrs.prealloc_blob ||
required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
struct ceph_buffer *blob;