diff options
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/addr.c | 619 | ||||
-rw-r--r-- | fs/ceph/cache.c | 220 | ||||
-rw-r--r-- | fs/ceph/cache.h | 103 | ||||
-rw-r--r-- | fs/ceph/caps.c | 330 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 5 | ||||
-rw-r--r-- | fs/ceph/dir.c | 96 | ||||
-rw-r--r-- | fs/ceph/export.c | 3 | ||||
-rw-r--r-- | fs/ceph/file.c | 285 | ||||
-rw-r--r-- | fs/ceph/inode.c | 178 | ||||
-rw-r--r-- | fs/ceph/locks.c | 8 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 379 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 34 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 24 | ||||
-rw-r--r-- | fs/ceph/metric.c | 65 | ||||
-rw-r--r-- | fs/ceph/metric.h | 63 | ||||
-rw-r--r-- | fs/ceph/quota.c | 36 | ||||
-rw-r--r-- | fs/ceph/snap.c | 271 | ||||
-rw-r--r-- | fs/ceph/strings.c | 1 | ||||
-rw-r--r-- | fs/ceph/super.c | 209 | ||||
-rw-r--r-- | fs/ceph/super.h | 118 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 49 |
21 files changed, 1964 insertions, 1132 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e53c8541f5b2..dcf701b05cc1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -4,8 +4,8 @@ #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/pagemap.h> -#include <linux/writeback.h> /* generic_writepages */ #include <linux/slab.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> @@ -63,7 +63,7 @@ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { @@ -76,18 +76,17 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page) * Dirty a page. Optimistically adjust accounting, on the assumption * that we won't race with invalidate. If we do, readjust. */ -static int ceph_set_page_dirty(struct page *page) +static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct address_space *mapping = page->mapping; struct inode *inode; struct ceph_inode_info *ci; struct ceph_snap_context *snapc; - if (PageDirty(page)) { - dout("%p set_page_dirty %p idx %lu -- already dirty\n", - mapping->host, page, page->index); - BUG_ON(!PagePrivate(page)); - return 0; + if (folio_test_dirty(folio)) { + dout("%p dirty_folio %p idx %lu -- already dirty\n", + mapping->host, folio, folio->index); + VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); + return false; } inode = mapping->host; @@ -111,75 +110,81 @@ static int ceph_set_page_dirty(struct page *page) if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; - dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " + dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d " "snapc %p seq %lld (%d snaps)\n", - mapping->host, page, page->index, + mapping->host, folio, folio->index, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); /* - * Reference snap context in page->private. Also set - * PagePrivate so that we get invalidatepage callback. + * Reference snap context in folio->private. Also set + * PagePrivate so that we get invalidate_folio callback. */ - BUG_ON(PagePrivate(page)); - attach_page_private(page, snapc); + VM_WARN_ON_FOLIO(folio->private, folio); + folio_attach_private(folio, snapc); - return __set_page_dirty_nobuffers(page); + return ceph_fscache_dirty_folio(mapping, folio); } /* - * If we are truncating the full page (i.e. offset == 0), adjust the - * dirty page counters appropriately. Only called if there is private - * data on the page. + * If we are truncating the full folio (i.e. offset == 0), adjust the + * dirty folio counters appropriately. Only called if there is private + * data on the folio. */ -static void ceph_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void ceph_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { struct inode *inode; struct ceph_inode_info *ci; struct ceph_snap_context *snapc; - wait_on_page_fscache(page); - - inode = page->mapping->host; + inode = folio->mapping->host; ci = ceph_inode(inode); - if (offset != 0 || length != thp_size(page)) { - dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", - inode, page, page->index, offset, length); + if (offset != 0 || length != folio_size(folio)) { + dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n", + inode, folio->index, offset, length); return; } - WARN_ON(!PageLocked(page)); - if (!PagePrivate(page)) - return; + WARN_ON(!folio_test_locked(folio)); + if (folio_test_private(folio)) { + dout("%p invalidate_folio idx %lu full dirty page\n", + inode, folio->index); - dout("%p invalidatepage %p idx %lu full dirty page\n", - inode, page, page->index); + snapc = folio_detach_private(folio); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + } - snapc = detach_page_private(page); - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); + folio_wait_fscache(folio); } -static int ceph_releasepage(struct page *page, gfp_t gfp) +static bool ceph_release_folio(struct folio *folio, gfp_t gfp) { - dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, - page, page->index, PageDirty(page) ? "" : "not "); + struct inode *inode = folio->mapping->host; - if (PageFsCache(page)) { - if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) - return 0; - wait_on_page_fscache(page); + dout("%llx:%llx release_folio idx %lu (%sdirty)\n", + ceph_vinop(inode), + folio->index, folio_test_dirty(folio) ? "" : "not "); + + if (folio_test_private(folio)) + return false; + + if (folio_test_fscache(folio)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_fscache(folio); } - return !PagePrivate(page); + ceph_fscache_note_page_release(inode); + return true; } -static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) +static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) { - struct inode *inode = rreq->mapping->host; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_layout *lo = &ci->i_layout; u32 blockoff; @@ -194,9 +199,9 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) rreq->len = roundup(rreq->len, lo->stripe_unit); } -static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) +static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) { - struct inode *inode = subreq->rreq->mapping->host; + struct inode *inode = subreq->rreq->inode; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; @@ -213,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) { struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); - struct netfs_read_subrequest *subreq = req->r_priv; + struct netfs_io_subrequest *subreq = req->r_priv; int num_pages; int err = req->r_result; @@ -232,17 +237,72 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (err >= 0 && err < subreq->len) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, err, true); + netfs_subreq_terminated(subreq, err, false); num_pages = calc_pages_for(osd_data->alignment, osd_data->length); ceph_put_page_vector(osd_data->pages, num_pages, false); iput(req->r_inode); } -static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) +static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) { - struct netfs_read_request *rreq = subreq->rreq; - struct inode *inode = rreq->mapping->host; + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_info_in *iinfo; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + struct iov_iter iter; + ssize_t err = 0; + size_t len; + int mode; + + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + + if (subreq->start >= inode->i_size) + goto out; + + /* We need to fetch the inline data. */ + mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_ino1 = ci->i_vino; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); + req->r_num_caps = 2; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + iinfo = &rinfo->targeti; + if (iinfo->inline_version == CEPH_INLINE_NONE) { + /* The data got uninlined */ + ceph_mdsc_put_request(req); + return false; + } + + len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); + if (err == 0) + err = -EFAULT; + + ceph_mdsc_put_request(req); +out: + netfs_subreq_terminated(subreq, err, false); + return true; +} + +static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; @@ -253,6 +313,9 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) + return; + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, @@ -265,7 +328,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); - err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off); + err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); if (err < 0) { dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); goto out; @@ -274,6 +337,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) /* should always give us a page-aligned read */ WARN_ON_ONCE(page_off); len = err; + err = 0; osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_netfs_read; @@ -281,9 +345,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) req->r_inode = inode; ihold(inode); - err = ceph_osdc_start_request(req->r_osdc, req, false); - if (err) - iput(inode); + ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); if (err) @@ -291,92 +353,95 @@ out: dout("%s: result %d\n", __func__, err); } -static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) +static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { + struct inode *inode = rreq->inode; + int got = 0, want = CEPH_CAP_FILE_CACHE; + int ret = 0; + + if (rreq->origin != NETFS_READAHEAD) + return 0; + + if (file) { + struct ceph_rw_context *rw_ctx; + struct ceph_file_info *fi = file->private_data; + + rw_ctx = ceph_find_rw_context(fi); + if (rw_ctx) + return 0; + } + + /* + * readahead callers do not necessarily hold Fcb caps + * (e.g. fadvise, madvise). + */ + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); + if (ret < 0) { + dout("start_read %p, error getting cap\n", inode); + return ret; + } + + if (!(got & want)) { + dout("start_read %p, no cache cap\n", inode); + return -EACCES; + } + if (ret == 0) + return -EACCES; + + rreq->netfs_priv = (void *)(uintptr_t)got; + return 0; } -static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) +static void ceph_netfs_free_request(struct netfs_io_request *rreq) { - struct inode *inode = mapping->host; - struct ceph_inode_info *ci = ceph_inode(inode); - int got = (uintptr_t)priv; + struct ceph_inode_info *ci = ceph_inode(rreq->inode); + int got = (uintptr_t)rreq->netfs_priv; if (got) ceph_put_cap_refs(ci, got); } -static const struct netfs_read_request_ops ceph_netfs_read_ops = { - .init_rreq = ceph_init_rreq, - .is_cache_enabled = ceph_is_cache_enabled, +const struct netfs_request_ops ceph_netfs_ops = { + .init_request = ceph_init_request, + .free_request = ceph_netfs_free_request, .begin_cache_operation = ceph_begin_cache_operation, - .issue_op = ceph_netfs_issue_op, + .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, - .cleanup = ceph_readahead_cleanup, }; -/* read a single page, without unlocking it. */ -static int ceph_readpage(struct file *file, struct page *subpage) +#ifdef CONFIG_CEPH_FSCACHE +static void ceph_set_page_fscache(struct page *page) { - struct folio *folio = page_folio(subpage); - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vino vino = ceph_vino(inode); - size_t len = folio_size(folio); - u64 off = folio_file_pos(folio); - - if (ci->i_inline_version != CEPH_INLINE_NONE) { - /* - * Uptodate inline data should have been added - * into page cache while getting Fcr caps. - */ - if (off == 0) { - folio_unlock(folio); - return -EINVAL; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; - } - - dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n", - vino.ino, vino.snap, file, off, len, folio, folio_index(folio)); - - return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL); + set_page_fscache(page); } -static void ceph_readahead(struct readahead_control *ractl) +static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) { - struct inode *inode = file_inode(ractl->file); - struct ceph_file_info *fi = ractl->file->private_data; - struct ceph_rw_context *rw_ctx; - int got = 0; - int ret = 0; + struct inode *inode = priv; - if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) - return; + if (IS_ERR_VALUE(error) && error != -ENOBUFS) + ceph_fscache_invalidate(inode, false); +} - rw_ctx = ceph_find_rw_context(fi); - if (!rw_ctx) { - /* - * readahead callers do not necessarily hold Fcb caps - * (e.g. fadvise, madvise). - */ - int want = CEPH_CAP_FILE_CACHE; +static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); - ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); - if (ret < 0) - dout("start_read %p, error getting cap\n", inode); - else if (!(got & want)) - dout("start_read %p, no cache cap\n", inode); + fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode), + ceph_fscache_write_terminated, inode, caching); +} +#else +static inline void ceph_set_page_fscache(struct page *page) +{ +} - if (ret <= 0) - return; - } - netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); +static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) +{ } +#endif /* CONFIG_CEPH_FSCACHE */ struct ceph_writeback_ctl { @@ -483,6 +548,7 @@ static u64 get_writepages_data_length(struct inode *inode, */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -493,6 +559,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; + bool caching = ceph_is_cache_enabled(inode); dout("writepage %p idx %lu\n", page, page->index); @@ -516,8 +583,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) /* is this a partial page at end of file? */ if (page_off >= ceph_wbc.i_size) { - dout("%p page eof %llu\n", page, ceph_wbc.i_size); - page->mapping->a_ops->invalidatepage(page, 0, thp_size(page)); + dout("folio at %lu beyond eof %llu\n", folio->index, + ceph_wbc.i_size); + folio_invalidate(folio, 0, folio_size(folio)); return 0; } @@ -529,28 +597,30 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + fsc->write_congested = true; - set_page_writeback(page); req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); if (IS_ERR(req)) { redirty_page_for_writepage(wbc, page); - end_page_writeback(page); return PTR_ERR(req); } + set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); + ceph_fscache_write_to_cache(inode, page_off, len, caching); + /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(osdc, req, true); - if (!err) - err = ceph_osdc_wait_request(osdc, req); + ceph_osdc_start_request(osdc, req); + err = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); @@ -588,7 +658,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + fsc->write_congested = false; return err; } @@ -599,6 +669,13 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; BUG_ON(!inode); ihold(inode); + + if (wbc->sync_mode == WB_SYNC_NONE && + ceph_inode_to_client(inode)->write_congested) + return AOP_WRITEPAGE_ACTIVATE; + + wait_on_page_fscache(page); + err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 @@ -652,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req) /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { - if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { + pr_warn("%s incorrect op %d req %p index %d tid %llu\n", + __func__, req->r_ops[i].op, req, i, req->r_tid); break; + } osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); @@ -669,8 +749,7 @@ static void writepages_finish(struct ceph_osd_request *req) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH( fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); + fsc->write_congested = false; ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); @@ -720,6 +799,11 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_writeback_ctl ceph_wbc; bool should_loop, range_whole = false; bool done = false; + bool caching = ceph_is_cache_enabled(inode); + + if (wbc->sync_mode == WB_SYNC_NONE && + fsc->write_congested) + return 0; dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : @@ -828,14 +912,16 @@ get_more_pages: continue; } if (page_offset(page) >= ceph_wbc.i_size) { - dout("%p page eof %llu\n", - page, ceph_wbc.i_size); + struct folio *folio = page_folio(page); + + dout("folio at %lu beyond eof %llu\n", + folio->index, ceph_wbc.i_size); if ((ceph_wbc.size_stable || - page_offset(page) >= i_size_read(inode)) && - clear_page_dirty_for_io(page)) - mapping->a_ops->invalidatepage(page, - 0, thp_size(page)); - unlock_page(page); + folio_pos(folio) >= i_size_read(inode)) && + folio_clear_dirty_for_io(folio)) + folio_invalidate(folio, 0, + folio_size(folio)); + folio_unlock(folio); continue; } if (strip_unit_end && (page->index > strip_unit_end)) { @@ -843,7 +929,7 @@ get_more_pages: unlock_page(page); break; } - if (PageWriteback(page)) { + if (PageWriteback(page) || PageFsCache(page)) { if (wbc->sync_mode == WB_SYNC_NONE) { dout("%p under writeback\n", page); unlock_page(page); @@ -851,6 +937,7 @@ get_more_pages: } dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); + wait_on_page_fscache(page); } if (!clear_page_dirty_for_io(page)) { @@ -914,11 +1001,8 @@ get_more_pages: if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) { - set_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); - } - + fsc->mount_options->congestion_kb)) + fsc->write_congested = true; pages[locked_pages++] = page; pvec.pages[i] = NULL; @@ -983,9 +1067,19 @@ new_request: op_idx = 0; for (i = 0; i < locked_pages; i++) { u64 cur_offset = page_offset(pages[i]); + /* + * Discontinuity in page range? Ceph can handle that by just passing + * multiple extents in the write op. + */ if (offset + len != cur_offset) { + /* If it's full, stop here */ if (op_idx + 1 == req->r_num_ops) break; + + /* Kick off an fscache write with what we have so far. */ + ceph_fscache_write_to_cache(inode, offset, len, caching); + + /* Start a new extent */ osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); dout("writepages got pages at %llu~%llu\n", @@ -996,14 +1090,17 @@ new_request: osd_req_op_extent_update(req, op_idx, len); len = 0; - offset = cur_offset; + offset = cur_offset; data_pages = pages + i; op_idx++; } set_page_writeback(pages[i]); + if (caching) + ceph_set_page_fscache(pages[i]); len += thp_size(page); } + ceph_fscache_write_to_cache(inode, offset, len, caching); if (ceph_wbc.size_stable) { len = min(len, ceph_wbc.i_size - offset); @@ -1051,8 +1148,7 @@ new_request: } req->r_mtime = inode->i_mtime; - rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); - BUG_ON(rc); + ceph_osdc_start_request(&fsc->client->osdc, req); req = NULL; wbc->nr_to_write -= i; @@ -1188,18 +1284,19 @@ ceph_find_incompatible(struct page *page) } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(folio, 0)); + snapc = ceph_find_incompatible(folio_page(*foliop, 0)); if (snapc) { int r; - folio_unlock(folio); - folio_put(folio); + folio_unlock(*foliop); + folio_put(*foliop); + *foliop = NULL; if (IS_ERR(snapc)) return PTR_ERR(snapc); @@ -1217,59 +1314,22 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned * clean, or already dirty within the same snap context. */ static int ceph_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned aop_flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; - pgoff_t index = pos >> PAGE_SHIFT; int r; - /* - * Uninlining should have already been done and everything updated, EXCEPT - * for inline_version sent to the MDS. - */ - if (ci->i_inline_version != CEPH_INLINE_NONE) { - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (aop_flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - folio = __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); + if (r < 0) + return r; - /* - * The inline_version on a new inode is set to 1. If that's the - * case, then the folio is brand new and isn't yet Uptodate. - */ - r = 0; - if (index == 0 && ci->i_inline_version != 1) { - if (!folio_test_uptodate(folio)) { - WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", - ci->i_inline_version); - r = -EINVAL; - } - goto out; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - goto out; - } - - r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL, - &ceph_netfs_read_ops, NULL); -out: - if (r == 0) - folio_wait_fscache(folio); - if (r < 0) { - if (folio) - folio_put(folio); - } else { - WARN_ON_ONCE(!folio_test_locked(folio)); - *pagep = &folio->page; - } - return r; + folio_wait_fscache(folio); + WARN_ON_ONCE(!folio_test_locked(folio)); + *pagep = &folio->page; + return 0; } /* @@ -1313,15 +1373,15 @@ out: } const struct address_space_operations ceph_aops = { - .readpage = ceph_readpage, - .readahead = ceph_readahead, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, - .set_page_dirty = ceph_set_page_dirty, - .invalidatepage = ceph_invalidatepage, - .releasepage = ceph_releasepage, + .dirty_folio = ceph_dirty_folio, + .invalidate_folio = ceph_invalidate_folio, + .release_folio = ceph_release_folio, .direct_IO = noop_direct_IO, }; @@ -1372,7 +1432,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) inode, off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || - ci->i_inline_version == CEPH_INLINE_NONE) { + !ceph_has_inline_data(ci)) { CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); @@ -1455,19 +1515,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - struct page *locked_page = NULL; - if (off == 0) { - lock_page(page); - locked_page = page; - } - err = ceph_uninline_data(vma->vm_file, locked_page); - if (locked_page) - unlock_page(locked_page); - if (err < 0) - goto out_free; - } - if (off + thp_size(page) <= size) len = thp_size(page); else @@ -1524,11 +1571,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_snap_context(snapc); } while (err == 0); - if (ret == VM_FAULT_LOCKED || - ci->i_inline_version != CEPH_INLINE_NONE) { + if (ret == VM_FAULT_LOCKED) { int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1592,16 +1637,18 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, } } -int ceph_uninline_data(struct file *filp, struct page *locked_page) +int ceph_uninline_data(struct file *file) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; - struct page *page = NULL; - u64 len, inline_version; + struct ceph_osd_request *req = NULL; + struct ceph_cap_flush *prealloc_cf; + struct folio *folio = NULL; + u64 inline_version = CEPH_INLINE_NONE; + struct page *pages[1]; int err = 0; - bool from_pagecache = false; + u64 len; spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; @@ -1610,64 +1657,43 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version); - if (inline_version == 1 || /* initial version, no data */ - inline_version == CEPH_INLINE_NONE) - goto out; + if (inline_version == CEPH_INLINE_NONE) + return 0; - if (locked_page) { - page = locked_page; - WARN_ON(!PageUptodate(page)); - } else if (ceph_caps_issued(ci) & - (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { - page = find_get_page(inode->i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - from_pagecache = true; - lock_page(page); - } else { - put_page(page); - page = NULL; - } - } - } + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; - if (page) { - len = i_size_read(inode); - if (len > PAGE_SIZE) - len = PAGE_SIZE; - } else { - page = __page_cache_alloc(GFP_NOFS); - if (!page) { - err = -ENOMEM; - goto out; - } - err = __ceph_do_getattr(inode, page, - CEPH_STAT_CAP_INLINE_DATA, true); - if (err < 0) { - /* no inline data */ - if (err == -ENODATA) - err = 0; - goto out; - } - len = err; + if (inline_version == 1) /* initial version, no data */ + goto out_uninline; + + folio = read_mapping_folio(inode->i_mapping, 0, file); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; } + folio_lock(folio); + + len = i_size_read(inode); + if (len > folio_size(folio)) + len = folio_size(folio); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) - goto out; + goto out_unlock; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, @@ -1676,10 +1702,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } - osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + pages[0] = folio_page(folio, 0); + osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); { __le64 xattr_buf = cpu_to_le64(inline_version); @@ -1689,7 +1716,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) CEPH_OSD_CMPXATTR_OP_GT, CEPH_OSD_CMPXATTR_MODE_U64); if (err) - goto out_put; + goto out_put_req; } { @@ -1700,30 +1727,41 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) "inline_version", xattr_buf, xattr_len, 0, 0); if (err) - goto out_put; + goto out_put_req; } req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); -out_put: +out_uninline: + if (!err) { + int dirty; + + /* Set to CAP_INLINE_NONE and dirty the caps */ + down_read(&fsc->mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + up_read(&fsc->mdsc->snap_rwsem); + if (dirty) + __mark_inode_dirty(inode, dirty); + } +out_put_req: ceph_osdc_put_request(req); if (err == -ECANCELED) err = 0; -out: - if (page && page != locked_page) { - if (from_pagecache) { - unlock_page(page); - put_page(page); - } else - __free_pages(page, 0); +out_unlock: + if (folio) { + folio_unlock(folio); + folio_put(folio); } - +out: + ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", inode, ceph_vinop(inode), inline_version, err); return err; @@ -1738,9 +1776,8 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->read_folio) return -ENOEXEC; - file_accessed(file); vma->vm_ops = &ceph_vmops; return 0; } @@ -1753,7 +1790,7 @@ enum { static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool, struct ceph_string *pool_ns) { - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; struct rb_node **p, *parent; @@ -1866,15 +1903,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); - err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); + ceph_osdc_start_request(&fsc->client->osdc, rd_req); - wr_req->r_mtime = ci->vfs_inode.i_mtime; - err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); + wr_req->r_mtime = ci->netfs.inode.i_mtime; + ceph_osdc_start_request(&fsc->client->osdc, wr_req); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); - if (!err2) - err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); if (err >= 0 || err == -ENOENT) have |= POOL_READ; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 457afda5498a..177d8e8d73fe 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -12,199 +12,99 @@ #include "super.h" #include "cache.h" -struct fscache_netfs ceph_cache_netfs = { - .name = "ceph", - .version = 0, -}; - -static DEFINE_MUTEX(ceph_fscache_lock); -static LIST_HEAD(ceph_fscache_list); - -struct ceph_fscache_entry { - struct list_head list; - struct fscache_cookie *fscache; - size_t uniq_len; - /* The following members must be last */ - struct ceph_fsid fsid; - char uniquifier[]; -}; - -static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { - .name = "CEPH.fsid", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -int __init ceph_fscache_register(void) -{ - return fscache_register_netfs(&ceph_cache_netfs); -} - -void ceph_fscache_unregister(void) -{ - fscache_unregister_netfs(&ceph_cache_netfs); -} - -int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) +void ceph_fscache_register_inode_cookie(struct inode *inode) { - const struct ceph_fsid *fsid = &fsc->client->fsid; - const char *fscache_uniq = fsc->mount_options->fscache_uniq; - size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; - struct ceph_fscache_entry *ent; - int err = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - mutex_lock(&ceph_fscache_lock); - list_for_each_entry(ent, &ceph_fscache_list, list) { - if (memcmp(&ent->fsid, fsid, sizeof(*fsid))) - continue; - if (ent->uniq_len != uniq_len) - continue; - if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) - continue; - - errorfc(fc, "fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option", - fsid); - err = -EBUSY; - goto out_unlock; - } + /* No caching for filesystem? */ + if (!fsc->fscache) + return; - ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL); - if (!ent) { - err = -ENOMEM; - goto out_unlock; - } + /* Regular files only */ + if (!S_ISREG(inode->i_mode)) + return; - memcpy(&ent->fsid, fsid, sizeof(*fsid)); - if (uniq_len > 0) { - memcpy(&ent->uniquifier, fscache_uniq, uniq_len); - ent->uniq_len = uniq_len; - } + /* Only new inodes! */ + if (!(inode->i_state & I_NEW)) + return; - fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, - &ceph_fscache_fsid_object_def, - &ent->fsid, sizeof(ent->fsid) + uniq_len, - NULL, 0, - fsc, 0, true); + WARN_ON_ONCE(ci->netfs.cache); - if (fsc->fscache) { - ent->fscache = fsc->fscache; - list_add_tail(&ent->list, &ceph_fscache_list); - } else { - kfree(ent); - errorfc(fc, "unable to register fscache cookie for fsid %pU", - fsid); - /* all other fs ignore this error */ - } -out_unlock: - mutex_unlock(&ceph_fscache_lock); - return err; + ci->netfs.cache = + fscache_acquire_cookie(fsc->fscache, 0, + &ci->i_vino, sizeof(ci->i_vino), + &ci->i_version, sizeof(ci->i_version), + i_size_read(inode)); } -static enum fscache_checkaux ceph_fscache_inode_check_aux( - void *cookie_netfs_data, const void *data, uint16_t dlen, - loff_t object_size) +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) { - struct ceph_inode_info* ci = cookie_netfs_data; - struct inode* inode = &ci->vfs_inode; - - if (dlen != sizeof(ci->i_version) || - i_size_read(inode) != object_size) - return FSCACHE_CHECKAUX_OBSOLETE; + fscache_relinquish_cookie(ceph_fscache_cookie(ci), false); +} - if (*(u64 *)data != ci->i_version) - return FSCACHE_CHECKAUX_OBSOLETE; +void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) +{ + struct ceph_inode_info *ci = ceph_inode(inode); - dout("ceph inode 0x%p cached okay\n", ci); - return FSCACHE_CHECKAUX_OKAY; + fscache_use_cookie(ceph_fscache_cookie(ci), will_modify); } -static const struct fscache_cookie_def ceph_fscache_inode_object_def = { - .name = "CEPH.inode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = ceph_fscache_inode_check_aux, -}; - -void ceph_fscache_register_inode_cookie(struct inode *inode) +void ceph_fscache_unuse_cookie(struct inode *inode, bool update) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - - /* No caching for filesystem */ - if (!fsc->fscache) - return; - /* Only cache for regular files that are read only */ - if (!S_ISREG(inode->i_mode)) - return; + if (update) { + loff_t i_size = i_size_read(inode); - inode_lock_nested(inode, I_MUTEX_CHILD); - if (!ci->fscache) { - ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - &ci->i_vino, sizeof(ci->i_vino), - &ci->i_version, sizeof(ci->i_version), - ci, i_size_read(inode), false); + fscache_unuse_cookie(ceph_fscache_cookie(ci), + &ci->i_version, &i_size); + } else { + fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL); } - inode_unlock(inode); } -void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +void ceph_fscache_update(struct inode *inode) { - struct fscache_cookie* cookie; - - if ((cookie = ci->fscache) == NULL) - return; - - ci->fscache = NULL; + struct ceph_inode_info *ci = ceph_inode(inode); + loff_t i_size = i_size_read(inode); - fscache_relinquish_cookie(cookie, &ci->i_vino, false); + fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size); } -static bool ceph_fscache_can_enable(void *data) +void ceph_fscache_invalidate(struct inode *inode, bool dio_write) { - struct inode *inode = data; - return !inode_is_open_for_write(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + + fscache_invalidate(ceph_fscache_cookie(ci), + &ci->i_version, i_size_read(inode), + dio_write ? FSCACHE_INVAL_DIO_WRITE : 0); } -void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { - struct ceph_inode_info *ci = ceph_inode(inode); + const struct ceph_fsid *fsid = &fsc->client->fsid; + const char *fscache_uniq = fsc->mount_options->fscache_uniq; + size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; + char *name; + int err = 0; - if (!fscache_cookie_valid(ci->fscache)) - return; + name = kasprintf(GFP_KERNEL, "ceph,%pU%s%s", fsid, uniq_len ? "," : "", + uniq_len ? fscache_uniq : ""); + if (!name) + return -ENOMEM; - if (inode_is_open_for_write(inode)) { - dout("fscache_file_set_cookie %p %p disabling cache\n", - inode, filp); - fscache_disable_cookie(ci->fscache, &ci->i_vino, false); - } else { - fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), - ceph_fscache_can_enable, inode); - if (fscache_cookie_enabled(ci->fscache)) { - dout("fscache_file_set_cookie %p %p enabling cache\n", - inode, filp); - } + fsc->fscache = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(fsc->fscache)) { + errorfc(fc, "Unable to register fscache cookie for %s", name); + err = fsc->fscache ? PTR_ERR(fsc->fscache) : -EOPNOTSUPP; + fsc->fscache = NULL; } + kfree(name); + return err; } void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - if (fscache_cookie_valid(fsc->fscache)) { - struct ceph_fscache_entry *ent; - bool found = false; - - mutex_lock(&ceph_fscache_lock); - list_for_each_entry(ent, &ceph_fscache_list, list) { - if (ent->fscache == fsc->fscache) { - list_del(&ent->list); - kfree(ent); - found = true; - break; - } - } - WARN_ON_ONCE(!found); - mutex_unlock(&ceph_fscache_lock); - - __fscache_relinquish_cookie(fsc->fscache, NULL, false); - } - fsc->fscache = NULL; + fscache_relinquish_volume(fsc->fscache, NULL, false); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 058ea2a04376..dc502daac49a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -12,61 +12,70 @@ #include <linux/netfs.h> #ifdef CONFIG_CEPH_FSCACHE - -extern struct fscache_netfs ceph_cache_netfs; - -int ceph_fscache_register(void); -void ceph_fscache_unregister(void); +#include <linux/fscache.h> int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); -void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); -void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); -static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +void ceph_fscache_use_cookie(struct inode *inode, bool will_modify); +void ceph_fscache_unuse_cookie(struct inode *inode, bool update); + +void ceph_fscache_update(struct inode *inode); +void ceph_fscache_invalidate(struct inode *inode, bool dio_write); + +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - ci->fscache = NULL; + return netfs_i_cookie(&ci->netfs); } -static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +static inline void ceph_fscache_resize(struct inode *inode, loff_t to) { - return ci->fscache; + struct ceph_inode_info *ci = ceph_inode(inode); + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); + + if (cookie) { + ceph_fscache_use_cookie(inode, true); + fscache_resize_cookie(cookie, to); + ceph_fscache_unuse_cookie(inode, true); + } } -static inline void ceph_fscache_invalidate(struct inode *inode) +static inline void ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) { - fscache_invalidate(ceph_inode(inode)->fscache); + fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode))); } -static inline bool ceph_is_cache_enabled(struct inode *inode) +static inline int ceph_fscache_dirty_folio(struct address_space *mapping, + struct folio *folio) { - struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode)); + struct ceph_inode_info *ci = ceph_inode(mapping->host); - if (!cookie) - return false; - return fscache_cookie_enabled(cookie); + return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci)); } -static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) +static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) { struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); - return fscache_begin_read_operation(rreq, cookie); + return fscache_begin_read_operation(&rreq->cache_resources, cookie); } -#else -static inline int ceph_fscache_register(void) +static inline bool ceph_is_cache_enabled(struct inode *inode) { - return 0; + return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); } -static inline void ceph_fscache_unregister(void) +static inline void ceph_fscache_note_page_release(struct inode *inode) { -} + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_note_page_release(ceph_fscache_cookie(ci)); +} +#else /* CONFIG_CEPH_FSCACHE */ static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { @@ -77,41 +86,63 @@ static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { } -static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) { } -static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) { - return NULL; } -static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +static inline void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) { } -static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +static inline void ceph_fscache_unuse_cookie(struct inode *inode, bool update) { } -static inline void ceph_fscache_file_set_cookie(struct inode *inode, - struct file *filp) +static inline void ceph_fscache_update(struct inode *inode) { } -static inline void ceph_fscache_invalidate(struct inode *inode) +static inline void ceph_fscache_invalidate(struct inode *inode, bool dio_write) { } +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return NULL; +} + +static inline void ceph_fscache_resize(struct inode *inode, loff_t to) +{ +} + +static inline void ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) +{ +} + +static inline int ceph_fscache_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + return filemap_dirty_folio(mapping, folio); +} + static inline bool ceph_is_cache_enabled(struct inode *inode) { return false; } -static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) +static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) { return -ENOBUFS; } -#endif -#endif /* _CEPH_CACHE_H */ +static inline void ceph_fscache_note_page_release(struct inode *inode) +{ +} +#endif /* CONFIG_CEPH_FSCACHE */ + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b9460b6fb76f..fb023f9fafcb 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); - dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, + dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode, ci->i_hold_caps_max - jiffies); } @@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode, ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); @@ -531,7 +531,7 @@ no_change: static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) @@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + dout("__cap_delay_cancel %p\n", &ci->netfs.inode); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); @@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if (S_ISREG(ci->vfs_inode.i_mode) && + if (S_ISREG(ci->netfs.inode.i_mode) && (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; @@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); + if (S_ISDIR(ci->netfs.inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->netfs.inode); __ceph_dir_clear_complete(ci); } } /* Wipe saved layout if we're losing DIR_CREATE caps */ - if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && + if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && !(issued & CEPH_CAP_DIR_CREATE)) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); @@ -602,8 +602,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * @ci: inode to be moved * @session: new auth caps session */ -static void change_auth_cap_ses(struct ceph_inode_info *ci, - struct ceph_mds_session *session) +void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session) { lockdep_assert_held(&ci->i_ceph_lock); @@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode, cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; + wake_up_all(&ci->i_cap_wq); } /* @@ -771,7 +772,7 @@ static int __cap_is_valid(struct ceph_cap *cap) if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " - "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode, cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } @@ -797,7 +798,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (!__cap_is_valid(cap)) continue; dout("__ceph_caps_issued %p cap %p issued %s\n", - &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + &ci->netfs.inode, cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; @@ -844,12 +845,12 @@ static void __touch_cap(struct ceph_cap *cap) spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { - dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", - &cap->ci->vfs_inode, cap, s->s_mds); + &cap->ci->netfs.inode, cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } @@ -867,7 +868,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if ((have & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), + " (mask %s)\n", ceph_ino(&ci->netfs.inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; @@ -879,7 +880,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) continue; if ((cap->issued & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, + " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) @@ -891,7 +892,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) have |= cap->issued; if ((have & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), + " (mask %s)\n", ceph_ino(&ci->netfs.inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { @@ -919,7 +920,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); @@ -950,7 +951,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int ret; spin_lock(&ci->i_ceph_lock); @@ -969,8 +970,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci) if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || - (S_ISREG(ci->vfs_inode.i_mode) && - ci->vfs_inode.i_data.nrpages)) + (S_ISREG(ci->netfs.inode.i_mode) && + ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -993,11 +994,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->vfs_inode)->mount_options; + ceph_inode_to_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; - if (S_ISDIR(ci->vfs_inode.i_mode)) { + if (S_ISDIR(ci->netfs.inode.i_mode)) { int want = 0; /* use used_cutoff here, to keep dir's wanted caps longer */ @@ -1050,7 +1051,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); - if (S_ISDIR(ci->vfs_inode.i_mode)) { + if (S_ISDIR(ci->netfs.inode.i_mode)) { /* we want EXCL if holding caps of dir ops */ if (w & CEPH_CAP_ANY_DIR_OPS) w |= CEPH_CAP_FILE_EXCL; @@ -1116,9 +1117,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode); - mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; + mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); @@ -1169,7 +1170,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) - ceph_change_snap_realm(&ci->vfs_inode, NULL); + ceph_change_snap_realm(&ci->netfs.inode, NULL); __cap_delay_cancel(mdsc, ci); } @@ -1188,11 +1189,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - fsc = ceph_inode_to_client(&ci->vfs_inode); + fsc = ceph_inode_to_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && - !ceph_inode_is_shutdown(&ci->vfs_inode)); + !ceph_inode_is_shutdown(&ci->netfs.inode)); __ceph_remove_cap(cap, queue_release); } @@ -1343,7 +1344,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); @@ -1440,7 +1441,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); if (!msg) { @@ -1528,7 +1529,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; @@ -1577,7 +1578,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; - struct ceph_cap_flush *cf; + struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { @@ -1587,8 +1588,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, } ret = -ENOENT; - list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { - if (cf->tid >= first_tid) { + list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { + if (iter->tid >= first_tid) { + cf = iter; ret = 0; break; } @@ -1621,7 +1623,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; int mds; @@ -1681,8 +1683,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct inode *inode = &ci->vfs_inode; + ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc; + struct inode *inode = &ci->netfs.inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1695,7 +1697,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, return 0; } - dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode, ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; @@ -1711,7 +1713,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, ci->i_snap_realm->cached_context); } dout(" inode %p now dirty snapc %p auth cap %p\n", - &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); @@ -1856,7 +1858,7 @@ static int try_nonblocking_invalidate(struct inode *inode) u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode); + ceph_fscache_invalidate(inode, false); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); @@ -1874,7 +1876,7 @@ static int try_nonblocking_invalidate(struct inode *inode) bool __ceph_should_report_size(struct ceph_inode_info *ci) { - loff_t size = i_size_read(&ci->vfs_inode); + loff_t size = i_size_read(&ci->netfs.inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; @@ -1899,7 +1901,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; @@ -1910,11 +1912,19 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; + bool queue_writeback = false; if (session) ceph_get_mds_session(session); spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + /* Don't send messages until we get async create reply */ + spin_unlock(&ci->i_ceph_lock); + ceph_put_mds_session(session); + return; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; retry: @@ -1969,14 +1979,15 @@ retry: } dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s" - " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode), + " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", - (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", + (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); /* * If we no longer need to hold onto old our caps, and we may @@ -2055,10 +2066,27 @@ retry: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & cap_used) == 0) { - dout("completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; + if (revoking) { + if ((revoking & cap_used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; } /* want more caps from mds? */ @@ -2128,6 +2156,8 @@ ack: spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); + if (queue_writeback) + ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } @@ -2211,13 +2241,14 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) } /* - * wait for any unsafe requests to complete. + * flush the mdlog and wait for any unsafe requests to complete. */ -static int unsafe_request_wait(struct inode *inode) +static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; + unsigned int max_sessions; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); @@ -2236,36 +2267,46 @@ static int unsafe_request_wait(struct inode *inode) spin_unlock(&ci->i_unsafe_lock); /* + * The mdsc->max_sessions is unlikely to be changed + * mostly, here we will retry it by reallocating the + * sessions array memory to get rid of the mdsc->mutex + * lock. + */ +retry: + max_sessions = mdsc->max_sessions; + + /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ - if (req1 || req2) { + if ((req1 || req2) && likely(max_sessions)) { struct ceph_mds_session **sessions = NULL; struct ceph_mds_session *s; struct ceph_mds_request *req; - unsigned int max; int i; - /* - * The mdsc->max_sessions is unlikely to be changed - * mostly, here we will retry it by reallocating the - * sessions arrary memory to get rid of the mdsc->mutex - * lock. - */ -retry: - max = mdsc->max_sessions; - sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO); - if (!sessions) - return -ENOMEM; + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); + if (!sessions) { + err = -ENOMEM; + goto out; + } spin_lock(&ci->i_unsafe_lock); if (req1) { list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; - if (unlikely(s->s_mds >= max)) { + if (!s) + continue; + if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); + for (i = 0; i < max_sessions; i++) { + s = sessions[i]; + if (s) + ceph_put_mds_session(s); + } + kfree(sessions); goto retry; } if (!sessions[s->s_mds]) { @@ -2278,8 +2319,16 @@ retry: list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; - if (unlikely(s->s_mds >= max)) { + if (!s) + continue; + if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); + for (i = 0; i < max_sessions; i++) { + s = sessions[i]; + if (s) + ceph_put_mds_session(s); + } + kfree(sessions); goto retry; } if (!sessions[s->s_mds]) { @@ -2300,7 +2349,7 @@ retry: spin_unlock(&ci->i_ceph_lock); /* send flush mdlog request to MDSes */ - for (i = 0; i < max; i++) { + for (i = 0; i < max_sessions; i++) { s = sessions[i]; if (s) { send_flush_mdlog(s); @@ -2310,22 +2359,26 @@ retry: kfree(sessions); } - dout("unsafe_request_wait %p wait on tid %llu %llu\n", + dout("%s %p wait on tid %llu %llu\n", __func__, inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, ceph_timeout_jiffies(req1->r_timeout)); if (ret) err = -EIO; - ceph_mdsc_put_request(req1); } if (req2) { ret = !wait_for_completion_timeout(&req2->r_safe_completion, ceph_timeout_jiffies(req2->r_timeout)); if (ret) err = -EIO; - ceph_mdsc_put_request(req2); } + +out: + if (req1) + ceph_mdsc_put_request(req1); + if (req2) + ceph_mdsc_put_request(req2); return err; } @@ -2350,7 +2403,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - err = unsafe_request_wait(inode); + err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds @@ -2388,7 +2441,11 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); dout("write_inode %p wait=%d\n", inode, wait); + ceph_fscache_unpin_writeback(inode, wbc); if (wait) { + err = ceph_wait_on_async_create(inode); + if (err) + return err; dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, @@ -2412,13 +2469,17 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; u64 first_tid = 0; u64 last_snap_flush = 0; + /* Don't do anything until create reply comes in */ + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) + return; + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { @@ -2501,7 +2562,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err("%p auth cap %p not mds%d ???\n", - &ci->vfs_inode, cap, session->s_mds); + &ci->netfs.inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2551,7 +2612,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err("%p auth cap %p not mds%d ???\n", - &ci->vfs_inode, cap, session->s_mds); + &ci->netfs.inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2571,7 +2632,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, lockdep_assert_held(&ci->i_ceph_lock); - dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, + dout("%s %p flushing %s\n", __func__, &ci->netfs.inode, ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { @@ -2614,10 +2675,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) - ihold(&ci->vfs_inode); + ihold(&ci->netfs.inode); ci->i_wb_ref++; dout("%s %p wb %d -> %d (?)\n", __func__, - &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); + &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -2699,13 +2760,17 @@ again: * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. + * + * For RDCACHE|RD -> RD, there is not need to wait and we can + * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; + int exclude = revoking & not; dout("get_cap_refs %p have %s but not %s (revoking %s)\n", inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); - if ((revoking & not) == 0) { + if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { @@ -2727,7 +2792,7 @@ again: snap_rwsem_locked = true; } if ((have & want) == want) - *got = need | want; + *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); @@ -2945,8 +3010,8 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got return ret; } - if (S_ISREG(ci->vfs_inode.i_mode) && - ci->i_inline_version != CEPH_INLINE_NONE && + if (S_ISREG(ci->netfs.inode.i_mode) && + ceph_has_inline_data(ci) && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = @@ -3035,7 +3100,7 @@ enum put_cap_refs_mode { static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; @@ -3143,11 +3208,10 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap = NULL; + struct inode *inode = &ci->netfs.inode; + struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; - bool found = false; bool flush_snaps = false; bool complete_capsnap = false; @@ -3174,14 +3238,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->context == snapc) { - found = true; + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->context == snapc) { + capsnap = iter; break; } } - if (!found) { + if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. @@ -3375,8 +3439,7 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); - if (inode->i_nlink == 0 && - (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + if (inode->i_nlink == 0) deleted_inode = true; } @@ -3492,6 +3555,9 @@ static void handle_cap_grant(struct inode *inode, check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ + /* If there is new caps, try to wake up the waiters */ + if (~cap->issued & newcaps) + wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { @@ -3521,24 +3587,23 @@ static void handle_cap_grant(struct inode *inode, fill_inline = true; } - if (ci->i_auth_cap == cap && - le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - if (newcaps & ~extra_info->issued) - wake = true; + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + if (ci->i_auth_cap == cap) { + if (newcaps & ~extra_info->issued) + wake = true; - if (ci->i_requested_max_size > max_size || - !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { - /* re-request max_size if necessary */ - ci->i_requested_max_size = 0; - wake = true; - } + if (ci->i_requested_max_size > max_size || + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { + /* re-request max_size if necessary */ + ci->i_requested_max_size = 0; + wake = true; + } - ceph_kick_flushing_inode_caps(session, ci); - spin_unlock(&ci->i_ceph_lock); + ceph_kick_flushing_inode_caps(session, ci); + } up_read(&session->s_mdsc->snap_rwsem); - } else { - spin_unlock(&ci->i_ceph_lock); } + spin_unlock(&ci->i_ceph_lock); if (fill_inline) ceph_fill_inline_data(inode, NULL, extra_info->inline_data, @@ -3641,7 +3706,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, session->s_mds, &list_first_entry(&session->s_cap_flushing, struct ceph_inode_info, - i_flushing_item)->vfs_inode); + i_flushing_item)->netfs.inode); } } mdsc->num_cap_flushing--; @@ -3732,8 +3797,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); - struct ceph_cap_snap *capsnap; - bool flushed = false; + struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; @@ -3741,26 +3805,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, inode, ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->follows == follows) { - if (capsnap->cap_flush.tid != flush_tid) { + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->follows == follows) { + if (iter->cap_flush.tid != flush_tid) { dout(" cap_snap %p follows %lld tid %lld !=" - " %lld\n", capsnap, follows, - flush_tid, capsnap->cap_flush.tid); + " %lld\n", iter, follows, + flush_tid, iter->cap_flush.tid); break; } - flushed = true; + capsnap = iter; break; } else { dout(" skipping cap_snap %p follows %lld\n", - capsnap, capsnap->follows); + iter, iter->follows); } } - if (flushed) + if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); - if (flushed) { + if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) @@ -3837,6 +3901,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", inode, ci, mds, mseq, target); retry: + down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) @@ -3900,6 +3965,7 @@ retry: } spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); /* open target session */ @@ -3925,6 +3991,7 @@ retry: out_unlock: spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); if (tsession) { mutex_unlock(&tsession->s_mutex); @@ -4133,7 +4200,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); - ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); @@ -4159,6 +4225,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, } goto flush_cap_releases; } + ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { @@ -4286,7 +4353,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) break; list_del_init(&ci->i_cap_delay_list); - inode = igrab(&ci->vfs_inode); + inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); @@ -4314,10 +4381,11 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); + ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); iput(inode); spin_lock(&mdsc->cap_dirty_lock); @@ -4348,9 +4416,9 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci, void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; - bool is_opened = false; + bool already_opened = false; int i; if (count == 1) @@ -4358,19 +4426,19 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { - if (bits & (1 << i)) - ci->i_nr_by_mode[i] += count; - /* - * If any of the mode ref is larger than 1, + * If any of the mode ref is larger than 0, * that means it has been already opened by * others. Just skip checking the PIN ref. */ - if (i && ci->i_nr_by_mode[i] > 1) - is_opened = true; + if (i && ci->i_nr_by_mode[i]) + already_opened = true; + + if (bits & (1 << i)) + ci->i_nr_by_mode[i] += count; } - if (!is_opened) + if (!already_opened) percpu_counter_inc(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } @@ -4382,7 +4450,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool is_closed = true; int i; @@ -4597,7 +4665,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali lockdep_assert_held(&ci->i_ceph_lock); dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); + cap, ci, &ci->netfs.inode); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3cf7c9c1085b..bec3c4549c07 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_client_metric *cm = &fsc->mdsc->metric; struct ceph_metric *m; - s64 total, sum, avg, min, max, sq; + s64 total, avg, min, max, sq; int i; seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); @@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p) m = &cm->metric[i]; spin_lock(&m->lock); total = m->total; - sum = m->latency_sum; - avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + avg = m->latency_avg; min = m->latency_min; max = m->latency_max; sq = m->latency_sq_sum; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 133dbd9338e7..e7e2ebac330d 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by - i_mutex, no need to use page lock */ + i_rwsem, no need to use page lock */ unlock_page(cache_ctl->page); cache_ctl->dentries = kmap(cache_ctl->page); } @@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, rcu_read_lock(); spin_lock(&parent->d_lock); /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ + * marked as complete while not holding the i_rwsem. */ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) dentry = cache_ctl->dentries[cache_ctl->index]; else @@ -478,8 +478,11 @@ more: 2 : (fpos_off(rde->offset) + 1); err = note_last_dentry(dfi, rde->name, rde->name_len, next_offset); - if (err) + if (err) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; return err; + } } else if (req->r_reply_info.dir_end) { dfi->next_offset = 2; /* keep last name */ @@ -520,6 +523,12 @@ more: if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { + /* + * NOTE: Here no need to put the 'dfi->last_readdir', + * because when dir_emit stops us it's most likely + * doesn't have enough memory, etc. So for next readdir + * it will continue. + */ dout("filldir stopping us...\n"); return 0; } @@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */ /* .snap dir? */ if (ceph_snap(parent) == CEPH_NOSNAP && @@ -847,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_quota_is_max_files_exceeded(dir)) { err = -EDQUOT; goto out; @@ -909,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_quota_is_max_files_exceeded(dir)) { err = -EDQUOT; goto out; @@ -959,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; - int err = -EROFS; + int err; int op; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ op = CEPH_MDS_OP_MKSNAP; @@ -971,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { + err = -EROFS; goto out; } @@ -1028,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct ceph_mds_request *req; int err; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; @@ -1062,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct dentry *dentry = req->r_dentry; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_dentry_info *di = ceph_dentry(dentry); int result = req->r_err ? req->r_err : le32_to_cpu(req->r_reply_info.head->result); + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", + __func__, dentry, dentry); + + spin_lock(&fsc->async_unlink_conflict_lock); + hash_del_rcu(&di->hnode); + spin_unlock(&fsc->async_unlink_conflict_lock); + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT); + spin_unlock(&dentry->d_lock); + + synchronize_rcu(); + if (result == -EJUKEBOX) goto out; @@ -1072,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, if (result) { int pathlen = 0; u64 base = 0; - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + char *path = ceph_mdsc_build_path(dentry, &pathlen, &base, 0); /* mark error on parent + clear complete */ @@ -1080,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, ceph_dir_clear_complete(req->r_parent); /* drop the dentry -- we don't know its status */ - if (!d_unhashed(req->r_dentry)) - d_drop(req->r_dentry); + if (!d_unhashed(dentry)) + d_drop(dentry); /* mark inode itself for an error (since metadata is bogus) */ mapping_set_error(req->r_old_inode->i_mapping, result); - pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", + pr_warn("async unlink failure path=(%llx)%s result=%d!\n", base, IS_ERR(path) ? "<<bad>>" : path, result); ceph_mdsc_free_path(path, pathlen); } @@ -1171,6 +1215,8 @@ retry: if (try_async && op == CEPH_MDS_OP_UNLINK && (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { + struct ceph_dentry_info *di = ceph_dentry(dentry); + dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), dentry->d_name.len, dentry->d_name.name, ceph_cap_string(req->r_dir_caps)); @@ -1178,6 +1224,16 @@ retry: req->r_callback = ceph_async_unlink_cb; req->r_old_inode = d_inode(dentry); ihold(req->r_old_inode); + + spin_lock(&dentry->d_lock); + di->flags |= CEPH_DENTRY_ASYNC_UNLINK; + spin_unlock(&dentry->d_lock); + + spin_lock(&fsc->async_unlink_conflict_lock); + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode, + dentry->d_name.hash); + spin_unlock(&fsc->async_unlink_conflict_lock); + err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { /* @@ -1186,10 +1242,20 @@ retry: */ drop_nlink(inode); d_delete(dentry); - } else if (err == -EJUKEBOX) { - try_async = false; - ceph_mdsc_put_request(req); - goto retry; + } else { + spin_lock(&fsc->async_unlink_conflict_lock); + hash_del_rcu(&di->hnode); + spin_unlock(&fsc->async_unlink_conflict_lock); + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; + spin_unlock(&dentry->d_lock); + + if (err == -EJUKEBOX) { + try_async = false; + ceph_mdsc_put_request(req); + goto retry; + } } } else { set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); @@ -1228,6 +1294,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, (!ceph_quota_is_same_realm(old_dir, new_dir))) return -EXDEV; + err = ceph_wait_on_conflict_unlink(new_dentry); + if (err) + return err; + dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e0fa66ac8b9f..f780e4e0d062 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct inode *inode = __lookup_inode(sb, ino); + struct ceph_inode_info *ci = ceph_inode(inode); int err; if (IS_ERR(inode)) @@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_PTR(err); } /* -ESTALE if inode as been unlinked and no file is open */ - if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { + if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) { iput(inode); return ERR_PTR(-ESTALE); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 02a0a0fd9ccd..04fd34557de8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -95,12 +95,11 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, size_t start; int idx = 0; - bytes = iov_iter_get_pages(iter, pages, maxsize - size, + bytes = iov_iter_get_pages2(iter, pages, maxsize - size, ITER_GET_BVECS_PAGES, &start); if (bytes < 0) return size ?: bytes; - iov_iter_advance(iter, bytes); size += bytes; for ( ; bytes; idx++, bvec_idx++) { @@ -204,7 +203,10 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->netfs.inode)->mount_options; struct ceph_file_info *fi; + int ret; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, inode->i_mode, isdir ? "dir" : "regular"); @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (!fi) return -ENOMEM; + if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + fi->flags |= CEPH_F_SYNC; + file->private_data = fi; } @@ -235,7 +240,21 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, INIT_LIST_HEAD(&fi->rw_contexts); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); + if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) { + ret = ceph_uninline_data(file); + if (ret < 0) + goto error; + } + return 0; + +error: + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); + ceph_put_fmode(ci, fi->fmode, 1); + kmem_cache_free(ceph_file_cachep, fi); + /* wake up anyone waiting for caps on this inode */ + wake_up_all(&ci->i_cap_wq); + return ret; } /* @@ -248,8 +267,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) switch (inode->i_mode & S_IFMT) { case S_IFREG: - ceph_fscache_register_inode_cookie(inode); - ceph_fscache_file_set_cookie(inode, file); + ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE); fallthrough; case S_IFDIR: ret = ceph_init_file_info(inode, file, fmode, @@ -512,52 +530,67 @@ static void restore_deleg_ino(struct inode *dir, u64 ino) } } +static void wake_async_create_waiters(struct inode *inode, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + } + ceph_kick_flushing_inode_caps(session, ci); + spin_unlock(&ci->i_ceph_lock); +} + static void ceph_async_create_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct dentry *dentry = req->r_dentry; + struct inode *dinode = d_inode(dentry); + struct inode *tinode = req->r_target_inode; int result = req->r_err ? req->r_err : le32_to_cpu(req->r_reply_info.head->result); + WARN_ON_ONCE(dinode && tinode && dinode != tinode); + + /* MDS changed -- caller must resubmit */ if (result == -EJUKEBOX) goto out; mapping_set_error(req->r_parent->i_mapping, result); if (result) { - struct dentry *dentry = req->r_dentry; - struct inode *inode = d_inode(dentry); int pathlen = 0; u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); + pr_warn("async create failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) d_drop(dentry); - ceph_inode_shutdown(inode); - - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<<bad>>" : path, result); - ceph_mdsc_free_path(path, pathlen); + if (dinode) { + mapping_set_error(dinode->i_mapping, result); + ceph_inode_shutdown(dinode); + wake_async_create_waiters(dinode, req->r_session); + } } - if (req->r_target_inode) { - struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); - u64 ino = ceph_vino(req->r_target_inode).ino; + if (tinode) { + u64 ino = ceph_vino(tinode).ino; if (req->r_deleg_ino != ino) pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", __func__, req->r_err, req->r_deleg_ino, ino); - mapping_set_error(req->r_target_inode->i_mapping, result); - spin_lock(&ci->i_ceph_lock); - if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { - ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; - wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); - } - ceph_kick_flushing_inode_caps(req->r_session, ci); - spin_unlock(&ci->i_ceph_lock); + mapping_set_error(tinode->i_mapping, result); + wake_async_create_waiters(tinode, req->r_session); } else if (!result) { pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, req->r_deleg_ino); @@ -577,8 +610,10 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_mds_reply_inode in = { }; struct ceph_mds_reply_info_in iinfo = { .in = &in }; struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); struct inode *inode; struct timespec64 now; + struct ceph_string *pool_ns; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_vino vino = { .ino = req->r_deleg_ino, .snap = CEPH_NOSNAP }; @@ -593,9 +628,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); - iinfo.xattr_len = ARRAY_SIZE(xattr_buf); - iinfo.xattr_data = xattr_buf; - memset(iinfo.xattr_data, 0, iinfo.xattr_len); + if (req->r_pagelist) { + iinfo.xattr_len = req->r_pagelist->length; + iinfo.xattr_data = req->r_pagelist->mapped_tail; + } else { + /* fake it */ + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + } in.ino = cpu_to_le64(vino.ino); in.snapid = cpu_to_le64(CEPH_NOSNAP); @@ -605,17 +646,31 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); in.cap.flags = CEPH_CAP_FLAG_AUTH; in.ctime = in.mtime = in.atime = iinfo.btime; - in.mode = cpu_to_le32((u32)mode); in.truncate_seq = cpu_to_le32(1); in.truncate_size = cpu_to_le64(-1ULL); in.xattr_version = cpu_to_le64(1); in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); - in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ? - dir->i_gid : current_fsgid())); + if (dir->i_mode & S_ISGID) { + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); + + /* Directories always inherit the setgid bit. */ + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else { + in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); + } + in.mode = cpu_to_le32((u32)mode); + in.nlink = cpu_to_le32(1); in.max_size = cpu_to_le64(lo->stripe_unit); ceph_file_layout_to_legacy(lo, &in.layout); + /* lo is private, so pool_ns can't change */ + pool_ns = rcu_dereference_raw(lo->pool_ns); + if (pool_ns) { + iinfo.pool_ns_len = pool_ns->len; + iinfo.pool_ns_data = pool_ns->str; + } down_read(&mdsc->snap_rwsem); ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, @@ -654,6 +709,12 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, file->f_mode |= FMODE_CREATED; ret = finish_open(file, dentry, ceph_open); } + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_CREATE; + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT); + spin_unlock(&dentry->d_lock); + return ret; } @@ -680,6 +741,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return -ENAMETOOLONG; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + /* + * Do not truncate the file, since atomic_open is called before the + * permission check. The caller will do the truncation afterward. + */ + flags &= ~O_TRUNC; + if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) return -EDQUOT; @@ -689,6 +759,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out_ctx; + /* Async create can't handle more than a page of xattrs */ + if (as_ctx.pagelist && + !list_is_singular(&as_ctx.pagelist->head)) + try_async = false; } else if (!d_in_lookup(dentry)) { /* If it's not being looked up, it's negative */ return -ENOENT; @@ -722,9 +796,16 @@ retry: (req->r_dir_caps = try_prep_async_create(dir, dentry, &lo, &req->r_deleg_ino))) { + struct ceph_dentry_info *di = ceph_dentry(dentry); + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); req->r_callback = ceph_async_create_cb; + + spin_lock(&dentry->d_lock); + di->flags |= CEPH_DENTRY_ASYNC_CREATE; + spin_unlock(&dentry->d_lock); + err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { err = ceph_finish_async_create(dir, dentry, @@ -734,16 +815,16 @@ retry: restore_deleg_ino(dir, req->r_deleg_ino); ceph_mdsc_put_request(req); try_async = false; + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto retry; } + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto out_req; } } set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); - err = ceph_mdsc_do_request(mdsc, - (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, - req); + err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); if (err == -ENOENT) { dentry = ceph_handle_snapdir(req, dentry); if (IS_ERR(dentry)) { @@ -810,6 +891,7 @@ int ceph_release(struct inode *inode, struct file *file) dout("release inode %p regular file %p\n", inode, file); WARN_ON(!list_empty(&fi->rw_contexts)); + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); ceph_put_fmode(ci, fi->fmode, 1); kmem_cache_free(ceph_file_cachep, fi); @@ -847,7 +929,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ssize_t ret; u64 off = iocb->ki_pos; u64 len = iov_iter_count(to); - u64 i_size; + u64 i_size = i_size_read(inode); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); @@ -898,9 +980,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, false, false); - ret = ceph_osdc_start_request(osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(osdc, req); + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, @@ -1015,7 +1096,6 @@ static void ceph_aio_complete(struct inode *inode, } spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &aio_req->prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1164,7 +1244,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_inode = inode; req->r_priv = aio_req; - ret = ceph_osdc_start_request(req->r_osdc, req, false); + ceph_osdc_start_request(req->r_osdc, req); out: if (ret < 0) { req->r_result = ret; @@ -1196,7 +1276,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; - bool should_dirty = !write && iter_is_iovec(iter); + bool should_dirty = !write && user_backed_iter(iter); if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -1206,7 +1286,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, snapc, snapc ? snapc->seq : 0); if (write) { - int ret2 = invalidate_inode_pages2_range(inode->i_mapping, + int ret2; + + ceph_fscache_invalidate(inode, true); + + ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (pos + count - 1) >> PAGE_SHIFT); if (ret2 < 0) @@ -1297,9 +1381,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, continue; } - ret = ceph_osdc_start_request(req->r_osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(req->r_osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); if (write) ceph_update_write_metrics(metric, req->r_start_latency, @@ -1362,8 +1445,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, r_private_item); list_del_init(&req->r_private_item); if (ret >= 0) - ret = ceph_osdc_start_request(req->r_osdc, - req, false); + ceph_osdc_start_request(req->r_osdc, req); if (ret < 0) { req->r_result = ret; ceph_aio_complete_req(req); @@ -1417,6 +1499,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) return ret; + ceph_fscache_invalidate(inode, false); ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (pos + count - 1) >> PAGE_SHIFT); @@ -1475,9 +1558,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, false, true); req->r_mtime = mtime; - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, ret); @@ -1524,7 +1606,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct ceph_inode_info *ci = ceph_inode(inode); bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; - int want, got = 0; + int want = 0, got = 0; int retry_op = 0, read = 0; again: @@ -1539,13 +1621,14 @@ again: else ceph_start_io_read(inode); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_CACHE; + want |= CEPH_CAP_FILE_LAZYIO; + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); @@ -1560,7 +1643,7 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - if (ci->i_inline_version == CEPH_INLINE_NONE) { + if (!ceph_has_inline_data(ci)) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { ret = ceph_direct_read_write(iocb, to, NULL, NULL); @@ -1679,7 +1762,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; - int err, want, got; + int err, want = 0, got; bool direct_lock = false; u32 map_flags; u64 pool_flags; @@ -1746,18 +1829,12 @@ retry_snap: if (err) goto out; - if (ci->i_inline_version != CEPH_INLINE_NONE) { - err = ceph_uninline_data(file, NULL); - if (err < 0) - goto out; - } - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; + want |= CEPH_CAP_FILE_LAZYIO; got = 0; err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) @@ -1813,7 +1890,7 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_perform_write(file, from, pos); + written = generic_perform_write(iocb, from); if (likely(written >= 0)) iocb->ki_pos = pos + written; ceph_end_io_write(inode); @@ -1823,14 +1900,13 @@ retry_snap: int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -1870,57 +1946,15 @@ out_unlocked: */ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { - struct inode *inode = file->f_mapping->host; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - loff_t i_size; - loff_t ret; - - inode_lock(inode); - if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { + struct inode *inode = file_inode(file); + int ret; + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); if (ret < 0) - goto out; - } - - i_size = i_size_read(inode); - switch (whence) { - case SEEK_END: - offset += i_size; - break; - case SEEK_CUR: - /* - * Here we special-case the lseek(fd, 0, SEEK_CUR) - * position-querying operation. Avoid rewriting the "same" - * f_pos value back to the file because a concurrent read(), - * write() or lseek() might have altered it - */ - if (offset == 0) { - ret = file->f_pos; - goto out; - } - offset += file->f_pos; - break; - case SEEK_DATA: - if (offset < 0 || offset >= i_size) { - ret = -ENXIO; - goto out; - } - break; - case SEEK_HOLE: - if (offset < 0 || offset >= i_size) { - ret = -ENXIO; - goto out; - } - offset = i_size; - break; + return ret; } - - ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); - -out: - inode_unlock(inode); - return ret; + return generic_file_llseek(file, offset, whence); } static inline void ceph_zero_partial_page( @@ -1989,12 +2023,10 @@ static int ceph_zero_partial_object(struct inode *inode, } req->r_mtime = inode->i_mtime; - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) { - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - if (ret == -ENOENT) - ret = 0; - } + ceph_osdc_start_request(&fsc->client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; ceph_osdc_put_request(req); out: @@ -2077,12 +2109,6 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ci->i_inline_version != CEPH_INLINE_NONE) { - ret = ceph_uninline_data(file, NULL); - if (ret < 0) - goto unlock; - } - size = i_size_read(inode); /* Are we punching a hole beyond EOF? */ @@ -2101,12 +2127,12 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; filemap_invalidate_lock(inode->i_mapping); + ceph_fscache_invalidate(inode, false); ceph_zero_pagecache_range(inode, offset, length); ret = ceph_zero_objects(inode, offset, length); if (!ret) { spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -2302,7 +2328,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off if (IS_ERR(req)) ret = PTR_ERR(req); else { - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); ceph_update_copyfrom_metrics(&fsc->mdsc->metric, req->r_start_latency, @@ -2425,6 +2451,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, goto out_caps; /* Drop dst file cached pages */ + ceph_fscache_invalidate(dst_inode, false); ret = invalidate_inode_pages2_range(dst_inode->i_mapping, dst_off >> PAGE_SHIFT, (dst_off + len) >> PAGE_SHIFT); @@ -2494,11 +2521,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, /* Let the MDS know about dst file size change */ if (ceph_inode_set_size(dst_inode, dst_off) || ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) - ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH, + NULL); } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); - dst_ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&dst_ci->i_ceph_lock); if (dirty) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e3322fcb2e8d..4af5e55abc15 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent) if (!S_ISDIR(parent->i_mode)) { pr_warn_once("bad snapdir parent type (mode=0%o)\n", parent->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { pr_warn_once("bad snapdir inode type (mode=0%o)\n", inode->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } inode->i_mode = parent->i_mode; @@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent) } return inode; +err: + if ((inode->i_state & I_NEW)) + discard_new_inode(inode); + else + iput(inode); + return ERR_PTR(-ENOTDIR); } const struct inode_operations ceph_file_iops = { @@ -170,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, rb_insert_color(&frag->node, &ci->i_fragtree); dout("get_or_create_frag added %llx.%llx frag %x\n", - ceph_vinop(&ci->vfs_inode), f); + ceph_vinop(&ci->netfs.inode), f); return frag; } @@ -356,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32() % nsplits; + i = prandom_u32_max(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; @@ -447,11 +453,14 @@ struct inode *ceph_alloc_inode(struct super_block *sb) struct ceph_inode_info *ci; int i; - ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS); if (!ci) return NULL; - dout("alloc_inode %p\n", &ci->vfs_inode); + dout("alloc_inode %p\n", &ci->netfs.inode); + + /* Set parameters for the netfs library */ + netfs_inode_init(&ci->netfs, &ceph_netfs_ops); spin_lock_init(&ci->i_ceph_lock); @@ -538,10 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); - - ceph_fscache_inode_init(ci); - - return &ci->vfs_inode; + return &ci->netfs.inode; } void ceph_free_inode(struct inode *inode) @@ -564,13 +570,15 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); truncate_inode_pages_final(&inode->i_data); + if (inode->i_state & I_PINNING_FSCACHE_WB) + ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); __ceph_remove_caps(ci); - if (__ceph_has_any_quota(ci)) + if (__ceph_has_quota(ci, QUOTA_GET_ANY)) ceph_adjust_quota_realms_count(inode, false); /* @@ -634,6 +642,12 @@ int ceph_fill_file_size(struct inode *inode, int issued, } i_size_write(inode, size); inode->i_blocks = calc_inode_blocks(size); + /* + * If we're expanding, then we should be able to just update + * the existing cookie. + */ + if (size > isize) + ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", @@ -666,10 +680,6 @@ int ceph_fill_file_size(struct inode *inode, int issued, truncate_size); ci->i_truncate_size = truncate_size; } - - if (queue_trunc) - ceph_fscache_invalidate(inode); - return queue_trunc; } @@ -1039,7 +1049,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, iinfo->inline_version >= ci->i_inline_version) { int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; ci->i_inline_version = iinfo->inline_version; - if (ci->i_inline_version != CEPH_INLINE_NONE && + if (ceph_has_inline_data(ci) && (locked_page || (info_caps & cache_caps))) fill_inline = true; } @@ -1053,6 +1063,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); + ceph_fscache_register_inode_cookie(inode); + if (fill_inline) ceph_fill_inline_data(inode, locked_page, iinfo->inline_data, iinfo->inline_len); @@ -1195,7 +1207,7 @@ out_unlock: /* * splice a dentry to an inode. - * caller must hold directory i_mutex for this to be safe. + * caller must hold directory i_rwsem for this to be safe. */ static int splice_dentry(struct dentry **pdn, struct inode *in) { @@ -1454,10 +1466,12 @@ retry_lookup: } else if (have_lease) { if (d_unhashed(dn)) d_add(dn, NULL); + } + + if (!d_unhashed(dn) && have_lease) update_dentry_lease(dir, dn, rinfo->dlease, session, req->r_request_started); - } goto done; } @@ -1592,7 +1606,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, return idx == 0 ? -ENOMEM : 0; } /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ + * i_rwsem, no need to use page lock */ unlock_page(ctl->page); ctl->dentries = kmap(ctl->page); if (idx == 0) @@ -1814,11 +1828,13 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); i_size_write(inode, size); + ceph_fscache_update(inode); inode->i_blocks = calc_inode_blocks(size); ret = __ceph_should_report_size(ci); spin_unlock(&ci->i_ceph_lock); + return ret; } @@ -1844,6 +1860,8 @@ static void ceph_do_invalidate_pages(struct inode *inode) u32 orig_gen; int check = 0; + ceph_fscache_invalidate(inode, false); + mutex_lock(&ci->i_truncate_mutex); if (ceph_inode_is_shutdown(inode)) { @@ -1868,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_inode_pages2 %llx.%llx failed\n", ceph_vinop(inode)); @@ -1937,6 +1954,7 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); + ceph_fscache_resize(inode, to); truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); @@ -1960,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work) { struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_work); - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { dout("writeback %p\n", inode); @@ -2174,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); inode->i_ctime = attr->ia_ctime; + inode_inc_iversion_raw(inode); } release &= issued; @@ -2184,7 +2203,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); - if (mask) { req->r_inode = inode; ihold(inode); @@ -2242,6 +2260,36 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; } +int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) +{ + int issued = ceph_caps_issued(ceph_inode(inode)); + + /* + * If any 'x' caps is issued we can just choose the auth MDS + * instead of the random replica MDSes. Because only when the + * Locker is in LOCK_EXEC state will the loner client could + * get the 'x' caps. And if we send the getattr requests to + * any replica MDS it must auth pin and tries to rdlock from + * the auth MDS, and then the auth MDS need to do the Locker + * state transition to LOCK_SYNC. And after that the lock state + * will change back. + * + * This cost much when doing the Locker state transition and + * usually will need to revoke caps from clients. + * + * And for the 'Xs' caps for getxattr we will also choose the + * auth MDS, because the MDS side code is buggy due to setxattr + * won't notify the replica MDSes when the values changed and + * the replica MDS will return the old values. Though we will + * fix it in MDS code, but this still makes sense for old ceph. + */ + if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) + || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) + return USE_AUTH_MDS; + else + return USE_ANY_MDS; +} + /* * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. @@ -2265,7 +2313,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) return 0; - mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; + mode = ceph_try_to_choose_auth_mds(inode, mask); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) return PTR_ERR(req); @@ -2280,7 +2328,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (inline_version == 0) { /* the reply is supposed to contain inline data */ err = -EINVAL; - } else if (inline_version == CEPH_INLINE_NONE) { + } else if (inline_version == CEPH_INLINE_NONE || + inline_version == 1) { err = -ENODATA; } else { err = req->r_reply_info.targeti.inline_len; @@ -2291,6 +2340,58 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, return err; } +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int mode = USE_AUTH_MDS; + int err; + char *xattr_value; + size_t xattr_value_len; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); + if (IS_ERR(req)) { + err = -ENOMEM; + goto out; + } + + req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + goto put; + } + + ihold(inode); + req->r_inode = inode; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto put; + + xattr_value = req->r_reply_info.xattr_info.xattr_value; + xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; + + dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); + + err = (int)xattr_value_len; + if (size == 0) + goto put; + + if (xattr_value_len > size) { + err = -ERANGE; + goto put; + } + + memcpy(value, xattr_value, xattr_value_len); +put: + ceph_mdsc_put_request(req); +out: + dout("do_getvxattr result=%d\n", err); + return err; +} + /* * Check inode permissions. We verify we have a valid value for @@ -2348,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct super_block *sb = inode->i_sb; struct ceph_inode_info *ci = ceph_inode(inode); u32 valid_mask = STATX_BASIC_STATS; int err = 0; @@ -2356,7 +2458,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return -ESTALE; /* Skip the getattr altogether if we're asked not to sync */ - if (!(flags & AT_STATX_DONT_SYNC)) { + if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { err = ceph_do_getattr(inode, statx_to_caps(request_mask, inode->i_mode), flags & AT_STATX_FORCE_SYNC); @@ -2377,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, } if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; + stat->dev = sb->s_dev; else stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) { stat->size = ci->i_rbytes; - else + } else if (ceph_snap(inode) == CEPH_SNAPDIR) { + struct ceph_inode_info *pci; + struct ceph_snap_realm *realm; + struct inode *parent; + + parent = ceph_lookup_inode(sb, ceph_ino(inode)); + if (!parent) + return PTR_ERR(parent); + + pci = ceph_inode(parent); + spin_lock(&pci->i_ceph_lock); + realm = pci->i_snap_realm; + if (realm) + stat->size = realm->num_snaps; + else + stat->size = 0; + spin_unlock(&pci->i_ceph_lock); + iput(parent); + } else { stat->size = ci->i_files + ci->i_subdirs; + } stat->blocks = 0; stat->blksize = 65536; /* diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d1f154aec249..3e2843e86e27 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; - if (wait) - req->r_wait_for_completion = ceph_lock_wait_for_completion; - - err = ceph_mdsc_do_request(mdsc, inode, req); + err = ceph_mdsc_submit_request(mdsc, inode, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req, wait ? + ceph_lock_wait_for_completion : NULL); if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 250aad330a10..26a0a8b9975e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, ceph_decode_32_safe(p, end, sets, bad); dout("got %u sets of delegated inodes\n", sets); while (sets--) { - u64 start, len, ino; + u64 start, len; ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); @@ -449,14 +449,14 @@ static int ceph_parse_deleg_inos(void **p, void *end, continue; } while (len--) { - int err = xa_insert(&s->s_delegated_inos, ino = start++, + int err = xa_insert(&s->s_delegated_inos, start++, DELEGATED_INO_AVAILABLE, GFP_KERNEL); if (!err) { dout("added delegated inode 0x%llx\n", start - 1); } else if (err == -EBUSY) { - pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n", + pr_warn("MDS delegated inode 0x%llx more than once.\n", start - 1); } else { return err; @@ -555,6 +555,28 @@ bad: return -EIO; } +static int parse_reply_info_getvxattr(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + u32 value_len; + + ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */ + ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */ + ceph_decode_skip_32(p, end, bad); /* skip payload length */ + + ceph_decode_32_safe(p, end, value_len, bad); + + if (value_len == end - *p) { + info->xattr_info.xattr_value = *p; + info->xattr_info.xattr_value_len = value_len; + *p = end; + return value_len; + } +bad: + return -EIO; +} + /* * parse extra results */ @@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end, return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); + else if (op == CEPH_MDS_OP_GETVXATTR) + return parse_reply_info_getvxattr(p, end, info, features); else return -EIO; } @@ -631,6 +655,79 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } +/* + * In async unlink case the kclient won't wait for the first reply + * from MDS and just drop all the links and unhash the dentry and then + * succeeds immediately. + * + * For any new create/link/rename,etc requests followed by using the + * same file names we must wait for the first reply of the inflight + * unlink request, or the MDS possibly will fail these following + * requests with -EEXIST if the inflight async unlink request was + * delayed for some reasons. + * + * And the worst case is that for the none async openc request it will + * successfully open the file if the CDentry hasn't been unlinked yet, + * but later the previous delayed async unlink request will remove the + * CDenty. That means the just created file is possiblly deleted later + * by accident. + * + * We need to wait for the inflight async unlink requests to finish + * when creating new files/directories by using the same file names. + */ +int ceph_wait_on_conflict_unlink(struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct dentry *pdentry = dentry->d_parent; + struct dentry *udentry, *found = NULL; + struct ceph_dentry_info *di; + struct qstr dname; + u32 hash = dentry->d_name.hash; + int err; + + dname.name = dentry->d_name.name; + dname.len = dentry->d_name.len; + + rcu_read_lock(); + hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, + hnode, hash) { + udentry = di->dentry; + + spin_lock(&udentry->d_lock); + if (udentry->d_name.hash != hash) + goto next; + if (unlikely(udentry->d_parent != pdentry)) + goto next; + if (!hash_hashed(&di->hnode)) + goto next; + + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", + __func__, dentry, dentry); + + if (!d_same_name(udentry, pdentry, &dname)) + goto next; + + spin_unlock(&udentry->d_lock); + found = dget(udentry); + break; +next: + spin_unlock(&udentry->d_lock); + } + rcu_read_unlock(); + + if (likely(!found)) + return 0; + + dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__, + dentry, dentry, found, found); + + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, + TASK_KILLABLE); + dput(found); + return err; +} + /* * sessions @@ -1196,14 +1293,17 @@ static int encode_supported_features(void **p, void *end) if (count > 0) { size_t i; size_t size = FEATURE_BYTES(count); + unsigned long bit; if (WARN_ON_ONCE(*p + 4 + size > end)) return -ERANGE; ceph_encode_32(p, size); memset(*p, 0, size); - for (i = 0; i < count; i++) - ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); + for (i = 0; i < count; i++) { + bit = feature_bits[i]; + ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); + } *p += size; } else { if (WARN_ON_ONCE(*p + 4 > end)) @@ -1540,7 +1640,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, p = session->s_caps.next; while (p != &session->s_caps) { cap = list_entry(p, struct ceph_cap, session_caps); - inode = igrab(&cap->ci->vfs_inode); + inode = igrab(&cap->ci->netfs.inode); if (!inode) { p = p->next; continue; @@ -1598,7 +1698,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, int iputs; dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); + cap, ci, &ci->netfs.inode); spin_lock(&ci->i_ceph_lock); iputs = ceph_purge_inode_cap(inode, cap, &invalidate); spin_unlock(&ci->i_ceph_lock); @@ -2178,7 +2278,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, + __GFP_NOWARN | + __GFP_ZERO, order); if (rinfo->dir_entries) break; @@ -2217,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; + req->r_feature_needed = -1; kref_init(&req->r_kref); RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); @@ -2626,7 +2728,28 @@ static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_mds_request_head_old *rhead; struct ceph_msg *msg; - int flags = 0; + int flags = 0, max_retry; + + /* + * The type of 'r_attempts' in kernel 'ceph_mds_request' + * is 'int', while in 'ceph_mds_request_head' the type of + * 'num_retry' is '__u8'. So in case the request retries + * exceeding 256 times, the MDS will receive a incorrect + * retry seq. + * + * In this case it's ususally a bug in MDS and continue + * retrying the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); + max_retry = 1 << (max_retry * BITS_PER_BYTE); + if (req->r_attempts >= max_retry) { + pr_warn_ratelimited("%s request tid %llu seq overflow\n", + __func__, req->r_tid); + return -EMULTIHOP; + } req->r_attempts++; if (req->r_inode) { @@ -2638,7 +2761,7 @@ static int __prepare_send_request(struct ceph_mds_session *session, else req->r_sent_on_mseq = -1; } - dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + dout("%s %p tid %lld %s (attempt %d)\n", __func__, req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { @@ -2794,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc, dout("do_request mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); + + /* + * The old ceph will crash the MDSs when see unknown OPs + */ + if (req->r_feature_needed > 0 && + !test_bit(req->r_feature_needed, &session->s_features)) { + err = -EOPNOTSUPP; + goto out_session; + } + if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { /* @@ -2838,6 +2971,64 @@ static void __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; + /* + * For async create we will choose the auth MDS of frag in parent + * directory to send the request and ususally this works fine, but + * if the migrated the dirtory to another MDS before it could handle + * it the request will be forwarded. + * + * And then the auth cap will be changed. + */ + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) { + struct ceph_dentry_info *di = ceph_dentry(req->r_dentry); + struct ceph_inode_info *ci; + struct ceph_cap *cap; + + /* + * The request maybe handled very fast and the new inode + * hasn't been linked to the dentry yet. We need to wait + * for the ceph_finish_async_create(), which shouldn't be + * stuck too long or fail in thoery, to finish when forwarding + * the request. + */ + if (!d_inode(req->r_dentry)) { + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT, + TASK_KILLABLE); + if (err) { + mutex_lock(&req->r_fill_mutex); + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + goto out_session; + } + } + + ci = ceph_inode(d_inode(req->r_dentry)); + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { + dout("do_request session changed for auth cap %d -> %d\n", + cap->session->s_mds, session->s_mds); + + /* Remove the auth cap from old session */ + spin_lock(&cap->session->s_cap_lock); + cap->session->s_nr_caps--; + list_del_init(&cap->session_caps); + spin_unlock(&cap->session->s_cap_lock); + + /* Add the auth cap to the new session */ + cap->mds = mds; + cap->session = session; + spin_lock(&session->s_cap_lock); + session->s_nr_caps++; + list_add_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + + change_auth_cap_ses(ci, session); + } + spin_unlock(&ci->i_ceph_lock); + } + err = __send_request(session, req, false); out_session: @@ -2946,15 +3137,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, return err; } -static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req) +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func) { int err; /* wait */ dout("do_request waiting\n"); - if (!req->r_timeout && req->r_wait_for_completion) { - err = req->r_wait_for_completion(mdsc, req); + if (wait_func) { + err = wait_func(mdsc, req); } else { long timeleft = wait_for_completion_killable_timeout( &req->r_completion, @@ -3011,7 +3203,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* issue */ err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) - err = ceph_mdsc_wait_request(mdsc, req); + err = ceph_mdsc_wait_request(mdsc, req, NULL); dout("do_request %p done, result %d\n", req, err); return err; } @@ -3097,35 +3289,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) result = le32_to_cpu(head->result); - /* - * Handle an ESTALE - * if we're not talking to the authority, send to them - * if the authority has changed while we weren't looking, - * send to new authority - * Otherwise we just have to return an ESTALE - */ - if (result == -ESTALE) { - dout("got ESTALE on request %llu\n", req->r_tid); - req->r_resend_mds = -1; - if (req->r_direct_mode != USE_AUTH_MDS) { - dout("not using auth, setting for that now\n"); - req->r_direct_mode = USE_AUTH_MDS; - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } else { - int mds = __choose_mds(mdsc, req, NULL); - if (mds >= 0 && mds != req->r_session->s_mds) { - dout("but auth changed, so resending\n"); - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } - } - dout("have to return ESTALE on request %llu\n", req->r_tid); - } - - if (head->safe) { set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); @@ -3268,6 +3431,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, int err = -EINVAL; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; + bool aborted = false; ceph_decode_need(&p, end, 2*sizeof(u32), bad); next_mds = ceph_decode_32(&p); @@ -3276,16 +3440,41 @@ static void handle_forward(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { + mutex_unlock(&mdsc->mutex); dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); - goto out; /* dup reply? */ + return; /* dup reply? */ } if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd) { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); + /* + * The type of 'num_fwd' in ceph 'MClientRequestForward' + * is 'int32_t', while in 'ceph_mds_request_head' the + * type is '__u8'. So in case the request bounces between + * MDSes exceeding 256 times, the client will get stuck. + * + * In this case it's ususally a bug in MDS and continue + * bouncing the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + int max = sizeof_field(struct ceph_mds_request_head, num_fwd); + max = 1 << (max * BITS_PER_BYTE); + if (req->r_num_fwd >= max) { + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited("forward tid %llu seq overflow\n", + tid); + } else { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } } else { /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); @@ -3297,9 +3486,12 @@ static void handle_forward(struct ceph_mds_client *mdsc, put_request_session(req); __do_request(mdsc, req); } - ceph_mdsc_put_request(req); -out: mutex_unlock(&mdsc->mutex); + + /* kick calling process */ + if (aborted) + complete_request(mdsc, req); + ceph_mdsc_put_request(req); return; bad: @@ -3378,13 +3570,17 @@ static void handle_session(struct ceph_mds_session *session, } if (msg_version >= 5) { - u32 flags; - /* version >= 4, struct_v, struct_cv, len, metric_spec */ - ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad); + u32 flags, len; + + /* version >= 4 */ + ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ + ceph_decode_32_safe(&p, end, len, bad); /* len */ + ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ + /* version >= 5, flags */ - ceph_decode_32_safe(&p, end, flags, bad); + ceph_decode_32_safe(&p, end, flags, bad); if (flags & CEPH_SESSION_BLOCKLISTED) { - pr_warn("mds%d session blocklisted\n", session->s_mds); + pr_warn("mds%d session blocklisted\n", session->s_mds); blocklisted = true; } } @@ -3413,11 +3609,26 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_OPEN: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect success\n", session->s_mds); - session->s_state = CEPH_MDS_SESSION_OPEN; - session->s_features = features; - renewed_caps(mdsc, session, 0); - if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) - metric_schedule_delayed(&mdsc->metric); + + if (session->s_state == CEPH_MDS_SESSION_OPEN) { + pr_notice("mds%d is already opened\n", session->s_mds); + } else { + session->s_state = CEPH_MDS_SESSION_OPEN; + session->s_features = features; + renewed_caps(mdsc, session, 0); + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, + &session->s_features)) + metric_schedule_delayed(&mdsc->metric); + } + + /* + * The connection maybe broken and the session in client + * side has been reinitialized, need to update the seq + * anyway. + */ + if (!session->s_seq && seq) + session->s_seq = seq; + wake = 1; if (mdsc->stopping) __close_session(mdsc, session); @@ -3683,7 +3894,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; char *path; - int pathlen, err; + int pathlen = 0, err; u64 pathbase; u64 snap_follows; @@ -3703,7 +3914,6 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, } } else { path = NULL; - pathlen = 0; pathbase = 0; } @@ -4400,12 +4610,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dentry->d_name.len); spin_unlock(&dentry->d_lock); - /* - * if this is a preemptive lease RELEASE, no need to - * flush request stream, since the actual request will - * soon follow. - */ - msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); ceph_con_send(&session->s_con, msg); } @@ -4438,8 +4642,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { - struct ceph_fs_client *fsc = s->s_mdsc->fsc; - switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { @@ -4448,10 +4650,6 @@ bool check_session_state(struct ceph_mds_session *s) } break; case CEPH_MDS_SESSION_CLOSING: - /* Should never reach this when not force unmounting */ - WARN_ON_ONCE(s->s_ttl && - READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); - fallthrough; case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: case CEPH_MDS_SESSION_CLOSED: @@ -4706,15 +4904,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) } /* - * wait for all write mds requests to flush. + * flush the mdlog and wait for all write mds requests to flush. */ -static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, + u64 want_tid) { struct ceph_mds_request *req = NULL, *nextreq; + struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); - dout("wait_unsafe_requests want %lld\n", want_tid); + dout("%s want %lld\n", __func__, want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { @@ -4726,14 +4926,32 @@ restart: nextreq = NULL; if (req->r_op != CEPH_MDS_OP_SETFILELOCK && (req->r_op & CEPH_MDS_OP_WRITE)) { + struct ceph_mds_session *s = req->r_session; + + if (!s) { + req = nextreq; + continue; + } + /* write op */ ceph_mdsc_get_request(req); if (nextreq) ceph_mdsc_get_request(nextreq); + s = ceph_get_mds_session(s); mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests wait on %llu (want %llu)\n", + + /* send flush mdlog request to MDS */ + if (last_session != s) { + send_flush_mdlog(s); + ceph_put_mds_session(last_session); + last_session = s; + } else { + ceph_put_mds_session(s); + } + dout("%s wait on %llu (want %llu)\n", __func__, req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); if (!nextreq) @@ -4748,7 +4966,8 @@ restart: req = nextreq; } mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests done\n"); + ceph_put_mds_session(last_session); + dout("%s done\n", __func__); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) @@ -4777,7 +4996,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); - wait_unsafe_requests(mdsc, want_tid); + flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); wait_caps_flush(mdsc, want_flush); } @@ -4842,7 +5061,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); ceph_cleanup_snapid_map(mdsc); - ceph_cleanup_empty_realms(mdsc); + ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 97c7f7bfa55f..0598faa50e2e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -29,14 +29,13 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, CEPHFS_FEATURE_DELEG_INO, CEPHFS_FEATURE_METRIC_COLLECT, + CEPHFS_FEATURE_ALTERNATE_NAME, + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_OP_GETVXATTR, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, }; -/* - * This will always have the highest feature bit value - * as the last element of the array. - */ #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ @@ -45,10 +44,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ - \ - CEPHFS_FEATURE_MAX, \ + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ + CEPHFS_FEATURE_OP_GETVXATTR, \ } -#define CEPHFS_FEATURES_CLIENT_REQUIRED {} /* * Some lock dependencies: @@ -100,6 +98,11 @@ struct ceph_mds_reply_dir_entry { loff_t offset; }; +struct ceph_mds_reply_xattr { + char *xattr_value; + size_t xattr_value_len; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -115,6 +118,7 @@ struct ceph_mds_reply_info_parsed { char *dname; u32 dname_len; struct ceph_mds_reply_lease *dlease; + struct ceph_mds_reply_xattr xattr_info; /* extra */ union { @@ -274,8 +278,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - const struct cred *r_cred; int r_request_release_offset; + const struct cred *r_cred; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ @@ -296,12 +300,11 @@ struct ceph_mds_request { struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; - + u32 r_readdir_offset; struct page *r_locked_page; int r_dir_caps; int r_num_caps; - u32 r_readdir_offset; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ @@ -329,13 +332,14 @@ struct ceph_mds_request { struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; - ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ long long r_dir_release_cnt; long long r_dir_ordered_cnt; int r_readdir_cache_idx; + int r_feature_needed; + struct ceph_cap_reservation r_caps_reservation; }; @@ -507,6 +511,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); @@ -572,9 +579,10 @@ static inline int ceph_wait_on_async_create(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, - TASK_INTERRUPTIBLE); + TASK_KILLABLE); } +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); #endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 30387733765d..3fbabc98e1f7 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) return -1; /* pick */ - n = prandom_u32() % n; + n = prandom_u32_max(n); for (j = 0, i = 0; i < m->possible_max_rank; i++) { if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; @@ -352,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) __decode_and_drop_type(p, end, u8, bad_ext); } if (mdsmap_ev >= 8) { - u32 name_len; /* enabled */ ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); - ceph_decode_32_safe(p, end, name_len, bad_ext); - ceph_decode_need(p, end, name_len, bad_ext); - *p += name_len; + /* fs_name */ + ceph_decode_skip_string(p, end, bad_ext); } /* damaged */ if (mdsmap_ev >= 9) { @@ -370,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) } else { m->m_damaged = false; } + if (mdsmap_ev >= 17) { + /* balancer */ + ceph_decode_skip_string(p, end, bad_ext); + /* standby_count_wanted */ + ceph_decode_skip_32(p, end, bad_ext); + /* old_max_mds */ + ceph_decode_skip_32(p, end, bad_ext); + /* min_compat_client */ + ceph_decode_skip_8(p, end, bad_ext); + /* required_client_features */ + ceph_decode_skip_set(p, end, 64, bad_ext); + ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); + } else { + /* This forces the usage of the (sync) SETXATTR Op */ + m->m_max_xattr_size = 0; + } bad_ext: dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index c57699d8408d..c47347d2e84e 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -8,6 +8,12 @@ #include "metric.h" #include "mds_client.h" +static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) +{ + struct timespec64 t = ktime_to_timespec64(val); + ceph_encode_timespec64(ts, &t); +} + static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { @@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, u64 nr_caps = atomic64_read(&m->total_caps); u32 header_len = sizeof(struct ceph_metric_header); struct ceph_msg *msg; - struct timespec64 ts; s64 sum; s32 items = 0; s32 len; @@ -59,37 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the read latency metric */ read = (struct ceph_metric_read_latency *)(cap + 1); read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); - read->header.ver = 1; + read->header.ver = 2; read->header.compat = 1; read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->metric[METRIC_READ].latency_sum; - jiffies_to_timespec64(sum, &ts); - read->sec = cpu_to_le32(ts.tv_sec); - read->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&read->lat, sum); + ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg); + read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum); + read->count = cpu_to_le64(m->metric[METRIC_READ].total); items++; /* encode the write latency metric */ write = (struct ceph_metric_write_latency *)(read + 1); write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); - write->header.ver = 1; + write->header.ver = 2; write->header.compat = 1; write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->metric[METRIC_WRITE].latency_sum; - jiffies_to_timespec64(sum, &ts); - write->sec = cpu_to_le32(ts.tv_sec); - write->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&write->lat, sum); + ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg); + write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum); + write->count = cpu_to_le64(m->metric[METRIC_WRITE].total); items++; /* encode the metadata latency metric */ meta = (struct ceph_metric_metadata_latency *)(write + 1); meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); - meta->header.ver = 1; + meta->header.ver = 2; meta->header.compat = 1; meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metric[METRIC_METADATA].latency_sum; - jiffies_to_timespec64(sum, &ts); - meta->sec = cpu_to_le32(ts.tv_sec); - meta->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&meta->lat, sum); + ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg); + meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum); + meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total); items++; /* encode the dentry lease metric */ @@ -160,8 +168,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("client%llu send metrics to mds%d\n", - ceph_client_gid(mdsc->fsc->client), s->s_mds); ceph_con_send(&s->s_con, msg); return true; @@ -252,6 +258,7 @@ int ceph_metric_init(struct ceph_client_metric *m) metric->size_max = 0; metric->total = 0; metric->latency_sum = 0; + metric->latency_avg = 0; metric->latency_sq_sum = 0; metric->latency_min = KTIME_MAX; metric->latency_max = 0; @@ -309,20 +316,19 @@ void ceph_metric_destroy(struct ceph_client_metric *m) max = new; \ } -static inline void __update_stdev(ktime_t total, ktime_t lsum, - ktime_t *sq_sump, ktime_t lat) +static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg, + ktime_t *sq_sump, ktime_t lat) { - ktime_t avg, sq; - - if (unlikely(total == 1)) - return; - - /* the sq is (lat - old_avg) * (lat - new_avg) */ - avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); - sq = lat - avg; - avg = DIV64_U64_ROUND_CLOSEST(lsum, total); - sq = sq * (lat - avg); - *sq_sump += sq; + ktime_t avg; + + if (unlikely(total == 1)) { + *lavg = lat; + } else { + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = *lavg + div64_s64(lat - *lavg, total); + *sq_sump += (lat - *lavg)*(lat - avg); + *lavg = avg; + } } void ceph_update_metrics(struct ceph_metric *m, @@ -341,6 +347,7 @@ void ceph_update_metrics(struct ceph_metric *m, METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); m->latency_sum += lat; METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); - __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat); + __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum, + lat); spin_unlock(&m->lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index bb45608181e7..0d0c44bd3332 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -2,7 +2,7 @@ #ifndef _FS_CEPH_MDS_METRIC_H #define _FS_CEPH_MDS_METRIC_H -#include <linux/types.h> +#include <linux/ceph/types.h> #include <linux/percpu_counter.h> #include <linux/ktime.h> @@ -19,27 +19,39 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_OPENED_INODES, CLIENT_METRIC_TYPE_READ_IO_SIZES, CLIENT_METRIC_TYPE_WRITE_IO_SIZES, - - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES, + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, + + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, }; /* * This will always have the highest metric bit value * as the last element of the array. */ -#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ - CLIENT_METRIC_TYPE_CAP_INFO, \ - CLIENT_METRIC_TYPE_READ_LATENCY, \ - CLIENT_METRIC_TYPE_WRITE_LATENCY, \ - CLIENT_METRIC_TYPE_METADATA_LATENCY, \ - CLIENT_METRIC_TYPE_DENTRY_LEASE, \ - CLIENT_METRIC_TYPE_OPENED_FILES, \ - CLIENT_METRIC_TYPE_PINNED_ICAPS, \ - CLIENT_METRIC_TYPE_OPENED_INODES, \ - CLIENT_METRIC_TYPE_READ_IO_SIZES, \ - CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ - \ - CLIENT_METRIC_TYPE_MAX, \ +#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ + CLIENT_METRIC_TYPE_CAP_INFO, \ + CLIENT_METRIC_TYPE_READ_LATENCY, \ + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_DENTRY_LEASE, \ + CLIENT_METRIC_TYPE_OPENED_FILES, \ + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ + CLIENT_METRIC_TYPE_OPENED_INODES, \ + CLIENT_METRIC_TYPE_READ_IO_SIZES, \ + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \ + \ + CLIENT_METRIC_TYPE_MAX, \ } struct ceph_metric_header { @@ -60,22 +72,28 @@ struct ceph_metric_cap { /* metric read latency header */ struct ceph_metric_read_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric write latency header */ struct ceph_metric_write_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric metadata latency header */ struct ceph_metric_metadata_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric dentry lease header */ @@ -140,6 +158,7 @@ struct ceph_metric { u64 size_min; u64 size_max; ktime_t latency_sum; + ktime_t latency_avg; ktime_t latency_sq_sum; ktime_t latency_min; ktime_t latency_max; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 620c691af40e..64592adfe48f 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -30,6 +30,9 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode) /* if root is the real CephFS root, we don't have quota realms */ if (root && ceph_ino(root) == CEPH_INO_ROOT) return false; + /* MDS stray dirs have no quota realms */ + if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino)) + return false; /* otherwise, we can't know for sure */ return true; } @@ -192,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) /* * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (either max_files - * or max_bytes). If the root is reached, return the root ceph_snap_realm - * instead. + * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * max_bytes, or any, depending on the 'which_quota' argument). If the root is + * reached, return the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -206,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * will be restarted. */ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, bool retry) + struct inode *inode, + enum quota_get_realm which_quota, + bool retry) { struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; @@ -245,7 +250,7 @@ restart: } ci = ceph_inode(in); - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, which_quota); iput(in); next = realm->parent; @@ -276,8 +281,8 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, true); - new_realm = get_quota_realm(mdsc, new, false); + old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); + new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); if (PTR_ERR(new_realm) == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) @@ -480,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), + QUOTA_GET_MAX_BYTES, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; @@ -494,10 +500,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) if (ci->i_max_bytes) { total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* For quota size less than 4MB, use 4KB block size */ + if (!total) { + total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } /* It is possible for a quota to be exceeded. * Report 'zero' in that case */ free = total > used ? total - used : 0; + /* For quota size less than 4KB, report the + * total=used=4KB,free=0 when quota is full + * and total=free=4KB, used=0 otherwise */ + if (!total) { + total = 1; + free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } } spin_unlock(&ci->i_ceph_lock); if (total) { diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b41e6724c591..864cdaa0d2bd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -121,18 +121,23 @@ static struct ceph_snap_realm *ceph_create_snap_realm( if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 1); /* for caller */ + /* Do not release the global dummy snaprealm until unmouting */ + if (ino == CEPH_INO_GLOBAL_SNAPREALM) + atomic_set(&realm->nref, 2); + else + atomic_set(&realm->nref, 1); realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); INIT_LIST_HEAD(&realm->dirty_item); + INIT_LIST_HEAD(&realm->rebuild_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); mdsc->num_snap_realms++; - dout("create_snap_realm %llx %p\n", realm->ino, realm); + dout("%s %llx %p\n", __func__, realm->ino, realm); return realm; } @@ -156,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, else if (ino > r->ino) n = n->rb_right; else { - dout("lookup_snap_realm %llx %p\n", r->ino, r); + dout("%s %llx %p\n", __func__, r->ino, r); return r; } } @@ -184,7 +189,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, { lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + dout("%s %p %llx\n", __func__, realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); mdsc->num_snap_realms--; @@ -260,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_empty_lock); } -void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) { + struct ceph_snap_realm *global_realm; + down_write(&mdsc->snap_rwsem); + global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); + if (global_realm) + ceph_put_snap_realm(mdsc, global_realm); __cleanup_empty_realms(mdsc); up_write(&mdsc->snap_rwsem); } @@ -292,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, if (IS_ERR(parent)) return PTR_ERR(parent); } - dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", - realm->ino, realm, realm->parent_ino, realm->parent, - parentino, parent); + dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino, + realm, realm->parent_ino, realm->parent, parentino, parent); if (realm->parent) { list_del_init(&realm->child_item); ceph_put_snap_realm(mdsc, realm->parent); @@ -320,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b) * build the snap context for a given realm. */ static int build_snap_context(struct ceph_snap_realm *realm, - struct list_head* dirty_realms) + struct list_head *realm_queue, + struct list_head *dirty_realms) { struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; @@ -334,9 +344,9 @@ static int build_snap_context(struct ceph_snap_realm *realm, */ if (parent) { if (!parent->cached_context) { - err = build_snap_context(parent, dirty_realms); - if (err) - goto fail; + /* add to the queue head */ + list_add(&parent->rebuild_item, realm_queue); + return 1; } num += parent->cached_context->num_snaps; } @@ -349,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" - " (unchanged)\n", - realm->ino, realm, realm->cached_context, + dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n", + __func__, realm->ino, realm, realm->cached_context, realm->cached_context->seq, (unsigned int)realm->cached_context->num_snaps); return 0; @@ -390,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", - realm->ino, realm, snapc, snapc->seq, - (unsigned int) snapc->num_snaps); + dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino, + realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; @@ -409,8 +417,7 @@ fail: ceph_put_snap_context(realm->cached_context); realm->cached_context = NULL; } - pr_err("build_snap_context %llx %p fail %d\n", realm->ino, - realm, err); + pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err); return err; } @@ -420,13 +427,50 @@ fail: static void rebuild_snap_realms(struct ceph_snap_realm *realm, struct list_head *dirty_realms) { - struct ceph_snap_realm *child; + LIST_HEAD(realm_queue); + int last = 0; + bool skip = false; + + list_add_tail(&realm->rebuild_item, &realm_queue); + + while (!list_empty(&realm_queue)) { + struct ceph_snap_realm *_realm, *child; + + _realm = list_first_entry(&realm_queue, + struct ceph_snap_realm, + rebuild_item); + + /* + * If the last building failed dues to memory + * issue, just empty the realm_queue and return + * to avoid infinite loop. + */ + if (last < 0) { + list_del_init(&_realm->rebuild_item); + continue; + } + + last = build_snap_context(_realm, &realm_queue, dirty_realms); + dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm, + last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); + + /* is any child in the list ? */ + list_for_each_entry(child, &_realm->children, child_item) { + if (!list_empty(&child->rebuild_item)) { + skip = true; + break; + } + } - dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); - build_snap_context(realm, dirty_realms); + if (!skip) { + list_for_each_entry(child, &_realm->children, child_item) + list_add_tail(&child->rebuild_item, &realm_queue); + } - list_for_each_entry(child, &realm->children, child_item) - rebuild_snap_realms(child, dirty_realms); + /* last == 1 means need to build parent first */ + if (last <= 0) + list_del_init(&_realm->rebuild_item); + } } @@ -474,23 +518,15 @@ static bool has_new_snaps(struct ceph_snap_context *o, * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -static void ceph_queue_cap_snap(struct ceph_inode_info *ci) +static void ceph_queue_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap **pcapsnap) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap; + struct inode *inode = &ci->netfs.inode; struct ceph_snap_context *old_snapc, *new_snapc; + struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; int used, dirty; - capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); - if (!capsnap) { - pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); - return; - } - capsnap->cap_flush.is_capsnap = true; - INIT_LIST_HEAD(&capsnap->cap_flush.i_list); - INIT_LIST_HEAD(&capsnap->cap_flush.g_list); - spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -511,12 +547,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("queue_cap_snap %p already pending\n", inode); + dout("%s %p %llx.%llx already pending\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } if (ci->i_wrbuffer_ref_head == 0 && !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); + dout("%s %p %llx.%llx nothing dirty|writing\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } @@ -536,20 +574,17 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) } else { if (!(used & CEPH_CAP_FILE_WR) && ci->i_wrbuffer_ref_head == 0) { - dout("queue_cap_snap %p " - "no new_snap|dirty_page|writing\n", inode); + dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", - inode, capsnap, old_snapc, ceph_cap_string(dirty), - capsnap->need_flush ? "" : "no_flush"); + dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n", + __func__, inode, ceph_vinop(inode), capsnap, old_snapc, + ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); ihold(inode); - refcount_set(&capsnap->nref, 1); - INIT_LIST_HEAD(&capsnap->ci_item); - capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; @@ -579,31 +614,30 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, + dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR," + " now pending\n", __func__, inode, ceph_vinop(inode), capsnap, old_snapc, old_snapc->seq); capsnap->writing = 1; } else { /* note mtime, size NOW. */ __ceph_finish_cap_snap(ci, capsnap); } - capsnap = NULL; + *pcapsnap = NULL; old_snapc = NULL; update_snapc: - if (ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ci->i_head_snapc = NULL; - } else { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } spin_unlock(&ci->i_ceph_lock); ceph_buffer_put(old_blob); - kfree(capsnap); ceph_put_snap_context(old_snapc); } @@ -618,7 +652,7 @@ update_snapc: int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); @@ -632,27 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; if (capsnap->dirty_pages) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "still has %d dirty pages\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size, - capsnap->dirty_pages); + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " + "still has %d dirty pages\n", __func__, inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size, capsnap->dirty_pages); return 0; } /* Fb cap still in use, delay it */ if (ci->i_wb_ref) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "used WRBUFFER, delaying\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size); + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " + "used WRBUFFER, delaying\n", __func__, inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); capsnap->writing = 1; return 0; } ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", - inode, capsnap, capsnap->context, + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", + __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), capsnap->size); @@ -671,24 +706,47 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) { struct ceph_inode_info *ci; struct inode *lastinode = NULL; + struct ceph_cap_snap *capsnap = NULL; - dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + dout("%s %p %llx inode\n", __func__, realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); + struct inode *inode = igrab(&ci->netfs.inode); if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); lastinode = inode; - ceph_queue_cap_snap(ci); + + /* + * Allocate the capsnap memory outside of ceph_queue_cap_snap() + * to reduce very possible but unnecessary frequently memory + * allocate/free in this loop. + */ + if (!capsnap) { + capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); + if (!capsnap) { + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", + inode); + return; + } + } + capsnap->cap_flush.is_capsnap = true; + refcount_set(&capsnap->nref, 1); + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); + INIT_LIST_HEAD(&capsnap->ci_item); + + ceph_queue_cap_snap(ci, &capsnap); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); - dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); + if (capsnap) + kmem_cache_free(ceph_cap_snap_cachep, capsnap); + dout("%s %p %llx done\n", __func__, realm, realm->ino); } /* @@ -707,14 +765,16 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, __le64 *prior_parent_snaps; /* encoded */ struct ceph_snap_realm *realm = NULL; struct ceph_snap_realm *first_realm = NULL; - int invalidate = 0; + struct ceph_snap_realm *realm_to_rebuild = NULL; + int rebuild_snapcs; int err = -ENOMEM; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("update_snap_trace deletion=%d\n", deletion); + dout("%s deletion=%d\n", __func__, deletion); more: + rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); ri = p; p += sizeof(*ri); @@ -738,10 +798,10 @@ more: err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) goto fail; - invalidate += err; + rebuild_snapcs += err; if (le64_to_cpu(ri->seq) > realm->seq) { - dout("update_snap_trace updating %llx %p %lld -> %lld\n", + dout("%s updating %llx %p %lld -> %lld\n", __func__, realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); @@ -763,22 +823,30 @@ more: if (realm->seq > mdsc->last_snap_seq) mdsc->last_snap_seq = realm->seq; - invalidate = 1; + rebuild_snapcs = 1; } else if (!realm->cached_context) { - dout("update_snap_trace %llx %p seq %lld new\n", + dout("%s %llx %p seq %lld new\n", __func__, realm->ino, realm, realm->seq); - invalidate = 1; + rebuild_snapcs = 1; } else { - dout("update_snap_trace %llx %p seq %lld unchanged\n", + dout("%s %llx %p seq %lld unchanged\n", __func__, realm->ino, realm, realm->seq); } - dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, - realm, invalidate, p, e); + dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, + realm, rebuild_snapcs, p, e); + + /* + * this will always track the uppest parent realm from which + * we need to rebuild the snapshot contexts _downward_ in + * hierarchy. + */ + if (rebuild_snapcs) + realm_to_rebuild = realm; - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate && p >= e) - rebuild_snap_realms(realm, &dirty_realms); + /* rebuild_snapcs when we reach the _end_ (root) of the trace */ + if (realm_to_rebuild && p >= e) + rebuild_snap_realms(realm_to_rebuild, &dirty_realms); if (!first_realm) first_realm = realm; @@ -814,7 +882,7 @@ fail: ceph_put_snap_realm(mdsc, realm); if (first_realm) ceph_put_snap_realm(mdsc, first_realm); - pr_err("update_snap_trace error %d\n", err); + pr_err("%s error %d\n", __func__, err); return err; } @@ -831,12 +899,12 @@ static void flush_snaps(struct ceph_mds_client *mdsc) struct inode *inode; struct ceph_mds_session *session = NULL; - dout("flush_snaps\n"); + dout("%s\n", __func__); spin_lock(&mdsc->snap_flush_lock); while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); ceph_flush_snaps(ci, &session); @@ -846,7 +914,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_flush_lock); ceph_put_mds_session(session); - dout("flush_snaps done\n"); + dout("%s done\n", __func__); } /** @@ -928,8 +996,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, trace_len = le32_to_cpu(h->trace_len); p += sizeof(*h); - dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, - ceph_snap_op_name(op), split, trace_len); + dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, + mds, ceph_snap_op_name(op), split, trace_len); mutex_lock(&session->s_mutex); inc_session_sequence(session); @@ -989,13 +1057,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, */ if (ci->i_snap_realm->created > le64_to_cpu(ri->created)) { - dout(" leaving %p in newer realm %llx %p\n", - inode, ci->i_snap_realm->ino, + dout(" leaving %p %llx.%llx in newer realm %llx %p\n", + inode, ceph_vinop(inode), ci->i_snap_realm->ino, ci->i_snap_realm); goto skip_inode; } - dout(" will move %p to split realm %llx %p\n", - inode, realm->ino, realm); + dout(" will move %p %llx.%llx to split realm %llx %p\n", + inode, ceph_vinop(inode), realm->ino, realm); ceph_get_snap_realm(mdsc, realm); ceph_change_snap_realm(inode, realm); @@ -1038,7 +1106,7 @@ skip_inode: return; bad: - pr_err("corrupt snap message from mds%d\n", mds); + pr_err("%s corrupt snap message from mds%d\n", __func__, mds); ceph_msg_dump(msg); out: if (locked_rwsem) @@ -1071,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, } spin_unlock(&mdsc->snapid_map_lock); if (exist) { - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + dout("%s found snapid map %llx -> %x\n", __func__, + exist->snap, exist->dev); return exist; } @@ -1115,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, if (exist) { free_anon_bdev(sm->dev); kfree(sm); - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + dout("%s found snapid map %llx -> %x\n", __func__, + exist->snap, exist->dev); return exist; } - dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); + dout("%s create snapid map %llx -> %x\n", __func__, + sm->snap, sm->dev); return sm; } diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 573bb9556fb5..e36e8948e728 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_GETVXATTR: return "getvxattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bab61232dc5a..3fc48b43cab0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,8 @@ #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#include <uapi/linux/magic.h> + static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); @@ -70,15 +72,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ /* - * express utilization in terms of large blocks to avoid + * Express utilization in terms of large blocks to avoid * overflow on 32-bit machines. - * - * NOTE: for the time being, we make bsize == frsize to humor - * not-yet-ancient versions of glibc that are broken. - * Someday, we will probably want to report a real block - * size... whatever that may mean for a network file system! */ - buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; /* @@ -93,6 +89,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); } + /* + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! + */ + buf->f_bsize = buf->f_frsize; + buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; @@ -146,6 +150,7 @@ enum { Opt_mds_namespace, Opt_recover_session, Opt_source, + Opt_mon_addr, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -159,6 +164,7 @@ enum { Opt_quotadf, Opt_copyfrom, Opt_wsync, + Opt_pagecache, }; enum ceph_recover_session_mode { @@ -197,8 +203,10 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), + fsparam_flag_no ("pagecache", Opt_pagecache), {} }; @@ -228,9 +236,92 @@ static void canonicalize_path(char *path) } /* - * Parse the source parameter. Distinguish the server list from the path. + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + +static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + int r; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + if (*dev_name_end != ':') + return invalfc(fc, "separator ':' missing in source"); + + r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, + pctx->copts, fc->log.log, ','); + if (r) + return r; + + fsopt->new_dev_syntax = false; + return 0; +} + +static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + size_t len; + struct ceph_fsid fsid; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *fsid_start, *fs_name_start; + + if (*dev_name_end != '=') { + dout("separator '=' missing in source"); + return -EINVAL; + } + + fsid_start = strchr(dev_name, '@'); + if (!fsid_start) + return invalfc(fc, "missing cluster fsid"); + ++fsid_start; /* start of cluster fsid */ + + fs_name_start = strchr(fsid_start, '.'); + if (!fs_name_start) + return invalfc(fc, "missing file system name"); + + if (ceph_parse_fsid(fsid_start, &fsid)) + return invalfc(fc, "Invalid FSID"); + + ++fs_name_start; /* start of file system name */ + len = dev_name_end - fs_name_start; + + if (!namespace_equals(fsopt, fs_name_start, len)) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; + dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); + + fsopt->new_dev_syntax = true; + return 0; +} + +/* + * Parse the source parameter for new device format. Distinguish the device + * spec from the path. Try parsing new device format and fallback to old + * format if needed. + * + * New device syntax will looks like: + * <device_spec>=/<path> + * where + * <device_spec> is name@fsid.fsname + * <path> is optional, but if present must begin with '/' + * (monitor addresses are passed via mount option) * - * The source will look like: + * Old device syntax is: * <server_spec>[,<server_spec>...]:[<path>] * where * <server_spec> is <ip>[:<port>] @@ -263,24 +354,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = dev_name + strlen(dev_name); } - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') - return invalfc(fc, "No path or : separator in source"); + dev_name_end--; /* back up to separator */ + if (dev_name_end < dev_name) + return invalfc(fc, "Path missing in source"); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); if (fsopt->server_path) dout("server path '%s'\n", fsopt->server_path); - ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log); - if (ret) - return ret; + dout("trying new device syntax"); + ret = ceph_parse_new_source(dev_name, dev_name_end, fc); + if (ret) { + if (ret != -EINVAL) + return ret; + dout("trying old device syntax"); + ret = ceph_parse_old_source(dev_name, dev_name_end, fc); + if (ret) + return ret; + } fc->source = param->string; param->string = NULL; return 0; } +static int ceph_parse_mon_addr(struct fs_parameter *param, + struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + kfree(fsopt->mon_addr); + fsopt->mon_addr = param->string; + param->string = NULL; + + return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), + pctx->copts, fc->log.log, '/'); +} + static int ceph_parse_mount_param(struct fs_context *fc, struct fs_parameter *param) { @@ -306,6 +417,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, param->string = NULL; break; case Opt_mds_namespace: + if (!namespace_equals(fsopt, param->string, strlen(param->string))) + return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = param->string; param->string = NULL; @@ -323,6 +436,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, if (fc->source) return invalfc(fc, "Multiple sources specified"); return ceph_parse_source(param, fc); + case Opt_mon_addr: + return ceph_parse_mon_addr(param, fc); case Opt_wsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_WRITE_SIZE) @@ -455,6 +570,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; + case Opt_pagecache: + if (result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; + break; default: BUG(); } @@ -474,6 +595,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->mds_namespace); kfree(args->server_path); kfree(args->fscache_uniq); + kfree(args->mon_addr); kfree(args); } @@ -517,6 +639,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } @@ -572,15 +698,22 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) seq_puts(m, ",copyfrom"); - if (fsopt->mds_namespace) + /* dump mds_namespace when old device syntax is in use */ + if (fsopt->mds_namespace && !fsopt->new_dev_syntax) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + if (fsopt->mon_addr) + seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + seq_puts(m, ",nopagecache"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -671,6 +804,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->have_copy_from2 = true; atomic_long_set(&fsc->writeback_count, 0); + fsc->write_congested = false; err = -ENOMEM; /* @@ -684,6 +818,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->cap_wq) goto fail_inode_wq; + hash_init(fsc->async_unlink_conflict); + spin_lock_init(&fsc->async_unlink_conflict_lock); + spin_lock(&ceph_fsc_lock); list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); spin_unlock(&ceph_fsc_lock); @@ -733,6 +870,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_snap_cachep; struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; @@ -743,7 +881,7 @@ mempool_t *ceph_wb_pagevec_pool; static void ceph_inode_init_once(void *foo) { struct ceph_inode_info *ci = foo; - inode_init_once(&ci->vfs_inode); + inode_init_once(&ci->netfs.inode); } static int __init init_caches(void) @@ -761,6 +899,9 @@ static int __init init_caches(void) ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); if (!ceph_cap_cachep) goto bad_cap; + ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD); + if (!ceph_cap_snap_cachep) + goto bad_cap_snap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); if (!ceph_cap_flush_cachep) @@ -787,16 +928,10 @@ static int __init init_caches(void) if (!ceph_wb_pagevec_pool) goto bad_pagevec_pool; - error = ceph_fscache_register(); - if (error) - goto bad_fscache; - return 0; -bad_fscache: - kmem_cache_destroy(ceph_mds_request_cachep); bad_pagevec_pool: - mempool_destroy(ceph_wb_pagevec_pool); + kmem_cache_destroy(ceph_mds_request_cachep); bad_mds_req: kmem_cache_destroy(ceph_dir_file_cachep); bad_dir_file: @@ -806,6 +941,8 @@ bad_file: bad_dentry: kmem_cache_destroy(ceph_cap_flush_cachep); bad_cap_flush: + kmem_cache_destroy(ceph_cap_snap_cachep); +bad_cap_snap: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); @@ -822,14 +959,13 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_snap_cachep); kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); kmem_cache_destroy(ceph_dir_file_cachep); kmem_cache_destroy(ceph_mds_request_cachep); mempool_destroy(ceph_wb_pagevec_pool); - - ceph_fscache_unregister(); } static void __ceph_umount_begin(struct ceph_fs_client *fsc) @@ -988,6 +1124,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_time_min = 0; s->s_time_max = U32_MAX; + s->s_flags |= SB_NODIRATIME | SB_NOATIME; ret = set_anon_super_fc(s, fc); if (ret != 0) @@ -1060,6 +1197,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) static int ceph_get_tree(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; @@ -1071,6 +1209,8 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); + if (fsopt->new_dev_syntax && !fsopt->mon_addr) + return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); @@ -1156,6 +1296,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc) else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { + kfree(fsc->mount_options->mon_addr); + fsc->mount_options->mon_addr = fsopt->mon_addr; + fsopt->mon_addr = NULL; + pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); + } + sync_filesystem(fc->root->d_sb); return 0; } @@ -1333,6 +1480,14 @@ bool disable_send_metrics = false; module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); +/* for both v1 and v2 syntax */ +static bool mount_support = true; +static const struct kernel_param_ops param_ops_mount_syntax = { + .get = param_get_bool, +}; +module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); +module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); + module_init(init_ceph); module_exit(exit_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ac331aa07cfa..40630e6f691c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -17,21 +17,17 @@ #include <linux/posix_acl.h> #include <linux/refcount.h> #include <linux/security.h> - -#include <linux/ceph/libceph.h> - -#ifdef CONFIG_CEPH_FSCACHE -#define FSCACHE_USE_NEW_IO_API +#include <linux/netfs.h> #include <linux/fscache.h> -#endif +#include <linux/hashtable.h> -/* f_type in struct statfs */ -#define CEPH_SUPER_MAGIC 0x00c36400 +#include <linux/ceph/libceph.h> /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ @@ -45,6 +41,7 @@ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ +#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ @@ -89,6 +86,8 @@ struct ceph_mount_options { unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + bool new_dev_syntax; + /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() @@ -98,8 +97,11 @@ struct ceph_mount_options { char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ + char *mon_addr; }; +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 + struct ceph_fs_client { struct super_block *sb; @@ -120,10 +122,14 @@ struct ceph_fs_client { struct ceph_mds_client *mdsc; atomic_long_t writeback_count; + bool write_congested; struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; + DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); + spinlock_t async_unlink_conflict_lock; + #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; @@ -135,7 +141,7 @@ struct ceph_fs_client { #endif #ifdef CONFIG_CEPH_FSCACHE - struct fscache_cookie *fscache; + struct fscache_volume *fscache; #endif }; @@ -229,7 +235,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); - kfree(capsnap); + kmem_cache_free(ceph_cap_snap_cachep, capsnap); } } @@ -280,7 +286,8 @@ struct ceph_dentry_info { struct dentry *dentry; struct ceph_mds_session *lease_session; struct list_head lease_list; - unsigned flags; + struct hlist_node hnode; + unsigned long flags; int lease_shared_gen; u32 lease_gen; u32 lease_seq; @@ -289,10 +296,14 @@ struct ceph_dentry_info { u64 offset; }; -#define CEPH_DENTRY_REFERENCED 1 -#define CEPH_DENTRY_LEASE_LIST 2 -#define CEPH_DENTRY_SHRINK_LIST 4 -#define CEPH_DENTRY_PRIMARY_LINK 8 +#define CEPH_DENTRY_REFERENCED (1 << 0) +#define CEPH_DENTRY_LEASE_LIST (1 << 1) +#define CEPH_DENTRY_SHRINK_LIST (1 << 2) +#define CEPH_DENTRY_PRIMARY_LINK (1 << 3) +#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) +#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) +#define CEPH_DENTRY_ASYNC_CREATE_BIT (5) +#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT) struct ceph_inode_xattrs_info { /* @@ -316,6 +327,7 @@ struct ceph_inode_xattrs_info { * Ceph inode. */ struct ceph_inode_info { + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; @@ -426,17 +438,12 @@ struct ceph_inode_info { struct work_struct i_work; unsigned long i_work_mask; - -#ifdef CONFIG_CEPH_FSCACHE - struct fscache_cookie *fscache; -#endif - struct inode vfs_inode; /* at end */ }; static inline struct ceph_inode_info * ceph_inode(const struct inode *inode) { - return container_of(inode, struct ceph_inode_info, vfs_inode); + return container_of(inode, struct ceph_inode_info, netfs.inode); } static inline struct ceph_fs_client * @@ -535,19 +542,23 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) * * These come from src/mds/mdstypes.h in the ceph sources. */ -#define CEPH_MAX_MDS 0x100 -#define CEPH_NUM_STRAY 10 +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) { - if (vino.ino < CEPH_INO_SYSTEM_BASE && - vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { - WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); - return true; - } - return false; + if (vino.ino >= CEPH_INO_SYSTEM_BASE || + vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) + return false; + + /* Don't warn on mdsdirs */ + WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, + "Attempt to access reserved inode number 0x%llx", + vino.ino); + return true; } static inline struct inode *ceph_find_inode(struct super_block *sb, @@ -758,6 +769,8 @@ extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); +extern void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session); @@ -878,6 +891,8 @@ struct ceph_snap_realm { struct list_head dirty_item; /* if realm needs new context */ + struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */ + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; @@ -933,7 +948,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_msg *msg); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); -extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc); extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, u64 snap); @@ -1016,6 +1031,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode) ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); } +extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force); static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) @@ -1043,6 +1059,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode) /* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); @@ -1207,12 +1224,21 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci, /* addr.c */ extern const struct address_space_operations ceph_aops; +extern const struct netfs_request_ops ceph_netfs_ops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); -extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_uninline_data(struct file *file); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); +static inline bool ceph_has_inline_data(struct ceph_inode_info *ci) +{ + if (ci->i_inline_version == CEPH_INLINE_NONE || + ci->i_inline_version == 1) /* initial version, no data */ + return false; + return true; +} + /* file.c */ extern const struct file_operations ceph_file_fops; @@ -1270,9 +1296,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); /* quota.c */ -static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) + +enum quota_get_realm { + QUOTA_GET_MAX_FILES, + QUOTA_GET_MAX_BYTES, + QUOTA_GET_ANY +}; + +static inline bool __ceph_has_quota(struct ceph_inode_info *ci, + enum quota_get_realm which) { - return ci->i_max_files || ci->i_max_bytes; + bool has_quota = false; + + switch (which) { + case QUOTA_GET_MAX_BYTES: + has_quota = !!ci->i_max_bytes; + break; + case QUOTA_GET_MAX_FILES: + has_quota = !!ci->i_max_files; + break; + default: + has_quota = !!(ci->i_max_files || ci->i_max_bytes); + } + return has_quota; } extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); @@ -1281,13 +1327,13 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, u64 max_bytes, u64 max_files) { bool had_quota, has_quota; - had_quota = __ceph_has_any_quota(ci); + had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); ci->i_max_bytes = max_bytes; ci->i_max_files = max_files; - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) - ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); + ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } extern void ceph_handle_quota(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index fcf7dfdecf96..f31350cda960 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_string *pool_ns; s64 pool = ci->i_layout.pool_id; @@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); - dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode); down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { @@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, char *val, size_t size) { ssize_t ret; - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ci->i_layout.pool_id; const char *pool_name; @@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid); } @@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "client%lld", ceph_client_gid(fsc->client)); @@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, } #define XATTR_RSTAT_FIELD(_type, _name) \ XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) +#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = NULL, \ + .flags = VXATTR_FLAG_RSTAT, \ + } #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ { \ .name = CEPH_XATTR_NAME2(_type, _name, _field), \ @@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rsubdirs), XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rbytes), - XATTR_RSTAT_FIELD(dir, rctime), + XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime), { .name = "ceph.dir.pin", .name_size = sizeof("ceph.dir.pin"), @@ -621,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci, } dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", - ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val); + ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val); return 0; } @@ -863,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) struct ceph_buffer *old_blob = NULL; void *dest; - dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + dout("__build_xattrs_blob %p\n", &ci->netfs.inode); if (ci->i_xattrs.dirty) { int need = __get_required_blob_size(ci, 0, 0); @@ -923,10 +931,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_xattr *xattr; - struct ceph_vxattr *vxattr = NULL; + struct ceph_vxattr *vxattr; int req_mask; ssize_t err; + if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto handle_non_vxattrs; + /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr) { @@ -945,8 +956,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, err = -ERANGE; } return err; + } else { + err = ceph_do_getvxattr(inode, name, value, size); + /* this would happen with a new client and old server combo */ + if (err == -EOPNOTSUPP) + err = -ENODATA; + return err; } - +handle_non_vxattrs: req_mask = __get_request_mask(inode); spin_lock(&ci->i_ceph_lock); @@ -1069,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, flags |= CEPH_XATTR_REMOVE; } - dout("setxattr value=%.*s\n", (int)size, value); + dout("setxattr value size: %zu\n", size); /* do request */ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -1167,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name, spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) || + (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) { + dout("%s do sync setxattr: version: %llu size: %d max: %llu\n", + __func__, ci->i_xattrs.version, required_blob_size, + mdsc->mdsmap->m_max_xattr_size); goto do_sync; + } if (!lock_snap_rwsem && !ci->i_head_snapc) { lock_snap_rwsem = true; @@ -1184,8 +1207,6 @@ retry: ceph_cap_string(issued)); __build_xattrs(inode); - required_blob_size = __get_required_blob_size(ci, name_len, val_len); - if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob; |