aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/ceph/Kconfig3
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/acl.c13
-rw-r--r--fs/ceph/addr.c1208
-rw-r--r--fs/ceph/cache.c336
-rw-r--r--fs/ceph/cache.h134
-rw-r--r--fs/ceph/caps.c1828
-rw-r--r--fs/ceph/debugfs.c197
-rw-r--r--fs/ceph/dir.c357
-rw-r--r--fs/ceph/export.c47
-rw-r--r--fs/ceph/file.c993
-rw-r--r--fs/ceph/inode.c589
-rw-r--r--fs/ceph/io.c2
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/locks.c61
-rw-r--r--fs/ceph/mds_client.c1647
-rw-r--r--fs/ceph/mds_client.h96
-rw-r--r--fs/ceph/mdsmap.c75
-rw-r--r--fs/ceph/metric.c353
-rw-r--r--fs/ceph/metric.h244
-rw-r--r--fs/ceph/quota.c69
-rw-r--r--fs/ceph/snap.c413
-rw-r--r--fs/ceph/strings.c2
-rw-r--r--fs/ceph/super.c346
-rw-r--r--fs/ceph/super.h415
-rw-r--r--fs/ceph/xattr.c175
26 files changed, 6383 insertions, 3224 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index cf235f6eacf9..94df854147d3 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -6,6 +6,7 @@ config CEPH_FS
select LIBCRC32C
select CRYPTO_AES
select CRYPTO
+ select NETFS_SUPPORT
default n
help
Choose Y or M here to include support for mounting the
@@ -13,7 +14,7 @@ config CEPH_FS
scalable file system designed to provide high performance,
reliable access to petabytes of storage.
- More information at http://ceph.newdream.net/.
+ More information at https://ceph.io/.
If unsure, say N.
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 0a0823d378db..50c635dc7f71 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o quota.o io.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
- debugfs.o util.o
+ debugfs.o util.o metric.o
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 26be6520d3fb..f4fc8e0b847c 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -22,14 +22,14 @@ static inline void ceph_set_cached_acl(struct inode *inode,
struct ceph_inode_info *ci = ceph_inode(inode);
spin_lock(&ci->i_ceph_lock);
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+ if (__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 0))
set_cached_acl(inode, type, acl);
else
forget_cached_acl(inode, type);
spin_unlock(&ci->i_ceph_lock);
}
-struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu)
{
int size;
unsigned int retry_cnt = 0;
@@ -37,6 +37,9 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
char *value = NULL;
struct posix_acl *acl;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -82,7 +85,8 @@ retry:
return acl;
}
-int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
{
int ret = 0, size = 0;
const char *name = NULL;
@@ -100,7 +104,8 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
- ret = posix_acl_update_mode(inode, &new_mode, &acl);
+ ret = posix_acl_update_mode(&init_user_ns, inode,
+ &new_mode, &acl);
if (ret)
goto out;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..dcf701b05cc1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -4,17 +4,20 @@
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/pagemap.h>
-#include <linux/writeback.h> /* generic_writepages */
#include <linux/slab.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/signal.h>
#include <linux/iversion.h>
+#include <linux/ktime.h>
+#include <linux/netfs.h>
#include "super.h"
#include "mds_client.h"
#include "cache.h"
+#include "metric.h"
#include <linux/ceph/osd_client.h>
#include <linux/ceph/striper.h>
@@ -59,6 +62,9 @@
(CONGESTION_ON_THRESH(congestion_kb) - \
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+ struct folio **foliop, void **_fsdata);
+
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
if (PagePrivate(page))
@@ -70,22 +76,17 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
* Dirty a page. Optimistically adjust accounting, on the assumption
* that we won't race with invalidate. If we do, readjust.
*/
-static int ceph_set_page_dirty(struct page *page)
+static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct address_space *mapping = page->mapping;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- int ret;
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
- if (PageDirty(page)) {
- dout("%p set_page_dirty %p idx %lu -- already dirty\n",
- mapping->host, page, page->index);
- BUG_ON(!PagePrivate(page));
- return 0;
+ if (folio_test_dirty(folio)) {
+ dout("%p dirty_folio %p idx %lu -- already dirty\n",
+ mapping->host, folio, folio->index);
+ VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
+ return false;
}
inode = mapping->host;
@@ -109,369 +110,338 @@ static int ceph_set_page_dirty(struct page *page)
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
- dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d "
"snapc %p seq %lld (%d snaps)\n",
- mapping->host, page, page->index,
+ mapping->host, folio, folio->index,
ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
/*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
+ * Reference snap context in folio->private. Also set
+ * PagePrivate so that we get invalidate_folio callback.
*/
- BUG_ON(PagePrivate(page));
- page->private = (unsigned long)snapc;
- SetPagePrivate(page);
+ VM_WARN_ON_FOLIO(folio->private, folio);
+ folio_attach_private(folio, snapc);
- ret = __set_page_dirty_nobuffers(page);
- WARN_ON(!PageLocked(page));
- WARN_ON(!page->mapping);
-
- return ret;
+ return ceph_fscache_dirty_folio(mapping, folio);
}
/*
- * If we are truncating the full page (i.e. offset == 0), adjust the
- * dirty page counters appropriately. Only called if there is private
- * data on the page.
+ * If we are truncating the full folio (i.e. offset == 0), adjust the
+ * dirty folio counters appropriately. Only called if there is private
+ * data on the folio.
*/
-static void ceph_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ceph_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct inode *inode;
struct ceph_inode_info *ci;
- struct ceph_snap_context *snapc = page_snap_context(page);
+ struct ceph_snap_context *snapc;
- inode = page->mapping->host;
+ inode = folio->mapping->host;
ci = ceph_inode(inode);
- if (offset != 0 || length != PAGE_SIZE) {
- dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
- inode, page, page->index, offset, length);
+ if (offset != 0 || length != folio_size(folio)) {
+ dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n",
+ inode, folio->index, offset, length);
return;
}
- ceph_invalidate_fscache_page(inode, page);
+ WARN_ON(!folio_test_locked(folio));
+ if (folio_test_private(folio)) {
+ dout("%p invalidate_folio idx %lu full dirty page\n",
+ inode, folio->index);
- WARN_ON(!PageLocked(page));
- if (!PagePrivate(page))
- return;
+ snapc = folio_detach_private(folio);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ }
+
+ folio_wait_fscache(folio);
+}
- ClearPageChecked(page);
+static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
+{
+ struct inode *inode = folio->mapping->host;
- dout("%p invalidatepage %p idx %lu full dirty page\n",
- inode, page, page->index);
+ dout("%llx:%llx release_folio idx %lu (%sdirty)\n",
+ ceph_vinop(inode),
+ folio->index, folio_test_dirty(folio) ? "" : "not ");
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
- page->private = 0;
- ClearPagePrivate(page);
+ if (folio_test_private(folio))
+ return false;
+
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_fscache(folio);
+ }
+ ceph_fscache_note_page_release(inode);
+ return true;
}
-static int ceph_releasepage(struct page *page, gfp_t g)
+static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
{
- dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
- page, page->index, PageDirty(page) ? "" : "not ");
+ struct inode *inode = rreq->inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_layout *lo = &ci->i_layout;
+ u32 blockoff;
+ u64 blockno;
- /* Can we release the page from the cache? */
- if (!ceph_release_fscache_page(page, g))
- return 0;
+ /* Expand the start downward */
+ blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
+ rreq->start = blockno * lo->stripe_unit;
+ rreq->len += blockoff;
- return !PagePrivate(page);
+ /* Now, round up the length to the next block */
+ rreq->len = roundup(rreq->len, lo->stripe_unit);
}
-/*
- * read a single page, without unlocking it.
- */
-static int ceph_do_readpage(struct file *filp, struct page *page)
+static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
- struct inode *inode = file_inode(filp);
- struct ceph_inode_info *ci = ceph_inode(inode);
+ struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- int err = 0;
- u64 off = page_offset(page);
- u64 len = PAGE_SIZE;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 objno, objoff;
+ u32 xlen;
+
+ /* Truncate the extent at the end of the current block */
+ ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
+ &objno, &objoff, &xlen);
+ subreq->len = min(xlen, fsc->mount_options->rsize);
+ return true;
+}
- if (off >= i_size_read(inode)) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
- return 0;
- }
+static void finish_netfs_read(struct ceph_osd_request *req)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
+ struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+ struct netfs_io_subrequest *subreq = req->r_priv;
+ int num_pages;
+ int err = req->r_result;
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- /*
- * Uptodate inline data should have been added
- * into page cache while getting Fcr caps.
- */
- if (off == 0)
- return -EINVAL;
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
- return 0;
- }
+ ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, osd_data->length, err);
- err = ceph_readpage_from_fscache(inode, page);
- if (err == 0)
- return -EINPROGRESS;
-
- dout("readpage inode %p file %p page %p index %lu\n",
- inode, filp, page, page->index);
- err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
- &ci->i_layout, off, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1, 0);
+ dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
+ subreq->len, i_size_read(req->r_inode));
+
+ /* no object means success but no data */
if (err == -ENOENT)
err = 0;
- if (err < 0) {
- SetPageError(page);
- ceph_fscache_readpage_cancel(inode, page);
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
- goto out;
- }
- if (err < PAGE_SIZE)
- /* zero fill remainder of page */
- zero_user_segment(page, err, PAGE_SIZE);
- else
- flush_dcache_page(page);
+ else if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
- SetPageUptodate(page);
- ceph_readpage_to_fscache(inode, page);
+ if (err >= 0 && err < subreq->len)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
-out:
- return err < 0 ? err : 0;
-}
+ netfs_subreq_terminated(subreq, err, false);
-static int ceph_readpage(struct file *filp, struct page *page)
-{
- int r = ceph_do_readpage(filp, page);
- if (r != -EINPROGRESS)
- unlock_page(page);
- else
- r = 0;
- return r;
+ num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
+ ceph_put_page_vector(osd_data->pages, num_pages, false);
+ iput(req->r_inode);
}
-/*
- * Finish an async read(ahead) op.
- */
-static void finish_read(struct ceph_osd_request *req)
+static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
{
- struct inode *inode = req->r_inode;
- struct ceph_osd_data *osd_data;
- int rc = req->r_result <= 0 ? req->r_result : 0;
- int bytes = req->r_result >= 0 ? req->r_result : 0;
- int num_pages;
- int i;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ struct ceph_mds_reply_info_in *iinfo;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct iov_iter iter;
+ ssize_t err = 0;
+ size_t len;
+ int mode;
- dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
- if (rc == -EBLACKLISTED)
- ceph_inode_to_client(inode)->blacklisted = true;
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
- /* unlock all pages, zeroing any data we didn't read */
- osd_data = osd_req_op_extent_osd_data(req, 0);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
- num_pages = calc_pages_for((u64)osd_data->alignment,
- (u64)osd_data->length);
- for (i = 0; i < num_pages; i++) {
- struct page *page = osd_data->pages[i];
-
- if (rc < 0 && rc != -ENOENT) {
- ceph_fscache_readpage_cancel(inode, page);
- goto unlock;
- }
- if (bytes < (int)PAGE_SIZE) {
- /* zero (remainder of) page */
- int s = bytes < 0 ? 0 : bytes;
- zero_user_segment(page, s, PAGE_SIZE);
- }
- dout("finish_read %p uptodate %p idx %lu\n", inode, page,
- page->index);
- flush_dcache_page(page);
- SetPageUptodate(page);
- ceph_readpage_to_fscache(inode, page);
-unlock:
- unlock_page(page);
- put_page(page);
- bytes -= PAGE_SIZE;
+ if (subreq->start >= inode->i_size)
+ goto out;
+
+ /* We need to fetch the inline data. */
+ mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_ino1 = ci->i_vino;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
+ req->r_num_caps = 2;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto out;
+
+ rinfo = &req->r_reply_info;
+ iinfo = &rinfo->targeti;
+ if (iinfo->inline_version == CEPH_INLINE_NONE) {
+ /* The data got uninlined */
+ ceph_mdsc_put_request(req);
+ return false;
}
- kfree(osd_data->pages);
+
+ len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
+ if (err == 0)
+ err = -EFAULT;
+
+ ceph_mdsc_put_request(req);
+out:
+ netfs_subreq_terminated(subreq, err, false);
+ return true;
}
-/*
- * start an async read(ahead) operation. return nr_pages we submitted
- * a read for on success, or negative error code.
- */
-static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
- struct list_head *page_list, int max)
+static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
- struct ceph_osd_client *osdc =
- &ceph_inode_to_client(inode)->client->osdc;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *page = lru_to_page(page_list);
- struct ceph_vino vino;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
- u64 off;
- u64 len;
- int i;
+ struct ceph_vino vino = ceph_vino(inode);
+ struct iov_iter iter;
struct page **pages;
- pgoff_t next_index;
- int nr_pages = 0;
- int got = 0;
- int ret = 0;
-
- if (!rw_ctx) {
- /* caller of readpages does not hold buffer and read caps
- * (fadvise, madvise and readahead cases) */
- int want = CEPH_CAP_FILE_CACHE;
- ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
- true, &got);
- if (ret < 0) {
- dout("start_read %p, error getting cap\n", inode);
- } else if (!(got & want)) {
- dout("start_read %p, no cache cap\n", inode);
- ret = 0;
- }
- if (ret <= 0) {
- if (got)
- ceph_put_cap_refs(ci, got);
- while (!list_empty(page_list)) {
- page = lru_to_page(page_list);
- list_del(&page->lru);
- put_page(page);
- }
- return ret;
- }
- }
+ size_t page_off;
+ int err = 0;
+ u64 len = subreq->len;
- off = (u64) page_offset(page);
+ if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
+ return;
- /* count pages */
- next_index = page->index;
- list_for_each_entry_reverse(page, page_list, lru) {
- if (page->index != next_index)
- break;
- nr_pages++;
- next_index++;
- if (max && nr_pages == max)
- break;
- }
- len = nr_pages << PAGE_SHIFT;
- dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
- off, len);
- vino = ceph_vino(inode);
- req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
- 0, 1, CEPH_OSD_OP_READ,
- CEPH_OSD_FLAG_READ, NULL,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
+ 0, 1, CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
+ NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
if (IS_ERR(req)) {
- ret = PTR_ERR(req);
+ err = PTR_ERR(req);
+ req = NULL;
goto out;
}
- /* build page vector */
- nr_pages = calc_pages_for(0, len);
- pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out_put;
- }
- for (i = 0; i < nr_pages; ++i) {
- page = list_entry(page_list->prev, struct page, lru);
- BUG_ON(PageLocked(page));
- list_del(&page->lru);
-
- dout("start_read %p adding %p idx %lu\n", inode, page,
- page->index);
- if (add_to_page_cache_lru(page, &inode->i_data, page->index,
- GFP_KERNEL)) {
- ceph_fscache_uncache_page(inode, page);
- put_page(page);
- dout("start_read %p add_to_page_cache failed %p\n",
- inode, page);
- nr_pages = i;
- if (nr_pages > 0) {
- len = nr_pages << PAGE_SHIFT;
- osd_req_op_extent_update(req, 0, len);
- break;
- }
- goto out_pages;
- }
- pages[i] = page;
+ dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
+ if (err < 0) {
+ dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
+ goto out;
}
+
+ /* should always give us a page-aligned read */
+ WARN_ON_ONCE(page_off);
+ len = err;
+ err = 0;
+
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
- req->r_callback = finish_read;
+ req->r_callback = finish_netfs_read;
+ req->r_priv = subreq;
req->r_inode = inode;
+ ihold(inode);
- dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
- ret = ceph_osdc_start_request(osdc, req, false);
- if (ret < 0)
- goto out_pages;
+ ceph_osdc_start_request(req->r_osdc, req);
+out:
ceph_osdc_put_request(req);
+ if (err)
+ netfs_subreq_terminated(subreq, err, false);
+ dout("%s: result %d\n", __func__, err);
+}
- /* After adding locked pages to page cache, the inode holds cache cap.
- * So we can drop our cap refs. */
- if (got)
- ceph_put_cap_refs(ci, got);
+static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
+{
+ struct inode *inode = rreq->inode;
+ int got = 0, want = CEPH_CAP_FILE_CACHE;
+ int ret = 0;
- return nr_pages;
+ if (rreq->origin != NETFS_READAHEAD)
+ return 0;
-out_pages:
- for (i = 0; i < nr_pages; ++i) {
- ceph_fscache_readpage_cancel(inode, pages[i]);
- unlock_page(pages[i]);
+ if (file) {
+ struct ceph_rw_context *rw_ctx;
+ struct ceph_file_info *fi = file->private_data;
+
+ rw_ctx = ceph_find_rw_context(fi);
+ if (rw_ctx)
+ return 0;
}
- ceph_put_page_vector(pages, nr_pages, false);
-out_put:
- ceph_osdc_put_request(req);
-out:
+
+ /*
+ * readahead callers do not necessarily hold Fcb caps
+ * (e.g. fadvise, madvise).
+ */
+ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
+ if (ret < 0) {
+ dout("start_read %p, error getting cap\n", inode);
+ return ret;
+ }
+
+ if (!(got & want)) {
+ dout("start_read %p, no cache cap\n", inode);
+ return -EACCES;
+ }
+ if (ret == 0)
+ return -EACCES;
+
+ rreq->netfs_priv = (void *)(uintptr_t)got;
+ return 0;
+}
+
+static void ceph_netfs_free_request(struct netfs_io_request *rreq)
+{
+ struct ceph_inode_info *ci = ceph_inode(rreq->inode);
+ int got = (uintptr_t)rreq->netfs_priv;
+
if (got)
ceph_put_cap_refs(ci, got);
- return ret;
}
+const struct netfs_request_ops ceph_netfs_ops = {
+ .init_request = ceph_init_request,
+ .free_request = ceph_netfs_free_request,
+ .begin_cache_operation = ceph_begin_cache_operation,
+ .issue_read = ceph_netfs_issue_read,
+ .expand_readahead = ceph_netfs_expand_readahead,
+ .clamp_length = ceph_netfs_clamp_length,
+ .check_write_begin = ceph_netfs_check_write_begin,
+};
-/*
- * Read multiple pages. Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
- */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned nr_pages)
+#ifdef CONFIG_CEPH_FSCACHE
+static void ceph_set_page_fscache(struct page *page)
{
- struct inode *inode = file_inode(file);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_file_info *fi = file->private_data;
- struct ceph_rw_context *rw_ctx;
- int rc = 0;
- int max = 0;
+ set_page_fscache(page);
+}
- if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
- return -EINVAL;
+static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
+{
+ struct inode *inode = priv;
- rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
- &nr_pages);
+ if (IS_ERR_VALUE(error) && error != -ENOBUFS)
+ ceph_fscache_invalidate(inode, false);
+}
- if (rc == 0)
- goto out;
+static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
- rw_ctx = ceph_find_rw_context(fi);
- max = fsc->mount_options->rsize >> PAGE_SHIFT;
- dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
- inode, file, rw_ctx, nr_pages, max);
- while (!list_empty(page_list)) {
- rc = start_read(inode, rw_ctx, page_list, max);
- if (rc < 0)
- goto out;
- }
-out:
- ceph_fscache_readpages_cancel(inode, page_list);
+ fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
+ ceph_fscache_write_terminated, inode, caching);
+}
+#else
+static inline void ceph_set_page_fscache(struct page *page)
+{
+}
- dout("readpages %p file %p ret %d\n", inode, file, rc);
- return rc;
+static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
+{
}
+#endif /* CONFIG_CEPH_FSCACHE */
struct ceph_writeback_ctl
{
@@ -565,8 +535,8 @@ static u64 get_writepages_data_length(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
WARN_ON(!found);
}
- if (end > page_offset(page) + PAGE_SIZE)
- end = page_offset(page) + PAGE_SIZE;
+ if (end > page_offset(page) + thp_size(page))
+ end = page_offset(page) + thp_size(page);
return end > start ? end - start : 0;
}
@@ -578,20 +548,21 @@ static u64 get_writepages_data_length(struct inode *inode,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_fs_client *fsc;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = page->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
- int err, len = PAGE_SIZE;
+ int err;
+ loff_t len = thp_size(page);
struct ceph_writeback_ctl ceph_wbc;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ bool caching = ceph_is_cache_enabled(inode);
dout("writepage %p idx %lu\n", page, page->index);
- inode = page->mapping->host;
- ci = ceph_inode(inode);
- fsc = ceph_inode_to_client(inode);
-
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
@@ -612,27 +583,52 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n", page, ceph_wbc.i_size);
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
+ dout("folio at %lu beyond eof %llu\n", folio->index,
+ ceph_wbc.i_size);
+ folio_invalidate(folio, 0, folio_size(folio));
return 0;
}
if (ceph_wbc.i_size < page_off + len)
len = ceph_wbc.i_size - page_off;
- dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
+ dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
inode, page, page->index, page_off, len, snapc, snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = true;
+
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
+ ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
+ true);
+ if (IS_ERR(req)) {
+ redirty_page_for_writepage(wbc, page);
+ return PTR_ERR(req);
+ }
set_page_writeback(page);
- err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
- &ci->i_layout, snapc, page_off, len,
- ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size,
- &inode->i_mtime, &page, 1);
+ if (caching)
+ ceph_set_page_fscache(page);
+ ceph_fscache_write_to_cache(inode, page_off, len, caching);
+
+ /* it may be a short write due to an object boundary */
+ WARN_ON_ONCE(len > thp_size(page));
+ osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+ dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
+
+ req->r_mtime = inode->i_mtime;
+ ceph_osdc_start_request(osdc, req);
+ err = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, err);
+
+ ceph_osdc_put_request(req);
+ if (err == 0)
+ err = len;
+
if (err < 0) {
struct writeback_control tmp_wbc;
if (!wbc)
@@ -644,8 +640,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
return err;
}
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
dout("writepage setting page/mapping error %d %p\n",
err, page);
mapping_set_error(&inode->i_data, err);
@@ -654,15 +650,15 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage cleaned page %p\n", page);
err = 0; /* vfs expects us to return 0 */
}
- page->private = 0;
- ClearPagePrivate(page);
+ oldest = detach_page_private(page);
+ WARN_ON_ONCE(oldest != snapc);
end_page_writeback(page);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = false;
return err;
}
@@ -673,6 +669,13 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = page->mapping->host;
BUG_ON(!inode);
ihold(inode);
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ ceph_inode_to_client(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
+ wait_on_page_fscache(page);
+
err = writepage_nounlock(page, wbc);
if (err == -ERESTARTSYS) {
/* direct memory reclaimer was killed by SIGKILL. return 0
@@ -702,14 +705,15 @@ static void writepages_finish(struct ceph_osd_request *req)
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ unsigned int len = 0;
bool remove_page;
dout("writepages_finish %p rc %d\n", inode, rc);
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
- if (rc == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (rc == -EBLOCKLISTED)
+ fsc->blocklisted = true;
} else {
ceph_clear_error_write(ci);
}
@@ -725,11 +729,15 @@ static void writepages_finish(struct ceph_osd_request *req)
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
- if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
+ pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
+ __func__, req->r_ops[i].op, req, i, req->r_tid);
break;
+ }
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ len += osd_data->length;
num_pages = calc_pages_for((u64)osd_data->alignment,
(u64)osd_data->length);
total_pages += num_pages;
@@ -741,14 +749,11 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
+ fsc->write_congested = false;
- ceph_put_snap_context(page_snap_context(page));
- page->private = 0;
- ClearPagePrivate(page);
- dout("unlocking %p\n", page);
+ ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
+ dout("unlocking %p\n", page);
if (remove_page)
generic_error_remove_page(inode->i_mapping,
@@ -762,12 +767,14 @@ static void writepages_finish(struct ceph_osd_request *req)
release_pages(osd_data->pages, num_pages);
}
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, rc);
+
ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->pages_from_pool)
- mempool_free(osd_data->pages,
- ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
+ mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
else
kfree(osd_data->pages);
ceph_osdc_put_request(req);
@@ -792,12 +799,17 @@ static int ceph_writepages_start(struct address_space *mapping,
struct ceph_writeback_ctl ceph_wbc;
bool should_loop, range_whole = false;
bool done = false;
+ bool caching = ceph_is_cache_enabled(inode);
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fsc->write_congested)
+ return 0;
dout("writepages_start %p (mode=%s)\n", inode,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ if (ceph_inode_is_shutdown(inode)) {
if (ci->i_wrbuffer_ref > 0) {
pr_warn_ratelimited(
"writepage_start %p %lld forced umount\n",
@@ -859,17 +871,16 @@ retry:
int num_ops = 0, op_idx;
unsigned i, pvec_pages, max_pages, locked_pages = 0;
struct page **pages = NULL, **data_pages;
- mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
pgoff_t strip_unit_end = 0;
u64 offset = 0, len = 0;
+ bool from_pool = false;
max_pages = wsize >> PAGE_SHIFT;
get_more_pages:
- pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
- end, PAGECACHE_TAG_DIRTY,
- max_pages - locked_pages);
+ pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_DIRTY);
dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
if (!pvec_pages && !locked_pages)
break;
@@ -901,14 +912,16 @@ get_more_pages:
continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n",
- page, ceph_wbc.i_size);
+ struct folio *folio = page_folio(page);
+
+ dout("folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
- page_offset(page) >= i_size_read(inode)) &&
- clear_page_dirty_for_io(page))
- mapping->a_ops->invalidatepage(page,
- 0, PAGE_SIZE);
- unlock_page(page);
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0,
+ folio_size(folio));
+ folio_unlock(folio);
continue;
}
if (strip_unit_end && (page->index > strip_unit_end)) {
@@ -916,7 +929,7 @@ get_more_pages:
unlock_page(page);
break;
}
- if (PageWriteback(page)) {
+ if (PageWriteback(page) || PageFsCache(page)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
dout("%p under writeback\n", page);
unlock_page(page);
@@ -924,6 +937,7 @@ get_more_pages:
}
dout("waiting on writeback %p\n", page);
wait_on_page_writeback(page);
+ wait_on_page_fscache(page);
}
if (!clear_page_dirty_for_io(page)) {
@@ -961,16 +975,16 @@ get_more_pages:
sizeof(*pages),
GFP_NOFS);
if (!pages) {
- pool = fsc->wb_pagevec_pool;
- pages = mempool_alloc(pool, GFP_NOFS);
+ from_pool = true;
+ pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
BUG_ON(!pages);
}
len = 0;
} else if (page->index !=
(offset + len) >> PAGE_SHIFT) {
- if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
- CEPH_OSD_MAX_OPS)) {
+ if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
+ CEPH_OSD_MAX_OPS)) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
break;
@@ -987,16 +1001,13 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb)) {
- set_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
- }
-
+ fsc->mount_options->congestion_kb))
+ fsc->write_congested = true;
pages[locked_pages++] = page;
pvec.pages[i] = NULL;
- len += PAGE_SIZE;
+ len += thp_size(page);
}
/* did we get anything? */
@@ -1045,7 +1056,7 @@ new_request:
BUG_ON(IS_ERR(req));
}
BUG_ON(len < page_offset(pages[locked_pages - 1]) +
- PAGE_SIZE - offset);
+ thp_size(page) - offset);
req->r_callback = writepages_finish;
req->r_inode = inode;
@@ -1056,27 +1067,40 @@ new_request:
op_idx = 0;
for (i = 0; i < locked_pages; i++) {
u64 cur_offset = page_offset(pages[i]);
+ /*
+ * Discontinuity in page range? Ceph can handle that by just passing
+ * multiple extents in the write op.
+ */
if (offset + len != cur_offset) {
+ /* If it's full, stop here */
if (op_idx + 1 == req->r_num_ops)
break;
+
+ /* Kick off an fscache write with what we have so far. */
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ /* Start a new extent */
osd_req_op_extent_dup_last(req, op_idx,
cur_offset - offset);
dout("writepages got pages at %llu~%llu\n",
offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx,
data_pages, len, 0,
- !!pool, false);
+ from_pool, false);
osd_req_op_extent_update(req, op_idx, len);
len = 0;
- offset = cur_offset;
+ offset = cur_offset;
data_pages = pages + i;
op_idx++;
}
set_page_writeback(pages[i]);
- len += PAGE_SIZE;
+ if (caching)
+ ceph_set_page_fscache(pages[i]);
+ len += thp_size(page);
}
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
if (ceph_wbc.size_stable) {
len = min(len, ceph_wbc.i_size - offset);
@@ -1084,7 +1108,7 @@ new_request:
/* writepages_finish() clears writeback pages
* according to the data length, so make sure
* data length covers all locked pages */
- u64 min_len = len + 1 - PAGE_SIZE;
+ u64 min_len = len + 1 - thp_size(page);
len = get_writepages_data_length(inode, pages[i - 1],
offset);
len = max(len, min_len);
@@ -1092,12 +1116,12 @@ new_request:
dout("writepages got pages at %llu~%llu\n", offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
- 0, !!pool, false);
+ 0, from_pool, false);
osd_req_op_extent_update(req, op_idx, len);
BUG_ON(op_idx + 1 != req->r_num_ops);
- pool = NULL;
+ from_pool = false;
if (i < locked_pages) {
BUG_ON(num_ops <= req->r_num_ops);
num_ops -= req->r_num_ops;
@@ -1108,8 +1132,8 @@ new_request:
pages = kmalloc_array(locked_pages, sizeof(*pages),
GFP_NOFS);
if (!pages) {
- pool = fsc->wb_pagevec_pool;
- pages = mempool_alloc(pool, GFP_NOFS);
+ from_pool = true;
+ pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
BUG_ON(!pages);
}
memcpy(pages, data_pages + i,
@@ -1124,8 +1148,7 @@ new_request:
}
req->r_mtime = inode->i_mtime;
- rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
- BUG_ON(rc);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
req = NULL;
wbc->nr_to_write -= i;
@@ -1204,110 +1227,86 @@ static int context_is_writeable_or_written(struct inode *inode,
return ret;
}
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
+/**
+ * ceph_find_incompatible - find an incompatible context and return it
+ * @page: page being dirtied
*
- * called with page locked.
- * return success with page locked,
- * or any failure (incl -EAGAIN) with page unlocked.
+ * We are only allowed to write into/dirty a page if the page is
+ * clean, or already dirty within the same snap context. Returns a
+ * conflicting context if there is one, NULL if there isn't, or a
+ * negative error code on other errors.
+ *
+ * Must be called with page lock held.
*/
-static int ceph_update_writeable_page(struct file *file,
- loff_t pos, unsigned len,
- struct page *page)
+static struct ceph_snap_context *
+ceph_find_incompatible(struct page *page)
{
- struct inode *inode = file_inode(file);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- loff_t page_off = pos & PAGE_MASK;
- int pos_in_page = pos & ~PAGE_MASK;
- int end_in_page = pos_in_page + len;
- loff_t i_size;
- int r;
- struct ceph_snap_context *snapc, *oldest;
- if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- dout(" page %p forced umount\n", page);
- unlock_page(page);
- return -EIO;
+ if (ceph_inode_is_shutdown(inode)) {
+ dout(" page %p %llx:%llx is shutdown\n", page,
+ ceph_vinop(inode));
+ return ERR_PTR(-ESTALE);
}
-retry_locked:
- /* writepages currently holds page lock, but if we change that later, */
- wait_on_page_writeback(page);
+ for (;;) {
+ struct ceph_snap_context *snapc, *oldest;
+
+ wait_on_page_writeback(page);
+
+ snapc = page_snap_context(page);
+ if (!snapc || snapc == ci->i_head_snapc)
+ break;
- snapc = page_snap_context(page);
- if (snapc && snapc != ci->i_head_snapc) {
/*
* this page is already dirty in another (older) snap
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
+ /* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- dout(" page %p snapc %p not current or oldest\n",
- page, snapc);
- /*
- * queue for writeback, and wait for snapc to
- * be writeable or written
- */
- snapc = ceph_get_snap_context(snapc);
- unlock_page(page);
- ceph_queue_writeback(inode);
- r = wait_event_killable(ci->i_cap_wq,
- context_is_writeable_or_written(inode, snapc));
- ceph_put_snap_context(snapc);
- if (r == -ERESTARTSYS)
- return r;
- return -EAGAIN;
+ dout(" page %p snapc %p not current or oldest\n", page, snapc);
+ return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
/* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n",
- page, snapc);
- if (!clear_page_dirty_for_io(page))
- goto retry_locked;
- r = writepage_nounlock(page, NULL);
- if (r < 0)
- goto fail_unlock;
- goto retry_locked;
- }
-
- if (PageUptodate(page)) {
- dout(" page %p already uptodate\n", page);
- return 0;
+ dout(" page %p snapc %p not current, but oldest\n", page, snapc);
+ if (clear_page_dirty_for_io(page)) {
+ int r = writepage_nounlock(page, NULL);
+ if (r < 0)
+ return ERR_PTR(r);
+ }
}
+ return NULL;
+}
- /* full page? */
- if (pos_in_page == 0 && len == PAGE_SIZE)
- return 0;
-
- /* past end of file? */
- i_size = i_size_read(inode);
-
- if (page_off >= i_size ||
- (pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_SIZE)) {
- dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_SIZE);
- zero_user_segments(page,
- 0, pos_in_page,
- end_in_page, PAGE_SIZE);
- return 0;
- }
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+ struct folio **foliop, void **_fsdata)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
- /* we need to read it. */
- r = ceph_do_readpage(file, page);
- if (r < 0) {
- if (r == -EINPROGRESS)
- return -EAGAIN;
- goto fail_unlock;
+ snapc = ceph_find_incompatible(folio_page(*foliop, 0));
+ if (snapc) {
+ int r;
+
+ folio_unlock(*foliop);
+ folio_put(*foliop);
+ *foliop = NULL;
+ if (IS_ERR(snapc))
+ return PTR_ERR(snapc);
+
+ ceph_queue_writeback(inode);
+ r = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ return r == 0 ? -EAGAIN : r;
}
- goto retry_locked;
-fail_unlock:
- unlock_page(page);
- return r;
+ return 0;
}
/*
@@ -1315,31 +1314,22 @@ fail_unlock:
* clean, or already dirty within the same snap context.
*/
static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct page *page;
- pgoff_t index = pos >> PAGE_SHIFT;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct folio *folio = NULL;
int r;
- do {
- /* get a page */
- page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page)
- return -ENOMEM;
-
- dout("write_begin file %p inode %p page %p %d~%d\n", file,
- inode, page, (int)pos, (int)len);
+ r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
+ if (r < 0)
+ return r;
- r = ceph_update_writeable_page(file, pos, len, page);
- if (r < 0)
- put_page(page);
- else
- *pagep = page;
- } while (r == -EAGAIN);
-
- return r;
+ folio_wait_fscache(folio);
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ *pagep = &folio->page;
+ return 0;
}
/*
@@ -1348,32 +1338,33 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
*/
static int ceph_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct page *subpage, void *fsdata)
{
+ struct folio *folio = page_folio(subpage);
struct inode *inode = file_inode(file);
bool check_cap = false;
- dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
- inode, page, (int)pos, (int)copied, (int)len);
+ dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
+ inode, folio, (int)pos, (int)copied, (int)len);
- /* zero the stale part of the page if we did a short copy */
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
+ /* just return that nothing was copied on a short copy */
if (copied < len) {
copied = 0;
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/* did file size increase? */
if (pos+copied > i_size_read(inode))
check_cap = ceph_inode_set_size(inode, pos+copied);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
@@ -1381,28 +1372,17 @@ out:
return copied;
}
-/*
- * we set .direct_IO to indicate direct io is supported, but since we
- * intercept O_DIRECT reads and writes early, this function should
- * never get called.
- */
-static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
-{
- WARN_ON(1);
- return -EINVAL;
-}
-
const struct address_space_operations ceph_aops = {
- .readpage = ceph_readpage,
- .readpages = ceph_readpages,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
.writepage = ceph_writepage,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
- .set_page_dirty = ceph_set_page_dirty,
- .invalidatepage = ceph_invalidatepage,
- .releasepage = ceph_releasepage,
- .direct_IO = ceph_direct_io,
+ .dirty_folio = ceph_dirty_folio,
+ .invalidate_folio = ceph_invalidate_folio,
+ .release_folio = ceph_release_folio,
+ .direct_IO = noop_direct_IO,
};
static void ceph_block_sigs(sigset_t *oldset)
@@ -1426,44 +1406,42 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
struct inode *inode = file_inode(vma->vm_file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
- struct page *pinned_page = NULL;
- loff_t off = vmf->pgoff << PAGE_SHIFT;
+ loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
int want, got, err;
sigset_t oldset;
vm_fault_t ret = VM_FAULT_SIGBUS;
+ if (ceph_inode_is_shutdown(inode))
+ return ret;
+
ceph_block_sigs(&oldset);
- dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
- inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
+ dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
+ inode, ceph_vinop(inode), off);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
got = 0;
- err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
- &got, &pinned_page);
+ err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
if (err < 0)
goto out_restore;
- dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
- inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
+ dout("filemap_fault %p %llu got cap refs on %s\n",
+ inode, off, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
- ci->i_inline_version == CEPH_INLINE_NONE) {
+ !ceph_has_inline_data(ci)) {
CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
ceph_add_rw_context(fi, &rw_ctx);
ret = filemap_fault(vmf);
ceph_del_rw_context(fi, &rw_ctx);
- dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n",
- inode, off, (size_t)PAGE_SIZE,
- ceph_cap_string(got), ret);
+ dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
+ inode, off, ceph_cap_string(got), ret);
} else
err = -EAGAIN;
- if (pinned_page)
- put_page(pinned_page);
ceph_put_cap_refs(ci, got);
if (err != -EAGAIN)
@@ -1475,9 +1453,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
} else {
struct address_space *mapping = inode->i_mapping;
- struct page *page = find_or_create_page(mapping, 0,
- mapping_gfp_constraint(mapping,
- ~__GFP_FS));
+ struct page *page;
+
+ filemap_invalidate_lock_shared(mapping);
+ page = find_or_create_page(mapping, 0,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
goto out_inline;
@@ -1498,8 +1478,9 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
vmf->page = page;
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
out_inline:
- dout("filemap_fault %p %llu~%zd read inline data ret %x\n",
- inode, off, (size_t)PAGE_SIZE, ret);
+ filemap_invalidate_unlock_shared(mapping);
+ dout("filemap_fault %p %llu read inline data ret %x\n",
+ inode, off, ret);
}
out_restore:
ceph_restore_sigs(&oldset);
@@ -1509,9 +1490,6 @@ out_restore:
return ret;
}
-/*
- * Reuse write_begin here for simplicity.
- */
static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -1527,6 +1505,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sigset_t oldset;
vm_fault_t ret = VM_FAULT_SIGBUS;
+ if (ceph_inode_is_shutdown(inode))
+ return ret;
+
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return VM_FAULT_OOM;
@@ -1534,23 +1515,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- struct page *locked_page = NULL;
- if (off == 0) {
- lock_page(page);
- locked_page = page;
- }
- err = ceph_uninline_data(vma->vm_file, locked_page);
- if (locked_page)
- unlock_page(locked_page);
- if (err < 0)
- goto out_free;
- }
-
- if (off + PAGE_SIZE <= size)
- len = PAGE_SIZE;
+ if (off + thp_size(page) <= size)
+ len = thp_size(page);
else
- len = size & ~PAGE_MASK;
+ len = offset_in_thp(page, size);
dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
inode, ceph_vinop(inode), off, len, size);
@@ -1560,8 +1528,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
want = CEPH_CAP_FILE_BUFFER;
got = 0;
- err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
- &got, NULL);
+ err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
if (err < 0)
goto out_free;
@@ -1573,27 +1540,40 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
inode_inc_iversion_raw(inode);
do {
+ struct ceph_snap_context *snapc;
+
lock_page(page);
- if ((off > size) || (page->mapping != inode->i_mapping)) {
+ if (page_mkwrite_check_truncate(page, inode) < 0) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
break;
}
- err = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (err >= 0) {
+ snapc = ceph_find_incompatible(page);
+ if (!snapc) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
ret = VM_FAULT_LOCKED;
+ break;
}
- } while (err == -EAGAIN);
- if (ret == VM_FAULT_LOCKED ||
- ci->i_inline_version != CEPH_INLINE_NONE) {
+ unlock_page(page);
+
+ if (IS_ERR(snapc)) {
+ ret = VM_FAULT_SIGBUS;
+ break;
+ }
+
+ ceph_queue_writeback(inode);
+ err = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ } while (err == 0);
+
+ if (ret == VM_FAULT_LOCKED) {
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1603,7 +1583,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n",
inode, off, len, ceph_cap_string(got), ret);
- ceph_put_cap_refs(ci, got);
+ ceph_put_cap_refs_async(ci, got);
out_free:
ceph_restore_sigs(&oldset);
sb_end_pagefault(inode->i_sb);
@@ -1657,16 +1637,18 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
-int ceph_uninline_data(struct file *filp, struct page *locked_page)
+int ceph_uninline_data(struct file *file)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_osd_request *req;
- struct page *page = NULL;
- u64 len, inline_version;
+ struct ceph_osd_request *req = NULL;
+ struct ceph_cap_flush *prealloc_cf;
+ struct folio *folio = NULL;
+ u64 inline_version = CEPH_INLINE_NONE;
+ struct page *pages[1];
int err = 0;
- bool from_pagecache = false;
+ u64 len;
spin_lock(&ci->i_ceph_lock);
inline_version = ci->i_inline_version;
@@ -1675,64 +1657,43 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
dout("uninline_data %p %llx.%llx inline_version %llu\n",
inode, ceph_vinop(inode), inline_version);
- if (inline_version == 1 || /* initial version, no data */
- inline_version == CEPH_INLINE_NONE)
- goto out;
+ if (inline_version == CEPH_INLINE_NONE)
+ return 0;
- if (locked_page) {
- page = locked_page;
- WARN_ON(!PageUptodate(page));
- } else if (ceph_caps_issued(ci) &
- (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
- page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- from_pagecache = true;
- lock_page(page);
- } else {
- put_page(page);
- page = NULL;
- }
- }
- }
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
- if (page) {
- len = i_size_read(inode);
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- } else {
- page = __page_cache_alloc(GFP_NOFS);
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- err = __ceph_do_getattr(inode, page,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (err < 0) {
- /* no inline data */
- if (err == -ENODATA)
- err = 0;
- goto out;
- }
- len = err;
+ if (inline_version == 1) /* initial version, no data */
+ goto out_uninline;
+
+ folio = read_mapping_folio(inode->i_mapping, 0, file);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
+ goto out;
}
+ folio_lock(folio);
+
+ len = i_size_read(inode);
+ if (len > folio_size(folio))
+ len = folio_size(folio);
+
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
- goto out;
+ goto out_unlock;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
@@ -1741,10 +1702,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
- osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+ pages[0] = folio_page(folio, 0);
+ osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
{
__le64 xattr_buf = cpu_to_le64(inline_version);
@@ -1754,7 +1716,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
CEPH_OSD_CMPXATTR_OP_GT,
CEPH_OSD_CMPXATTR_MODE_U64);
if (err)
- goto out_put;
+ goto out_put_req;
}
{
@@ -1765,26 +1727,41 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
"inline_version",
xattr_buf, xattr_len, 0, 0);
if (err)
- goto out_put;
+ goto out_put_req;
}
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, req);
-out_put:
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, err);
+
+out_uninline:
+ if (!err) {
+ int dirty;
+
+ /* Set to CAP_INLINE_NONE and dirty the caps */
+ down_read(&fsc->mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ up_read(&fsc->mdsc->snap_rwsem);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+out_put_req:
ceph_osdc_put_request(req);
if (err == -ECANCELED)
err = 0;
-out:
- if (page && page != locked_page) {
- if (from_pagecache) {
- unlock_page(page);
- put_page(page);
- } else
- __free_pages(page, 0);
+out_unlock:
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
-
+out:
+ ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
inode, ceph_vinop(inode), inline_version, err);
return err;
@@ -1799,9 +1776,8 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
- file_accessed(file);
vma->vm_ops = &ceph_vmops;
return 0;
}
@@ -1814,7 +1790,7 @@ enum {
static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
s64 pool, struct ceph_string *pool_ns)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
struct rb_node **p, *parent;
@@ -1927,29 +1903,27 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
0, false, true);
- err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+ ceph_osdc_start_request(&fsc->client->osdc, rd_req);
- wr_req->r_mtime = ci->vfs_inode.i_mtime;
- err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+ wr_req->r_mtime = ci->netfs.inode.i_mtime;
+ ceph_osdc_start_request(&fsc->client->osdc, wr_req);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
- if (!err2)
- err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+ err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
else if (err != -EPERM) {
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
goto out_unlock;
}
if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE;
else if (err2 != -EPERM) {
- if (err2 == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err2 == -EBLOCKLISTED)
+ fsc->blocklisted = true;
err = err2;
goto out_unlock;
}
@@ -1994,6 +1968,10 @@ int ceph_pool_perm_check(struct inode *inode, int need)
s64 pool;
int ret, flags;
+ /* Only need to do this for regular files */
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+
if (ci->i_vino.snap != CEPH_NOSNAP) {
/*
* Pool permission check needs to write to the first object.
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 270b769607a2..177d8e8d73fe 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -12,341 +12,99 @@
#include "super.h"
#include "cache.h"
-struct ceph_aux_inode {
- u64 version;
- u64 mtime_sec;
- u64 mtime_nsec;
-};
-
-struct fscache_netfs ceph_cache_netfs = {
- .name = "ceph",
- .version = 0,
-};
-
-static DEFINE_MUTEX(ceph_fscache_lock);
-static LIST_HEAD(ceph_fscache_list);
-
-struct ceph_fscache_entry {
- struct list_head list;
- struct fscache_cookie *fscache;
- size_t uniq_len;
- /* The following members must be last */
- struct ceph_fsid fsid;
- char uniquifier[0];
-};
-
-static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
- .name = "CEPH.fsid",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-int __init ceph_fscache_register(void)
-{
- return fscache_register_netfs(&ceph_cache_netfs);
-}
-
-void ceph_fscache_unregister(void)
-{
- fscache_unregister_netfs(&ceph_cache_netfs);
-}
-
-int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc)
-{
- const struct ceph_fsid *fsid = &fsc->client->fsid;
- const char *fscache_uniq = fsc->mount_options->fscache_uniq;
- size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
- struct ceph_fscache_entry *ent;
- int err = 0;
-
- mutex_lock(&ceph_fscache_lock);
- list_for_each_entry(ent, &ceph_fscache_list, list) {
- if (memcmp(&ent->fsid, fsid, sizeof(*fsid)))
- continue;
- if (ent->uniq_len != uniq_len)
- continue;
- if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len))
- continue;
-
- errorfc(fc, "fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option",
- fsid);
- err = -EBUSY;
- goto out_unlock;
- }
-
- ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL);
- if (!ent) {
- err = -ENOMEM;
- goto out_unlock;
- }
-
- memcpy(&ent->fsid, fsid, sizeof(*fsid));
- if (uniq_len > 0) {
- memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
- ent->uniq_len = uniq_len;
- }
-
- fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
- &ceph_fscache_fsid_object_def,
- &ent->fsid, sizeof(ent->fsid) + uniq_len,
- NULL, 0,
- fsc, 0, true);
-
- if (fsc->fscache) {
- ent->fscache = fsc->fscache;
- list_add_tail(&ent->list, &ceph_fscache_list);
- } else {
- kfree(ent);
- errorfc(fc, "unable to register fscache cookie for fsid %pU",
- fsid);
- /* all other fs ignore this error */
- }
-out_unlock:
- mutex_unlock(&ceph_fscache_lock);
- return err;
-}
-
-static enum fscache_checkaux ceph_fscache_inode_check_aux(
- void *cookie_netfs_data, const void *data, uint16_t dlen,
- loff_t object_size)
-{
- struct ceph_aux_inode aux;
- struct ceph_inode_info* ci = cookie_netfs_data;
- struct inode* inode = &ci->vfs_inode;
-
- if (dlen != sizeof(aux) ||
- i_size_read(inode) != object_size)
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- memset(&aux, 0, sizeof(aux));
- aux.version = ci->i_version;
- aux.mtime_sec = inode->i_mtime.tv_sec;
- aux.mtime_nsec = inode->i_mtime.tv_nsec;
-
- if (memcmp(data, &aux, sizeof(aux)) != 0)
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- dout("ceph inode 0x%p cached okay\n", ci);
- return FSCACHE_CHECKAUX_OKAY;
-}
-
-static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
- .name = "CEPH.inode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = ceph_fscache_inode_check_aux,
-};
-
void ceph_fscache_register_inode_cookie(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_aux_inode aux;
- /* No caching for filesystem */
+ /* No caching for filesystem? */
if (!fsc->fscache)
return;
- /* Only cache for regular files that are read only */
+ /* Regular files only */
if (!S_ISREG(inode->i_mode))
return;
- inode_lock_nested(inode, I_MUTEX_CHILD);
- if (!ci->fscache) {
- memset(&aux, 0, sizeof(aux));
- aux.version = ci->i_version;
- aux.mtime_sec = inode->i_mtime.tv_sec;
- aux.mtime_nsec = inode->i_mtime.tv_nsec;
- ci->fscache = fscache_acquire_cookie(fsc->fscache,
- &ceph_fscache_inode_object_def,
- &ci->i_vino, sizeof(ci->i_vino),
- &aux, sizeof(aux),
- ci, i_size_read(inode), false);
- }
- inode_unlock(inode);
-}
-
-void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
-{
- struct fscache_cookie* cookie;
-
- if ((cookie = ci->fscache) == NULL)
+ /* Only new inodes! */
+ if (!(inode->i_state & I_NEW))
return;
- ci->fscache = NULL;
+ WARN_ON_ONCE(ci->netfs.cache);
- fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
- fscache_relinquish_cookie(cookie, &ci->i_vino, false);
+ ci->netfs.cache =
+ fscache_acquire_cookie(fsc->fscache, 0,
+ &ci->i_vino, sizeof(ci->i_vino),
+ &ci->i_version, sizeof(ci->i_version),
+ i_size_read(inode));
}
-static bool ceph_fscache_can_enable(void *data)
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
{
- struct inode *inode = data;
- return !inode_is_open_for_write(inode);
+ fscache_relinquish_cookie(ceph_fscache_cookie(ci), false);
}
-void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
+void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- if (!fscache_cookie_valid(ci->fscache))
- return;
-
- if (inode_is_open_for_write(inode)) {
- dout("fscache_file_set_cookie %p %p disabling cache\n",
- inode, filp);
- fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
- fscache_uncache_all_inode_pages(ci->fscache, inode);
- } else {
- fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
- ceph_fscache_can_enable, inode);
- if (fscache_cookie_enabled(ci->fscache)) {
- dout("fscache_file_set_cookie %p %p enabling cache\n",
- inode, filp);
- }
- }
-}
-
-static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error)
-{
- if (!error)
- SetPageUptodate(page);
-
- unlock_page(page);
-}
-
-static inline bool cache_valid(struct ceph_inode_info *ci)
-{
- return ci->i_fscache_gen == ci->i_rdcache_gen;
-}
-
-
-/* Atempt to read from the fscache,
- *
- * This function is called from the readpage_nounlock context. DO NOT attempt to
- * unlock the page here (or in the callback).
- */
-int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int ret;
-
- if (!cache_valid(ci))
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_page(ci->fscache, page,
- ceph_readpage_from_fscache_complete, NULL,
- GFP_KERNEL);
-
- switch (ret) {
- case 0: /* Page found */
- dout("page read submitted\n");
- return 0;
- case -ENOBUFS: /* Pages were not found, and can't be */
- case -ENODATA: /* Pages were not found */
- dout("page/inode not in cache\n");
- return ret;
- default:
- dout("%s: unknown error ret = %i\n", __func__, ret);
- return ret;
- }
+ fscache_use_cookie(ceph_fscache_cookie(ci), will_modify);
}
-int ceph_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int ret;
- if (!cache_valid(ci))
- return -ENOBUFS;
+ if (update) {
+ loff_t i_size = i_size_read(inode);
- ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
- ceph_readpage_from_fscache_complete,
- NULL, mapping_gfp_mask(mapping));
-
- switch (ret) {
- case 0: /* All pages found */
- dout("all-page read submitted\n");
- return 0;
- case -ENOBUFS: /* Some pages were not found, and can't be */
- case -ENODATA: /* some pages were not found */
- dout("page/inode not in cache\n");
- return ret;
- default:
- dout("%s: unknown error ret = %i\n", __func__, ret);
- return ret;
+ fscache_unuse_cookie(ceph_fscache_cookie(ci),
+ &ci->i_version, &i_size);
+ } else {
+ fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL);
}
}
-void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
+void ceph_fscache_update(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int ret;
-
- if (!PageFsCache(page))
- return;
-
- if (!cache_valid(ci))
- return;
+ loff_t i_size = i_size_read(inode);
- ret = fscache_write_page(ci->fscache, page, i_size_read(inode),
- GFP_KERNEL);
- if (ret)
- fscache_uncache_page(ci->fscache, page);
+ fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size);
}
-void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
+void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- if (!PageFsCache(page))
- return;
-
- fscache_wait_on_page_write(ci->fscache, page);
- fscache_uncache_page(ci->fscache, page);
+ fscache_invalidate(ceph_fscache_cookie(ci),
+ &ci->i_version, i_size_read(inode),
+ dio_write ? FSCACHE_INVAL_DIO_WRITE : 0);
}
-void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc)
{
- if (fscache_cookie_valid(fsc->fscache)) {
- struct ceph_fscache_entry *ent;
- bool found = false;
+ const struct ceph_fsid *fsid = &fsc->client->fsid;
+ const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+ size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+ char *name;
+ int err = 0;
- mutex_lock(&ceph_fscache_lock);
- list_for_each_entry(ent, &ceph_fscache_list, list) {
- if (ent->fscache == fsc->fscache) {
- list_del(&ent->list);
- kfree(ent);
- found = true;
- break;
- }
- }
- WARN_ON_ONCE(!found);
- mutex_unlock(&ceph_fscache_lock);
+ name = kasprintf(GFP_KERNEL, "ceph,%pU%s%s", fsid, uniq_len ? "," : "",
+ uniq_len ? fscache_uniq : "");
+ if (!name)
+ return -ENOMEM;
- __fscache_relinquish_cookie(fsc->fscache, NULL, false);
+ fsc->fscache = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(fsc->fscache)) {
+ errorfc(fc, "Unable to register fscache cookie for %s", name);
+ err = fsc->fscache ? PTR_ERR(fsc->fscache) : -EOPNOTSUPP;
+ fsc->fscache = NULL;
}
- fsc->fscache = NULL;
+ kfree(name);
+ return err;
}
-/*
- * caller should hold CEPH_CAP_FILE_{RD,CACHE}
- */
-void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
- if (cache_valid(ci))
- return;
-
- /* resue i_truncate_mutex. There should be no pending
- * truncate while the caller holds CEPH_CAP_FILE_RD */
- mutex_lock(&ci->i_truncate_mutex);
- if (!cache_valid(ci)) {
- if (fscache_check_consistency(ci->fscache, &ci->i_vino))
- fscache_invalidate(ci->fscache);
- spin_lock(&ci->i_ceph_lock);
- ci->i_fscache_gen = ci->i_rdcache_gen;
- spin_unlock(&ci->i_ceph_lock);
- }
- mutex_unlock(&ci->i_truncate_mutex);
+ fscache_relinquish_volume(fsc->fscache, NULL, false);
}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 89dbdd1eb14a..dc502daac49a 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -9,85 +9,73 @@
#ifndef _CEPH_CACHE_H
#define _CEPH_CACHE_H
-#ifdef CONFIG_CEPH_FSCACHE
-
-extern struct fscache_netfs ceph_cache_netfs;
+#include <linux/netfs.h>
-int ceph_fscache_register(void);
-void ceph_fscache_unregister(void);
+#ifdef CONFIG_CEPH_FSCACHE
+#include <linux/fscache.h>
int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc);
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
void ceph_fscache_register_inode_cookie(struct inode *inode);
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
-void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
-void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);
-int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
-int ceph_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages);
-void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
-void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
+void ceph_fscache_use_cookie(struct inode *inode, bool will_modify);
+void ceph_fscache_unuse_cookie(struct inode *inode, bool update);
-static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
-{
- ci->fscache = NULL;
- ci->i_fscache_gen = 0;
-}
+void ceph_fscache_update(struct inode *inode);
+void ceph_fscache_invalidate(struct inode *inode, bool dio_write);
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
- fscache_invalidate(ceph_inode(inode)->fscache);
+ return netfs_i_cookie(&ci->netfs);
}
-static inline void ceph_fscache_uncache_page(struct inode *inode,
- struct page *page)
+static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- return fscache_uncache_page(ci->fscache, page);
-}
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
-static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
-{
- struct inode* inode = page->mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
- return fscache_maybe_release_page(ci->fscache, page, gfp);
+ if (cookie) {
+ ceph_fscache_use_cookie(inode, true);
+ fscache_resize_cookie(cookie, to);
+ ceph_fscache_unuse_cookie(inode, true);
+ }
}
-static inline void ceph_fscache_readpage_cancel(struct inode *inode,
- struct page *page)
+static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
- __fscache_uncache_page(ci->fscache, page);
+ fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline void ceph_fscache_readpages_cancel(struct inode *inode,
- struct list_head *pages)
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- return fscache_readpages_cancel(ci->fscache, pages);
+ struct ceph_inode_info *ci = ceph_inode(mapping->host);
+
+ return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
}
-static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
{
- ci->i_fscache_gen = ci->i_rdcache_gen - 1;
-}
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
-#else
+ return fscache_begin_read_operation(&rreq->cache_resources, cookie);
+}
-static inline int ceph_fscache_register(void)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
{
- return 0;
+ return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline void ceph_fscache_unregister(void)
+static inline void ceph_fscache_note_page_release(struct inode *inode)
{
-}
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ fscache_note_page_release(ceph_fscache_cookie(ci));
+}
+#else /* CONFIG_CEPH_FSCACHE */
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
struct fs_context *fc)
{
@@ -98,10 +86,6 @@ static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
}
-static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
-{
-}
-
static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
{
}
@@ -110,67 +94,55 @@ static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info*
{
}
-static inline void ceph_fscache_file_set_cookie(struct inode *inode,
- struct file *filp)
-{
-}
-
-static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
+static inline void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
{
}
-static inline void ceph_fscache_uncache_page(struct inode *inode,
- struct page *pages)
+static inline void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
{
}
-static inline int ceph_readpage_from_fscache(struct inode* inode,
- struct page *page)
+static inline void ceph_fscache_update(struct inode *inode)
{
- return -ENOBUFS;
}
-static inline int ceph_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+static inline void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
{
- return -ENOBUFS;
}
-static inline void ceph_readpage_to_fscache(struct inode *inode,
- struct page *page)
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
+ return NULL;
}
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
{
}
-static inline void ceph_invalidate_fscache_page(struct inode *inode,
- struct page *page)
+static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
{
}
-static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- return 1;
+ return filemap_dirty_folio(mapping, folio);
}
-static inline void ceph_fscache_readpage_cancel(struct inode *inode,
- struct page *page)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
{
+ return false;
}
-static inline void ceph_fscache_readpages_cancel(struct inode *inode,
- struct list_head *pages)
+static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
{
+ return -ENOBUFS;
}
-static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+static inline void ceph_fscache_note_page_release(struct inode *inode)
{
}
-
-#endif
+#endif /* CONFIG_CEPH_FSCACHE */
#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 28ae0c134700..fb023f9fafcb 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
-
- ci->i_hold_caps_min = round_jiffies(jiffies +
- opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+ dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
+ ci->i_hold_caps_max - jiffies);
}
/*
@@ -508,10 +505,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci,
- bool set_timeout)
+ struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
@@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
- if (set_timeout)
- __cap_set_timeouts(mdsc, ci);
+ __cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
@@ -536,7 +531,7 @@ no_change:
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
@@ -553,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
if (list_empty(&ci->i_cap_delay_list))
return;
spin_lock(&mdsc->cap_delay_lock);
@@ -561,19 +556,20 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_delay_lock);
}
-/*
- * Common issue checks for add_cap, handle_cap_grant.
- */
+/* Common issue checks for add_cap, handle_cap_grant. */
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
unsigned issued)
{
unsigned had = __ceph_caps_issued(ci, NULL);
+ lockdep_assert_held(&ci->i_ceph_lock);
+
/*
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
+ (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
}
@@ -587,11 +583,39 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
if (issued & CEPH_CAP_FILE_SHARED)
atomic_inc(&ci->i_shared_gen);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->netfs.inode);
__ceph_dir_clear_complete(ci);
}
}
+
+ /* Wipe saved layout if we're losing DIR_CREATE caps */
+ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ !(issued & CEPH_CAP_DIR_CREATE)) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
+}
+
+/**
+ * change_auth_cap_ses - move inode to appropriate lists when auth caps change
+ * @ci: inode to be moved
+ * @session: new auth caps session
+ */
+void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
+{
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item))
+ return;
+
+ spin_lock(&session->s_mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item))
+ list_move(&ci->i_dirty_item, &session->s_cap_dirty);
+ if (!list_empty(&ci->i_flushing_item))
+ list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+ spin_unlock(&session->s_mdsc->cap_dirty_lock);
}
/*
@@ -605,7 +629,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
*/
void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
+ unsigned issued, unsigned wanted,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap **new_cap)
{
@@ -621,16 +645,7 @@ void ceph_add_cap(struct inode *inode,
dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
session->s_mds, cap_id, ceph_cap_string(issued), seq);
- /*
- * If we are opening the file, include file mode wanted bits
- * in wanted.
- */
- if (fmode >= 0)
- wanted |= ceph_caps_for_mode(fmode);
-
- spin_lock(&session->s_gen_ttl_lock);
- gen = session->s_cap_gen;
- spin_unlock(&session->s_gen_ttl_lock);
+ gen = atomic_read(&session->s_cap_gen);
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
@@ -651,6 +666,7 @@ void ceph_add_cap(struct inode *inode,
spin_lock(&session->s_cap_lock);
list_add_tail(&cap->session_caps, &session->s_caps);
session->s_nr_caps++;
+ atomic64_inc(&mdsc->metric.total_caps);
spin_unlock(&session->s_cap_lock);
} else {
spin_lock(&session->s_cap_lock);
@@ -687,29 +703,12 @@ void ceph_add_cap(struct inode *inode,
*/
struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
realmino);
- if (realm) {
- struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
- if (oldrealm) {
- spin_lock(&oldrealm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- spin_unlock(&oldrealm->inodes_with_caps_lock);
- }
-
- spin_lock(&realm->inodes_with_caps_lock);
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- ci->i_snap_realm = realm;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = inode;
- spin_unlock(&realm->inodes_with_caps_lock);
-
- if (oldrealm)
- ceph_put_snap_realm(mdsc, oldrealm);
- } else {
- pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
- realmino);
- WARN_ON(!realm);
- }
+ if (realm)
+ ceph_change_snap_realm(inode, realm);
+ else
+ WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
+ __func__, realmino, ci->i_vino.ino,
+ ci->i_snap_realm ? ci->i_snap_realm->ino : 0);
}
__check_cap_issue(ci, cap, issued);
@@ -725,12 +724,15 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci, true);
+ __cap_delay_requeue(mdsc, ci);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
if (!ci->i_auth_cap ||
ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
+ if (ci->i_auth_cap &&
+ ci->i_auth_cap->session != cap->session)
+ change_auth_cap_ses(ci, cap->session);
ci->i_auth_cap = cap;
cap->mds_wanted = wanted;
}
@@ -752,9 +754,7 @@ void ceph_add_cap(struct inode *inode,
cap->issue_seq = seq;
cap->mseq = mseq;
cap->cap_gen = gen;
-
- if (fmode >= 0)
- __ceph_get_fmode(ci, fmode);
+ wake_up_all(&ci->i_cap_wq);
}
/*
@@ -767,14 +767,12 @@ static int __cap_is_valid(struct ceph_cap *cap)
unsigned long ttl;
u32 gen;
- spin_lock(&cap->session->s_gen_ttl_lock);
- gen = cap->session->s_cap_gen;
+ gen = atomic_read(&cap->session->s_cap_gen);
ttl = cap->session->s_cap_ttl;
- spin_unlock(&cap->session->s_gen_ttl_lock);
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+ "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
return 0;
}
@@ -800,7 +798,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (!__cap_is_valid(cap))
continue;
dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
have |= cap->issued;
if (implemented)
*implemented |= cap->implemented;
@@ -847,12 +845,12 @@ static void __touch_cap(struct ceph_cap *cap)
spin_lock(&s->s_cap_lock);
if (!s->s_cap_iterator) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+ dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
} else {
dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->vfs_inode, cap, s->s_mds);
+ &cap->ci->netfs.inode, cap, s->s_mds);
}
spin_unlock(&s->s_cap_lock);
}
@@ -869,8 +867,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -881,8 +879,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+ dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -893,8 +891,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -919,6 +917,20 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
return 0;
}
+int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
+ int touch)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+ int r;
+
+ r = __ceph_caps_issued_mask(ci, mask, touch);
+ if (r)
+ ceph_update_cap_hit(&fsc->mdsc->metric);
+ else
+ ceph_update_cap_mis(&fsc->mdsc->metric);
+ return r;
+}
+
/*
* Return true if mask caps are currently being revoked by an MDS.
*/
@@ -939,7 +951,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int ret;
spin_lock(&ci->i_ceph_lock);
@@ -958,29 +970,97 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
- (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
- ci->vfs_inode.i_data.nrpages))
+ (S_ISREG(ci->netfs.inode.i_mode) &&
+ ci->netfs.inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
if (ci->i_wb_ref || ci->i_wrbuffer_ref)
used |= CEPH_CAP_FILE_BUFFER;
+ if (ci->i_fx_ref)
+ used |= CEPH_CAP_FILE_EXCL;
return used;
}
+#define FMODE_WAIT_BIAS 1000
+
/*
* wanted, by virtue of open file modes
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int i, bits = 0;
- for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
- if (ci->i_nr_by_mode[i])
- bits |= 1 << i;
+ const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
+ const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
+ const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
+ const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
+ unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
+ unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
+
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ int want = 0;
+
+ /* use used_cutoff here, to keep dir's wanted caps longer */
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
+ time_after(ci->i_last_rd, used_cutoff))
+ want |= CEPH_CAP_ANY_SHARED;
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
+ time_after(ci->i_last_wr, used_cutoff)) {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ want |= CEPH_CAP_ANY_DIR_OPS;
+ }
+
+ if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
+ want |= CEPH_CAP_PIN;
+
+ return want;
+ } else {
+ int bits = 0;
+
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_rd, used_cutoff))
+ bits |= 1 << RD_SHIFT;
+ } else if (time_after(ci->i_last_rd, idle_cutoff)) {
+ bits |= 1 << RD_SHIFT;
+ }
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_wr, used_cutoff))
+ bits |= 1 << WR_SHIFT;
+ } else if (time_after(ci->i_last_wr, idle_cutoff)) {
+ bits |= 1 << WR_SHIFT;
+ }
+
+ /* check lazyio only when read/write is wanted */
+ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
+ ci->i_nr_by_mode[LAZY_SHIFT] > 0)
+ bits |= 1 << LAZY_SHIFT;
+
+ return bits ? ceph_caps_for_mode(bits >> 1) : 0;
}
- if (bits == 0)
- return 0;
- return ceph_caps_for_mode(bits >> 1);
+}
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ /* we want EXCL if holding caps of dir ops */
+ if (w & CEPH_CAP_ANY_DIR_OPS)
+ w |= CEPH_CAP_FILE_EXCL;
+ } else {
+ /* we want EXCL if dirty data */
+ if (w & CEPH_CAP_FILE_BUFFER)
+ w |= CEPH_CAP_FILE_EXCL;
+ }
+ return w;
}
/*
@@ -1004,14 +1084,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
return mds_wanted;
}
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
-{
- return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
-}
-
int ceph_is_any_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1024,20 +1096,6 @@ int ceph_is_any_caps(struct inode *inode)
return ret;
}
-static void drop_inode_snap_realm(struct ceph_inode_info *ci)
-{
- struct ceph_snap_realm *realm = ci->i_snap_realm;
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm_counter++;
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
- realm);
-}
-
/*
* Remove a cap. Take steps to deal with a racing iterate_session_caps.
*
@@ -1048,11 +1106,20 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
{
struct ceph_mds_session *session = cap->session;
struct ceph_inode_info *ci = cap->ci;
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ struct ceph_mds_client *mdsc;
int removed = 0;
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ /* 'ci' being NULL means the remove have already occurred */
+ if (!ci) {
+ dout("%s: cap inode is NULL\n", __func__);
+ return;
+ }
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
+
+ mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
/* remove from inode's cap rbtree, and clear auth cap */
rb_erase(&cap->ci_node, &ci->i_caps);
@@ -1068,6 +1135,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
} else {
list_del_init(&cap->session_caps);
session->s_nr_caps--;
+ atomic64_dec(&mdsc->metric.total_caps);
cap->session = NULL;
removed = 1;
}
@@ -1079,7 +1147,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
* s_cap_gen while session is in the reconnect state.
*/
if (queue_release &&
- (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
+ (!session->s_cap_reconnect ||
+ cap->cap_gen == atomic_read(&session->s_cap_gen))) {
cap->queue_release = 1;
if (removed) {
__ceph_queue_cap_release(session, cap);
@@ -1101,12 +1170,34 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
* keep i_snap_realm.
*/
if (ci->i_wr_ref == 0 && ci->i_snap_realm)
- drop_inode_snap_realm(ci);
+ ceph_change_snap_realm(&ci->netfs.inode, NULL);
__cap_delay_cancel(mdsc, ci);
}
}
+void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+{
+ struct ceph_inode_info *ci = cap->ci;
+ struct ceph_fs_client *fsc;
+
+ /* 'ci' being NULL means the remove have already occurred */
+ if (!ci) {
+ dout("%s: cap inode is NULL\n", __func__);
+ return;
+ }
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ fsc = ceph_inode_to_client(&ci->netfs.inode);
+ WARN_ON_ONCE(ci->i_auth_cap == cap &&
+ !list_empty(&ci->i_dirty_item) &&
+ !fsc->blocklisted &&
+ !ceph_inode_is_shutdown(&ci->netfs.inode));
+
+ __ceph_remove_cap(cap, queue_release);
+}
+
struct cap_msg_args {
struct ceph_mds_session *session;
u64 ino, cid, follows;
@@ -1114,6 +1205,7 @@ struct cap_msg_args {
u64 xattr_version;
u64 change_attr;
struct ceph_buffer *xattr_buf;
+ struct ceph_buffer *old_xattr_buf;
struct timespec64 atime, mtime, ctime, btime;
int op, caps, wanted, dirty;
u32 seq, issue_seq, mseq, time_warp_seq;
@@ -1122,39 +1214,31 @@ struct cap_msg_args {
kgid_t gid;
umode_t mode;
bool inline_data;
+ bool wake;
};
/*
- * Build and send a cap message to the given MDS.
- *
- * Caller should be holding s_mutex.
+ * cap struct size + flock buffer size + inline version + inline data size +
+ * osd_epoch_barrier + oldest_flush_tid
*/
-static int send_cap_msg(struct cap_msg_args *arg)
+#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
+
+/* Marshal up the cap msg to the MDS */
+static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
struct ceph_mds_caps *fc;
- struct ceph_msg *msg;
void *p;
- size_t extra_len;
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
- arg->cid, arg->ino, ceph_cap_string(arg->caps),
- ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
- arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
- arg->mseq, arg->follows, arg->size, arg->max_size,
- arg->xattr_version,
+ dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
+ __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
+ arg->size, arg->max_size, arg->xattr_version,
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
- /* flock buffer size + inline version + inline data size +
- * osd_epoch_barrier + oldest_flush_tid */
- extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
- GFP_NOFS, false);
- if (!msg)
- return -ENOMEM;
-
msg->hdr.version = cpu_to_le16(10);
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@@ -1226,9 +1310,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
/* Advisory flags (version 10) */
ceph_encode_32(&p, arg->flags);
-
- ceph_con_send(&arg->session->s_con, msg);
- return 0;
}
/*
@@ -1245,124 +1326,98 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
p = rb_next(p);
- __ceph_remove_cap(cap, true);
+ ceph_remove_cap(cap, true);
}
spin_unlock(&ci->i_ceph_lock);
}
/*
- * Send a cap msg on the given inode. Update our caps state, then
- * drop i_ceph_lock and send the message.
+ * Prepare to send a cap message to an MDS. Update the cap state, and populate
+ * the arg struct with the parameters that will need to be sent. This should
+ * be done under the i_ceph_lock to guard against changes to cap state.
*
* Make note of max_size reported/requested from mds, revoked caps
* that have now been implemented.
- *
- * Return non-zero if delayed release, or we experienced an error
- * such that the caller should requeue + retry later.
- *
- * called with i_ceph_lock, then drops it.
- * caller should hold snap_rwsem (read), s_mutex.
*/
-static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
- int op, int flags, int used, int want, int retain,
- int flushing, u64 flush_tid, u64 oldest_flush_tid)
- __releases(cap->ci->i_ceph_lock)
+static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
+ int op, int flags, int used, int want, int retain,
+ int flushing, u64 flush_tid, u64 oldest_flush_tid)
{
struct ceph_inode_info *ci = cap->ci;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_buffer *old_blob = NULL;
- struct cap_msg_args arg;
+ struct inode *inode = &ci->netfs.inode;
int held, revoking;
- int wake = 0;
- int delayed = 0;
- int ret;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
- dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
- inode, cap, cap->session,
+ dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
+ __func__, inode, cap, cap->session,
ceph_cap_string(held), ceph_cap_string(held & retain),
ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
- arg.session = cap->session;
-
- /* don't release wanted unless we've waited a bit. */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_min)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- want |= cap->mds_wanted;
- retain |= cap->issued;
- delayed = 1;
- }
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
- if (want & ~cap->mds_wanted) {
- /* user space may open/close single file frequently.
- * This avoids droping mds_wanted immediately after
- * requesting new mds_wanted.
- */
- __cap_set_timeouts(mdsc, ci);
- }
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH;
cap->issued &= retain; /* drop bits we don't want */
- if (cap->implemented & ~cap->issued) {
- /*
- * Wake up any waiters on wanted -> needed transition.
- * This is due to the weird transition from buffered
- * to sync IO... we need to flush dirty pages _before_
- * allowing sync writes to avoid reordering.
- */
- wake = 1;
- }
+ /*
+ * Wake up any waiters on wanted -> needed transition. This is due to
+ * the weird transition from buffered to sync IO... we need to flush
+ * dirty pages _before_ allowing sync writes to avoid reordering.
+ */
+ arg->wake = cap->implemented & ~cap->issued;
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
- arg.ino = ceph_vino(inode).ino;
- arg.cid = cap->cap_id;
- arg.follows = flushing ? ci->i_head_snapc->seq : 0;
- arg.flush_tid = flush_tid;
- arg.oldest_flush_tid = oldest_flush_tid;
-
- arg.size = inode->i_size;
- ci->i_reported_size = arg.size;
- arg.max_size = ci->i_wanted_max_size;
- ci->i_requested_max_size = arg.max_size;
+ arg->session = cap->session;
+ arg->ino = ceph_vino(inode).ino;
+ arg->cid = cap->cap_id;
+ arg->follows = flushing ? ci->i_head_snapc->seq : 0;
+ arg->flush_tid = flush_tid;
+ arg->oldest_flush_tid = oldest_flush_tid;
+
+ arg->size = i_size_read(inode);
+ ci->i_reported_size = arg->size;
+ arg->max_size = ci->i_wanted_max_size;
+ if (cap == ci->i_auth_cap) {
+ if (want & CEPH_CAP_ANY_FILE_WR)
+ ci->i_requested_max_size = arg->max_size;
+ else
+ ci->i_requested_max_size = 0;
+ }
if (flushing & CEPH_CAP_XATTR_EXCL) {
- old_blob = __ceph_build_xattrs_blob(ci);
- arg.xattr_version = ci->i_xattrs.version;
- arg.xattr_buf = ci->i_xattrs.blob;
+ arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
+ arg->xattr_version = ci->i_xattrs.version;
+ arg->xattr_buf = ci->i_xattrs.blob;
} else {
- arg.xattr_buf = NULL;
+ arg->xattr_buf = NULL;
+ arg->old_xattr_buf = NULL;
}
- arg.mtime = inode->i_mtime;
- arg.atime = inode->i_atime;
- arg.ctime = inode->i_ctime;
- arg.btime = ci->i_btime;
- arg.change_attr = inode_peek_iversion_raw(inode);
+ arg->mtime = inode->i_mtime;
+ arg->atime = inode->i_atime;
+ arg->ctime = inode->i_ctime;
+ arg->btime = ci->i_btime;
+ arg->change_attr = inode_peek_iversion_raw(inode);
- arg.op = op;
- arg.caps = cap->implemented;
- arg.wanted = want;
- arg.dirty = flushing;
+ arg->op = op;
+ arg->caps = cap->implemented;
+ arg->wanted = want;
+ arg->dirty = flushing;
- arg.seq = cap->seq;
- arg.issue_seq = cap->issue_seq;
- arg.mseq = cap->mseq;
- arg.time_warp_seq = ci->i_time_warp_seq;
+ arg->seq = cap->seq;
+ arg->issue_seq = cap->issue_seq;
+ arg->mseq = cap->mseq;
+ arg->time_warp_seq = ci->i_time_warp_seq;
- arg.uid = inode->i_uid;
- arg.gid = inode->i_gid;
- arg.mode = inode->i_mode;
+ arg->uid = inode->i_uid;
+ arg->gid = inode->i_gid;
+ arg->mode = inode->i_mode;
- arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+ arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
!list_empty(&ci->i_cap_snaps)) {
struct ceph_cap_snap *capsnap;
@@ -1375,22 +1430,35 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
}
}
}
- arg.flags = flags;
-
- spin_unlock(&ci->i_ceph_lock);
+ arg->flags = flags;
+}
- ceph_buffer_put(old_blob);
+/*
+ * Send a cap msg on the given inode.
+ *
+ * Caller should hold snap_rwsem (read), s_mutex.
+ */
+static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
+{
+ struct ceph_msg *msg;
+ struct inode *inode = &ci->netfs.inode;
- ret = send_cap_msg(&arg);
- if (ret < 0) {
- dout("error sending cap msg, must requeue %p\n", inode);
- delayed = 1;
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ if (!msg) {
+ pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
+ ceph_vinop(inode), ceph_cap_string(arg->dirty),
+ arg->flush_tid);
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(arg->session->s_mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ return;
}
- if (wake)
+ encode_cap_msg(msg, arg);
+ ceph_con_send(&arg->session->s_con, msg);
+ ceph_buffer_put(arg->old_xattr_buf);
+ if (arg->wake)
wake_up_all(&ci->i_cap_wq);
-
- return delayed;
}
static inline int __send_flush_snap(struct inode *inode,
@@ -1399,6 +1467,11 @@ static inline int __send_flush_snap(struct inode *inode,
u32 mseq, u64 oldest_flush_tid)
{
struct cap_msg_args arg;
+ struct ceph_msg *msg;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ if (!msg)
+ return -ENOMEM;
arg.session = session;
arg.ino = ceph_vino(inode).ino;
@@ -1411,6 +1484,7 @@ static inline int __send_flush_snap(struct inode *inode,
arg.max_size = 0;
arg.xattr_version = capsnap->xattr_version;
arg.xattr_buf = capsnap->xattr_blob;
+ arg.old_xattr_buf = NULL;
arg.atime = capsnap->atime;
arg.mtime = capsnap->mtime;
@@ -1434,8 +1508,11 @@ static inline int __send_flush_snap(struct inode *inode,
arg.inline_data = capsnap->inline_data;
arg.flags = 0;
+ arg.wake = false;
- return send_cap_msg(&arg);
+ encode_cap_msg(msg, &arg);
+ ceph_con_send(&arg.session->s_con, msg);
+ return 0;
}
/*
@@ -1445,14 +1522,14 @@ static inline int __send_flush_snap(struct inode *inode,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Called under i_ceph_lock. Takes s_mutex as needed.
+ * Called under i_ceph_lock.
*/
static void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session *session)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
u64 oldest_flush_tid = 0;
@@ -1501,7 +1578,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
while (first_tid <= last_tid) {
struct ceph_cap *cap = ci->i_auth_cap;
- struct ceph_cap_flush *cf;
+ struct ceph_cap_flush *cf = NULL, *iter;
int ret;
if (!(cap && cap->session == session)) {
@@ -1511,8 +1588,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
}
ret = -ENOENT;
- list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
- if (cf->tid >= first_tid) {
+ list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
+ if (iter->tid >= first_tid) {
+ cf = iter;
ret = 0;
break;
}
@@ -1545,7 +1623,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL;
int mds;
@@ -1567,7 +1645,6 @@ retry:
mds = ci->i_auth_cap->session->s_mds;
if (session && session->s_mds != mds) {
dout(" oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
session = NULL;
}
@@ -1576,10 +1653,6 @@ retry:
mutex_lock(&mdsc->mutex);
session = __ceph_lookup_mds_session(mdsc, mds);
mutex_unlock(&mdsc->mutex);
- if (session) {
- dout(" inverting session/ino locks on %p\n", session);
- mutex_lock(&session->s_mutex);
- }
goto retry;
}
@@ -1591,12 +1664,10 @@ retry:
out:
spin_unlock(&ci->i_ceph_lock);
- if (psession) {
+ if (psession)
*psession = session;
- } else if (session) {
- mutex_unlock(&session->s_mutex);
+ else
ceph_put_mds_session(session);
- }
/* we flushed them all; remove this inode from the queue */
spin_lock(&mdsc->snap_flush_lock);
list_del_init(&ci->i_snap_flush_item);
@@ -1612,11 +1683,13 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct inode *inode = &ci->vfs_inode;
+ ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
int was = ci->i_dirty_caps;
int dirty = 0;
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (!ci->i_auth_cap) {
pr_warn("__mark_dirty_caps %p %llx mask %s, "
"but no auth cap (session was closed?)\n",
@@ -1624,11 +1697,13 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return 0;
}
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+ dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
ceph_cap_string(mask), ceph_cap_string(was),
ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
if (was == 0) {
+ struct ceph_mds_session *session = ci->i_auth_cap->session;
+
WARN_ON_ONCE(ci->i_prealloc_cap_flush);
swap(ci->i_prealloc_cap_flush, *pcf);
@@ -1638,10 +1713,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
ci->i_snap_realm->cached_context);
}
dout(" inode %p now dirty snapc %p auth cap %p\n",
- &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ list_add(&ci->i_dirty_item, &session->s_cap_dirty);
spin_unlock(&mdsc->cap_dirty_lock);
if (ci->i_flushing_caps == 0) {
ihold(inode);
@@ -1654,13 +1729,20 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
- __cap_delay_requeue(mdsc, ci, true);
+ __cap_delay_requeue(mdsc, ci);
return dirty;
}
struct ceph_cap_flush *ceph_alloc_cap_flush(void)
{
- return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+ struct ceph_cap_flush *cf;
+
+ cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+ if (!cf)
+ return NULL;
+
+ cf->is_capsnap = false;
+ return cf;
}
void ceph_free_cap_flush(struct ceph_cap_flush *cf)
@@ -1684,30 +1766,33 @@ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
* Remove cap_flush from the mdsc's or inode's flushing cap list.
* Return true if caller needs to wake up flush waiters.
*/
-static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci,
- struct ceph_cap_flush *cf)
+static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
+ struct ceph_cap_flush *cf)
{
struct ceph_cap_flush *prev;
bool wake = cf->wake;
- if (mdsc) {
- /* are there older pending cap flushes? */
- if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
- prev = list_prev_entry(cf, g_list);
- prev->wake = true;
- wake = false;
- }
- list_del(&cf->g_list);
- } else if (ci) {
- if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
- prev = list_prev_entry(cf, i_list);
- prev->wake = true;
- wake = false;
- }
- list_del(&cf->i_list);
- } else {
- BUG_ON(1);
+
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+ prev = list_prev_entry(cf, g_list);
+ prev->wake = true;
+ wake = false;
}
+ list_del_init(&cf->g_list);
+ return wake;
+}
+
+static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+ prev = list_prev_entry(cf, i_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del_init(&cf->i_list);
return wake;
}
@@ -1726,6 +1811,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
struct ceph_cap_flush *cf = NULL;
int flushing;
+ lockdep_assert_held(&ci->i_ceph_lock);
BUG_ON(ci->i_dirty_caps == 0);
BUG_ON(list_empty(&ci->i_dirty_item));
BUG_ON(!ci->i_prealloc_cap_flush);
@@ -1765,11 +1851,14 @@ static u64 __mark_caps_flushing(struct inode *inode,
* try to invalidate mapping pages without blocking.
*/
static int try_nonblocking_invalidate(struct inode *inode)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
u32 invalidating_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_invalidate(inode, false);
invalidate_mapping_pages(&inode->i_data, 0, -1);
spin_lock(&ci->i_ceph_lock);
@@ -1787,7 +1876,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
bool __ceph_should_report_size(struct ceph_inode_info *ci)
{
- loff_t size = ci->vfs_inode.i_size;
+ loff_t size = i_size_read(&ci->netfs.inode);
/* mds will adjust max size according to the reported size */
if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
return false;
@@ -1805,8 +1894,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* versus held caps. Release, flush, ack revoked caps to mds as
* appropriate.
*
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- * cap release further.
* CHECK_CAPS_AUTHONLY - we should only check the auth cap
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
@@ -1814,45 +1901,51 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
- int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int delayed = 0, sent = 0;
- bool no_delay = flags & CHECK_CAPS_NODELAY;
bool queue_invalidate = false;
bool tried_invalidate = false;
+ bool queue_writeback = false;
- /* if we are unmounting, flush any unused caps immediately. */
- if (mdsc->stopping)
- no_delay = true;
+ if (session)
+ ceph_get_mds_session(session);
spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ /* Don't send messages until we get async create reply */
+ spin_unlock(&ci->i_ceph_lock);
+ ceph_put_mds_session(session);
+ return;
+ }
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
-
- if (!(flags & CHECK_CAPS_AUTHONLY) ||
- (ci->i_auth_cap && __ceph_is_single_caps(ci)))
- __cap_delay_cancel(mdsc, ci);
-
- goto retry_locked;
retry:
- spin_lock(&ci->i_ceph_lock);
-retry_locked:
+ /* Caps wanted by virtue of active open files. */
file_wanted = __ceph_caps_file_wanted(ci);
+
+ /* Caps which have active references against them */
used = __ceph_caps_used(ci);
+
+ /*
+ * "issued" represents the current caps that the MDS wants us to have.
+ * "implemented" is the set that we have been granted, and includes the
+ * ones that have not yet been returned to the MDS (the "revoking" set,
+ * usually because they have outstanding references).
+ */
issued = __ceph_caps_issued(ci, &implemented);
revoking = implemented & ~issued;
want = file_wanted;
+
+ /* The ones we currently want to retain (may be adjusted below) */
retain = file_wanted | used | CEPH_CAP_PIN;
if (!mdsc->stopping && inode->i_nlink > 0) {
if (file_wanted) {
@@ -1866,10 +1959,11 @@ retry_locked:
* revoking the shared cap on every create/unlink
* operation.
*/
- if (IS_RDONLY(inode))
+ if (IS_RDONLY(inode)) {
want = CEPH_CAP_ANY_SHARED;
- else
- want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ } else {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ }
retain |= want;
} else {
@@ -1884,40 +1978,44 @@ retry_locked:
}
}
- dout("check_caps %p file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s%s\n", inode,
+ dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
+ " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
- (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+ (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
+ (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
/*
* If we no longer need to hold onto old our caps, and we may
* have cached pages, but don't want them, then try to invalidate.
* If we fail, it's because pages are locked.... try again later.
*/
- if ((!no_delay || mdsc->stopping) &&
- !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
+ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
+ S_ISREG(inode->i_mode) &&
!(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
- dout("check_caps trying to invalidate on %p\n", inode);
+ dout("check_caps trying to invalidate on %llx.%llx\n",
+ ceph_vinop(inode));
if (try_nonblocking_invalidate(inode) < 0) {
dout("check_caps queuing invalidate\n");
queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
tried_invalidate = true;
- goto retry_locked;
+ goto retry;
}
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ int mflags = 0;
+ struct cap_msg_args arg;
+
cap = rb_entry(p, struct ceph_cap, ci_node);
/* avoid looping forever */
@@ -1925,8 +2023,10 @@ retry_locked:
((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
continue;
- /* NOTE: no side-effects allowed, until we take s_mutex */
-
+ /*
+ * If we have an auth cap, we don't need to consider any
+ * overlapping caps as used.
+ */
cap_used = used;
if (ci->i_auth_cap && cap != ci->i_auth_cap)
cap_used &= ~ci->i_auth_cap->issued;
@@ -1966,55 +2066,44 @@ retry_locked:
}
/* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & cap_used) == 0) {
- dout("completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ dout("completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
}
/* want more caps from mds? */
- if (want & ~(cap->mds_wanted | cap->issued))
- goto ack;
+ if (want & ~cap->mds_wanted) {
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+ if (!__cap_is_valid(cap))
+ goto ack;
+ }
/* things we might delay */
if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
- if (no_delay)
- goto ack;
-
- /* delay? */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- delayed++;
- continue;
- }
-
ack:
- if (session && session != cap->session) {
- dout("oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
- session = NULL;
- }
- if (!session) {
- session = cap->session;
- if (mutex_trylock(&session->s_mutex) == 0) {
- dout("inverting session/ino locks on %p\n",
- session);
- spin_unlock(&ci->i_ceph_lock);
- if (took_snap_rwsem) {
- up_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 0;
- }
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- }
+ ceph_put_mds_session(session);
+ session = ceph_get_mds_session(cap->session);
/* kick flushing and flush snaps before sending normal
* cap message */
@@ -2026,26 +2115,16 @@ ack:
if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
__ceph_flush_snaps(ci, session);
- goto retry_locked;
- }
-
- /* take snap_rwsem after session mutex */
- if (!took_snap_rwsem) {
- if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
- dout("inverting snap/in locks on %p\n",
- inode);
- spin_unlock(&ci->i_ceph_lock);
- down_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 1;
- goto retry;
- }
- took_snap_rwsem = 1;
+ goto retry;
}
if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
flushing = ci->i_dirty_caps;
flush_tid = __mark_caps_flushing(inode, session, false,
&oldest_flush_tid);
+ if (flags & CHECK_CAPS_FLUSH &&
+ list_empty(&session->s_cap_dirty))
+ mflags |= CEPH_CLIENT_CAPS_SYNC;
} else {
flushing = 0;
flush_tid = 0;
@@ -2055,28 +2134,32 @@ ack:
}
mds = cap->mds; /* remember mds, so we don't repeat */
- sent++;
- /* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
- cap_used, want, retain, flushing,
- flush_tid, oldest_flush_tid);
+ __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used,
+ want, retain, flushing, flush_tid, oldest_flush_tid);
+
+ spin_unlock(&ci->i_ceph_lock);
+ __send_cap(&arg, ci);
+ spin_lock(&ci->i_ceph_lock);
+
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
- /* Reschedule delayed caps release if we delayed anything */
- if (delayed)
- __cap_delay_requeue(mdsc, ci, false);
+ /* periodically re-calculate caps wanted by open files */
+ if (__ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list) &&
+ (file_wanted & ~CEPH_CAP_PIN) &&
+ !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ __cap_delay_requeue(mdsc, ci);
+ }
spin_unlock(&ci->i_ceph_lock);
+ ceph_put_mds_session(session);
+ if (queue_writeback)
+ ceph_queue_writeback(inode);
if (queue_invalidate)
ceph_queue_invalidate(inode);
-
- if (session)
- mutex_unlock(&session->s_mutex);
- if (took_snap_rwsem)
- up_read(&mdsc->snap_rwsem);
}
/*
@@ -2086,26 +2169,17 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_session *session = NULL;
int flushing = 0;
u64 flush_tid = 0, oldest_flush_tid = 0;
-retry:
spin_lock(&ci->i_ceph_lock);
retry_locked:
if (ci->i_dirty_caps && ci->i_auth_cap) {
struct ceph_cap *cap = ci->i_auth_cap;
- int delayed;
+ struct cap_msg_args arg;
+ struct ceph_mds_session *session = cap->session;
- if (session != cap->session) {
- spin_unlock(&ci->i_ceph_lock);
- if (session)
- mutex_unlock(&session->s_mutex);
- session = cap->session;
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
+ if (session->s_state < CEPH_MDS_SESSION_OPEN) {
spin_unlock(&ci->i_ceph_lock);
goto out;
}
@@ -2123,19 +2197,13 @@ retry_locked:
flush_tid = __mark_caps_flushing(inode, session, true,
&oldest_flush_tid);
- /* __send_cap drops i_ceph_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- CEPH_CLIENT_CAPS_SYNC,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- (cap->issued | cap->implemented),
- flushing, flush_tid, oldest_flush_tid);
+ __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
+ __ceph_caps_used(ci), __ceph_caps_wanted(ci),
+ (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
+ spin_unlock(&ci->i_ceph_lock);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci, true);
- spin_unlock(&ci->i_ceph_lock);
- }
+ __send_cap(&arg, ci);
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
@@ -2148,9 +2216,6 @@ retry_locked:
spin_unlock(&ci->i_ceph_lock);
}
out:
- if (session)
- mutex_unlock(&session->s_mutex);
-
*ptid = flush_tid;
return flushing;
}
@@ -2176,12 +2241,14 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}
/*
- * wait for any unsafe requests to complete.
+ * flush the mdlog and wait for any unsafe requests to complete.
*/
-static int unsafe_request_wait(struct inode *inode)
+static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ unsigned int max_sessions;
int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
@@ -2199,28 +2266,124 @@ static int unsafe_request_wait(struct inode *inode)
}
spin_unlock(&ci->i_unsafe_lock);
- dout("unsafe_request_wait %p wait on tid %llu %llu\n",
+ /*
+ * The mdsc->max_sessions is unlikely to be changed
+ * mostly, here we will retry it by reallocating the
+ * sessions array memory to get rid of the mdsc->mutex
+ * lock.
+ */
+retry:
+ max_sessions = mdsc->max_sessions;
+
+ /*
+ * Trigger to flush the journal logs in all the relevant MDSes
+ * manually, or in the worst case we must wait at most 5 seconds
+ * to wait the journal logs to be flushed by the MDSes periodically.
+ */
+ if ((req1 || req2) && likely(max_sessions)) {
+ struct ceph_mds_session **sessions = NULL;
+ struct ceph_mds_session *s;
+ struct ceph_mds_request *req;
+ int i;
+
+ sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
+ if (!sessions) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (req1) {
+ list_for_each_entry(req, &ci->i_unsafe_dirops,
+ r_unsafe_dir_item) {
+ s = req->r_session;
+ if (!s)
+ continue;
+ if (unlikely(s->s_mds >= max_sessions)) {
+ spin_unlock(&ci->i_unsafe_lock);
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s)
+ ceph_put_mds_session(s);
+ }
+ kfree(sessions);
+ goto retry;
+ }
+ if (!sessions[s->s_mds]) {
+ s = ceph_get_mds_session(s);
+ sessions[s->s_mds] = s;
+ }
+ }
+ }
+ if (req2) {
+ list_for_each_entry(req, &ci->i_unsafe_iops,
+ r_unsafe_target_item) {
+ s = req->r_session;
+ if (!s)
+ continue;
+ if (unlikely(s->s_mds >= max_sessions)) {
+ spin_unlock(&ci->i_unsafe_lock);
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s)
+ ceph_put_mds_session(s);
+ }
+ kfree(sessions);
+ goto retry;
+ }
+ if (!sessions[s->s_mds]) {
+ s = ceph_get_mds_session(s);
+ sessions[s->s_mds] = s;
+ }
+ }
+ }
+ spin_unlock(&ci->i_unsafe_lock);
+
+ /* the auth MDS */
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap) {
+ s = ci->i_auth_cap->session;
+ if (!sessions[s->s_mds])
+ sessions[s->s_mds] = ceph_get_mds_session(s);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* send flush mdlog request to MDSes */
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(s);
+ }
+ }
+ kfree(sessions);
+ }
+
+ dout("%s %p wait on tid %llu %llu\n", __func__,
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
ceph_timeout_jiffies(req1->r_timeout));
if (ret)
err = -EIO;
- ceph_mdsc_put_request(req1);
}
if (req2) {
ret = !wait_for_completion_timeout(&req2->r_safe_completion,
ceph_timeout_jiffies(req2->r_timeout));
if (ret)
err = -EIO;
- ceph_mdsc_put_request(req2);
}
+
+out:
+ if (req1)
+ ceph_mdsc_put_request(req1);
+ if (req2)
+ ceph_mdsc_put_request(req2);
return err;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 flush_tid;
@@ -2233,10 +2396,14 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync)
goto out;
+ ret = ceph_wait_on_async_create(inode);
+ if (ret)
+ goto out;
+
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
- err = unsafe_request_wait(inode);
+ err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
/*
* only wait on non-file metadata writeback (the mds
@@ -2251,14 +2418,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (err < 0)
ret = err;
- if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
- spin_lock(&file->f_lock);
- err = errseq_check_and_advance(&ci->i_meta_err,
- &fi->meta_err);
- spin_unlock(&file->f_lock);
- if (err < 0)
- ret = err;
- }
+ err = file_check_and_advance_wb_err(file);
+ if (err < 0)
+ ret = err;
out:
dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
@@ -2279,7 +2441,11 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
dout("write_inode %p wait=%d\n", inode, wait);
+ ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
+ err = ceph_wait_on_async_create(inode);
+ if (err)
+ return err;
dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
@@ -2303,17 +2469,21 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
int ret;
u64 first_tid = 0;
u64 last_snap_flush = 0;
+ /* Don't do anything until create reply comes in */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
+ return;
+
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
- if (!cf->caps) {
+ if (cf->is_capsnap) {
last_snap_flush = cf->tid;
break;
}
@@ -2332,25 +2502,20 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
first_tid = cf->tid + 1;
- if (cf->caps) {
+ if (!cf->is_capsnap) {
+ struct cap_msg_args arg;
+
dout("kick_flushing_caps %p cap %p tid %llu %s\n",
inode, cap, cf->tid, ceph_cap_string(cf->caps));
- ci->i_ceph_flags |= CEPH_I_NODELAY;
-
- ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
(cf->tid < last_snap_flush ?
CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
- if (ret) {
- pr_err("kick_flushing_caps: error sending "
- "cap flush, ino (%llx.%llx) "
- "tid %llu flushing %s\n",
- ceph_vinop(inode), cf->tid,
- ceph_cap_string(cf->caps));
- }
+ spin_unlock(&ci->i_ceph_lock);
+ __send_cap(&arg, ci);
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
@@ -2397,7 +2562,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2434,6 +2599,8 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_cap *cap;
u64 oldest_flush_tid;
+ lockdep_assert_held(&session->s_mutex);
+
dout("kick_flushing_caps mds%d\n", session->s_mds);
spin_lock(&mdsc->cap_dirty_lock);
@@ -2445,7 +2612,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2457,16 +2624,15 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
}
}
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct inode *inode)
- __releases(ci->i_ceph_lock)
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_cap *cap = ci->i_auth_cap;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+ dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2478,9 +2644,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_dirty_lock);
__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
- spin_unlock(&ci->i_ceph_lock);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
}
@@ -2488,18 +2651,20 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
/*
* Take references to capabilities we hold, so that we don't release
* them to the MDS prematurely.
- *
- * Protected by i_ceph_lock.
*/
-static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
bool snap_rwsem_locked)
{
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
if (got & CEPH_CAP_FILE_RD)
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
+ if (got & CEPH_CAP_FILE_EXCL)
+ ci->i_fx_ref++;
if (got & CEPH_CAP_FILE_WR) {
if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
BUG_ON(!snap_rwsem_locked);
@@ -2510,10 +2675,10 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
}
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
- ihold(&ci->vfs_inode);
+ ihold(&ci->netfs.inode);
ci->i_wb_ref++;
- dout("__take_cap_refs %p wb %d -> %d (?)\n",
- &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ dout("%s %p wb %d -> %d (?)\n", __func__,
+ &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2524,14 +2689,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
* Note that caller is responsible for ensuring max_size increases are
* requested from the MDS.
*
- * Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
- * or a negative error code.
- *
- * FIXME: how does a 0 return differ from -EAGAIN?
+ * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
+ * or a negative error code. There are 3 speical error codes:
+ * -EAGAIN: need to sleep but non-blocking is specified
+ * -EFBIG: ask caller to call check_max_size() and try again.
+ * -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
*/
enum {
- NON_BLOCKING = 1,
- CHECK_FILELOCK = 2,
+ /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
+ NON_BLOCKING = (1 << 8),
+ CHECK_FILELOCK = (1 << 9),
};
static int try_get_cap_refs(struct inode *inode, int need, int want,
@@ -2541,7 +2708,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
int ret = 0;
int have, implemented;
- int file_wanted;
bool snap_rwsem_locked = false;
dout("get_cap_refs %p need %s want %s\n", inode,
@@ -2557,15 +2723,6 @@ again:
goto out_unlock;
}
- /* make sure file is actually open */
- file_wanted = __ceph_caps_file_wanted(ci);
- if ((file_wanted & need) != need) {
- dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
- ceph_cap_string(need), ceph_cap_string(file_wanted));
- ret = -EBADF;
- goto out_unlock;
- }
-
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
@@ -2584,7 +2741,7 @@ again:
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size)
- ret = -EAGAIN;
+ ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
goto out_unlock;
}
/*
@@ -2603,13 +2760,17 @@ again:
* on transition from wanted -> needed caps. This is needed
* for WRBUFFER|WR -> WR to avoid a new WR sync write from
* going before a prior buffered writeback happens.
+ *
+ * For RDCACHE|RD -> RD, there is not need to wait and we can
+ * just exclude the revoking caps and force to sync read.
*/
int not = want & ~(have & need);
int revoking = implemented & ~have;
+ int exclude = revoking & not;
dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
- if ((revoking & not) == 0) {
+ if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
if (!snap_rwsem_locked &&
!ci->i_head_snapc &&
(need & CEPH_CAP_FILE_WR)) {
@@ -2630,55 +2791,60 @@ again:
}
snap_rwsem_locked = true;
}
- *got = need | (have & want);
- if ((need & CEPH_CAP_FILE_RD) &&
- !(*got & CEPH_CAP_FILE_CACHE))
- ceph_disable_fscache_readpage(ci);
- __take_cap_refs(ci, *got, true);
+ if ((have & want) == want)
+ *got = need | (want & ~exclude);
+ else
+ *got = need;
+ ceph_take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
int session_readonly = false;
- if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+ int mds_wanted;
+ if (ci->i_auth_cap &&
+ (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
struct ceph_mds_session *s = ci->i_auth_cap->session;
spin_lock(&s->s_cap_lock);
session_readonly = s->s_readonly;
spin_unlock(&s->s_cap_lock);
}
if (session_readonly) {
- dout("get_cap_refs %p needed %s but mds%d readonly\n",
+ dout("get_cap_refs %p need %s but mds%d readonly\n",
inode, ceph_cap_string(need), ci->i_auth_cap->mds);
ret = -EROFS;
goto out_unlock;
}
- if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
- int mds_wanted;
- if (READ_ONCE(mdsc->fsc->mount_state) ==
- CEPH_MOUNT_SHUTDOWN) {
- dout("get_cap_refs %p forced umount\n", inode);
- ret = -EIO;
- goto out_unlock;
- }
- mds_wanted = __ceph_caps_mds_wanted(ci, false);
- if (need & ~(mds_wanted & need)) {
- dout("get_cap_refs %p caps were dropped"
- " (session killed?)\n", inode);
- ret = -ESTALE;
- goto out_unlock;
- }
- if (!(file_wanted & ~mds_wanted))
- ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
+ if (ceph_inode_is_shutdown(inode)) {
+ dout("get_cap_refs %p inode is shutdown\n", inode);
+ ret = -ESTALE;
+ goto out_unlock;
+ }
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
+ if (need & ~mds_wanted) {
+ dout("get_cap_refs %p need %s > mds_wanted %s\n",
+ inode, ceph_cap_string(need),
+ ceph_cap_string(mds_wanted));
+ ret = -EUCLEAN;
+ goto out_unlock;
}
- dout("get_cap_refs %p have %s needed %s\n", inode,
+ dout("get_cap_refs %p have %s need %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
out_unlock:
+
+ __ceph_touch_fmode(ci, mdsc, flags);
+
spin_unlock(&ci->i_ceph_lock);
if (snap_rwsem_locked)
up_read(&mdsc->snap_rwsem);
+ if (!ret)
+ ceph_update_cap_mis(&mdsc->metric);
+ else if (ret == 1)
+ ceph_update_cap_hit(&mdsc->metric);
+
dout("get_cap_refs %p ret %d got %s\n", inode,
ret, ceph_cap_string(*got));
return ret;
@@ -2712,20 +2878,40 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
+static inline int get_used_fmode(int caps)
+{
+ int fmode = 0;
+ if (caps & CEPH_CAP_FILE_RD)
+ fmode |= CEPH_FILE_MODE_RD;
+ if (caps & CEPH_CAP_FILE_WR)
+ fmode |= CEPH_FILE_MODE_WR;
+ return fmode;
+}
+
int ceph_try_get_caps(struct inode *inode, int need, int want,
bool nonblock, int *got)
{
- int ret;
+ int ret, flags;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
- ret = ceph_pool_perm_check(inode, need);
- if (ret < 0)
- return ret;
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
+ CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_ANY_DIR_OPS));
+ if (need) {
+ ret = ceph_pool_perm_check(inode, need);
+ if (ret < 0)
+ return ret;
+ }
- ret = try_get_cap_refs(inode, need, want, 0,
- (nonblock ? NON_BLOCKING : 0), got);
- return ret == -EAGAIN ? 0 : ret;
+ flags = get_used_fmode(need | want);
+ if (nonblock)
+ flags |= NON_BLOCKING;
+
+ ret = try_get_cap_refs(inode, need, want, 0, flags, got);
+ /* three special error codes */
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN)
+ ret = 0;
+ return ret;
}
/*
@@ -2733,8 +2919,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
* due to a small max_size, make sure we check_max_size (and possibly
* ask the mds) so we don't get hung up indefinitely.
*/
-int ceph_get_caps(struct file *filp, int need, int want,
- loff_t endoff, int *got, struct page **pinned_page)
+int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got)
{
struct ceph_file_info *fi = filp->private_data;
struct inode *inode = file_inode(filp);
@@ -2750,22 +2935,22 @@ int ceph_get_caps(struct file *filp, int need, int want,
fi->filp_gen != READ_ONCE(fsc->filp_gen))
return -EBADF;
- while (true) {
- if (endoff > 0)
- check_max_size(inode, endoff);
+ flags = get_used_fmode(need | want);
- flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
+ while (true) {
+ flags &= CEPH_FILE_MODE_MASK;
+ if (atomic_read(&fi->num_locks))
+ flags |= CHECK_FILELOCK;
_got = 0;
ret = try_get_cap_refs(inode, need, want, endoff,
flags, &_got);
- if (ret == -EAGAIN)
- continue;
+ WARN_ON_ONCE(ret == -EAGAIN);
if (!ret) {
struct ceph_mds_client *mdsc = fsc->mdsc;
struct cap_wait cw;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
- cw.ino = inode->i_ino;
+ cw.ino = ceph_ino(inode);
cw.tgid = current->tgid;
cw.need = need;
cw.want = want;
@@ -2774,6 +2959,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
list_add(&cw.list, &mdsc->cap_wait_list);
spin_unlock(&mdsc->caps_list_lock);
+ /* make sure used fmode not timeout */
+ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
add_wait_queue(&ci->i_cap_wq, &wait);
flags |= NON_BLOCKING;
@@ -2787,6 +2974,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
+ ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
spin_lock(&mdsc->caps_list_lock);
list_del(&cw.list);
@@ -2804,26 +2992,36 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
if (ret < 0) {
- if (ret == -ESTALE) {
+ if (ret == -EFBIG || ret == -EUCLEAN) {
+ int ret2 = ceph_wait_on_async_create(inode);
+ if (ret2 < 0)
+ return ret2;
+ }
+ if (ret == -EFBIG) {
+ check_max_size(inode, endoff);
+ continue;
+ }
+ if (ret == -EUCLEAN) {
/* session was killed, try renew caps */
- ret = ceph_renew_caps(inode);
+ ret = ceph_renew_caps(inode, flags);
if (ret == 0)
continue;
}
return ret;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
+ ceph_has_inline_data(ci) &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
struct page *page =
find_get_page(inode->i_mapping, 0);
if (page) {
- if (PageUptodate(page)) {
- *pinned_page = page;
- break;
- }
+ bool uptodate = PageUptodate(page);
+
put_page(page);
+ if (uptodate)
+ break;
}
/*
* drop cap refs first because getattr while
@@ -2845,10 +3043,6 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
break;
}
-
- if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
- ceph_fscache_revalidate_cookie(ci);
-
*got = _got;
return 0;
}
@@ -2860,7 +3054,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps, false);
+ ceph_take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
@@ -2888,6 +3082,12 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
return 0;
}
+enum put_cap_refs_mode {
+ PUT_CAP_REFS_SYNC = 0,
+ PUT_CAP_REFS_NO_CHECK,
+ PUT_CAP_REFS_ASYNC,
+};
+
/*
* Release cap refs.
*
@@ -2897,10 +3097,12 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
* If we are releasing a WR cap (from a sync write), finalize any affected
* cap_snap, and wake up any waiters.
*/
-void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
+ enum put_cap_refs_mode mode)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int last = 0, put = 0, flushsnaps = 0, wake = 0;
+ bool check_flushsnaps = false;
spin_lock(&ci->i_ceph_lock);
if (had & CEPH_CAP_PIN)
@@ -2911,29 +3113,23 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (had & CEPH_CAP_FILE_CACHE)
if (--ci->i_rdcache_ref == 0)
last++;
+ if (had & CEPH_CAP_FILE_EXCL)
+ if (--ci->i_fx_ref == 0)
+ last++;
if (had & CEPH_CAP_FILE_BUFFER) {
if (--ci->i_wb_ref == 0) {
last++;
+ /* put the ref held by ceph_take_cap_refs() */
put++;
+ check_flushsnaps = true;
}
dout("put_cap_refs %p wb %d -> %d (?)\n",
inode, ci->i_wb_ref+1, ci->i_wb_ref);
}
- if (had & CEPH_CAP_FILE_WR)
+ if (had & CEPH_CAP_FILE_WR) {
if (--ci->i_wr_ref == 0) {
last++;
- if (__ceph_have_pending_cap_snap(ci)) {
- struct ceph_cap_snap *capsnap =
- list_last_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap,
- ci_item);
- capsnap->writing = 0;
- if (ceph_try_drop_cap_snap(ci, capsnap))
- put++;
- else if (__ceph_finish_cap_snap(ci, capsnap))
- flushsnaps = 1;
- wake = 1;
- }
+ check_flushsnaps = true;
if (ci->i_wrbuffer_ref_head == 0 &&
ci->i_dirty_caps == 0 &&
ci->i_flushing_caps == 0) {
@@ -2943,23 +3139,65 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
}
/* see comment in __ceph_remove_cap() */
if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
- drop_inode_snap_realm(ci);
+ ceph_change_snap_realm(inode, NULL);
}
+ }
+ if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+
+ capsnap->writing = 0;
+ if (ceph_try_drop_cap_snap(ci, capsnap))
+ /* put the ref held by ceph_queue_cap_snap() */
+ put++;
+ else if (__ceph_finish_cap_snap(ci, capsnap))
+ flushsnaps = 1;
+ wake = 1;
+ }
spin_unlock(&ci->i_ceph_lock);
dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
last ? " last" : "", put ? " put" : "");
- if (last && !flushsnaps)
- ceph_check_caps(ci, 0, NULL);
- else if (flushsnaps)
- ceph_flush_snaps(ci, NULL);
+ switch (mode) {
+ case PUT_CAP_REFS_SYNC:
+ if (last)
+ ceph_check_caps(ci, 0, NULL);
+ else if (flushsnaps)
+ ceph_flush_snaps(ci, NULL);
+ break;
+ case PUT_CAP_REFS_ASYNC:
+ if (last)
+ ceph_queue_check_caps(inode);
+ else if (flushsnaps)
+ ceph_queue_flush_snaps(inode);
+ break;
+ default:
+ break;
+ }
if (wake)
wake_up_all(&ci->i_cap_wq);
while (put-- > 0)
iput(inode);
}
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+ __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC);
+}
+
+void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
+{
+ __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
+}
+
+void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
+{
+ __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK);
+}
+
/*
* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
* context. Adjust per-snap dirty page accounting as appropriate.
@@ -2970,11 +3208,10 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap = NULL;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
int put = 0;
bool last = false;
- bool found = false;
bool flush_snaps = false;
bool complete_capsnap = false;
@@ -3001,13 +3238,22 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
last ? " LAST" : "");
} else {
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->context == snapc) {
- found = true;
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->context == snapc) {
+ capsnap = iter;
break;
}
}
- BUG_ON(!found);
+
+ if (!capsnap) {
+ /*
+ * The capsnap should already be removed when removing
+ * auth cap in the case of a forced unmount.
+ */
+ WARN_ON_ONCE(ci->i_auth_cap);
+ goto unlock;
+ }
+
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
complete_capsnap = true;
@@ -3029,18 +3275,18 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
complete_capsnap ? " (complete capsnap)" : "");
}
+unlock:
spin_unlock(&ci->i_ceph_lock);
if (last) {
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
} else if (flush_snaps) {
ceph_flush_snaps(ci, NULL);
}
if (complete_capsnap)
wake_up_all(&ci->i_cap_wq);
while (put-- > 0) {
- /* avoid calling iput_final() in osd dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
}
}
@@ -3114,7 +3360,7 @@ static void handle_cap_grant(struct inode *inode,
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
unsigned char check_caps = 0;
- bool was_stale = cap->cap_gen < session->s_cap_gen;
+ bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen);
bool wake = false;
bool writeback = false;
bool queue_trunc = false;
@@ -3125,7 +3371,7 @@ static void handle_cap_grant(struct inode *inode,
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
- inode->i_size);
+ i_size_read(inode));
/*
@@ -3133,7 +3379,7 @@ static void handle_cap_grant(struct inode *inode,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
- if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
@@ -3166,7 +3412,7 @@ static void handle_cap_grant(struct inode *inode,
}
/* side effects now are allowed */
- cap->cap_gen = session->s_cap_gen;
+ cap->cap_gen = atomic_read(&session->s_cap_gen);
cap->seq = seq;
__check_cap_issue(ci, cap, newcaps);
@@ -3175,7 +3421,13 @@ static void handle_cap_grant(struct inode *inode,
if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
(extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(grant->mode);
+ umode_t mode = le32_to_cpu(grant->mode);
+
+ if (inode_wrong_type(inode, mode))
+ pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
+ else
+ inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
ci->i_btime = extra_info->btime;
@@ -3187,8 +3439,7 @@ static void handle_cap_grant(struct inode *inode,
if ((newcaps & CEPH_CAP_LINK_SHARED) &&
(extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
set_nlink(inode, le32_to_cpu(grant->nlink));
- if (inode->i_nlink == 0 &&
- (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+ if (inode->i_nlink == 0)
deleted_inode = true;
}
@@ -3258,10 +3509,6 @@ static void handle_cap_grant(struct inode *inode,
ci->i_requested_max_size = 0;
}
wake = true;
- } else if (ci->i_wanted_max_size > ci->i_max_size &&
- ci->i_wanted_max_size > ci->i_requested_max_size) {
- /* CEPH_CAP_OP_IMPORT */
- wake = true;
}
}
@@ -3297,16 +3544,20 @@ static void handle_cap_grant(struct inode *inode,
ceph_cap_string(cap->issued),
ceph_cap_string(newcaps),
ceph_cap_string(revoking));
- if (revoking & used & CEPH_CAP_FILE_BUFFER)
+ if (S_ISREG(inode->i_mode) &&
+ (revoking & used & CEPH_CAP_FILE_BUFFER))
writeback = true; /* initiate writeback; will delay ack */
- else if (revoking == CEPH_CAP_FILE_CACHE &&
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- queue_invalidate)
+ else if (queue_invalidate &&
+ revoking == CEPH_CAP_FILE_CACHE &&
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
; /* do nothing yet, invalidation will be queued */
else if (cap == ci->i_auth_cap)
check_caps = 1; /* check auth cap only */
else
check_caps = 2; /* check all caps */
+ /* If there is new caps, try to wake up the waiters */
+ if (~cap->issued & newcaps)
+ wake = true;
cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) {
@@ -3337,13 +3588,22 @@ static void handle_cap_grant(struct inode *inode,
}
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- if (newcaps & ~extra_info->issued)
- wake = true;
- kick_flushing_inode_caps(session->s_mdsc, session, inode);
+ if (ci->i_auth_cap == cap) {
+ if (newcaps & ~extra_info->issued)
+ wake = true;
+
+ if (ci->i_requested_max_size > max_size ||
+ !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
+ /* re-request max_size if necessary */
+ ci->i_requested_max_size = 0;
+ wake = true;
+ }
+
+ ceph_kick_flushing_inode_caps(session, ci);
+ }
up_read(&session->s_mdsc->snap_rwsem);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
+ spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
@@ -3366,13 +3626,12 @@ static void handle_cap_grant(struct inode *inode,
if (wake)
wake_up_all(&ci->i_cap_wq);
+ mutex_unlock(&session->s_mutex);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
session);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
- else
- mutex_unlock(&session->s_mutex);
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
}
/*
@@ -3397,15 +3656,26 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
bool wake_mdsc = false;
list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
+ /* Is this the one that was flushed? */
if (cf->tid == flush_tid)
cleaned = cf->caps;
- if (cf->caps == 0) /* capsnap */
+
+ /* Is this a capsnap? */
+ if (cf->is_capsnap)
continue;
+
if (cf->tid <= flush_tid) {
- if (__finish_cap_flush(NULL, ci, cf))
- wake_ci = true;
+ /*
+ * An earlier or current tid. The FLUSH_ACK should
+ * represent a superset of this flush's caps.
+ */
+ wake_ci |= __detach_cap_flush_from_ci(ci, cf);
list_add_tail(&cf->i_list, &to_remove);
} else {
+ /*
+ * This is a later one. Any caps in it are still dirty
+ * so don't count them as cleaned.
+ */
cleaned &= ~cf->caps;
if (!cleaned)
break;
@@ -3425,10 +3695,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
spin_lock(&mdsc->cap_dirty_lock);
- list_for_each_entry(cf, &to_remove, i_list) {
- if (__finish_cap_flush(mdsc, NULL, cf))
- wake_mdsc = true;
- }
+ list_for_each_entry(cf, &to_remove, i_list)
+ wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf);
if (ci->i_flushing_caps == 0) {
if (list_empty(&ci->i_cap_flush_list)) {
@@ -3438,7 +3706,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
session->s_mds,
&list_first_entry(&session->s_cap_flushing,
struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ i_flushing_item)->netfs.inode);
}
}
mdsc->num_cap_flushing--;
@@ -3466,8 +3734,9 @@ out:
while (!list_empty(&to_remove)) {
cf = list_first_entry(&to_remove,
struct ceph_cap_flush, i_list);
- list_del(&cf->i_list);
- ceph_free_cap_flush(cf);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
}
if (wake_ci)
@@ -3478,6 +3747,43 @@ out:
iput(inode);
}
+void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ bool ret;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
+
+ list_del_init(&capsnap->ci_item);
+ ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
+ if (wake_ci)
+ *wake_ci = ret;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (list_empty(&ci->i_cap_flush_list))
+ list_del_init(&ci->i_flushing_item);
+
+ ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
+ if (wake_mdsc)
+ *wake_mdsc = ret;
+ spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
+ __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
+}
+
/*
* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
* throw away our cap_snap.
@@ -3491,8 +3797,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
- struct ceph_cap_snap *capsnap;
- bool flushed = false;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
bool wake_ci = false;
bool wake_mdsc = false;
@@ -3500,41 +3805,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
inode, ci, session->s_mds, follows);
spin_lock(&ci->i_ceph_lock);
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->follows == follows) {
- if (capsnap->cap_flush.tid != flush_tid) {
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->follows == follows) {
+ if (iter->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", capsnap, follows,
- flush_tid, capsnap->cap_flush.tid);
+ " %lld\n", iter, follows,
+ flush_tid, iter->cap_flush.tid);
break;
}
- flushed = true;
+ capsnap = iter;
break;
} else {
dout(" skipping cap_snap %p follows %lld\n",
- capsnap, capsnap->follows);
+ iter, iter->follows);
}
}
- if (flushed) {
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing %p cap_snap %p follows %lld\n",
- inode, capsnap, follows);
- list_del(&capsnap->ci_item);
- if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
- wake_ci = true;
-
- spin_lock(&mdsc->cap_dirty_lock);
-
- if (list_empty(&ci->i_cap_flush_list))
- list_del_init(&ci->i_flushing_item);
-
- if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
- wake_mdsc = true;
-
- spin_unlock(&mdsc->cap_dirty_lock);
- }
+ if (capsnap)
+ ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
spin_unlock(&ci->i_ceph_lock);
- if (flushed) {
+
+ if (capsnap) {
ceph_put_snap_context(capsnap->context);
ceph_put_cap_snap(capsnap);
if (wake_ci)
@@ -3550,10 +3840,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
*
* caller hold s_mutex.
*/
-static void handle_cap_trunc(struct inode *inode,
+static bool handle_cap_trunc(struct inode *inode,
struct ceph_mds_caps *trunc,
struct ceph_mds_session *session)
- __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
@@ -3564,7 +3853,9 @@ static void handle_cap_trunc(struct inode *inode,
int implemented = 0;
int dirty = __ceph_caps_dirty(ci);
int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
- int queue_trunc = 0;
+ bool queue_trunc = false;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
issued |= implemented | dirty;
@@ -3572,10 +3863,7 @@ static void handle_cap_trunc(struct inode *inode,
inode, mds, seq, truncate_size, truncate_seq);
queue_trunc = ceph_fill_file_size(inode, issued,
truncate_seq, truncate_size, size);
- spin_unlock(&ci->i_ceph_lock);
-
- if (queue_trunc)
- ceph_queue_vmtruncate(inode);
+ return queue_trunc;
}
/*
@@ -3613,15 +3901,14 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
inode, ci, mds, mseq, target);
retry:
+ down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
goto out_unlock;
if (target < 0) {
- if (cap->mds_wanted | cap->issued)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
- __ceph_remove_cap(cap, false);
+ ceph_remove_cap(cap, false);
goto out_unlock;
}
@@ -3651,24 +3938,18 @@ retry:
tcap->issue_seq = t_seq - 1;
tcap->issued |= issued;
tcap->implemented |= issued;
- if (cap == ci->i_auth_cap)
+ if (cap == ci->i_auth_cap) {
ci->i_auth_cap = tcap;
-
- if (!list_empty(&ci->i_cap_flush_list) &&
- ci->i_auth_cap == tcap) {
- spin_lock(&mdsc->cap_dirty_lock);
- list_move_tail(&ci->i_flushing_item,
- &tcap->session->s_cap_flushing);
- spin_unlock(&mdsc->cap_dirty_lock);
+ change_auth_cap_ses(ci, tcap->session);
}
}
- __ceph_remove_cap(cap, false);
+ ceph_remove_cap(cap, false);
goto out_unlock;
} else if (tsession) {
/* add placeholder for the export tagert */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
- ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+ ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
if (!list_empty(&ci->i_cap_flush_list) &&
@@ -3679,11 +3960,12 @@ retry:
spin_unlock(&mdsc->cap_dirty_lock);
}
- __ceph_remove_cap(cap, false);
+ ceph_remove_cap(cap, false);
goto out_unlock;
}
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
/* open target session */
@@ -3703,11 +3985,13 @@ retry:
WARN_ON(1);
tsession = NULL;
target = -1;
+ mutex_lock(&session->s_mutex);
}
goto retry;
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
if (tsession) {
mutex_unlock(&tsession->s_mutex);
@@ -3727,7 +4011,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
struct ceph_mds_cap_peer *ph,
struct ceph_mds_session *session,
struct ceph_cap **target_cap, int *old_issued)
- __acquires(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap, *ocap, *new_cap = NULL;
@@ -3752,14 +4035,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
inode, ci, mds, mseq, peer);
-
retry:
- spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
if (!new_cap) {
spin_unlock(&ci->i_ceph_lock);
new_cap = ceph_get_cap(mdsc, NULL);
+ spin_lock(&ci->i_ceph_lock);
goto retry;
}
cap = new_cap;
@@ -3773,7 +4055,7 @@ retry:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+ ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
@@ -3791,12 +4073,9 @@ retry:
ocap->mseq, mds, le32_to_cpu(ph->seq),
le32_to_cpu(ph->mseq));
}
- __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
}
- /* make sure we re-request max_size, if necessary */
- ci->i_requested_max_size = 0;
-
*old_issued = issued;
*target_cap = cap;
}
@@ -3825,6 +4104,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
size_t snaptrace_len;
void *p, *end;
struct cap_extra_info extra_info = {};
+ bool queue_trunc;
dout("handle_caps from mds%d\n", session->s_mds);
@@ -3881,15 +4161,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
if (msg_version >= 8) {
- u64 flush_tid;
- u32 caller_uid, caller_gid;
u32 pool_ns_len;
/* version >= 6 */
- ceph_decode_64_safe(&p, end, flush_tid, bad);
+ ceph_decode_skip_64(&p, end, bad); // flush_tid
/* version >= 7 */
- ceph_decode_32_safe(&p, end, caller_uid, bad);
- ceph_decode_32_safe(&p, end, caller_gid, bad);
+ ceph_decode_skip_32(&p, end, bad); // caller_uid
+ ceph_decode_skip_32(&p, end, bad); // caller_gid
/* version >= 8 */
ceph_decode_32_safe(&p, end, pool_ns_len, bad);
if (pool_ns_len > 0) {
@@ -3912,9 +4190,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
if (msg_version >= 11) {
- u32 flags;
/* version >= 10 */
- ceph_decode_32_safe(&p, end, flags, bad);
+ ceph_decode_skip_32(&p, end, bad); // flags
/* version >= 11 */
extra_info.dirstat_valid = true;
ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
@@ -3923,12 +4200,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
- ci = ceph_inode(inode);
dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
vino.snap, inode);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
@@ -3947,8 +4223,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
__ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- goto done;
+ goto flush_cap_releases;
}
+ ci = ceph_inode(inode);
/* these will work even if we don't have a cap yet */
switch (op) {
@@ -3972,6 +4249,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
} else {
down_read(&mdsc->snap_rwsem);
}
+ spin_lock(&ci->i_ceph_lock);
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &extra_info.issued);
handle_cap_grant(inode, session, cap,
@@ -4008,7 +4286,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
break;
case CEPH_CAP_OP_TRUNC:
- handle_cap_trunc(inode, h, session);
+ queue_trunc = handle_cap_trunc(inode, h, session);
+ spin_unlock(&ci->i_ceph_lock);
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
break;
default:
@@ -4020,9 +4301,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
done:
mutex_unlock(&session->s_mutex);
done_unlocked:
+ iput(inode);
+out:
ceph_put_string(extra_info.pool_ns);
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(inode);
return;
flush_cap_releases:
@@ -4037,62 +4318,75 @@ flush_cap_releases:
bad:
pr_err("ceph_handle_caps: corrupt message\n");
ceph_msg_dump(msg);
- return;
+ goto out;
}
/*
* Delayed work handler to process end of delayed cap release LRU list.
+ *
+ * If new caps are added to the list while processing it, these won't get
+ * processed in this run. In this case, the ci->i_hold_caps_max will be
+ * returned so that the work can be scheduled accordingly.
*/
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
- int flags = CHECK_CAPS_NODELAY;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+ unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
+ unsigned long loop_start = jiffies;
+ unsigned long delay = 0;
dout("check_delayed_caps\n");
- while (1) {
- spin_lock(&mdsc->cap_delay_lock);
- if (list_empty(&mdsc->cap_delay_list))
- break;
+ spin_lock(&mdsc->cap_delay_lock);
+ while (!list_empty(&mdsc->cap_delay_list)) {
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
+ if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
+ dout("%s caps added recently. Exiting loop", __func__);
+ delay = ci->i_hold_caps_max;
+ break;
+ }
if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
time_before(jiffies, ci->i_hold_caps_max))
break;
list_del_init(&ci->i_cap_delay_list);
- inode = igrab(&ci->vfs_inode);
- spin_unlock(&mdsc->cap_delay_lock);
-
+ inode = igrab(&ci->netfs.inode);
if (inode) {
+ spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
- ceph_check_caps(ci, flags, NULL);
- /* avoid calling iput_final() in tick thread */
- ceph_async_iput(inode);
+ ceph_check_caps(ci, 0, NULL);
+ iput(inode);
+ spin_lock(&mdsc->cap_delay_lock);
}
}
spin_unlock(&mdsc->cap_delay_lock);
+
+ return delay;
}
/*
* Flush all dirty caps to the mds
*/
-void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+static void flush_dirty_session_caps(struct ceph_mds_session *s)
{
+ struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_inode_info *ci;
struct inode *inode;
dout("flush_dirty_caps\n");
spin_lock(&mdsc->cap_dirty_lock);
- while (!list_empty(&mdsc->cap_dirty)) {
- ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
+ while (!list_empty(&s->s_cap_dirty)) {
+ ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
i_dirty_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
- dout("flush_dirty_caps %p\n", inode);
+ dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+ ceph_wait_on_async_create(inode);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
@@ -4100,14 +4394,53 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
{
- int i;
+ ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
+}
+
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode)
+{
+ unsigned long now = jiffies;
+ if (fmode & CEPH_FILE_MODE_RD)
+ ci->i_last_rd = now;
+ if (fmode & CEPH_FILE_MODE_WR)
+ ci->i_last_wr = now;
+ /* queue periodic check */
+ if (fmode &&
+ __ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list))
+ __cap_delay_requeue(mdsc, ci);
+}
+
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
+ bool already_opened = false;
+ int i;
+
+ if (count == 1)
+ atomic64_inc(&mdsc->metric.opened_files);
+
+ spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ /*
+ * If any of the mode ref is larger than 0,
+ * that means it has been already opened by
+ * others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i])
+ already_opened = true;
+
if (bits & (1 << i))
- ci->i_nr_by_mode[i]++;
+ ci->i_nr_by_mode[i] += count;
}
+
+ if (!already_opened)
+ percpu_counter_inc(&mdsc->metric.opened_inodes);
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -4115,26 +4448,35 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
* we may need to release capabilities to the MDS (or schedule
* their delayed release).
*/
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i, last = 0;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
+ bool is_closed = true;
+ int i;
+
+ if (count == 1)
+ atomic64_dec(&mdsc->metric.opened_files);
+
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
- BUG_ON(ci->i_nr_by_mode[i] == 0);
- if (--ci->i_nr_by_mode[i] == 0)
- last++;
+ BUG_ON(ci->i_nr_by_mode[i] < count);
+ ci->i_nr_by_mode[i] -= count;
}
+
+ /*
+ * If any of the mode ref is not 0 after
+ * decreased, that means it is still opened
+ * by others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i])
+ is_closed = false;
}
- dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
- &ci->vfs_inode, fmode,
- ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
- ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
- spin_unlock(&ci->i_ceph_lock);
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
- ceph_check_caps(ci, 0, NULL);
+ if (is_closed)
+ percpu_counter_dec(&mdsc->metric.opened_inodes);
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -4152,7 +4494,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (inode->i_nlink == 1) {
drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
- ci->i_ceph_flags |= CEPH_I_NODELAY;
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_client(inode)->mdsc;
@@ -4208,8 +4549,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
if (force || (cap->issued & drop)) {
if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
- wanted |= cap->mds_wanted;
dout("encode_inode_release %p cap %p "
"%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
@@ -4220,6 +4559,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
cap->issued &= ~drop;
cap->implemented &= ~drop;
cap->mds_wanted = wanted;
+ if (cap == ci->i_auth_cap &&
+ !(wanted & CEPH_CAP_ANY_FILE_WR))
+ ci->i_requested_max_size = 0;
} else {
dout("encode_inode_release %p cap %p %s"
" (force)\n", inode, cap,
@@ -4287,3 +4629,119 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_unlock(&dentry->d_lock);
return ret;
}
+
+static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_snap *capsnap;
+ int capsnap_release = 0;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
+
+ while (!list_empty(&ci->i_cap_snaps)) {
+ capsnap = list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ __ceph_remove_capsnap(inode, capsnap, NULL, NULL);
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ capsnap_release++;
+ }
+ wake_up_all(&ci->i_cap_wq);
+ wake_up_all(&mdsc->cap_flushing_wq);
+ return capsnap_release;
+}
+
+int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool is_auth;
+ bool dirty_dropped = false;
+ int iputs = 0;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ dout("removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->netfs.inode);
+
+ is_auth = (cap == ci->i_auth_cap);
+ __ceph_remove_cap(cap, false);
+ if (is_auth) {
+ struct ceph_cap_flush *cf;
+
+ if (ceph_inode_is_shutdown(inode)) {
+ if (inode->i_data.nrpages > 0)
+ *invalidate = true;
+ if (ci->i_wrbuffer_ref > 0)
+ mapping_set_error(&inode->i_data, -EIO);
+ }
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ /* trash all of the cap flushes for this inode */
+ while (!list_empty(&ci->i_cap_flush_list)) {
+ cf = list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ list_del_init(&cf->g_list);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
+ }
+
+ if (!list_empty(&ci->i_dirty_item)) {
+ pr_warn_ratelimited(
+ " dropping dirty %s state for %p %lld\n",
+ ceph_cap_string(ci->i_dirty_caps),
+ inode, ceph_ino(inode));
+ ci->i_dirty_caps = 0;
+ list_del_init(&ci->i_dirty_item);
+ dirty_dropped = true;
+ }
+ if (!list_empty(&ci->i_flushing_item)) {
+ pr_warn_ratelimited(
+ " dropping dirty+flushing %s state for %p %lld\n",
+ ceph_cap_string(ci->i_flushing_caps),
+ inode, ceph_ino(inode));
+ ci->i_flushing_caps = 0;
+ list_del_init(&ci->i_flushing_item);
+ mdsc->num_cap_flushing--;
+ dirty_dropped = true;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ if (dirty_dropped) {
+ mapping_set_error(inode->i_mapping, -EIO);
+
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ }
+
+ if (atomic_read(&ci->i_filelock_ref) > 0) {
+ /* make further file lock syscall return -EIO */
+ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
+ pr_warn_ratelimited(" dropping file locks for %p %lld\n",
+ inode, ceph_ino(inode));
+ }
+
+ if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+ cf = ci->i_prealloc_cap_flush;
+ ci->i_prealloc_cap_flush = NULL;
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
+ }
+
+ if (!list_empty(&ci->i_cap_snaps))
+ iputs = remove_capsnaps(mdsc, inode);
+ }
+ if (dirty_dropped)
+ ++iputs;
+ return iputs;
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb7cabd98e7b..bec3c4549c07 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -7,6 +7,8 @@
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/math64.h>
+#include <linux/ktime.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/mon_client.h>
@@ -18,6 +20,7 @@
#ifdef CONFIG_DEBUG_FS
#include "mds_client.h"
+#include "metric.h"
static int mdsmap_show(struct seq_file *s, void *p)
{
@@ -124,11 +127,133 @@ static int mdsc_show(struct seq_file *s, void *p)
return 0;
}
+#define CEPH_LAT_METRIC_SHOW(name, total, avg, min, max, sq) { \
+ s64 _total, _avg, _min, _max, _sq, _st; \
+ _avg = ktime_to_us(avg); \
+ _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \
+ _max = ktime_to_us(max); \
+ _total = total - 1; \
+ _sq = _total > 0 ? DIV64_U64_ROUND_CLOSEST(sq, _total) : 0; \
+ _st = int_sqrt64(_sq); \
+ _st = ktime_to_us(_st); \
+ seq_printf(s, "%-14s%-12lld%-16lld%-16lld%-16lld%lld\n", \
+ name, total, _avg, _min, _max, _st); \
+}
+
+#define CEPH_SZ_METRIC_SHOW(name, total, avg, min, max, sum) { \
+ u64 _min = min == U64_MAX ? 0 : min; \
+ seq_printf(s, "%-14s%-12lld%-16llu%-16llu%-16llu%llu\n", \
+ name, total, avg, _min, max, sum); \
+}
+
+static int metrics_file_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *m = &fsc->mdsc->metric;
+
+ seq_printf(s, "item total\n");
+ seq_printf(s, "------------------------------------------\n");
+ seq_printf(s, "%-35s%lld\n", "total inodes",
+ percpu_counter_sum(&m->total_inodes));
+ seq_printf(s, "%-35s%lld\n", "opened files",
+ atomic64_read(&m->opened_files));
+ seq_printf(s, "%-35s%lld\n", "pinned i_caps",
+ atomic64_read(&m->total_caps));
+ seq_printf(s, "%-35s%lld\n", "opened inodes",
+ percpu_counter_sum(&m->opened_inodes));
+ return 0;
+}
+
+static const char * const metric_str[] = {
+ "read",
+ "write",
+ "metadata",
+ "copyfrom"
+};
+static int metrics_latency_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *cm = &fsc->mdsc->metric;
+ struct ceph_metric *m;
+ s64 total, avg, min, max, sq;
+ int i;
+
+ seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
+ seq_printf(s, "-----------------------------------------------------------------------------------\n");
+
+ for (i = 0; i < METRIC_MAX; i++) {
+ m = &cm->metric[i];
+ spin_lock(&m->lock);
+ total = m->total;
+ avg = m->latency_avg;
+ min = m->latency_min;
+ max = m->latency_max;
+ sq = m->latency_sq_sum;
+ spin_unlock(&m->lock);
+ CEPH_LAT_METRIC_SHOW(metric_str[i], total, avg, min, max, sq);
+ }
+
+ return 0;
+}
+
+static int metrics_size_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *cm = &fsc->mdsc->metric;
+ struct ceph_metric *m;
+ s64 total;
+ u64 sum, avg, min, max;
+ int i;
+
+ seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n");
+ seq_printf(s, "----------------------------------------------------------------------------------------\n");
+
+ for (i = 0; i < METRIC_MAX; i++) {
+ /* skip 'metadata' as it doesn't use the size metric */
+ if (i == METRIC_METADATA)
+ continue;
+ m = &cm->metric[i];
+ spin_lock(&m->lock);
+ total = m->total;
+ sum = m->size_sum;
+ avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
+ min = m->size_min;
+ max = m->size_max;
+ spin_unlock(&m->lock);
+ CEPH_SZ_METRIC_SHOW(metric_str[i], total, avg, min, max, sum);
+ }
+
+ return 0;
+}
+
+static int metrics_caps_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *m = &fsc->mdsc->metric;
+ int nr_caps = 0;
+
+ seq_printf(s, "item total miss hit\n");
+ seq_printf(s, "-------------------------------------------------\n");
+
+ seq_printf(s, "%-14s%-16lld%-16lld%lld\n", "d_lease",
+ atomic64_read(&m->total_dentries),
+ percpu_counter_sum(&m->d_lease_mis),
+ percpu_counter_sum(&m->d_lease_hit));
+
+ nr_caps = atomic64_read(&m->total_caps);
+ seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps,
+ percpu_counter_sum(&m->i_caps_mis),
+ percpu_counter_sum(&m->i_caps_hit));
+
+ return 0;
+}
+
static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
struct seq_file *s = p;
- seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+ seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
+ cap->session->s_mds,
ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented));
return 0;
@@ -148,8 +273,8 @@ static int caps_show(struct seq_file *s, void *p)
"reserved\t%d\n"
"min\t\t%d\n\n",
total, avail, used, reserved, min);
- seq_printf(s, "ino issued implemented\n");
- seq_printf(s, "-----------------------------------------------\n");
+ seq_printf(s, "ino mds issued implemented\n");
+ seq_printf(s, "--------------------------------------------------\n");
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
@@ -173,7 +298,7 @@ static int caps_show(struct seq_file *s, void *p)
spin_lock(&mdsc->caps_list_lock);
list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
- seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+ seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
ceph_cap_string(cw->need),
ceph_cap_string(cw->want));
}
@@ -188,7 +313,7 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_auth_client *ac = fsc->client->monc.auth;
struct ceph_options *opt = fsc->client->options;
- int mds = -1;
+ int mds;
mutex_lock(&mdsc->mutex);
@@ -218,10 +343,28 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
return 0;
}
-CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
+static int status_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_entity_inst *inst = &fsc->client->msgr.inst;
+ struct ceph_entity_addr *client_addr = ceph_client_addr(fsc->client);
+
+ seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name),
+ ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce));
+ seq_printf(s, "blocklisted: %s\n", fsc->blocklisted ? "true" : "false");
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(mdsmap);
+DEFINE_SHOW_ATTRIBUTE(mdsc);
+DEFINE_SHOW_ATTRIBUTE(caps);
+DEFINE_SHOW_ATTRIBUTE(mds_sessions);
+DEFINE_SHOW_ATTRIBUTE(status);
+DEFINE_SHOW_ATTRIBUTE(metrics_file);
+DEFINE_SHOW_ATTRIBUTE(metrics_latency);
+DEFINE_SHOW_ATTRIBUTE(metrics_size);
+DEFINE_SHOW_ATTRIBUTE(metrics_caps);
/*
@@ -255,7 +398,9 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_mdsmap);
debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
+ debugfs_remove(fsc->debugfs_status);
debugfs_remove(fsc->debugfs_mdsc);
+ debugfs_remove_recursive(fsc->debugfs_metrics_dir);
}
void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
@@ -271,7 +416,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
&congestion_kb_fops);
snprintf(name, sizeof(name), "../../bdi/%s",
- dev_name(fsc->sb->s_bdi->dev));
+ bdi_dev_name(fsc->sb->s_bdi));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
fsc->client->debugfs_dir,
@@ -281,25 +426,43 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
0400,
fsc->client->debugfs_dir,
fsc,
- &mdsmap_show_fops);
+ &mdsmap_fops);
fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
0400,
fsc->client->debugfs_dir,
fsc,
- &mds_sessions_show_fops);
+ &mds_sessions_fops);
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
0400,
fsc->client->debugfs_dir,
fsc,
- &mdsc_show_fops);
+ &mdsc_fops);
fsc->debugfs_caps = debugfs_create_file("caps",
- 0400,
- fsc->client->debugfs_dir,
- fsc,
- &caps_show_fops);
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &caps_fops);
+
+ fsc->debugfs_status = debugfs_create_file("status",
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &status_fops);
+
+ fsc->debugfs_metrics_dir = debugfs_create_dir("metrics",
+ fsc->client->debugfs_dir);
+
+ debugfs_create_file("file", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_file_fops);
+ debugfs_create_file("latency", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_latency_fops);
+ debugfs_create_file("size", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_size_fops);
+ debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_caps_fops);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d0cd0aba5843..e7e2ebac330d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,6 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry);
static int ceph_d_init(struct dentry *dentry)
{
struct ceph_dentry_info *di;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di)
@@ -48,6 +49,9 @@ static int ceph_d_init(struct dentry *dentry)
di->time = jiffies;
dentry->d_fsdata = di;
INIT_LIST_HEAD(&di->lease_list);
+
+ atomic64_inc(&mdsc->metric.total_dentries);
+
return 0;
}
@@ -141,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
- i_mutex, no need to use page lock */
+ i_rwsem, no need to use page lock */
unlock_page(cache_ctl->page);
cache_ctl->dentries = kmap(cache_ctl->page);
}
@@ -151,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
rcu_read_lock();
spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
- * marked as complete while not holding the i_mutex. */
+ * marked as complete while not holding the i_rwsem. */
if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
dentry = cache_ctl->dentries[cache_ctl->index];
else
@@ -254,9 +258,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
- dentry->d_name.len,
- ceph_translate_ino(dentry->d_sb,
- d_inode(dentry)->i_ino),
+ dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
d_inode(dentry)->i_mode >> 12)) {
dput(dentry);
err = 0;
@@ -319,30 +321,37 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* always start with . and .. */
if (ctx->pos == 0) {
dout("readdir off 0 -> '.'\n");
- if (!dir_emit(ctx, ".", 1,
- ceph_translate_ino(inode->i_sb, inode->i_ino),
+ if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
inode->i_mode >> 12))
return 0;
ctx->pos = 1;
}
if (ctx->pos == 1) {
- ino_t ino = parent_ino(file->f_path.dentry);
+ u64 ino;
+ struct dentry *dentry = file->f_path.dentry;
+
+ spin_lock(&dentry->d_lock);
+ ino = ceph_present_inode(dentry->d_parent->d_inode);
+ spin_unlock(&dentry->d_lock);
+
dout("readdir off 1 -> '..'\n");
- if (!dir_emit(ctx, "..", 2,
- ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12))
+ if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
return 0;
ctx->pos = 2;
}
- /* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
+ /* request Fx cap. if have Fx, we don't need to release Fs cap
+ * for later create/unlink. */
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
+ /* can we use the dcache? */
if (ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete_ordered(ci) &&
- __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
int shared_gen = atomic_read(&ci->i_shared_gen);
+
spin_unlock(&ci->i_ceph_lock);
err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
@@ -469,8 +478,11 @@ more:
2 : (fpos_off(rde->offset) + 1);
err = note_last_dentry(dfi, rde->name, rde->name_len,
next_offset);
- if (err)
+ if (err) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
return err;
+ }
} else if (req->r_reply_info.dir_end) {
dfi->next_offset = 2;
/* keep last name */
@@ -498,9 +510,6 @@ more:
}
for (; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
- struct ceph_vino vino;
- ino_t ino;
- u32 ftype;
BUG_ON(rde->offset < ctx->pos);
@@ -510,13 +519,16 @@ more:
rde->name_len, rde->name, &rde->inode.in);
BUG_ON(!rde->inode.in);
- ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
- vino.ino = le64_to_cpu(rde->inode.in->ino);
- vino.snap = le64_to_cpu(rde->inode.in->snapid);
- ino = ceph_vino_to_ino(vino);
if (!dir_emit(ctx, rde->name, rde->name_len,
- ceph_translate_ino(inode->i_sb, ino), ftype)) {
+ ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+ le32_to_cpu(rde->inode.in->mode) >> 12)) {
+ /*
+ * NOTE: Here no need to put the 'dfi->last_readdir',
+ * because when dir_emit stops us it's most likely
+ * doesn't have enough memory, etc. So for next readdir
+ * it will continue.
+ */
dout("filldir stopping us...\n");
return 0;
}
@@ -628,10 +640,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_CUR:
offset += file->f_pos;
+ break;
case SEEK_SET:
break;
case SEEK_END:
retval = -EOPNOTSUPP;
+ goto out;
default:
goto out;
}
@@ -662,25 +676,25 @@ out:
/*
* Handle lookups for the hidden .snap directory.
*/
-int ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err)
+struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
+ struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
+ struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
/* .snap dir? */
- if (err == -ENOENT &&
- ceph_snap(parent) == CEPH_NOSNAP &&
- strcmp(dentry->d_name.name,
- fsc->mount_options->snapdir_name) == 0) {
+ if (ceph_snap(parent) == CEPH_NOSNAP &&
+ strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) {
+ struct dentry *res;
struct inode *inode = ceph_get_snapdir(parent);
- dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
- dentry, dentry, inode);
- BUG_ON(!d_unhashed(dentry));
- d_add(dentry, inode);
- err = 0;
+
+ res = d_splice_alias(inode, dentry);
+ dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
+ dentry, dentry, inode, res);
+ if (res)
+ dentry = res;
}
- return err;
+ return dentry;
}
/*
@@ -734,7 +748,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
int op;
int mask;
@@ -752,14 +766,15 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&ci->i_ceph_lock);
- dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+ dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
!is_root_ceph_dentry(dir, dentry) &&
ceph_test_mount_opt(fsc, DCACHE) &&
__ceph_dir_is_complete(ci) &&
- (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+ __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
spin_unlock(&ci->i_ceph_lock);
dout(" dir %p complete, -ENOENT\n", dir);
d_add(dentry, NULL);
@@ -782,10 +797,21 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = cpu_to_le32(mask);
+ ihold(dir);
req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, NULL, req);
- err = ceph_handle_snapdir(req, dentry, err);
+ if (err == -ENOENT) {
+ struct dentry *res;
+
+ res = ceph_handle_snapdir(req, dentry);
+ if (IS_ERR(res)) {
+ err = PTR_ERR(res);
+ } else {
+ dentry = res;
+ err = 0;
+ }
+ }
dentry = ceph_finish_lookup(req, dentry, err);
ceph_mdsc_put_request(req); /* will dput(dentry) */
dout("lookup result=%p\n", dentry);
@@ -819,11 +845,10 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
return PTR_ERR(result);
}
-static int ceph_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
+static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -831,6 +856,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_quota_is_max_files_exceeded(dir)) {
err = -EDQUOT;
goto out;
@@ -853,6 +882,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
+ ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_args.mknod.mode = cpu_to_le32(mode);
req->r_args.mknod.rdev = cpu_to_le32(rdev);
@@ -875,17 +905,16 @@ out:
return err;
}
-static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int ceph_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
- return ceph_mknod(dir, dentry, mode, 0);
+ return ceph_mknod(mnt_userns, dir, dentry, mode, 0);
}
-static int ceph_symlink(struct inode *dir, struct dentry *dentry,
- const char *dest)
+static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *dest)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@@ -893,6 +922,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_quota_is_max_files_exceeded(dir)) {
err = -EDQUOT;
goto out;
@@ -915,11 +948,17 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
goto out;
}
req->r_parent = dir;
+ ihold(dir);
+
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ if (as_ctx.pagelist) {
+ req->r_pagelist = as_ctx.pagelist;
+ as_ctx.pagelist = NULL;
+ }
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
@@ -931,15 +970,19 @@ out:
return err;
}
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
- int err = -EROFS;
+ int err;
int op;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
op = CEPH_MDS_OP_MKSNAP;
@@ -949,6 +992,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
+ err = -EROFS;
goto out;
}
@@ -975,6 +1019,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
+ ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_args.mkdir.mode = cpu_to_le32(mode);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
@@ -1001,11 +1046,14 @@ out:
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
int err;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
@@ -1020,6 +1068,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
req->r_parent = dir;
+ ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -1036,6 +1085,96 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
+static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct dentry *dentry = req->r_dentry;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
+ __func__, dentry, dentry);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
+ spin_unlock(&dentry->d_lock);
+
+ synchronize_rcu();
+
+ if (result == -EJUKEBOX)
+ goto out;
+
+ /* If op failed, mark everyone involved for errors */
+ if (result) {
+ int pathlen = 0;
+ u64 base = 0;
+ char *path = ceph_mdsc_build_path(dentry, &pathlen,
+ &base, 0);
+
+ /* mark error on parent + clear complete */
+ mapping_set_error(req->r_parent->i_mapping, result);
+ ceph_dir_clear_complete(req->r_parent);
+
+ /* drop the dentry -- we don't know its status */
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+
+ /* mark inode itself for an error (since metadata is bogus) */
+ mapping_set_error(req->r_old_inode->i_mapping, result);
+
+ pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+ }
+out:
+ iput(req->r_old_inode);
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di;
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
+
+ spin_lock(&ci->i_ceph_lock);
+ if ((__ceph_caps_issued(ci, NULL) & want) == want) {
+ ceph_take_cap_refs(ci, want, false);
+ got = want;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* If we didn't get anything, return 0 */
+ if (!got)
+ return 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ /*
+ * - We are holding Fx, which implies Fs caps.
+ * - Only support async unlink for primary linkage
+ */
+ if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
+ !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
+ want = 0;
+ spin_unlock(&dentry->d_lock);
+
+ /* Do we still want what we've got? */
+ if (want == got)
+ return got;
+
+ ceph_put_cap_refs(ci, got);
+ return 0;
+}
+
/*
* rmdir and unlink are differ only by the metadata op code
*/
@@ -1045,6 +1184,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int err = -EROFS;
int op;
@@ -1059,6 +1199,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
+retry:
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -1067,24 +1208,72 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
- set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ ihold(dir);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- d_delete(dentry);
+
+ if (try_async && op == CEPH_MDS_OP_UNLINK &&
+ (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
+ dentry->d_name.len, dentry->d_name.name,
+ ceph_cap_string(req->r_dir_caps));
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_callback = ceph_async_unlink_cb;
+ req->r_old_inode = d_inode(dentry);
+ ihold(req->r_old_inode);
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
+ dentry->d_name.hash);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ /*
+ * We have enough caps, so we assume that the unlink
+ * will succeed. Fix up the target inode and dcache.
+ */
+ drop_nlink(inode);
+ d_delete(dentry);
+ } else {
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ if (err == -EJUKEBOX) {
+ try_async = false;
+ ceph_mdsc_put_request(req);
+ goto retry;
+ }
+ }
+ } else {
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ d_delete(dentry);
+ }
+
ceph_mdsc_put_request(req);
out:
return err;
}
-static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
struct ceph_mds_request *req;
int op = CEPH_MDS_OP_RENAME;
int err;
@@ -1105,6 +1294,10 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
(!ceph_quota_is_same_realm(old_dir, new_dir)))
return -EXDEV;
+ err = ceph_wait_on_conflict_unlink(new_dentry);
+ if (err)
+ return err;
+
dout("rename dir %p dentry %p to dir %p dentry %p\n",
old_dir, old_dentry, new_dir, new_dentry);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1116,6 +1309,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
req->r_old_dentry = dget(old_dentry);
req->r_old_dentry_dir = old_dir;
req->r_parent = new_dir;
+ ihold(new_dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -1411,6 +1605,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
spin_lock(&dentry->d_lock);
di->time = jiffies;
di->lease_shared_gen = 0;
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
__dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -1431,10 +1626,8 @@ static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
u32 gen;
unsigned long ttl;
- spin_lock(&session->s_gen_ttl_lock);
- gen = session->s_cap_gen;
+ gen = atomic_read(&session->s_cap_gen);
ttl = session->s_cap_ttl;
- spin_unlock(&session->s_gen_ttl_lock);
if (di->lease_gen == gen &&
time_before(jiffies, ttl) &&
@@ -1520,7 +1713,8 @@ static int __dir_lease_try_check(const struct dentry *dentry)
/*
* Check if directory-wide content lease/cap is valid.
*/
-static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_client *mdsc)
{
struct ceph_inode_info *ci = ceph_inode(dir);
int valid;
@@ -1528,7 +1722,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
spin_lock(&ci->i_ceph_lock);
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
- shared_gen = atomic_read(&ci->i_shared_gen);
+ if (valid) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
+ shared_gen = atomic_read(&ci->i_shared_gen);
+ }
spin_unlock(&ci->i_ceph_lock);
if (valid) {
struct ceph_dentry_info *di;
@@ -1554,6 +1751,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
int valid = 0;
struct dentry *parent;
struct inode *dir, *inode;
+ struct ceph_mds_client *mdsc;
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
@@ -1570,6 +1768,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
dentry, inode, ceph_dentry(dentry)->offset);
+ mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
+
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1581,7 +1781,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = dentry_lease_is_valid(dentry, flags);
if (valid == -ECHILD)
return valid;
- if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
if (inode)
valid = ceph_is_any_caps(inode);
else
@@ -1590,8 +1790,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
if (!valid) {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(dir->i_sb)->mdsc;
struct ceph_mds_request *req;
int op, err;
u32 mask;
@@ -1599,6 +1797,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
+ percpu_counter_inc(&mdsc->metric.d_lease_mis);
+
op = ceph_snap(dir) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
@@ -1606,6 +1806,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
+ ihold(dir);
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
@@ -1622,7 +1823,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
case -ENOENT:
if (d_really_is_negative(dentry))
valid = 1;
- /* Fallthrough */
+ fallthrough;
default:
break;
}
@@ -1630,6 +1831,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p lookup result=%d\n",
dentry, err);
}
+ } else {
+ percpu_counter_inc(&mdsc->metric.d_lease_hit);
}
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
@@ -1672,16 +1875,18 @@ static int ceph_d_delete(const struct dentry *dentry)
static void ceph_d_release(struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
dout("d_release %p\n", dentry);
+ atomic64_dec(&fsc->mdsc->metric.total_dentries);
+
spin_lock(&dentry->d_lock);
__dentry_lease_unlist(di);
dentry->d_fsdata = NULL;
spin_unlock(&dentry->d_lock);
- if (di->lease_session)
- ceph_put_mds_session(di->lease_session);
+ ceph_put_mds_session(di->lease_session);
kmem_cache_free(ceph_dentry_cachep, di);
}
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index b6bfa94332c3..f780e4e0d062 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -129,6 +129,10 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
vino.ino = ino;
vino.snap = CEPH_NOSNAP;
+
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-ESTALE);
+
inode = ceph_find_inode(sb, vino);
if (!inode) {
struct ceph_mds_request *req;
@@ -153,6 +157,11 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
ceph_mdsc_put_request(req);
if (!inode)
return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE);
+ } else {
+ if (ceph_inode_is_shutdown(inode)) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
}
return inode;
}
@@ -172,9 +181,19 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct inode *inode = __lookup_inode(sb, ino);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int err;
+
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (inode->i_nlink == 0) {
+ /* We need LINK caps to reliably check i_nlink */
+ err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false);
+ if (err) {
+ iput(inode);
+ return ERR_PTR(err);
+ }
+ /* -ESTALE if inode as been unlinked and no file is open */
+ if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) {
iput(inode);
return ERR_PTR(-ESTALE);
}
@@ -205,9 +224,18 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
vino.ino = sfh->ino;
vino.snap = sfh->snapid;
}
+
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-ESTALE);
+
inode = ceph_find_inode(sb, vino);
- if (inode)
+ if (inode) {
+ if (ceph_inode_is_shutdown(inode)) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
return d_obtain_alias(inode);
+ }
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
USE_ANY_MDS);
@@ -241,9 +269,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
ihold(inode);
} else {
/* mds does not support lookup snapped inode */
- err = -EOPNOTSUPP;
- inode = NULL;
+ inode = ERR_PTR(-EOPNOTSUPP);
}
+ } else {
+ inode = ERR_PTR(-ESTALE);
}
ceph_mdsc_put_request(req);
@@ -254,8 +283,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d",
vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err);
}
- if (!inode)
- return ERR_PTR(-ESTALE);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
/* see comments in ceph_get_parent() */
return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
}
@@ -315,6 +344,11 @@ static struct dentry *__get_parent(struct super_block *sb,
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return ERR_PTR(err);
+ }
+
inode = req->r_target_inode;
if (inode)
ihold(inode);
@@ -519,6 +553,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
ihold(inode);
req->r_ino2 = ceph_vino(d_inode(parent));
req->r_parent = d_inode(parent);
+ ihold(req->r_parent);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7e0190b1f821..04fd34557de8 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -11,11 +11,13 @@
#include <linux/writeback.h>
#include <linux/falloc.h>
#include <linux/iversion.h>
+#include <linux/ktime.h>
#include "super.h"
#include "mds_client.h"
#include "cache.h"
#include "io.h"
+#include "metric.h"
static __le32 ceph_flags_sys2wire(u32 flags)
{
@@ -93,12 +95,11 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
size_t start;
int idx = 0;
- bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+ bytes = iov_iter_get_pages2(iter, pages, maxsize - size,
ITER_GET_BVECS_PAGES, &start);
if (bytes < 0)
return size ?: bytes;
- iov_iter_advance(iter, bytes);
size += bytes;
for ( ; bytes; idx++, bvec_idx++) {
@@ -180,8 +181,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
static struct ceph_mds_request *
prepare_open_request(struct super_block *sb, int flags, int create_mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
struct ceph_mds_request *req;
int want_auth = USE_ANY_MDS;
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -203,7 +203,10 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
int fmode, bool isdir)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
struct ceph_file_info *fi;
+ int ret;
dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
inode->i_mode, isdir ? "dir" : "regular");
@@ -212,10 +215,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
if (isdir) {
struct ceph_dir_file_info *dfi =
kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
- if (!dfi) {
- ceph_put_fmode(ci, fmode); /* clean up */
+ if (!dfi)
return -ENOMEM;
- }
file->private_data = dfi;
fi = &dfi->file_info;
@@ -223,21 +224,37 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
dfi->readdir_cache_idx = -1;
} else {
fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
- if (!fi) {
- ceph_put_fmode(ci, fmode); /* clean up */
+ if (!fi)
return -ENOMEM;
- }
+
+ if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+ fi->flags |= CEPH_F_SYNC;
file->private_data = fi;
}
+ ceph_get_fmode(ci, fmode, 1);
fi->fmode = fmode;
+
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
- fi->meta_err = errseq_sample(&ci->i_meta_err);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
+ if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
+ ret = ceph_uninline_data(file);
+ if (ret < 0)
+ goto error;
+ }
+
return 0;
+
+error:
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
+ ceph_put_fmode(ci, fi->fmode, 1);
+ kmem_cache_free(ceph_file_cachep, fi);
+ /* wake up anyone waiting for caps on this inode */
+ wake_up_all(&ci->i_cap_wq);
+ return ret;
}
/*
@@ -250,20 +267,16 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
- ceph_fscache_register_inode_cookie(inode);
- ceph_fscache_file_set_cookie(inode, file);
- /* fall through */
+ ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE);
+ fallthrough;
case S_IFDIR:
ret = ceph_init_file_info(inode, file, fmode,
S_ISDIR(inode->i_mode));
- if (ret)
- return ret;
break;
case S_IFLNK:
dout("init_file %p %p 0%o (symlink)\n", inode, file,
inode->i_mode);
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
break;
default:
@@ -273,7 +286,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
* we need to drop the open ref now, since we don't
* have .release set to ceph_release.
*/
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
BUG_ON(inode->i_fop->release == ceph_release);
/* call the proper open fop */
@@ -285,14 +297,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
/*
* try renew caps after session gets killed.
*/
-int ceph_renew_caps(struct inode *inode)
+int ceph_renew_caps(struct inode *inode, int fmode)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
spin_lock(&ci->i_ceph_lock);
+ __ceph_touch_fmode(ci, mdsc, fmode);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
(!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
@@ -326,7 +339,6 @@ int ceph_renew_caps(struct inode *inode)
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
- req->r_fmode = -1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
@@ -372,9 +384,6 @@ int ceph_open(struct inode *inode, struct file *file)
/* trivially open snapdir */
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- spin_lock(&ci->i_ceph_lock);
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -392,7 +401,7 @@ int ceph_open(struct inode *inode, struct file *file)
dout("open %p fmode %d want %s issued %s using existing\n",
inode, fmode, ceph_cap_string(wanted),
ceph_cap_string(issued));
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
/* adjust wanted? */
@@ -404,7 +413,7 @@ int ceph_open(struct inode *inode, struct file *file)
return ceph_init_file(inode, file, fmode);
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
(ci->i_snap_caps & wanted) == wanted) {
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -430,6 +439,284 @@ out:
return err;
}
+/* Clone the layout from a synchronous create, if the dir now has Dc caps */
+static void
+cache_file_layout(struct inode *dst, struct inode *src)
+{
+ struct ceph_inode_info *cdst = ceph_inode(dst);
+ struct ceph_inode_info *csrc = ceph_inode(src);
+
+ spin_lock(&cdst->i_ceph_lock);
+ if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
+ !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
+ memcpy(&cdst->i_cached_layout, &csrc->i_layout,
+ sizeof(cdst->i_cached_layout));
+ rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
+ ceph_try_get_string(csrc->i_layout.pool_ns));
+ }
+ spin_unlock(&cdst->i_ceph_lock);
+}
+
+/*
+ * Try to set up an async create. We need caps, a file layout, and inode number,
+ * and either a lease on the dentry or complete dir info. If any of those
+ * criteria are not satisfied, then return false and the caller can go
+ * synchronous.
+ */
+static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
+ struct ceph_file_layout *lo, u64 *pino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
+ u64 ino;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* No auth cap means no chance for Dc caps */
+ if (!ci->i_auth_cap)
+ goto no_async;
+
+ /* Any delegated inos? */
+ if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
+ goto no_async;
+
+ if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
+ goto no_async;
+
+ if ((__ceph_caps_issued(ci, NULL) & want) != want)
+ goto no_async;
+
+ if (d_in_lookup(dentry)) {
+ if (!__ceph_dir_is_complete(ci))
+ goto no_async;
+ spin_lock(&dentry->d_lock);
+ di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
+ spin_unlock(&dentry->d_lock);
+ } else if (atomic_read(&ci->i_shared_gen) !=
+ READ_ONCE(di->lease_shared_gen)) {
+ goto no_async;
+ }
+
+ ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
+ if (!ino)
+ goto no_async;
+
+ *pino = ino;
+ ceph_take_cap_refs(ci, want, false);
+ memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
+ rcu_assign_pointer(lo->pool_ns,
+ ceph_try_get_string(ci->i_cached_layout.pool_ns));
+ got = want;
+no_async:
+ spin_unlock(&ci->i_ceph_lock);
+ return got;
+}
+
+static void restore_deleg_ino(struct inode *dir, u64 ino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_session *s = NULL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ s = ceph_get_mds_session(ci->i_auth_cap->session);
+ spin_unlock(&ci->i_ceph_lock);
+ if (s) {
+ int err = ceph_restore_deleg_ino(s, ino);
+ if (err)
+ pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
+ ino, err);
+ ceph_put_mds_session(s);
+ }
+}
+
+static void wake_async_create_waiters(struct inode *inode,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ }
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
+static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct dentry *dentry = req->r_dentry;
+ struct inode *dinode = d_inode(dentry);
+ struct inode *tinode = req->r_target_inode;
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ WARN_ON_ONCE(dinode && tinode && dinode != tinode);
+
+ /* MDS changed -- caller must resubmit */
+ if (result == -EJUKEBOX)
+ goto out;
+
+ mapping_set_error(req->r_parent->i_mapping, result);
+
+ if (result) {
+ int pathlen = 0;
+ u64 base = 0;
+ char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &base, 0);
+
+ pr_warn("async create failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+
+ ceph_dir_clear_complete(req->r_parent);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+
+ if (dinode) {
+ mapping_set_error(dinode->i_mapping, result);
+ ceph_inode_shutdown(dinode);
+ wake_async_create_waiters(dinode, req->r_session);
+ }
+ }
+
+ if (tinode) {
+ u64 ino = ceph_vino(tinode).ino;
+
+ if (req->r_deleg_ino != ino)
+ pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+ __func__, req->r_err, req->r_deleg_ino, ino);
+
+ mapping_set_error(tinode->i_mapping, result);
+ wake_async_create_waiters(tinode, req->r_session);
+ } else if (!result) {
+ pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
+ req->r_deleg_ino);
+ }
+out:
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
+ struct file *file, umode_t mode,
+ struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx,
+ struct ceph_file_layout *lo)
+{
+ int ret;
+ char xattr_buf[4];
+ struct ceph_mds_reply_inode in = { };
+ struct ceph_mds_reply_info_in iinfo = { .in = &in };
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct inode *inode;
+ struct timespec64 now;
+ struct ceph_string *pool_ns;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_vino vino = { .ino = req->r_deleg_ino,
+ .snap = CEPH_NOSNAP };
+
+ ktime_get_real_ts64(&now);
+
+ inode = ceph_get_inode(dentry->d_sb, vino);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ iinfo.inline_version = CEPH_INLINE_NONE;
+ iinfo.change_attr = 1;
+ ceph_encode_timespec64(&iinfo.btime, &now);
+
+ if (req->r_pagelist) {
+ iinfo.xattr_len = req->r_pagelist->length;
+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
+ } else {
+ /* fake it */
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ }
+
+ in.ino = cpu_to_le64(vino.ino);
+ in.snapid = cpu_to_le64(CEPH_NOSNAP);
+ in.version = cpu_to_le64(1); // ???
+ in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
+ in.cap.cap_id = cpu_to_le64(1);
+ in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
+ in.cap.flags = CEPH_CAP_FLAG_AUTH;
+ in.ctime = in.mtime = in.atime = iinfo.btime;
+ in.truncate_seq = cpu_to_le32(1);
+ in.truncate_size = cpu_to_le64(-1ULL);
+ in.xattr_version = cpu_to_le64(1);
+ in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+ if (dir->i_mode & S_ISGID) {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
+
+ /* Directories always inherit the setgid bit. */
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
+ }
+ in.mode = cpu_to_le32((u32)mode);
+
+ in.nlink = cpu_to_le32(1);
+ in.max_size = cpu_to_le64(lo->stripe_unit);
+
+ ceph_file_layout_to_legacy(lo, &in.layout);
+ /* lo is private, so pool_ns can't change */
+ pool_ns = rcu_dereference_raw(lo->pool_ns);
+ if (pool_ns) {
+ iinfo.pool_ns_len = pool_ns->len;
+ iinfo.pool_ns_data = pool_ns->str;
+ }
+
+ down_read(&mdsc->snap_rwsem);
+ ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
+ req->r_fmode, NULL);
+ up_read(&mdsc->snap_rwsem);
+ if (ret) {
+ dout("%s failed to fill inode: %d\n", __func__, ret);
+ ceph_dir_clear_complete(dir);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ if (inode->i_state & I_NEW)
+ discard_new_inode(inode);
+ } else {
+ struct dentry *dn;
+
+ dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
+ vino.ino, ceph_ino(dir), dentry->d_name.name);
+ ceph_dir_clear_ordered(dir);
+ ceph_init_inode_acls(inode, as_ctx);
+ if (inode->i_state & I_NEW) {
+ /*
+ * If it's not I_NEW, then someone created this before
+ * we got here. Assume the server is aware of it at
+ * that point and don't worry about setting
+ * CEPH_I_ASYNC_CREATE.
+ */
+ ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
+ unlock_new_inode(inode);
+ }
+ if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ dn = d_splice_alias(inode, dentry);
+ WARN_ON_ONCE(dn && dn != dentry);
+ }
+ file->f_mode |= FMODE_CREATED;
+ ret = finish_open(file, dentry, ceph_open);
+ }
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_CREATE;
+ wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT);
+ spin_unlock(&dentry->d_lock);
+
+ return ret;
+}
/*
* Do a lookup + open with a single request. If we get a non-existent
@@ -443,6 +730,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acl_sec_ctx as_ctx = {};
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int mask;
int err;
@@ -453,6 +741,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+ /*
+ * Do not truncate the file, since atomic_open is called before the
+ * permission check. The caller will do the truncation afterward.
+ */
+ flags &= ~O_TRUNC;
+
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
@@ -462,11 +759,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ /* Async create can't handle more than a page of xattrs */
+ if (as_ctx.pagelist &&
+ !list_is_singular(&as_ctx.pagelist->head))
+ try_async = false;
} else if (!d_in_lookup(dentry)) {
/* If it's not being looked up, it's negative */
return -ENOENT;
}
-
+retry:
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
if (IS_ERR(req)) {
@@ -475,30 +776,65 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.open.mask = cpu_to_le32(mask);
+ req->r_parent = dir;
+ ihold(dir);
+
if (flags & O_CREAT) {
+ struct ceph_file_layout lo;
+
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
if (as_ctx.pagelist) {
req->r_pagelist = as_ctx.pagelist;
as_ctx.pagelist = NULL;
}
+ if (try_async &&
+ (req->r_dir_caps =
+ try_prep_async_create(dir, dentry, &lo,
+ &req->r_deleg_ino))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
+ req->r_callback = ceph_async_create_cb;
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_CREATE;
+ spin_unlock(&dentry->d_lock);
+
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ err = ceph_finish_async_create(dir, dentry,
+ file, mode, req,
+ &as_ctx, &lo);
+ } else if (err == -EJUKEBOX) {
+ restore_deleg_ino(dir, req->r_deleg_ino);
+ ceph_mdsc_put_request(req);
+ try_async = false;
+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
+ goto retry;
+ }
+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
+ goto out_req;
+ }
}
- mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
- if (ceph_security_xattr_wanted(dir))
- mask |= CEPH_CAP_XATTR_SHARED;
- req->r_args.open.mask = cpu_to_le32(mask);
-
- req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
- err = ceph_handle_snapdir(req, dentry, err);
- if (err)
- goto out_req;
+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
+ if (err == -ENOENT) {
+ dentry = ceph_handle_snapdir(req, dentry);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_req;
+ }
+ err = 0;
+ }
- if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+ if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
if (d_in_lookup(dentry)) {
@@ -518,14 +854,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
- ceph_init_inode_acls(d_inode(dentry), &as_ctx);
+ struct inode *newino = d_inode(dentry);
+
+ cache_file_layout(dir, newino);
+ ceph_init_inode_acls(newino, &as_ctx);
file->f_mode |= FMODE_CREATED;
}
err = finish_open(file, dentry, ceph_open);
}
out_req:
- if (!req->r_err && req->r_target_inode)
- ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
out_ctx:
ceph_release_acl_sec_ctx(&as_ctx);
@@ -542,7 +879,7 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p dir file %p\n", inode, file);
WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
- ceph_put_fmode(ci, dfi->file_info.fmode);
+ ceph_put_fmode(ci, dfi->file_info.fmode, 1);
if (dfi->last_readdir)
ceph_mdsc_put_request(dfi->last_readdir);
@@ -554,7 +891,9 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p regular file %p\n", inode, file);
WARN_ON(!list_empty(&fi->rw_contexts));
- ceph_put_fmode(ci, fi->fmode);
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
+ ceph_put_fmode(ci, fi->fmode, 1);
+
kmem_cache_free(ceph_file_cachep, fi);
}
@@ -590,6 +929,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ssize_t ret;
u64 off = iocb->ki_pos;
u64 len = iov_iter_count(to);
+ u64 i_size = i_size_read(inode);
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -613,8 +953,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
struct page **pages;
int num_pages;
size_t page_off;
- u64 i_size;
bool more;
+ int idx;
+ size_t left;
req = ceph_osdc_new_request(osdc, &ci->i_layout,
ci->i_vino, off, &len, 0, 1,
@@ -628,36 +969,25 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
more = len < iov_iter_count(to);
- if (unlikely(iov_iter_is_pipe(to))) {
- ret = iov_iter_get_pages_alloc(to, &pages, len,
- &page_off);
- if (ret <= 0) {
- ceph_osdc_put_request(req);
- ret = -ENOMEM;
- break;
- }
- num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
- if (ret < len) {
- len = ret;
- osd_req_op_extent_update(req, 0, len);
- more = false;
- }
- } else {
- num_pages = calc_pages_for(off, len);
- page_off = off & ~PAGE_MASK;
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages)) {
- ceph_osdc_put_request(req);
- ret = PTR_ERR(pages);
- break;
- }
+ num_pages = calc_pages_for(off, len);
+ page_off = off & ~PAGE_MASK;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
+ break;
}
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
false, false);
- ret = ceph_osdc_start_request(osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_read_metrics(&fsc->mdsc->metric,
+ req->r_start_latency,
+ req->r_end_latency,
+ len, ret);
+
ceph_osdc_put_request(req);
i_size = i_size_read(inode);
@@ -675,36 +1005,27 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ret += zlen;
}
- if (unlikely(iov_iter_is_pipe(to))) {
- if (ret > 0) {
- iov_iter_advance(to, ret);
- off += ret;
- } else {
- iov_iter_advance(to, 0);
- }
- ceph_put_page_vector(pages, num_pages, false);
- } else {
- int idx = 0;
- size_t left = ret > 0 ? ret : 0;
- while (left > 0) {
- size_t len, copied;
- page_off = off & ~PAGE_MASK;
- len = min_t(size_t, left, PAGE_SIZE - page_off);
- copied = copy_page_to_iter(pages[idx++],
- page_off, len, to);
- off += copied;
- left -= copied;
- if (copied < len) {
- ret = -EFAULT;
- break;
- }
+ idx = 0;
+ left = ret > 0 ? ret : 0;
+ while (left > 0) {
+ size_t len, copied;
+ page_off = off & ~PAGE_MASK;
+ len = min_t(size_t, left, PAGE_SIZE - page_off);
+ SetPageUptodate(pages[idx]);
+ copied = copy_page_to_iter(pages[idx++],
+ page_off, len, to);
+ off += copied;
+ left -= copied;
+ if (copied < len) {
+ ret = -EFAULT;
+ break;
}
- ceph_release_page_vector(pages, num_pages);
}
+ ceph_release_page_vector(pages, num_pages);
if (ret < 0) {
- if (ret == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (ret == -EBLOCKLISTED)
+ fsc->blocklisted = true;
break;
}
@@ -713,11 +1034,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
}
if (off > iocb->ki_pos) {
- if (ret >= 0 &&
- iov_iter_count(to) > 0 && off >= i_size_read(inode))
+ if (off >= i_size) {
*retry_op = CHECK_EOF;
- ret = off - iocb->ki_pos;
- iocb->ki_pos = off;
+ ret = i_size - iocb->ki_pos;
+ iocb->ki_pos = i_size;
+ } else {
+ ret = off - iocb->ki_pos;
+ iocb->ki_pos = off;
+ }
}
dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
@@ -772,7 +1096,6 @@ static void ceph_aio_complete(struct inode *inode,
}
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&aio_req->prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -784,7 +1107,7 @@ static void ceph_aio_complete(struct inode *inode,
ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD));
- aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+ aio_req->iocb->ki_complete(aio_req->iocb, ret);
ceph_free_cap_flush(aio_req->prealloc_cf);
kfree(aio_req);
@@ -796,12 +1119,13 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+ struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
+ unsigned int len = osd_data->bvec_pos.iter.bi_size;
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
BUG_ON(!osd_data->num_bvecs);
- dout("ceph_aio_complete_req %p rc %d bytes %u\n",
- inode, rc, osd_data->bvec_pos.iter.bi_size);
+ dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len);
if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work;
@@ -819,9 +1143,9 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
} else if (!aio_req->write) {
if (rc == -ENOENT)
rc = 0;
- if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
+ if (rc >= 0 && len > rc) {
struct iov_iter i;
- int zlen = osd_data->bvec_pos.iter.bi_size - rc;
+ int zlen = len - rc;
/*
* If read is satisfied by single OSD request,
@@ -838,13 +1162,22 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
}
iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
- osd_data->num_bvecs,
- osd_data->bvec_pos.iter.bi_size);
+ osd_data->num_bvecs, len);
iov_iter_advance(&i, rc);
iov_iter_zero(zlen, &i);
}
}
+ /* r_start_latency == 0 means the request was not submitted */
+ if (req->r_start_latency) {
+ if (aio_req->write)
+ ceph_update_write_metrics(metric, req->r_start_latency,
+ req->r_end_latency, len, rc);
+ else
+ ceph_update_read_metrics(metric, req->r_start_latency,
+ req->r_end_latency, len, rc);
+ }
+
put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
aio_req->should_dirty);
ceph_osdc_put_request(req);
@@ -911,7 +1244,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_inode = inode;
req->r_priv = aio_req;
- ret = ceph_osdc_start_request(req->r_osdc, req, false);
+ ceph_osdc_start_request(req->r_osdc, req);
out:
if (ret < 0) {
req->r_result = ret;
@@ -931,6 +1264,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_client_metric *metric = &fsc->mdsc->metric;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct bio_vec *bvecs;
@@ -942,7 +1276,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
- bool should_dirty = !write && iter_is_iovec(iter);
+ bool should_dirty = !write && user_backed_iter(iter);
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -952,7 +1286,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
snapc, snapc ? snapc->seq : 0);
if (write) {
- int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
+ int ret2;
+
+ ceph_fscache_invalidate(inode, true);
+
+ ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(pos + count - 1) >> PAGE_SHIFT);
if (ret2 < 0)
@@ -1043,9 +1381,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
continue;
}
- ret = ceph_osdc_start_request(req->r_osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(req->r_osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+ if (write)
+ ceph_update_write_metrics(metric, req->r_start_latency,
+ req->r_end_latency, len, ret);
+ else
+ ceph_update_read_metrics(metric, req->r_start_latency,
+ req->r_end_latency, len, ret);
size = i_size_read(inode);
if (!write) {
@@ -1101,8 +1445,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
r_private_item);
list_del_init(&req->r_private_item);
if (ret >= 0)
- ret = ceph_osdc_start_request(req->r_osdc,
- req, false);
+ ceph_osdc_start_request(req->r_osdc, req);
if (ret < 0) {
req->r_result = ret;
ceph_aio_complete_req(req);
@@ -1156,6 +1499,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (ret < 0)
return ret;
+ ceph_fscache_invalidate(inode, false);
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(pos + count - 1) >> PAGE_SHIFT);
@@ -1214,10 +1558,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
false, true);
req->r_mtime = mtime;
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, ret);
out:
ceph_osdc_put_request(req);
if (ret != 0) {
@@ -1259,28 +1604,31 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
size_t len = iov_iter_count(to);
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *pinned_page = NULL;
+ bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
ssize_t ret;
- int want, got = 0;
+ int want = 0, got = 0;
int retry_op = 0, read = 0;
again:
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ if (direct_lock)
ceph_start_io_direct(inode);
else
ceph_start_io_read(inode);
+ if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+ want |= CEPH_CAP_FILE_CACHE;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_CACHE;
- ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
- &got, &pinned_page);
+ want |= CEPH_CAP_FILE_LAZYIO;
+
+ ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
if (ret < 0) {
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_end_io_direct(inode);
else
ceph_end_io_read(inode);
@@ -1295,7 +1643,7 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
- if (ci->i_inline_version == CEPH_INLINE_NONE) {
+ if (!ceph_has_inline_data(ci)) {
if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
ret = ceph_direct_read_write(iocb, to,
NULL, NULL);
@@ -1319,13 +1667,9 @@ again:
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
- if (pinned_page) {
- put_page(pinned_page);
- pinned_page = NULL;
- }
ceph_put_cap_refs(ci, got);
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_end_io_direct(inode);
else
ceph_end_io_read(inode);
@@ -1415,13 +1759,19 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
- int err, want, got;
+ int err, want = 0, got;
bool direct_lock = false;
+ u32 map_flags;
+ u64 pool_flags;
loff_t pos;
loff_t limit = max(i_size_read(inode), fsc->max_file_size);
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
@@ -1465,40 +1815,37 @@ retry_snap:
goto out;
}
- err = file_remove_privs(file);
- if (err)
- goto out;
-
- err = file_update_time(file);
- if (err)
+ down_read(&osdc->lock);
+ map_flags = osdc->osdmap->flags;
+ pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
+ up_read(&osdc->lock);
+ if ((map_flags & CEPH_OSDMAP_FULL) ||
+ (pool_flags & CEPH_POOL_FLAG_FULL)) {
+ err = -ENOSPC;
goto out;
-
- inode_inc_iversion_raw(inode);
-
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- err = ceph_uninline_data(file, NULL);
- if (err < 0)
- goto out;
}
- /* FIXME: not complete since it doesn't account for being at quota */
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) {
- err = -ENOSPC;
+ err = file_remove_privs(file);
+ if (err)
goto out;
- }
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
inode, ceph_vinop(inode), pos, count, i_size_read(inode));
+ if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+ want |= CEPH_CAP_FILE_BUFFER;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
+ want |= CEPH_CAP_FILE_LAZYIO;
got = 0;
- err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
- &got, NULL);
+ err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
if (err < 0)
goto out;
+ err = file_update_time(file);
+ if (err)
+ goto out_caps;
+
+ inode_inc_iversion_raw(inode);
+
dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
@@ -1543,7 +1890,7 @@ retry_snap:
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
- written = generic_perform_write(file, from, pos);
+ written = generic_perform_write(iocb, from);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
ceph_end_io_write(inode);
@@ -1553,14 +1900,13 @@ retry_snap:
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1575,12 +1921,15 @@ retry_snap:
}
if (written >= 0) {
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_NEARFULL))
+ if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
+ (pool_flags & CEPH_POOL_FLAG_NEARFULL))
iocb->ki_flags |= IOCB_DSYNC;
written = generic_write_sync(iocb, written);
}
goto out_unlocked;
+out_caps:
+ ceph_put_cap_refs(ci, got);
out:
if (direct_lock)
ceph_end_io_direct(inode);
@@ -1597,57 +1946,15 @@ out_unlocked:
*/
static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *inode = file->f_mapping->host;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- loff_t i_size;
- loff_t ret;
-
- inode_lock(inode);
-
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
+ struct inode *inode = file_inode(file);
+ int ret;
+
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
if (ret < 0)
- goto out;
- }
-
- i_size = i_size_read(inode);
- switch (whence) {
- case SEEK_END:
- offset += i_size;
- break;
- case SEEK_CUR:
- /*
- * Here we special-case the lseek(fd, 0, SEEK_CUR)
- * position-querying operation. Avoid rewriting the "same"
- * f_pos value back to the file because a concurrent read(),
- * write() or lseek() might have altered it
- */
- if (offset == 0) {
- ret = file->f_pos;
- goto out;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset < 0 || offset >= i_size) {
- ret = -ENXIO;
- goto out;
- }
- break;
- case SEEK_HOLE:
- if (offset < 0 || offset >= i_size) {
- ret = -ENXIO;
- goto out;
- }
- offset = i_size;
- break;
+ return ret;
}
-
- ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size));
-
-out:
- inode_unlock(inode);
- return ret;
+ return generic_file_llseek(file, offset, whence);
}
static inline void ceph_zero_partial_page(
@@ -1716,12 +2023,10 @@ static int ceph_zero_partial_object(struct inode *inode,
}
req->r_mtime = inode->i_mtime;
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret) {
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
- if (ret == -ENOENT)
- ret = 0;
- }
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ if (ret == -ENOENT)
+ ret = 0;
ceph_osdc_put_request(req);
out:
@@ -1804,12 +2109,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- ret = ceph_uninline_data(file, NULL);
- if (ret < 0)
- goto unlock;
- }
-
size = i_size_read(inode);
/* Are we punching a hole beyond EOF? */
@@ -1823,22 +2122,24 @@ static long ceph_fallocate(struct file *file, int mode,
else
want = CEPH_CAP_FILE_BUFFER;
- ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
+ ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got);
if (ret < 0)
goto unlock;
+ filemap_invalidate_lock(inode->i_mapping);
+ ceph_fscache_invalidate(inode, false);
ceph_zero_pagecache_range(inode, offset, length);
ret = ceph_zero_objects(inode, offset, length);
if (!ret) {
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
}
+ filemap_invalidate_unlock(inode->i_mapping);
ceph_put_cap_refs(ci, got);
unlock:
@@ -1861,7 +2162,7 @@ static int get_rd_wr_caps(struct file *src_filp, int *src_got,
retry_caps:
ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
- dst_endoff, dst_got, NULL);
+ dst_endoff, dst_got);
if (ret < 0)
return ret;
@@ -1883,7 +2184,7 @@ retry_caps:
return ret;
}
ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
- CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
+ CEPH_CAP_FILE_SHARED, -1, src_got);
if (ret < 0)
return ret;
/*... drop src_ci caps too, and retry */
@@ -1936,6 +2237,127 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
return 0;
}
+static struct ceph_osd_request *
+ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc,
+ u64 src_snapid,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ struct ceph_object_id *dst_oid,
+ struct ceph_object_locator *dst_oloc,
+ u32 truncate_seq, u64 truncate_size)
+{
+ struct ceph_osd_request *req;
+ int ret;
+ u32 src_fadvise_flags =
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE;
+ u32 dst_fadvise_flags =
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
+
+ ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
+ ceph_oid_copy(&req->r_t.base_oid, dst_oid);
+
+ ret = osd_req_op_copy_from_init(req, src_snapid, 0,
+ src_oid, src_oloc,
+ src_fadvise_flags,
+ dst_fadvise_flags,
+ truncate_seq,
+ truncate_size,
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+ if (ret)
+ goto out;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ return req;
+
+out:
+ ceph_osdc_put_request(req);
+ return ERR_PTR(ret);
+}
+
+static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
+ struct ceph_inode_info *dst_ci, u64 *dst_off,
+ struct ceph_fs_client *fsc,
+ size_t len, unsigned int flags)
+{
+ struct ceph_object_locator src_oloc, dst_oloc;
+ struct ceph_object_id src_oid, dst_oid;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_request *req;
+ size_t bytes = 0;
+ u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
+ u32 src_objlen, dst_objlen;
+ u32 object_size = src_ci->i_layout.object_size;
+ int ret;
+
+ src_oloc.pool = src_ci->i_layout.pool_id;
+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+ dst_oloc.pool = dst_ci->i_layout.pool_id;
+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+ osdc = &fsc->client->osdc;
+
+ while (len >= object_size) {
+ ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
+ object_size, &src_objnum,
+ &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
+ object_size, &dst_objnum,
+ &dst_objoff, &dst_objlen);
+ ceph_oid_init(&src_oid);
+ ceph_oid_printf(&src_oid, "%llx.%08llx",
+ src_ci->i_vino.ino, src_objnum);
+ ceph_oid_init(&dst_oid);
+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
+ dst_ci->i_vino.ino, dst_objnum);
+ /* Do an object remote copy */
+ req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap,
+ &src_oid, &src_oloc,
+ &dst_oid, &dst_oloc,
+ dst_ci->i_truncate_seq,
+ dst_ci->i_truncate_size);
+ if (IS_ERR(req))
+ ret = PTR_ERR(req);
+ else {
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+ ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
+ req->r_start_latency,
+ req->r_end_latency,
+ object_size, ret);
+ ceph_osdc_put_request(req);
+ }
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ fsc->have_copy_from2 = false;
+ pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+ }
+ dout("ceph_osdc_copy_from returned %d\n", ret);
+ if (!bytes)
+ bytes = ret;
+ goto out;
+ }
+ len -= object_size;
+ bytes += object_size;
+ *src_off += object_size;
+ *dst_off += object_size;
+ }
+
+out:
+ ceph_oloc_destroy(&src_oloc);
+ ceph_oloc_destroy(&dst_oloc);
+ return bytes;
+}
+
static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
@@ -1946,14 +2368,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
struct ceph_cap_flush *prealloc_cf;
struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
- struct ceph_object_locator src_oloc, dst_oloc;
- struct ceph_object_id src_oid, dst_oid;
- loff_t endoff = 0, size;
- ssize_t ret = -EIO;
+ loff_t size;
+ ssize_t ret = -EIO, bytes;
u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
- u32 src_objlen, dst_objlen, object_size;
+ u32 src_objlen, dst_objlen;
int src_got = 0, dst_got = 0, err, dirty;
- bool do_final_copy = false;
if (src_inode->i_sb != dst_inode->i_sb) {
struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
@@ -2031,22 +2450,15 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ret < 0)
goto out_caps;
- size = i_size_read(dst_inode);
- endoff = dst_off + len;
-
/* Drop dst file cached pages */
+ ceph_fscache_invalidate(dst_inode, false);
ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
dst_off >> PAGE_SHIFT,
- endoff >> PAGE_SHIFT);
+ (dst_off + len) >> PAGE_SHIFT);
if (ret < 0) {
dout("Failed to invalidate inode pages (%zd)\n", ret);
ret = 0; /* XXX */
}
- src_oloc.pool = src_ci->i_layout.pool_id;
- src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
- dst_oloc.pool = dst_ci->i_layout.pool_id;
- dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
-
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
src_ci->i_layout.object_size,
&src_objnum, &src_objoff, &src_objlen);
@@ -2065,6 +2477,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* starting at the src_off
*/
if (src_objoff) {
+ dout("Initial partial copy of %u bytes\n", src_objlen);
+
/*
* we need to temporarily drop all caps as we'll be calling
* {read,write}_iter, which will get caps again.
@@ -2072,8 +2486,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
ret = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, src_objlen, flags);
- if (ret < 0) {
- dout("do_splice_direct returned %d\n", err);
+ /* Abort on short copies or on error */
+ if (ret < src_objlen) {
+ dout("Failed partial copy (%zd)\n", ret);
goto out;
}
len -= ret;
@@ -2086,69 +2501,31 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (err < 0)
goto out_caps;
}
- object_size = src_ci->i_layout.object_size;
- while (len >= object_size) {
- ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
- object_size, &src_objnum,
- &src_objoff, &src_objlen);
- ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
- object_size, &dst_objnum,
- &dst_objoff, &dst_objlen);
- ceph_oid_init(&src_oid);
- ceph_oid_printf(&src_oid, "%llx.%08llx",
- src_ci->i_vino.ino, src_objnum);
- ceph_oid_init(&dst_oid);
- ceph_oid_printf(&dst_oid, "%llx.%08llx",
- dst_ci->i_vino.ino, dst_objnum);
- /* Do an object remote copy */
- err = ceph_osdc_copy_from(
- &src_fsc->client->osdc,
- src_ci->i_vino.snap, 0,
- &src_oid, &src_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
- &dst_oid, &dst_oloc,
- CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
- dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
- CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
- if (err) {
- if (err == -EOPNOTSUPP) {
- src_fsc->have_copy_from2 = false;
- pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
- }
- dout("ceph_osdc_copy_from returned %d\n", err);
- if (!ret)
- ret = err;
- goto out_caps;
- }
- len -= object_size;
- src_off += object_size;
- dst_off += object_size;
- ret += object_size;
- }
- if (len)
- /* We still need one final local copy */
- do_final_copy = true;
+ size = i_size_read(dst_inode);
+ bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
+ src_fsc, len, flags);
+ if (bytes <= 0) {
+ if (!ret)
+ ret = bytes;
+ goto out_caps;
+ }
+ dout("Copied %zu bytes out of %zu\n", bytes, len);
+ len -= bytes;
+ ret += bytes;
file_update_time(dst_file);
inode_inc_iversion_raw(dst_inode);
- if (endoff > size) {
- int caps_flags = 0;
-
+ if (dst_off > size) {
/* Let the MDS know about dst file size change */
- if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
- caps_flags |= CHECK_CAPS_NODELAY;
- if (ceph_inode_set_size(dst_inode, endoff))
- caps_flags |= CHECK_CAPS_AUTHONLY;
- if (caps_flags)
- ceph_check_caps(dst_ci, caps_flags, NULL);
+ if (ceph_inode_set_size(dst_inode, dst_off) ||
+ ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH,
+ NULL);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
- dst_ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
spin_unlock(&dst_ci->i_ceph_lock);
if (dirty)
@@ -2157,15 +2534,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
out_caps:
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
- if (do_final_copy) {
- err = do_splice_direct(src_file, &src_off, dst_file,
- &dst_off, len, flags);
- if (err < 0) {
- dout("do_splice_direct returned %d\n", err);
- goto out;
- }
- len -= err;
- ret += err;
+ /*
+ * Do the final manual copy if we still have some bytes left, unless
+ * there were errors in remote object copies (len >= object_size).
+ */
+ if (len && (len < src_ci->i_layout.object_size)) {
+ dout("Final partial copy of %zu bytes\n", len);
+ bytes = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, len, flags);
+ if (bytes > 0)
+ ret += bytes;
+ else
+ dout("Failed partial copy (%zd)\n", bytes);
}
out:
@@ -2198,6 +2578,7 @@ const struct file_operations ceph_file_fops = {
.mmap = ceph_mmap,
.fsync = ceph_fsync,
.lock = ceph_lock,
+ .setlease = simple_nosetlease,
.flock = ceph_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d01710a16a4a..4af5e55abc15 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,26 +41,31 @@ static void ceph_inode_work(struct work_struct *work);
*/
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+
+ ci->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
inode_set_iversion_raw(inode, 0);
+ percpu_counter_inc(&mdsc->metric.total_inodes);
+
return 0;
}
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
{
struct inode *inode;
- ino_t t = ceph_vino_to_ino(vino);
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-EREMOTEIO);
+
+ inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+ ceph_set_ino_cb, &vino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW)
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
- inode, ceph_vinop(inode), (u64)inode->i_ino);
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
- vino.snap, inode);
+ dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
+ ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
return inode;
}
@@ -76,21 +81,44 @@ struct inode *ceph_get_snapdir(struct inode *parent)
struct inode *inode = ceph_get_inode(parent->i_sb, vino);
struct ceph_inode_info *ci = ceph_inode(inode);
- BUG_ON(!S_ISDIR(parent->i_mode));
if (IS_ERR(inode))
return inode;
+
+ if (!S_ISDIR(parent->i_mode)) {
+ pr_warn_once("bad snapdir parent type (mode=0%o)\n",
+ parent->i_mode);
+ goto err;
+ }
+
+ if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
+ pr_warn_once("bad snapdir inode type (mode=0%o)\n",
+ inode->i_mode);
+ goto err;
+ }
+
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
- inode->i_op = &ceph_snapdir_iops;
- inode->i_fop = &ceph_snapdir_fops;
- ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+ inode->i_mtime = parent->i_mtime;
+ inode->i_ctime = parent->i_ctime;
+ inode->i_atime = parent->i_atime;
ci->i_rbytes = 0;
+ ci->i_btime = ceph_inode(parent)->i_btime;
- if (inode->i_state & I_NEW)
+ if (inode->i_state & I_NEW) {
+ inode->i_op = &ceph_snapdir_iops;
+ inode->i_fop = &ceph_snapdir_fops;
+ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
unlock_new_inode(inode);
+ }
return inode;
+err:
+ if ((inode->i_state & I_NEW))
+ discard_new_inode(inode);
+ else
+ iput(inode);
+ return ERR_PTR(-ENOTDIR);
}
const struct inode_operations ceph_file_iops = {
@@ -148,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_insert_color(&frag->node, &ci->i_fragtree);
dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
+ ceph_vinop(&ci->netfs.inode), f);
return frag;
}
@@ -334,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (nsplits != ci->i_fragtree_nsplits) {
update = true;
} else if (nsplits) {
- i = prandom_u32() % nsplits;
+ i = prandom_u32_max(nsplits);
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
update = true;
@@ -425,11 +453,14 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
struct ceph_inode_info *ci;
int i;
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->vfs_inode);
+ dout("alloc_inode %p\n", &ci->netfs.inode);
+
+ /* Set parameters for the netfs library */
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
spin_lock_init(&ci->i_ceph_lock);
@@ -447,6 +478,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_max_files = 0;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
@@ -471,13 +503,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_prealloc_cap_flush = NULL;
INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
- ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
+ ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
@@ -496,6 +528,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_rdcache_ref = 0;
ci->i_wr_ref = 0;
ci->i_wb_ref = 0;
+ ci->i_fx_ref = 0;
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
@@ -514,12 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
-
- ceph_fscache_inode_init(ci);
-
- ci->i_meta_err = 0;
-
- return &ci->vfs_inode;
+ return &ci->netfs.inode;
}
void ceph_free_inode(struct inode *inode)
@@ -533,19 +561,24 @@ void ceph_free_inode(struct inode *inode)
void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_frag *frag;
struct rb_node *n;
dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ percpu_counter_dec(&mdsc->metric.total_inodes);
+
truncate_inode_pages_final(&inode->i_data);
+ if (inode->i_state & I_PINNING_FSCACHE_WB)
+ ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
ceph_fscache_unregister_inode_cookie(ci);
__ceph_remove_caps(ci);
- if (__ceph_has_any_quota(ci))
+ if (__ceph_has_quota(ci, QUOTA_GET_ANY))
ceph_adjust_quota_realms_count(inode, false);
/*
@@ -553,19 +586,10 @@ void ceph_evict_inode(struct inode *inode)
* caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
- struct ceph_mds_client *mdsc =
- ceph_inode_to_client(inode)->mdsc;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n",
- realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
+ ci->i_snap_realm);
+ ceph_change_snap_realm(inode, NULL);
} else {
ceph_put_snapid_map(mdsc, ci->i_snapid_map);
ci->i_snap_realm = NULL;
@@ -586,6 +610,7 @@ void ceph_evict_inode(struct inode *inode)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
}
static inline blkcnt_t calc_inode_blocks(u64 size)
@@ -606,16 +631,23 @@ int ceph_fill_file_size(struct inode *inode, int issued,
{
struct ceph_inode_info *ci = ceph_inode(inode);
int queue_trunc = 0;
+ loff_t isize = i_size_read(inode);
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
- (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
- dout("size %lld -> %llu\n", inode->i_size, size);
+ (truncate_seq == ci->i_truncate_seq && size > isize)) {
+ dout("size %lld -> %llu\n", isize, size);
if (size > 0 && S_ISDIR(inode->i_mode)) {
pr_err("fill_file_size non-zero size for directory\n");
size = 0;
}
i_size_write(inode, size);
inode->i_blocks = calc_inode_blocks(size);
+ /*
+ * If we're expanding, then we should be able to just update
+ * the existing cookie.
+ */
+ if (size > isize)
+ ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
dout("truncate_seq %u -> %u\n",
@@ -636,7 +668,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if ((issued & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_BUFFER)) ||
mapping_mapped(inode->i_mapping) ||
- __ceph_caps_file_wanted(ci)) {
+ __ceph_is_file_opened(ci)) {
ci->i_truncate_pending++;
queue_trunc = 1;
}
@@ -648,10 +680,6 @@ int ceph_fill_file_size(struct inode *inode, int issued,
truncate_size);
ci->i_truncate_size = truncate_size;
}
-
- if (queue_trunc)
- ceph_fscache_invalidate(inode);
-
return queue_trunc;
}
@@ -727,13 +755,13 @@ void ceph_fill_file_time(struct inode *inode, int issued,
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
*/
-static int fill_inode(struct inode *inode, struct page *locked_page,
- struct ceph_mds_reply_info_in *iinfo,
- struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session, int cap_fmode,
- struct ceph_cap_reservation *caps_reservation)
+int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued, new_issued, info_caps;
@@ -747,11 +775,34 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
bool queue_trunc = false;
bool new_version = false;
bool fill_inline = false;
+ umode_t mode = le32_to_cpu(info->mode);
+ dev_t rdev = le32_to_cpu(info->rdev);
+
+ lockdep_assert_held(&mdsc->snap_rwsem);
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+ dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
+ /* Once I_NEW is cleared, we can't change type or dev numbers */
+ if (inode->i_state & I_NEW) {
+ inode->i_mode = mode;
+ } else {
+ if (inode_wrong_type(inode, mode)) {
+ pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
+ return -ESTALE;
+ }
+
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
+ pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
+ ceph_vinop(inode), MAJOR(inode->i_rdev),
+ MINOR(inode->i_rdev), MAJOR(rdev),
+ MINOR(rdev));
+ return -ESTALE;
+ }
+ }
+
info_caps = le32_to_cpu(info->cap.caps);
/* prealloc new cap struct */
@@ -769,7 +820,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+ pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
iinfo->xattr_len);
}
@@ -805,8 +856,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
issued |= __ceph_caps_dirty(ci);
new_issued = ~issued & info_caps;
- /* update inode */
- inode->i_rdev = le32_to_cpu(info->rdev);
/* directories have fl_stripe_unit set to zero */
if (le32_to_cpu(info->layout.fl_stripe_unit))
inode->i_blkbits =
@@ -818,7 +867,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(info->mode);
+ inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
@@ -884,6 +933,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ci->i_rfiles = le64_to_cpu(info->rfiles);
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
ci->i_dir_pin = iinfo->dir_pin;
+ ci->i_rsnaps = iinfo->rsnaps;
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
}
}
@@ -916,7 +966,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
case S_IFCHR:
case S_IFSOCK:
inode->i_blkbits = PAGE_SHIFT;
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ceph_file_iops;
break;
case S_IFREG:
@@ -932,8 +982,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
if (symlen != i_size_read(inode)) {
- pr_err("fill_inode %llx.%llx BAD symlink "
- "size %lld\n", ceph_vinop(inode),
+ pr_err("%s %llx.%llx BAD symlink "
+ "size %lld\n", __func__,
+ ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
@@ -957,7 +1008,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_dir_fops;
break;
default:
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+ pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
ceph_vinop(inode), inode->i_mode);
}
@@ -966,7 +1017,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ceph_snap(inode) == CEPH_NOSNAP) {
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode, info_caps,
+ info_caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
@@ -991,26 +1042,29 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(info_caps));
ci->i_snap_caps |= info_caps;
- if (cap_fmode >= 0)
- __ceph_get_fmode(ci, cap_fmode);
}
- } else if (cap_fmode >= 0) {
- pr_warn("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
- __ceph_get_fmode(ci, cap_fmode);
}
if (iinfo->inline_version > 0 &&
iinfo->inline_version >= ci->i_inline_version) {
int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
ci->i_inline_version = iinfo->inline_version;
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (ceph_has_inline_data(ci) &&
(locked_page || (info_caps & cache_caps)))
fill_inline = true;
}
+ if (cap_fmode >= 0) {
+ if (!info_caps)
+ pr_warn("mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
+ __ceph_touch_fmode(ci, mdsc, cap_fmode);
+ }
+
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_register_inode_cookie(inode);
+
if (fill_inline)
ceph_fill_inline_data(inode, locked_page,
iinfo->inline_data, iinfo->inline_len);
@@ -1050,6 +1104,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
struct ceph_mds_session **old_lease_session)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
@@ -1061,13 +1116,18 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return;
+ if (mask & CEPH_LEASE_PRIMARY_LINK)
+ di->flags |= CEPH_DENTRY_PRIMARY_LINK;
+ else
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
+
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
- if (duration == 0) {
+ if (!(mask & CEPH_LEASE_VALID)) {
__ceph_dentry_dir_lease_touch(di);
return;
}
- if (di->lease_gen == session->s_cap_gen &&
+ if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
time_before(ttl, di->time))
return; /* we already have a newer lease. */
@@ -1078,7 +1138,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (!di->lease_session)
di->lease_session = ceph_get_mds_session(session);
- di->lease_gen = session->s_cap_gen;
+ di->lease_gen = atomic_read(&session->s_cap_gen);
di->lease_seq = le32_to_cpu(lease->seq);
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
@@ -1097,8 +1157,7 @@ static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
__update_dentry_lease(dir, dentry, lease, session, from_time,
&old_lease_session);
spin_unlock(&dentry->d_lock);
- if (old_lease_session)
- ceph_put_mds_session(old_lease_session);
+ ceph_put_mds_session(old_lease_session);
}
/*
@@ -1143,13 +1202,12 @@ static void update_dentry_lease_careful(struct dentry *dentry,
from_time, &old_lease_session);
out_unlock:
spin_unlock(&dentry->d_lock);
- if (old_lease_session)
- ceph_put_mds_session(old_lease_session);
+ ceph_put_mds_session(old_lease_session);
}
/*
* splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
+ * caller must hold directory i_rwsem for this to be safe.
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
@@ -1239,10 +1297,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct inode *dir = req->r_parent;
if (dir) {
- err = fill_inode(dir, NULL,
- &rinfo->diri, rinfo->dirfrag,
- session, -1,
- &req->r_caps_reservation);
+ err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ rinfo->dirfrag, session, -1,
+ &req->r_caps_reservation);
if (err < 0)
goto done;
} else {
@@ -1298,28 +1355,26 @@ retry_lookup:
}
if (rinfo->head->is_target) {
- tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ /* Should be filled in by handle_reply */
+ BUG_ON(!req->r_target_inode);
- in = ceph_get_inode(sb, tvino);
- if (IS_ERR(in)) {
- err = PTR_ERR(in);
- goto done;
- }
-
- err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
- session,
+ in = req->r_target_inode;
+ err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
+ NULL, session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
+ pr_err("ceph_fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in));
+ req->r_target_inode = NULL;
if (in->i_state & I_NEW)
discard_new_inode(in);
+ else
+ iput(in);
goto done;
}
- req->r_target_inode = in;
if (in->i_state & I_NEW)
unlock_new_inode(in);
}
@@ -1411,10 +1466,12 @@ retry_lookup:
} else if (have_lease) {
if (d_unhashed(dn))
d_add(dn, NULL);
+ }
+
+ if (!d_unhashed(dn) && have_lease)
update_dentry_lease(dir, dn,
rinfo->dlease, session,
req->r_request_started);
- }
goto done;
}
@@ -1500,10 +1557,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
- rc = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (rc < 0) {
- pr_err("fill_inode badness on %p got %d\n", in, rc);
+ pr_err("ceph_fill_inode badness on %p got %d\n",
+ in, rc);
err = rc;
if (in->i_state & I_NEW) {
ihold(in);
@@ -1513,8 +1571,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
unlock_new_inode(in);
}
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(in);
+ iput(in);
}
return err;
@@ -1549,7 +1606,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
- * i_mutex, no need to use page lock */
+ * i_rwsem, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
if (idx == 0)
@@ -1578,8 +1635,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct dentry *dn;
struct inode *in;
int err = 0, skipped = 0, ret, i;
- struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+ u32 frag = le32_to_cpu(req->r_args.readdir.frag);
u32 last_hash = 0;
u32 fpos_offset;
struct ceph_readdir_cache_control cache_ctl = {};
@@ -1596,7 +1652,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} else if (rinfo->offset_hash) {
/* mds understands offset_hash */
WARN_ON_ONCE(req->r_readdir_offset != 2);
- last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+ last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
}
}
@@ -1707,18 +1763,16 @@ retry_lookup:
}
}
- ret = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (ret < 0) {
- pr_err("fill_inode badness on %p\n", in);
+ pr_err("ceph_fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
- /* avoid calling iput_final() in mds
- * dispatch threads */
if (in->i_state & I_NEW) {
ihold(in);
discard_new_inode(in);
}
- ceph_async_iput(in);
+ iput(in);
}
d_drop(dn);
err = ret;
@@ -1731,7 +1785,7 @@ retry_lookup:
if (ceph_security_xattr_deadlock(in)) {
dout(" skip splicing dn %p to inode %p"
" (security xattr deadlock)\n", dn, in);
- ceph_async_iput(in);
+ iput(in);
skipped++;
goto next_item;
}
@@ -1772,89 +1826,29 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
bool ret;
spin_lock(&ci->i_ceph_lock);
- dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+ dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
i_size_write(inode, size);
+ ceph_fscache_update(inode);
inode->i_blocks = calc_inode_blocks(size);
ret = __ceph_should_report_size(ci);
spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-/*
- * Put reference to inode, but avoid calling iput_final() in current thread.
- * iput_final() may wait for reahahead pages. The wait can cause deadlock in
- * some contexts.
- */
-void ceph_async_iput(struct inode *inode)
-{
- if (!inode)
- return;
- for (;;) {
- if (atomic_add_unless(&inode->i_count, -1, 1))
- break;
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ceph_inode(inode)->i_work))
- break;
- /* queue work failed, i_count must be at least 2 */
- }
-}
-
-/*
- * Write back inode data in a worker thread. (This can't be done
- * in the message handler context.)
- */
-void ceph_queue_writeback(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask);
-
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ci->i_work)) {
- dout("ceph_queue_writeback %p\n", inode);
- } else {
- dout("ceph_queue_writeback %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
- iput(inode);
- }
-}
-
-/*
- * queue an async invalidation
- */
-void ceph_queue_invalidate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask);
-
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ceph_inode(inode)->i_work)) {
- dout("ceph_queue_invalidate %p\n", inode);
- } else {
- dout("ceph_queue_invalidate %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
- iput(inode);
- }
+ return ret;
}
-/*
- * Queue an async vmtruncate. If we fail to queue work, we will handle
- * the truncation the next time we call __ceph_do_pending_vmtruncate.
- */
-void ceph_queue_vmtruncate(struct inode *inode)
+void ceph_queue_inode_work(struct inode *inode, int work_bit)
{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask);
+ set_bit(work_bit, &ci->i_work_mask);
ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ci->i_work)) {
- dout("ceph_queue_vmtruncate %p\n", inode);
+ if (queue_work(fsc->inode_wq, &ci->i_work)) {
+ dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
} else {
- dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n",
+ dout("queue_inode_work %p already queued, mask=%lx\n",
inode, ci->i_work_mask);
iput(inode);
}
@@ -1863,15 +1857,16 @@ void ceph_queue_vmtruncate(struct inode *inode)
static void ceph_do_invalidate_pages(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
u32 orig_gen;
int check = 0;
+ ceph_fscache_invalidate(inode, false);
+
mutex_lock(&ci->i_truncate_mutex);
- if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
- inode, ceph_ino(inode));
+ if (ceph_inode_is_shutdown(inode)) {
+ pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
+ __func__, ceph_vinop(inode));
mapping_set_error(inode->i_mapping, -EIO);
truncate_pagecache(inode, 0);
mutex_unlock(&ci->i_truncate_mutex);
@@ -1892,7 +1887,8 @@ static void ceph_do_invalidate_pages(struct inode *inode)
spin_unlock(&ci->i_ceph_lock);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
- pr_err("invalidate_pages %p fails\n", inode);
+ pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
+ ceph_vinop(inode));
}
spin_lock(&ci->i_ceph_lock);
@@ -1958,6 +1954,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_resize(inode, to);
truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
@@ -1972,7 +1969,7 @@ retry:
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
wake_up_all(&ci->i_cap_wq);
}
@@ -1981,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work)
{
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_work);
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
dout("writeback %p\n", inode);
@@ -1993,6 +1990,12 @@ static void ceph_inode_work(struct work_struct *work)
if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
__ceph_do_pending_vmtruncate(inode);
+ if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
+ ceph_check_caps(ci, 0, NULL);
+
+ if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
+ ceph_flush_snaps(ci, NULL);
+
iput(inode);
}
@@ -2115,20 +2118,21 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
}
if (ia_valid & ATTR_SIZE) {
- dout("setattr %p size %lld -> %lld\n", inode,
- inode->i_size, attr->ia_size);
- if ((issued & CEPH_CAP_FILE_EXCL) &&
- attr->ia_size > inode->i_size) {
- i_size_write(inode, attr->ia_size);
- inode->i_blocks = calc_inode_blocks(attr->ia_size);
- ci->i_reported_size = attr->ia_size;
- dirtied |= CEPH_CAP_FILE_EXCL;
- ia_valid |= ATTR_MTIME;
+ loff_t isize = i_size_read(inode);
+
+ dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
+ if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+ if (attr->ia_size > isize) {
+ i_size_write(inode, attr->ia_size);
+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
+ ci->i_reported_size = attr->ia_size;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ ia_valid |= ATTR_MTIME;
+ }
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- attr->ia_size != inode->i_size) {
+ attr->ia_size != isize) {
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
- req->r_args.setattr.old_size =
- cpu_to_le64(inode->i_size);
+ req->r_args.setattr.old_size = cpu_to_le64(isize);
mask |= CEPH_SETATTR_SIZE;
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
@@ -2188,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
inode->i_ctime = attr->ia_ctime;
+ inode_inc_iversion_raw(inode);
}
release &= issued;
@@ -2198,7 +2203,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
-
if (mask) {
req->r_inode = inode;
ihold(inode);
@@ -2223,7 +2227,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
/*
* setattr
*/
-int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -2232,12 +2237,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- err = setattr_prepare(dentry, attr);
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ err = setattr_prepare(&init_user_ns, dentry, attr);
if (err != 0)
return err;
if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size > max(inode->i_size, fsc->max_file_size))
+ attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
return -EFBIG;
if ((attr->ia_valid & ATTR_SIZE) &&
@@ -2247,11 +2255,41 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
err = __ceph_setattr(inode, attr);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
- err = posix_acl_chmod(inode, attr->ia_mode);
+ err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
return err;
}
+int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
+{
+ int issued = ceph_caps_issued(ceph_inode(inode));
+
+ /*
+ * If any 'x' caps is issued we can just choose the auth MDS
+ * instead of the random replica MDSes. Because only when the
+ * Locker is in LOCK_EXEC state will the loner client could
+ * get the 'x' caps. And if we send the getattr requests to
+ * any replica MDS it must auth pin and tries to rdlock from
+ * the auth MDS, and then the auth MDS need to do the Locker
+ * state transition to LOCK_SYNC. And after that the lock state
+ * will change back.
+ *
+ * This cost much when doing the Locker state transition and
+ * usually will need to revoke caps from clients.
+ *
+ * And for the 'Xs' caps for getxattr we will also choose the
+ * auth MDS, because the MDS side code is buggy due to setxattr
+ * won't notify the replica MDSes when the values changed and
+ * the replica MDS will return the old values. Though we will
+ * fix it in MDS code, but this still makes sense for old ceph.
+ */
+ if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
+ || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
+ return USE_AUTH_MDS;
+ else
+ return USE_ANY_MDS;
+}
+
/*
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
@@ -2272,10 +2310,10 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
dout("do_getattr inode %p mask %s mode 0%o\n",
inode, ceph_cap_string(mask), inode->i_mode);
- if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
- return 0;
+ if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
+ return 0;
- mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+ mode = ceph_try_to_choose_auth_mds(inode, mask);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -2290,7 +2328,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (inline_version == 0) {
/* the reply is supposed to contain inline data */
err = -EINVAL;
- } else if (inline_version == CEPH_INLINE_NONE) {
+ } else if (inline_version == CEPH_INLINE_NONE ||
+ inline_version == 1) {
err = -ENODATA;
} else {
err = req->r_reply_info.targeti.inline_len;
@@ -2301,12 +2340,65 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
return err;
}
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int mode = USE_AUTH_MDS;
+ int err;
+ char *xattr_value;
+ size_t xattr_value_len;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
+ if (IS_ERR(req)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto put;
+ }
+
+ ihold(inode);
+ req->r_inode = inode;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto put;
+
+ xattr_value = req->r_reply_info.xattr_info.xattr_value;
+ xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
+
+ dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+
+ err = (int)xattr_value_len;
+ if (size == 0)
+ goto put;
+
+ if (xattr_value_len > size) {
+ err = -ERANGE;
+ goto put;
+ }
+
+ memcpy(value, xattr_value, xattr_value_len);
+put:
+ ceph_mdsc_put_request(req);
+out:
+ dout("do_getvxattr result=%d\n", err);
+ return err;
+}
+
/*
* Check inode permissions. We verify we have a valid value for
* the AUTH cap, then call the generic handler.
*/
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask)
{
int err;
@@ -2316,20 +2408,28 @@ int ceph_permission(struct inode *inode, int mask)
err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
if (!err)
- err = generic_permission(inode, mask);
+ err = generic_permission(&init_user_ns, inode, mask);
return err;
}
/* Craft a mask of needed caps given a set of requested statx attrs. */
-static int statx_to_caps(u32 want)
+static int statx_to_caps(u32 want, umode_t mode)
{
int mask = 0;
if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
mask |= CEPH_CAP_AUTH_SHARED;
- if (want & (STATX_NLINK|STATX_CTIME))
- mask |= CEPH_CAP_LINK_SHARED;
+ if (want & (STATX_NLINK|STATX_CTIME)) {
+ /*
+ * The link count for directories depends on inode->i_subdirs,
+ * and that is only updated when Fs caps are held.
+ */
+ if (S_ISDIR(mode))
+ mask |= CEPH_CAP_FILE_SHARED;
+ else
+ mask |= CEPH_CAP_LINK_SHARED;
+ }
if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
STATX_BLOCKS))
@@ -2345,24 +2445,29 @@ static int statx_to_caps(u32 want)
* Get all the attributes. If we have sufficient caps for the requested attrs,
* then we can avoid talking to the MDS at all.
*/
-int ceph_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct super_block *sb = inode->i_sb;
struct ceph_inode_info *ci = ceph_inode(inode);
u32 valid_mask = STATX_BASIC_STATS;
int err = 0;
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
/* Skip the getattr altogether if we're asked not to sync */
- if (!(flags & AT_STATX_DONT_SYNC)) {
- err = ceph_do_getattr(inode, statx_to_caps(request_mask),
- flags & AT_STATX_FORCE_SYNC);
+ if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
+ err = ceph_do_getattr(inode,
+ statx_to_caps(request_mask, inode->i_mode),
+ flags & AT_STATX_FORCE_SYNC);
if (err)
return err;
}
- generic_fillattr(inode, stat);
- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+ generic_fillattr(&init_user_ns, inode, stat);
+ stat->ino = ceph_present_inode(inode);
/*
* btime on newly-allocated inodes is 0, so if this is still set to
@@ -2374,16 +2479,34 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
}
if (ceph_snap(inode) == CEPH_NOSNAP)
- stat->dev = inode->i_sb->s_dev;
+ stat->dev = sb->s_dev;
else
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
- RBYTES))
+ if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
stat->size = ci->i_rbytes;
- else
+ } else if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ struct ceph_inode_info *pci;
+ struct ceph_snap_realm *realm;
+ struct inode *parent;
+
+ parent = ceph_lookup_inode(sb, ceph_ino(inode));
+ if (!parent)
+ return PTR_ERR(parent);
+
+ pci = ceph_inode(parent);
+ spin_lock(&pci->i_ceph_lock);
+ realm = pci->i_snap_realm;
+ if (realm)
+ stat->size = realm->num_snaps;
+ else
+ stat->size = 0;
+ spin_unlock(&pci->i_ceph_lock);
+ iput(parent);
+ } else {
stat->size = ci->i_files + ci->i_subdirs;
+ }
stat->blocks = 0;
stat->blksize = 65536;
/*
@@ -2399,3 +2522,27 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
stat->result_mask = request_mask & valid_mask;
return err;
}
+
+void ceph_inode_shutdown(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct rb_node *p;
+ int iputs = 0;
+ bool invalidate = false;
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
+ p = rb_first(&ci->i_caps);
+ while (p) {
+ struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+
+ p = rb_next(p);
+ iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (invalidate)
+ ceph_queue_invalidate(inode);
+ while (iputs--)
+ iput(inode);
+}
diff --git a/fs/ceph/io.c b/fs/ceph/io.c
index 97602ea92ff4..c456509b31c3 100644
--- a/fs/ceph/io.c
+++ b/fs/ceph/io.c
@@ -118,7 +118,7 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
}
/**
- * ceph_end_io_direct - declare the file is being used for direct i/o
+ * ceph_start_io_direct - declare the file is being used for direct i/o
* @inode: file inode
*
* Declare that a direct I/O operation is about to start, and ensure
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index c90f03beb15d..6e061bf62ad4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
fi->fmode |= CEPH_FILE_MODE_LAZY;
ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+ __ceph_touch_fmode(ci, mdsc, fi->fmode);
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 544e9e85b120..3e2843e86e27 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -57,13 +57,13 @@ static const struct file_lock_operations ceph_fl_lock_ops = {
.fl_release_private = ceph_fl_release_lock,
};
-/**
+/*
* Implement fcntl and flock locking functions.
*/
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
int cmd, u8 wait, struct file_lock *fl)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_request *req;
int err;
u64 length = 0;
@@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
- if (wait)
- req->r_wait_for_completion = ceph_lock_wait_for_completion;
-
- err = ceph_mdsc_do_request(mdsc, inode, req);
+ err = ceph_mdsc_submit_request(mdsc, inode, req);
+ if (!err)
+ err = ceph_mdsc_wait_request(mdsc, req, wait ?
+ ceph_lock_wait_for_completion : NULL);
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
@@ -210,7 +210,22 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
return 0;
}
-/**
+static int try_unlock_file(struct file *file, struct file_lock *fl)
+{
+ int err;
+ unsigned int orig_flags = fl->fl_flags;
+ fl->fl_flags |= FL_EXISTS;
+ err = locks_lock_file_wait(file, fl);
+ fl->fl_flags = orig_flags;
+ if (err == -ENOENT) {
+ if (!(orig_flags & FL_EXISTS))
+ err = 0;
+ return err;
+ }
+ return 1;
+}
+
+/*
* Attempt to set an fcntl lock.
* For now, this just goes away to the server. Later it may be more awesome.
*/
@@ -225,9 +240,9 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- /* No mandatory locks */
- if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
+
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
@@ -255,9 +270,15 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
- if (op == CEPH_MDS_OP_SETFILELOCK) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
dout("mds locked, locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
@@ -284,9 +305,9 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /* No mandatory locks */
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
+
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
dout("ceph_flock, fl_file: %p\n", fl->fl_file);
@@ -311,9 +332,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (F_UNLCK == fl->fl_type) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
inode, lock_cmd, wait, fl);
- if (!err) {
+ if (!err && F_UNLCK != fl->fl_type) {
err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
@@ -381,7 +408,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock,
return err;
}
-/**
+/*
* Encode the flock and fcntl locks for the given inode into the ceph_filelock
* array. Must be called with inode->i_lock already held.
* If we encounter more of a specific lock type than expected, return -ENOSPC.
@@ -431,7 +458,7 @@ fail:
return err;
}
-/**
+/*
* Copy the encoded flock and fcntl locks into the pagelist.
* Format is: #fcntl locks, sequential fcntl locks, #flock locks,
* sequential flock locks.
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bbbbddf71326..26a0a8b9975e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -10,6 +10,8 @@
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/bits.h>
+#include <linux/ktime.h>
+#include <linux/bitmap.h>
#include "super.h"
#include "mds_client.h"
@@ -175,6 +177,13 @@ static int parse_reply_info_in(void **p, void *end,
memset(&info->snap_btime, 0, sizeof(info->snap_btime));
}
+ /* snapshot count, remains zero for v<=3 */
+ if (struct_v >= 4) {
+ ceph_decode_64_safe(p, end, info->rsnaps, bad);
+ } else {
+ info->rsnaps = 0;
+ }
+
*p = end;
} else {
if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
@@ -213,7 +222,7 @@ static int parse_reply_info_in(void **p, void *end,
}
info->dir_pin = -ENODATA;
- /* info->snap_btime remains zero */
+ /* info->snap_btime and info->rsnaps remain zero */
}
return 0;
bad:
@@ -415,21 +424,124 @@ bad:
return -EIO;
}
+
+#if BITS_PER_LONG == 64
+
+#define DELEGATED_INO_AVAILABLE xa_mk_value(1)
+
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ dout("got %u sets of delegated inodes\n", sets);
+ while (sets--) {
+ u64 start, len;
+
+ ceph_decode_64_safe(p, end, start, bad);
+ ceph_decode_64_safe(p, end, len, bad);
+
+ /* Don't accept a delegation of system inodes */
+ if (start < CEPH_INO_SYSTEM_BASE) {
+ pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
+ start, len);
+ continue;
+ }
+ while (len--) {
+ int err = xa_insert(&s->s_delegated_inos, start++,
+ DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+ if (!err) {
+ dout("added delegated inode 0x%llx\n",
+ start - 1);
+ } else if (err == -EBUSY) {
+ pr_warn("MDS delegated inode 0x%llx more than once.\n",
+ start - 1);
+ } else {
+ return err;
+ }
+ }
+ }
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ unsigned long ino;
+ void *val;
+
+ xa_for_each(&s->s_delegated_inos, ino, val) {
+ val = xa_erase(&s->s_delegated_inos, ino);
+ if (val == DELEGATED_INO_AVAILABLE)
+ return ino;
+ }
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+}
+#else /* BITS_PER_LONG == 64 */
+/*
+ * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
+ * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
+ * and bottom words?
+ */
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ if (sets)
+ ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return 0;
+}
+#endif /* BITS_PER_LONG == 64 */
+
/*
* parse create results
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ u64 features, struct ceph_mds_session *s)
{
+ int ret;
+
if (features == (u64)-1 ||
(features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
- /* Malformed reply? */
if (*p == end) {
+ /* Malformed reply? */
info->has_create_ino = false;
- } else {
+ } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
info->has_create_ino = true;
+ /* struct_v, struct_compat, and len */
+ ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad);
ceph_decode_64_safe(p, end, info->ino, bad);
+ ret = ceph_parse_deleg_inos(p, end, s);
+ if (ret)
+ return ret;
+ } else {
+ /* legacy */
+ ceph_decode_64_safe(p, end, info->ino, bad);
+ info->has_create_ino = true;
}
} else {
if (*p != end)
@@ -443,12 +555,34 @@ bad:
return -EIO;
}
+static int parse_reply_info_getvxattr(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ u32 value_len;
+
+ ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
+ ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
+ ceph_decode_skip_32(p, end, bad); /* skip payload length */
+
+ ceph_decode_32_safe(p, end, value_len, bad);
+
+ if (value_len == end - *p) {
+ info->xattr_info.xattr_value = *p;
+ info->xattr_info.xattr_value_len = value_len;
+ *p = end;
+ return value_len;
+ }
+bad:
+ return -EIO;
+}
+
/*
* parse extra results
*/
static int parse_reply_info_extra(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ u64 features, struct ceph_mds_session *s)
{
u32 op = le32_to_cpu(info->head->op);
@@ -457,7 +591,9 @@ static int parse_reply_info_extra(void **p, void *end,
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
- return parse_reply_info_create(p, end, info, features);
+ return parse_reply_info_create(p, end, info, features, s);
+ else if (op == CEPH_MDS_OP_GETVXATTR)
+ return parse_reply_info_getvxattr(p, end, info, features);
else
return -EIO;
}
@@ -465,7 +601,7 @@ static int parse_reply_info_extra(void **p, void *end,
/*
* parse entire mds reply
*/
-static int parse_reply_info(struct ceph_msg *msg,
+static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
@@ -490,7 +626,7 @@ static int parse_reply_info(struct ceph_msg *msg,
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
ceph_decode_need(&p, end, len, bad);
- err = parse_reply_info_extra(&p, p+len, info, features);
+ err = parse_reply_info_extra(&p, p+len, info, features, s);
if (err < 0)
goto out_bad;
}
@@ -519,6 +655,79 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
}
+/*
+ * In async unlink case the kclient won't wait for the first reply
+ * from MDS and just drop all the links and unhash the dentry and then
+ * succeeds immediately.
+ *
+ * For any new create/link/rename,etc requests followed by using the
+ * same file names we must wait for the first reply of the inflight
+ * unlink request, or the MDS possibly will fail these following
+ * requests with -EEXIST if the inflight async unlink request was
+ * delayed for some reasons.
+ *
+ * And the worst case is that for the none async openc request it will
+ * successfully open the file if the CDentry hasn't been unlinked yet,
+ * but later the previous delayed async unlink request will remove the
+ * CDenty. That means the just created file is possiblly deleted later
+ * by accident.
+ *
+ * We need to wait for the inflight async unlink requests to finish
+ * when creating new files/directories by using the same file names.
+ */
+int ceph_wait_on_conflict_unlink(struct dentry *dentry)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct dentry *pdentry = dentry->d_parent;
+ struct dentry *udentry, *found = NULL;
+ struct ceph_dentry_info *di;
+ struct qstr dname;
+ u32 hash = dentry->d_name.hash;
+ int err;
+
+ dname.name = dentry->d_name.name;
+ dname.len = dentry->d_name.len;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
+ hnode, hash) {
+ udentry = di->dentry;
+
+ spin_lock(&udentry->d_lock);
+ if (udentry->d_name.hash != hash)
+ goto next;
+ if (unlikely(udentry->d_parent != pdentry))
+ goto next;
+ if (!hash_hashed(&di->hnode))
+ goto next;
+
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
+ __func__, dentry, dentry);
+
+ if (!d_same_name(udentry, pdentry, &dname))
+ goto next;
+
+ spin_unlock(&udentry->d_lock);
+ found = dget(udentry);
+ break;
+next:
+ spin_unlock(&udentry->d_lock);
+ }
+ rcu_read_unlock();
+
+ if (likely(!found))
+ return 0;
+
+ dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
+ dentry, dentry, found, found);
+
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
+ TASK_KILLABLE);
+ dput(found);
+ return err;
+}
+
/*
* sessions
@@ -541,23 +750,21 @@ const char *ceph_session_state_name(int s)
struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
{
- if (refcount_inc_not_zero(&s->s_ref)) {
- dout("mdsc get_session %p %d -> %d\n", s,
- refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
+ if (refcount_inc_not_zero(&s->s_ref))
return s;
- } else {
- dout("mdsc get_session %p 0 -- FAIL\n", s);
- return NULL;
- }
+ return NULL;
}
void ceph_put_mds_session(struct ceph_mds_session *s)
{
- dout("mdsc put_session %p %d -> %d\n", s,
- refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
+ if (IS_ERR_OR_NULL(s))
+ return;
+
if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
+ WARN_ON(mutex_is_locked(&s->s_mutex));
+ xa_destroy(&s->s_delegated_inos);
kfree(s);
}
}
@@ -627,30 +834,23 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_mdsc = mdsc;
s->s_mds = mds;
s->s_state = CEPH_MDS_SESSION_NEW;
- s->s_ttl = 0;
- s->s_seq = 0;
mutex_init(&s->s_mutex);
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
- spin_lock_init(&s->s_gen_ttl_lock);
- s->s_cap_gen = 1;
+ atomic_set(&s->s_cap_gen, 1);
s->s_cap_ttl = jiffies - 1;
spin_lock_init(&s->s_cap_lock);
- s->s_renew_requested = 0;
- s->s_renew_seq = 0;
INIT_LIST_HEAD(&s->s_caps);
- s->s_nr_caps = 0;
refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
- s->s_num_cap_releases = 0;
- s->s_cap_reconnect = 0;
- s->s_cap_iterator = NULL;
+ xa_init(&s->s_delegated_inos);
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
+ INIT_LIST_HEAD(&s->s_cap_dirty);
INIT_LIST_HEAD(&s->s_cap_flushing);
mdsc->sessions[mds] = s;
@@ -694,11 +894,39 @@ static void put_request_session(struct ceph_mds_request *req)
}
}
+void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state)
+{
+ int mds;
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; ++mds) {
+ struct ceph_mds_session *s;
+
+ s = __ceph_lookup_mds_session(mdsc, mds);
+ if (!s)
+ continue;
+
+ if (check_state && !check_session_state(s)) {
+ ceph_put_mds_session(s);
+ continue;
+ }
+
+ mutex_unlock(&mdsc->mutex);
+ cb(s);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
void ceph_mdsc_release_request(struct kref *kref)
{
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
+ ceph_mdsc_release_dir_caps_no_check(req);
destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
@@ -706,14 +934,13 @@ void ceph_mdsc_release_request(struct kref *kref)
ceph_msg_put(req->r_reply);
if (req->r_inode) {
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(req->r_inode);
+ iput(req->r_inode);
}
if (req->r_parent) {
ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
- ceph_async_iput(req->r_parent);
+ iput(req->r_parent);
}
- ceph_async_iput(req->r_target_inode);
+ iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
if (req->r_old_dentry)
@@ -727,16 +954,17 @@ void ceph_mdsc_release_request(struct kref *kref)
*/
ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- ceph_async_iput(req->r_old_dentry_dir);
+ iput(req->r_old_dentry_dir);
}
kfree(req->r_path1);
kfree(req->r_path2);
+ put_cred(req->r_cred);
if (req->r_pagelist)
ceph_pagelist_release(req->r_pagelist);
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
WARN_ON_ONCE(!list_empty(&req->r_wait));
- kfree(req);
+ kmem_cache_free(ceph_mds_request_cachep, req);
}
DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
@@ -786,15 +1014,19 @@ static void __register_request(struct ceph_mds_client *mdsc,
ceph_mdsc_get_request(req);
insert_request(&mdsc->request_tree, req);
- req->r_uid = current_fsuid();
- req->r_gid = current_fsgid();
+ req->r_cred = get_current_cred();
if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
mdsc->oldest_tid = req->r_tid;
if (dir) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+
ihold(dir);
req->r_unsafe_dir = dir;
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -822,8 +1054,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
erase_request(&mdsc->request_tree, req);
- if (req->r_unsafe_dir &&
- test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ if (req->r_unsafe_dir) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
@@ -838,8 +1069,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
}
if (req->r_unsafe_dir) {
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(req->r_unsafe_dir);
+ iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
@@ -993,8 +1223,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) {
- if (mode == USE_ANY_MDS &&
- !ceph_mdsmap_is_laggy(mdsc->mdsmap,
+ if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
mds))
goto out;
}
@@ -1011,7 +1240,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
if (!cap) {
spin_unlock(&ci->i_ceph_lock);
- ceph_async_iput(inode);
+ iput(inode);
goto random;
}
mds = cap->session->s_mds;
@@ -1020,9 +1249,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock);
out:
- /* avoid calling iput_final() while holding mdsc->mutex or
- * in mds dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
return mds;
random:
@@ -1038,7 +1265,7 @@ random:
/*
* session messages
*/
-static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -1046,7 +1273,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
+ pr_err("ENOMEM creating session %s msg\n",
+ ceph_session_op_name(op));
return NULL;
}
h = msg->front.iov_base;
@@ -1058,24 +1286,75 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
-static void encode_supported_features(void **p, void *end)
+static int encode_supported_features(void **p, void *end)
{
static const size_t count = ARRAY_SIZE(feature_bits);
if (count > 0) {
size_t i;
size_t size = FEATURE_BYTES(count);
+ unsigned long bit;
+
+ if (WARN_ON_ONCE(*p + 4 + size > end))
+ return -ERANGE;
- BUG_ON(*p + 4 + size > end);
+ ceph_encode_32(p, size);
+ memset(*p, 0, size);
+ for (i = 0; i < count; i++) {
+ bit = feature_bits[i];
+ ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
+ }
+ *p += size;
+ } else {
+ if (WARN_ON_ONCE(*p + 4 > end))
+ return -ERANGE;
+
+ ceph_encode_32(p, 0);
+ }
+
+ return 0;
+}
+
+static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
+#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
+static int encode_metric_spec(void **p, void *end)
+{
+ static const size_t count = ARRAY_SIZE(metric_bits);
+
+ /* header */
+ if (WARN_ON_ONCE(*p + 2 > end))
+ return -ERANGE;
+
+ ceph_encode_8(p, 1); /* version */
+ ceph_encode_8(p, 1); /* compat */
+
+ if (count > 0) {
+ size_t i;
+ size_t size = METRIC_BYTES(count);
+
+ if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
+ return -ERANGE;
+
+ /* metric spec info length */
+ ceph_encode_32(p, 4 + size);
+
+ /* metric spec */
ceph_encode_32(p, size);
memset(*p, 0, size);
for (i = 0; i < count; i++)
- ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
+ ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
*p += size;
} else {
- BUG_ON(*p + 4 > end);
+ if (WARN_ON_ONCE(*p + 4 + 4 > end))
+ return -ERANGE;
+
+ /* metric spec info length */
+ ceph_encode_32(p, 4);
+ /* metric spec */
ceph_encode_32(p, 0);
}
+
+ return 0;
}
/*
@@ -1086,13 +1365,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
- int i = -1;
+ int i;
int extra_bytes = 0;
int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options;
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
size_t size, count;
void *p, *end;
+ int ret;
const char* metadata[][2] = {
{"hostname", mdsc->nodename},
@@ -1117,12 +1397,19 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
size = FEATURE_BYTES(count);
extra_bytes += 4 + size;
+ /* metric spec */
+ size = 0;
+ count = ARRAY_SIZE(metric_bits);
+ if (count > 0)
+ size = METRIC_BYTES(count);
+ extra_bytes += 2 + 4 + 4 + size;
+
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
- return NULL;
+ pr_err("ENOMEM creating session open msg\n");
+ return ERR_PTR(-ENOMEM);
}
p = msg->front.iov_base;
end = p + msg->front.iov_len;
@@ -1135,9 +1422,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v3
+ * ClientSession messages with metadata are v4
*/
- msg->hdr.version = cpu_to_le16(3);
+ msg->hdr.version = cpu_to_le16(4);
msg->hdr.compat_version = cpu_to_le16(1);
/* The write pointer, following the session_head structure */
@@ -1159,7 +1446,20 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
p += val_len;
}
- encode_supported_features(&p, end);
+ ret = encode_supported_features(&p, end);
+ if (ret) {
+ pr_err("encode_supported_features failed!\n");
+ ceph_msg_put(msg);
+ return ERR_PTR(ret);
+ }
+
+ ret = encode_metric_spec(&p, end);
+ if (ret) {
+ pr_err("encode_metric_spec failed!\n");
+ ceph_msg_put(msg);
+ return ERR_PTR(ret);
+ }
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1187,8 +1487,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
/* send connect message */
msg = create_session_open_msg(mdsc, session->s_seq);
- if (!msg)
- return -ENOMEM;
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -1202,6 +1502,7 @@ static struct ceph_mds_session *
__open_export_target_session(struct ceph_mds_client *mdsc, int target)
{
struct ceph_mds_session *session;
+ int ret;
session = __ceph_lookup_mds_session(mdsc, target);
if (!session) {
@@ -1210,8 +1511,11 @@ __open_export_target_session(struct ceph_mds_client *mdsc, int target)
return session;
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
+ session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ ret = __open_session(mdsc, session);
+ if (ret)
+ return ERR_PTR(ret);
+ }
return session;
}
@@ -1246,8 +1550,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
for (i = 0; i < mi->num_export_targets; i++) {
ts = __open_export_target_session(mdsc, mi->export_targets[i]);
- if (!IS_ERR(ts))
- ceph_put_mds_session(ts);
+ ceph_put_mds_session(ts);
}
}
@@ -1290,7 +1593,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
{
struct ceph_mds_request *req;
struct rb_node *p;
- struct ceph_inode_info *ci;
dout("cleanup_session_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
@@ -1299,16 +1601,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request, r_unsafe_item);
pr_warn_ratelimited(" dropping unsafe request %llu\n",
req->r_tid);
- if (req->r_target_inode) {
- /* dropping unsafe change of inode's attributes */
- ci = ceph_inode(req->r_target_inode);
- errseq_set(&ci->i_meta_err, -EIO);
- }
- if (req->r_unsafe_dir) {
- /* dropping unsafe directory operation */
- ci = ceph_inode(req->r_unsafe_dir);
- errseq_set(&ci->i_meta_err, -EIO);
- }
+ if (req->r_target_inode)
+ mapping_set_error(req->r_target_inode->i_mapping, -EIO);
+ if (req->r_unsafe_dir)
+ mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
__unregister_request(mdsc, req);
}
/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1344,7 +1640,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
p = session->s_caps.next;
while (p != &session->s_caps) {
cap = list_entry(p, struct ceph_cap, session_caps);
- inode = igrab(&cap->ci->vfs_inode);
+ inode = igrab(&cap->ci->netfs.inode);
if (!inode) {
p = p->next;
continue;
@@ -1353,9 +1649,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
spin_unlock(&session->s_cap_lock);
if (last_inode) {
- /* avoid calling iput_final() while holding
- * s_mutex or in mds dispatch threads */
- ceph_async_iput(last_inode);
+ iput(last_inode);
last_inode = NULL;
}
if (old_cap) {
@@ -1375,6 +1669,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
+ atomic64_dec(&session->s_mdsc->metric.total_caps);
if (cap->queue_release)
__ceph_queue_cap_release(session, cap);
else
@@ -1388,7 +1683,7 @@ out:
session->s_cap_iterator = NULL;
spin_unlock(&session->s_cap_lock);
- ceph_async_iput(last_inode);
+ iput(last_inode);
if (old_cap)
ceph_put_cap(session->s_mdsc, old_cap);
@@ -1398,98 +1693,20 @@ out:
static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
- struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
struct ceph_inode_info *ci = ceph_inode(inode);
- LIST_HEAD(to_remove);
- bool dirty_dropped = false;
bool invalidate = false;
+ int iputs;
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
spin_lock(&ci->i_ceph_lock);
- if (cap->mds_wanted | cap->issued)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
- __ceph_remove_cap(cap, false);
- if (!ci->i_auth_cap) {
- struct ceph_cap_flush *cf;
- struct ceph_mds_client *mdsc = fsc->mdsc;
-
- if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- if (inode->i_data.nrpages > 0)
- invalidate = true;
- if (ci->i_wrbuffer_ref > 0)
- mapping_set_error(&inode->i_data, -EIO);
- }
-
- while (!list_empty(&ci->i_cap_flush_list)) {
- cf = list_first_entry(&ci->i_cap_flush_list,
- struct ceph_cap_flush, i_list);
- list_move(&cf->i_list, &to_remove);
- }
-
- spin_lock(&mdsc->cap_dirty_lock);
-
- list_for_each_entry(cf, &to_remove, i_list)
- list_del(&cf->g_list);
-
- if (!list_empty(&ci->i_dirty_item)) {
- pr_warn_ratelimited(
- " dropping dirty %s state for %p %lld\n",
- ceph_cap_string(ci->i_dirty_caps),
- inode, ceph_ino(inode));
- ci->i_dirty_caps = 0;
- list_del_init(&ci->i_dirty_item);
- dirty_dropped = true;
- }
- if (!list_empty(&ci->i_flushing_item)) {
- pr_warn_ratelimited(
- " dropping dirty+flushing %s state for %p %lld\n",
- ceph_cap_string(ci->i_flushing_caps),
- inode, ceph_ino(inode));
- ci->i_flushing_caps = 0;
- list_del_init(&ci->i_flushing_item);
- mdsc->num_cap_flushing--;
- dirty_dropped = true;
- }
- spin_unlock(&mdsc->cap_dirty_lock);
-
- if (dirty_dropped) {
- errseq_set(&ci->i_meta_err, -EIO);
-
- if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_wr_ref == 0 &&
- ci->i_dirty_caps == 0 &&
- ci->i_flushing_caps == 0) {
- ceph_put_snap_context(ci->i_head_snapc);
- ci->i_head_snapc = NULL;
- }
- }
-
- if (atomic_read(&ci->i_filelock_ref) > 0) {
- /* make further file lock syscall return -EIO */
- ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
- pr_warn_ratelimited(" dropping file locks for %p %lld\n",
- inode, ceph_ino(inode));
- }
-
- if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
- list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
- ci->i_prealloc_cap_flush = NULL;
- }
- }
+ iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
spin_unlock(&ci->i_ceph_lock);
- while (!list_empty(&to_remove)) {
- struct ceph_cap_flush *cf;
- cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, i_list);
- list_del(&cf->i_list);
- ceph_free_cap_flush(cf);
- }
wake_up_all(&ci->i_cap_wq);
if (invalidate)
ceph_queue_invalidate(inode);
- if (dirty_dropped)
+ while (iputs--)
iput(inode);
return 0;
}
@@ -1530,8 +1747,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
spin_unlock(&session->s_cap_lock);
inode = ceph_find_inode(sb, vino);
- /* avoid calling iput_final() while holding s_mutex */
- ceph_async_iput(inode);
+ iput(inode);
spin_lock(&session->s_cap_lock);
}
@@ -1570,13 +1786,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
} else if (ev == RENEWCAPS) {
- if (cap->cap_gen < cap->session->s_cap_gen) {
+ if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) {
/* mds did not re-issue stale cap */
spin_lock(&ci->i_ceph_lock);
cap->issued = cap->implemented = CEPH_CAP_PIN;
- /* make sure mds knows what we want */
- if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
- ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
spin_unlock(&ci->i_ceph_lock);
}
} else if (ev == FORCE_RO) {
@@ -1620,8 +1833,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state));
- msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
- ++session->s_renew_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ ++session->s_renew_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1635,7 +1848,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
session->s_mds, ceph_session_state_name(session->s_state), seq);
- msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1680,15 +1893,15 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
/*
* send a session close request
*/
-static int request_close_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static int request_close_session(struct ceph_mds_session *session)
{
struct ceph_msg *msg;
dout("request_close_session mds%d state %s seq %lld\n",
session->s_mds, ceph_session_state_name(session->s_state),
session->s_seq);
- msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
+ session->s_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1704,7 +1917,7 @@ static int __close_session(struct ceph_mds_client *mdsc,
if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
return 0;
session->s_state = CEPH_MDS_SESSION_CLOSING;
- return request_close_session(mdsc, session);
+ return request_close_session(session);
}
static bool drop_negative_children(struct dentry *dentry)
@@ -1772,7 +1985,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
}
/* The inode has cached pages, but it's no longer used.
* we can safely drop it */
- if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ if (S_ISREG(inode->i_mode) &&
+ wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
!(oissued & CEPH_CAP_FILE_CACHE)) {
used = 0;
oissued = 0;
@@ -1782,7 +1996,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if (oissued) {
/* we aren't the only cap.. just remove us */
- __ceph_remove_cap(cap, true);
+ ceph_remove_cap(cap, true);
(*remaining)--;
} else {
struct dentry *dentry;
@@ -2064,7 +2278,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
- __GFP_NOWARN,
+ __GFP_NOWARN |
+ __GFP_ZERO,
order);
if (rinfo->dir_entries)
break;
@@ -2089,18 +2304,21 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
- struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+ struct ceph_mds_request *req;
+ req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
if (!req)
return ERR_PTR(-ENOMEM);
mutex_init(&req->r_fill_mutex);
req->r_mdsc = mdsc;
req->r_started = jiffies;
+ req->r_start_latency = ktime_get();
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
+ req->r_feature_needed = -1;
kref_init(&req->r_kref);
RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_wait);
@@ -2299,15 +2517,33 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
return r;
}
+static void encode_timestamp_and_gids(void **p,
+ const struct ceph_mds_request *req)
+{
+ struct ceph_timespec ts;
+ int i;
+
+ ceph_encode_timespec64(&ts, &req->r_stamp);
+ ceph_encode_copy(p, &ts, sizeof(ts));
+
+ /* gid_list */
+ ceph_encode_32(p, req->r_cred->group_info->ngroups);
+ for (i = 0; i < req->r_cred->group_info->ngroups; i++)
+ ceph_encode_64(p, from_kgid(&init_user_ns,
+ req->r_cred->group_info->gid[i]));
+}
+
/*
* called under mdsc->mutex
*/
-static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
struct ceph_mds_request *req,
- int mds, bool drop_cap_releases)
+ bool drop_cap_releases)
{
+ int mds = session->s_mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_msg *msg;
- struct ceph_mds_request_head *head;
+ struct ceph_mds_request_head_old *head;
const char *path1 = NULL;
const char *path2 = NULL;
u64 ino1 = 0, ino2 = 0;
@@ -2317,6 +2553,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
u16 releases;
void *p, *end;
int ret;
+ bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
ret = set_request_path_attr(req->r_inode, req->r_dentry,
req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2338,14 +2575,16 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
goto out_free1;
}
- len = sizeof(*head) +
- pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+ len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
+ len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
sizeof(struct ceph_timespec);
+ len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
(!!req->r_inode_drop + !!req->r_dentry_drop +
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+
if (req->r_dentry_drop)
len += pathlen1;
if (req->r_old_dentry_drop)
@@ -2357,18 +2596,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
goto out_free2;
}
- msg->hdr.version = cpu_to_le16(2);
msg->hdr.tid = cpu_to_le64(req->r_tid);
- head = msg->front.iov_base;
- p = msg->front.iov_base + sizeof(*head);
+ /*
+ * The old ceph_mds_request_head didn't contain a version field, and
+ * one was added when we moved the message version from 3->4.
+ */
+ if (legacy) {
+ msg->hdr.version = cpu_to_le16(3);
+ head = msg->front.iov_base;
+ p = msg->front.iov_base + sizeof(*head);
+ } else {
+ struct ceph_mds_request_head *new_head = msg->front.iov_base;
+
+ msg->hdr.version = cpu_to_le16(4);
+ new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+ head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
+ p = msg->front.iov_base + sizeof(*new_head);
+ }
+
end = msg->front.iov_base + msg->front.iov_len;
head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
- head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
- head->ino = 0;
+ head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
+ req->r_cred->fsuid));
+ head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
+ req->r_cred->fsgid));
+ head->ino = cpu_to_le64(req->r_deleg_ino);
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
@@ -2382,7 +2637,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_inode_drop)
releases += ceph_encode_inode_release(&p,
req->r_inode ? req->r_inode : d_inode(req->r_dentry),
- mds, req->r_inode_drop, req->r_inode_unless, 0);
+ mds, req->r_inode_drop, req->r_inode_unless,
+ req->r_op == CEPH_MDS_OP_READDIR);
if (req->r_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_dentry,
req->r_parent, mds, req->r_dentry_drop,
@@ -2404,14 +2660,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->num_releases = cpu_to_le16(releases);
- /* time stamp */
- {
- struct ceph_timespec ts;
- ceph_encode_timespec64(&ts, &req->r_stamp);
- ceph_encode_copy(&p, &ts, sizeof(ts));
+ encode_timestamp_and_gids(&p, req);
+
+ if (WARN_ON_ONCE(p > end)) {
+ ceph_msg_put(msg);
+ msg = ERR_PTR(-ERANGE);
+ goto out_free2;
}
- BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2442,21 +2698,58 @@ out:
static void complete_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ req->r_end_latency = ktime_get();
+
if (req->r_callback)
req->r_callback(mdsc, req);
complete_all(&req->r_completion);
}
+static struct ceph_mds_request_head_old *
+find_old_request_head(void *p, u64 features)
+{
+ bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+ struct ceph_mds_request_head *new_head;
+
+ if (legacy)
+ return (struct ceph_mds_request_head_old *)p;
+ new_head = (struct ceph_mds_request_head *)p;
+ return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
+}
+
/*
* called under mdsc->mutex
*/
-static int __prepare_send_request(struct ceph_mds_client *mdsc,
+static int __prepare_send_request(struct ceph_mds_session *session,
struct ceph_mds_request *req,
- int mds, bool drop_cap_releases)
+ bool drop_cap_releases)
{
- struct ceph_mds_request_head *rhead;
+ int mds = session->s_mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_mds_request_head_old *rhead;
struct ceph_msg *msg;
- int flags = 0;
+ int flags = 0, max_retry;
+
+ /*
+ * The type of 'r_attempts' in kernel 'ceph_mds_request'
+ * is 'int', while in 'ceph_mds_request_head' the type of
+ * 'num_retry' is '__u8'. So in case the request retries
+ * exceeding 256 times, the MDS will receive a incorrect
+ * retry seq.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * retrying the request makes no sense.
+ *
+ * In future this could be fixed in ceph code, so avoid
+ * using the hardcode here.
+ */
+ max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
+ max_retry = 1 << (max_retry * BITS_PER_BYTE);
+ if (req->r_attempts >= max_retry) {
+ pr_warn_ratelimited("%s request tid %llu seq overflow\n",
+ __func__, req->r_tid);
+ return -EMULTIHOP;
+ }
req->r_attempts++;
if (req->r_inode) {
@@ -2468,11 +2761,12 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
else
req->r_sent_on_mseq = -1;
}
- dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+ dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
void *p;
+
/*
* Replay. Do not regenerate message (and rebuild
* paths, etc.); just use the original message.
@@ -2480,7 +2774,8 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
* d_move mangles the src name.
*/
msg = req->r_request;
- rhead = msg->front.iov_base;
+ rhead = find_old_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
flags = le32_to_cpu(rhead->flags);
flags |= CEPH_MDS_FLAG_REPLAY;
@@ -2494,13 +2789,8 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
/* remove cap/dentry releases from message */
rhead->num_releases = 0;
- /* time stamp */
p = msg->front.iov_base + req->r_request_release_offset;
- {
- struct ceph_timespec ts;
- ceph_encode_timespec64(&ts, &req->r_stamp);
- ceph_encode_copy(&p, &ts, sizeof(ts));
- }
+ encode_timestamp_and_gids(&p, req);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2511,23 +2801,25 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
ceph_msg_put(req->r_request);
req->r_request = NULL;
}
- msg = create_request_message(mdsc, req, mds, drop_cap_releases);
+ msg = create_request_message(session, req, drop_cap_releases);
if (IS_ERR(msg)) {
req->r_err = PTR_ERR(msg);
return PTR_ERR(msg);
}
req->r_request = msg;
- rhead = msg->front.iov_base;
+ rhead = find_old_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
+ flags |= CEPH_MDS_FLAG_ASYNC;
if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1;
- rhead->ino = 0;
dout(" r_parent = %p\n", req->r_parent);
return 0;
@@ -2536,15 +2828,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
/*
* called under mdsc->mutex
*/
-static int __send_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
+static int __send_request(struct ceph_mds_session *session,
struct ceph_mds_request *req,
bool drop_cap_releases)
{
int err;
- err = __prepare_send_request(mdsc, req, session->s_mds,
- drop_cap_releases);
+ err = __prepare_send_request(session, req, drop_cap_releases);
if (!err) {
ceph_msg_get(req->r_request);
ceph_con_send(&session->s_con, req->r_request);
@@ -2573,7 +2863,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
dout("do_request timed out\n");
- err = -EIO;
+ err = -ETIMEDOUT;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
@@ -2605,6 +2895,10 @@ static void __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto finish;
+ }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
@@ -2623,15 +2917,46 @@ static void __do_request(struct ceph_mds_client *mdsc,
dout("do_request mds%d session %p state %s\n", mds, session,
ceph_session_state_name(session->s_state));
+
+ /*
+ * The old ceph will crash the MDSs when see unknown OPs
+ */
+ if (req->r_feature_needed > 0 &&
+ !test_bit(req->r_feature_needed, &session->s_features)) {
+ err = -EOPNOTSUPP;
+ goto out_session;
+ }
+
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
session->s_state != CEPH_MDS_SESSION_HUNG) {
+ /*
+ * We cannot queue async requests since the caps and delegated
+ * inodes are bound to the session. Just return -EJUKEBOX and
+ * let the caller retry a sync request in that case.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto out_session;
+ }
+
+ /*
+ * If the session has been REJECTED, then return a hard error,
+ * unless it's a CLEANRECOVER mount, in which case we'll queue
+ * it to the mdsc queue.
+ */
if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
- err = -EACCES;
+ if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
+ list_add(&req->r_wait, &mdsc->waiting_for_map);
+ else
+ err = -EACCES;
goto out_session;
}
+
if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING) {
- __open_session(mdsc, session);
+ err = __open_session(mdsc, session);
+ if (err)
+ goto out_session;
/* retry the same mds later */
if (random)
req->r_resend_mds = mds;
@@ -2646,7 +2971,65 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies;
- err = __send_request(mdsc, session, req, false);
+ /*
+ * For async create we will choose the auth MDS of frag in parent
+ * directory to send the request and ususally this works fine, but
+ * if the migrated the dirtory to another MDS before it could handle
+ * it the request will be forwarded.
+ *
+ * And then the auth cap will be changed.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {
+ struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+
+ /*
+ * The request maybe handled very fast and the new inode
+ * hasn't been linked to the dentry yet. We need to wait
+ * for the ceph_finish_async_create(), which shouldn't be
+ * stuck too long or fail in thoery, to finish when forwarding
+ * the request.
+ */
+ if (!d_inode(req->r_dentry)) {
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,
+ TASK_KILLABLE);
+ if (err) {
+ mutex_lock(&req->r_fill_mutex);
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ goto out_session;
+ }
+ }
+
+ ci = ceph_inode(d_inode(req->r_dentry));
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
+ dout("do_request session changed for auth cap %d -> %d\n",
+ cap->session->s_mds, session->s_mds);
+
+ /* Remove the auth cap from old session */
+ spin_lock(&cap->session->s_cap_lock);
+ cap->session->s_nr_caps--;
+ list_del_init(&cap->session_caps);
+ spin_unlock(&cap->session->s_cap_lock);
+
+ /* Add the auth cap to the new session */
+ cap->mds = mds;
+ cap->session = session;
+ spin_lock(&session->s_cap_lock);
+ session->s_nr_caps++;
+ list_add_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
+ change_auth_cap_ses(ci, session);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ }
+
+ err = __send_request(session, req, false);
out_session:
ceph_put_mds_session(session);
@@ -2709,19 +3092,42 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
struct ceph_mds_request *req)
{
- int err;
+ int err = 0;
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_parent) {
- ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
- ihold(req->r_parent);
+ struct ceph_inode_info *ci = ceph_inode(req->r_parent);
+ int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
+ CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
+ spin_lock(&ci->i_ceph_lock);
+ ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
+ __ceph_touch_fmode(ci, mdsc, fmode);
+ spin_unlock(&ci->i_ceph_lock);
}
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
+ if (req->r_inode) {
+ err = ceph_wait_on_async_create(req->r_inode);
+ if (err) {
+ dout("%s: wait for async create returned: %d\n",
+ __func__, err);
+ return err;
+ }
+ }
+
+ if (!err && req->r_old_inode) {
+ err = ceph_wait_on_async_create(req->r_old_inode);
+ if (err) {
+ dout("%s: wait for async create returned: %d\n",
+ __func__, err);
+ return err;
+ }
+ }
+
dout("submit_request on %p for inode %p\n", req, dir);
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2731,15 +3137,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
return err;
}
-static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func)
{
int err;
/* wait */
dout("do_request waiting\n");
- if (!req->r_timeout && req->r_wait_for_completion) {
- err = req->r_wait_for_completion(mdsc, req);
+ if (wait_func) {
+ err = wait_func(mdsc, req);
} else {
long timeleft = wait_for_completion_killable_timeout(
&req->r_completion,
@@ -2747,7 +3154,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
if (timeleft > 0)
err = 0;
else if (!timeleft)
- err = -EIO; /* timed out */
+ err = -ETIMEDOUT; /* timed out */
else
err = timeleft; /* killed */
}
@@ -2796,7 +3203,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
/* issue */
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err)
- err = ceph_mdsc_wait_request(mdsc, req);
+ err = ceph_mdsc_wait_request(mdsc, req, NULL);
dout("do_request %p done, result %d\n", req, err);
return err;
}
@@ -2882,35 +3289,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
result = le32_to_cpu(head->result);
- /*
- * Handle an ESTALE
- * if we're not talking to the authority, send to them
- * if the authority has changed while we weren't looking,
- * send to new authority
- * Otherwise we just have to return an ESTALE
- */
- if (result == -ESTALE) {
- dout("got ESTALE on request %llu\n", req->r_tid);
- req->r_resend_mds = -1;
- if (req->r_direct_mode != USE_AUTH_MDS) {
- dout("not using auth, setting for that now\n");
- req->r_direct_mode = USE_AUTH_MDS;
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- } else {
- int mds = __choose_mds(mdsc, req, NULL);
- if (mds >= 0 && mds != req->r_session->s_mds) {
- dout("but auth changed, so resending\n");
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- }
- dout("have to return ESTALE on request %llu\n", req->r_tid);
- }
-
-
if (head->safe) {
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
@@ -2935,24 +3313,33 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
- if (req->r_unsafe_dir) {
- struct ceph_inode_info *ci =
- ceph_inode(req->r_unsafe_dir);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_dir_item,
- &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
- }
}
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
- err = parse_reply_info(msg, rinfo, (u64)-1);
+ err = parse_reply_info(session, msg, rinfo, (u64)-1);
else
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
+ /* Must find target inode outside of mutexes to avoid deadlocks */
+ if ((err >= 0) && rinfo->head->is_target) {
+ struct inode *in;
+ struct ceph_vino tvino = {
+ .ino = le64_to_cpu(rinfo->targeti.in->ino),
+ .snap = le64_to_cpu(rinfo->targeti.in->snapid)
+ };
+
+ in = ceph_get_inode(mdsc->fsc->sb, tvino);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ mutex_lock(&session->s_mutex);
+ goto out_err;
+ }
+ req->r_target_inode = in;
+ }
+
mutex_lock(&session->s_mutex);
if (err < 0) {
pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
@@ -3020,6 +3407,9 @@ out_err:
/* kick calling process */
complete_request(mdsc, req);
+
+ ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
out:
ceph_mdsc_put_request(req);
return;
@@ -3041,6 +3431,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
int err = -EINVAL;
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
+ bool aborted = false;
ceph_decode_need(&p, end, 2*sizeof(u32), bad);
next_mds = ceph_decode_32(&p);
@@ -3049,16 +3440,41 @@ static void handle_forward(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex);
req = lookup_get_request(mdsc, tid);
if (!req) {
+ mutex_unlock(&mdsc->mutex);
dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
- goto out; /* dup reply? */
+ return; /* dup reply? */
}
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd) {
- dout("forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
+ /*
+ * The type of 'num_fwd' in ceph 'MClientRequestForward'
+ * is 'int32_t', while in 'ceph_mds_request_head' the
+ * type is '__u8'. So in case the request bounces between
+ * MDSes exceeding 256 times, the client will get stuck.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * bouncing the request makes no sense.
+ *
+ * In future this could be fixed in ceph code, so avoid
+ * using the hardcode here.
+ */
+ int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
+ max = 1 << (max * BITS_PER_BYTE);
+ if (req->r_num_fwd >= max) {
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = -EMULTIHOP;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ aborted = true;
+ pr_warn_ratelimited("forward tid %llu seq overflow\n",
+ tid);
+ } else {
+ dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+ tid, next_mds, req->r_num_fwd, fwd_seq);
+ }
} else {
/* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
@@ -3070,9 +3486,12 @@ static void handle_forward(struct ceph_mds_client *mdsc,
put_request_session(req);
__do_request(mdsc, req);
}
- ceph_mdsc_put_request(req);
-out:
mutex_unlock(&mdsc->mutex);
+
+ /* kick calling process */
+ if (aborted)
+ complete_request(mdsc, req);
+ ceph_mdsc_put_request(req);
return;
bad:
@@ -3080,7 +3499,7 @@ bad:
}
static int __decode_session_metadata(void **p, void *end,
- bool *blacklisted)
+ bool *blocklisted)
{
/* map<string,string> */
u32 n;
@@ -3094,8 +3513,12 @@ static int __decode_session_metadata(void **p, void *end,
*p += len;
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad);
+ /*
+ * Match "blocklisted (blacklisted)" from newer MDSes,
+ * or "blacklisted" from older MDSes.
+ */
if (err_str && strnstr(*p, "blacklisted", len))
- *blacklisted = true;
+ *blocklisted = true;
*p += len;
}
return 0;
@@ -3116,10 +3539,9 @@ static void handle_session(struct ceph_mds_session *session,
void *end = p + msg->front.iov_len;
struct ceph_mds_session_head *h;
u32 op;
- u64 seq;
- unsigned long features = 0;
+ u64 seq, features = 0;
int wake = 0;
- bool blacklisted = false;
+ bool blocklisted = false;
/* decode */
ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -3131,14 +3553,36 @@ static void handle_session(struct ceph_mds_session *session,
if (msg_version >= 3) {
u32 len;
- /* version >= 2, metadata */
- if (__decode_session_metadata(&p, end, &blacklisted) < 0)
+ /* version >= 2 and < 5, decode metadata, skip otherwise
+ * as it's handled via flags.
+ */
+ if (msg_version >= 5)
+ ceph_decode_skip_map(&p, end, string, string, bad);
+ else if (__decode_session_metadata(&p, end, &blocklisted) < 0)
goto bad;
+
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_need(&p, end, len, bad);
- memcpy(&features, p, min_t(size_t, len, sizeof(features)));
- p += len;
+ if (len) {
+ ceph_decode_64_safe(&p, end, features, bad);
+ p += len - sizeof(features);
+ }
+ }
+
+ if (msg_version >= 5) {
+ u32 flags, len;
+
+ /* version >= 4 */
+ ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
+ ceph_decode_32_safe(&p, end, len, bad); /* len */
+ ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
+
+ /* version >= 5, flags */
+ ceph_decode_32_safe(&p, end, flags, bad);
+ if (flags & CEPH_SESSION_BLOCKLISTED) {
+ pr_warn("mds%d session blocklisted\n", session->s_mds);
+ blocklisted = true;
+ }
}
mutex_lock(&mdsc->mutex);
@@ -3165,9 +3609,26 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_OPEN:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect success\n", session->s_mds);
- session->s_state = CEPH_MDS_SESSION_OPEN;
- session->s_features = features;
- renewed_caps(mdsc, session, 0);
+
+ if (session->s_state == CEPH_MDS_SESSION_OPEN) {
+ pr_notice("mds%d is already opened\n", session->s_mds);
+ } else {
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ session->s_features = features;
+ renewed_caps(mdsc, session, 0);
+ if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
+ &session->s_features))
+ metric_schedule_delayed(&mdsc->metric);
+ }
+
+ /*
+ * The connection maybe broken and the session in client
+ * side has been reinitialized, need to update the seq
+ * anyway.
+ */
+ if (!session->s_seq && seq)
+ session->s_seq = seq;
+
wake = 1;
if (mdsc->stopping)
__close_session(mdsc, session);
@@ -3191,10 +3652,8 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_STALE:
pr_info("mds%d caps went stale, renewing\n",
session->s_mds);
- spin_lock(&session->s_gen_ttl_lock);
- session->s_cap_gen++;
+ atomic_inc(&session->s_cap_gen);
session->s_cap_ttl = jiffies - 1;
- spin_unlock(&session->s_gen_ttl_lock);
send_renew_caps(mdsc, session);
break;
@@ -3220,8 +3679,8 @@ static void handle_session(struct ceph_mds_session *session,
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
- if (blacklisted)
- mdsc->fsc->blacklisted = true;
+ if (blocklisted)
+ mdsc->fsc->blocklisted = true;
wake = 2; /* for good measure */
break;
@@ -3249,6 +3708,29 @@ bad:
return;
}
+void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
+{
+ int dcaps;
+
+ dcaps = xchg(&req->r_dir_caps, 0);
+ if (dcaps) {
+ dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
+ }
+}
+
+void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
+{
+ int dcaps;
+
+ dcaps = xchg(&req->r_dir_caps, 0);
+ if (dcaps) {
+ dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
+ dcaps);
+ }
+}
+
/*
* called under session->mutex.
*/
@@ -3262,7 +3744,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex);
list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
- __send_request(mdsc, session, req, true);
+ __send_request(session, req, true);
/*
* also re-send old requests when MDS enters reconnect stage. So that MDS
@@ -3276,9 +3758,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
continue;
if (req->r_attempts == 0)
continue; /* only old requests */
- if (req->r_session &&
- req->r_session->s_mds == session->s_mds)
- __send_request(mdsc, session, req, true);
+ if (!req->r_session)
+ continue;
+ if (req->r_session->s_mds != session->s_mds)
+ continue;
+
+ ceph_mdsc_release_dir_caps_no_check(req);
+
+ __send_request(session, req, true);
}
mutex_unlock(&mdsc->mutex);
}
@@ -3359,10 +3846,43 @@ fail_msg:
return err;
}
+static struct dentry* d_find_primary(struct inode *inode)
+{
+ struct dentry *alias, *dn = NULL;
+
+ if (hlist_empty(&inode->i_dentry))
+ return NULL;
+
+ spin_lock(&inode->i_lock);
+ if (hlist_empty(&inode->i_dentry))
+ goto out_unlock;
+
+ if (S_ISDIR(inode->i_mode)) {
+ alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+ if (!IS_ROOT(alias))
+ dn = dget(alias);
+ goto out_unlock;
+ }
+
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ spin_lock(&alias->d_lock);
+ if (!d_unhashed(alias) &&
+ (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
+ dn = dget_dlock(alias);
+ }
+ spin_unlock(&alias->d_lock);
+ if (dn)
+ break;
+ }
+out_unlock:
+ spin_unlock(&inode->i_lock);
+ return dn;
+}
+
/*
* Encode information about a cap for a reconnect with the MDS.
*/
-static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
union {
@@ -3372,36 +3892,63 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_inode_info *ci = cap->ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
- int err;
+ struct dentry *dentry;
+ char *path;
+ int pathlen = 0, err;
+ u64 pathbase;
u64 snap_follows;
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
+ dentry = d_find_primary(inode);
+ if (dentry) {
+ /* set pathbase to parent dir when msg_version >= 2 */
+ path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+ recon_state->msg_version >= 2);
+ dput(dentry);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out_err;
+ }
+ } else {
+ path = NULL;
+ pathbase = 0;
+ }
+
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
cap->mseq = 0; /* and migrate_seq */
- cap->cap_gen = cap->session->s_cap_gen;
+ cap->cap_gen = atomic_read(&cap->session->s_cap_gen);
+
+ /* These are lost when the session goes away */
+ if (S_ISDIR(inode->i_mode)) {
+ if (cap->issued & CEPH_CAP_DIR_CREATE) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
+ cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
+ }
if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = 0;
+ rec.v2.pathbase = cpu_to_le64(pathbase);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v1.issued = cpu_to_le32(cap->issued);
- rec.v1.size = cpu_to_le64(inode->i_size);
+ rec.v1.size = cpu_to_le64(i_size_read(inode));
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = 0;
+ rec.v1.pathbase = cpu_to_le64(pathbase);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -3463,7 +4010,7 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + sizeof(rec.v2);
+ struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -3487,7 +4034,7 @@ encode_again:
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, NULL, 0);
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -3496,39 +4043,20 @@ encode_again:
out_freeflocks:
kfree(flocks);
} else {
- u64 pathbase = 0;
- int pathlen = 0;
- char *path = NULL;
- struct dentry *dentry;
-
- dentry = d_find_alias(inode);
- if (dentry) {
- path = ceph_mdsc_build_path(dentry,
- &pathlen, &pathbase, 0);
- dput(dentry);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out_err;
- }
- rec.v1.pathbase = cpu_to_le64(pathbase);
- }
-
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
pathlen + sizeof(rec.v1));
- if (err) {
- goto out_freepath;
- }
+ if (err)
+ goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-out_freepath:
- ceph_mdsc_free_path(path, pathlen);
}
out_err:
- if (err >= 0)
+ ceph_mdsc_free_path(path, pathlen);
+ if (!err)
recon_state->nr_caps++;
return err;
}
@@ -3602,8 +4130,6 @@ fail:
* recovering MDS might have.
*
* This is a relatively heavyweight operation, but it's rare.
- *
- * called with mdsc->mutex held.
*/
static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
@@ -3626,6 +4152,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
if (!reply)
goto fail_nomsg;
+ xa_destroy(&session->s_delegated_inos);
+
mutex_lock(&session->s_mutex);
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
@@ -3633,9 +4161,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
dout("session %p state %s\n", session,
ceph_session_state_name(session->s_state));
- spin_lock(&session->s_gen_ttl_lock);
- session->s_cap_gen++;
- spin_unlock(&session->s_gen_ttl_lock);
+ atomic_inc(&session->s_cap_gen);
spin_lock(&session->s_cap_lock);
/* don't know if session is readonly */
@@ -3681,7 +4207,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.msg_version = 2;
}
/* trsaverse this session's caps */
- err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
+ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
@@ -3782,13 +4308,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
struct ceph_mdsmap *newmap,
struct ceph_mdsmap *oldmap)
{
- int i;
+ int i, j, err;
int oldstate, newstate;
struct ceph_mds_session *s;
+ unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch);
+ if (newmap->m_info) {
+ for (i = 0; i < newmap->possible_max_rank; i++) {
+ for (j = 0; j < newmap->m_info[i].num_export_targets; j++)
+ set_bit(newmap->m_info[i].export_targets[j], targets);
+ }
+ }
+
for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
if (!mdsc->sessions[i])
continue;
@@ -3842,6 +4376,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
newstate >= CEPH_MDS_STATE_RECONNECT) {
mutex_unlock(&mdsc->mutex);
+ clear_bit(i, targets);
send_mds_reconnect(mdsc, s);
mutex_lock(&mdsc->mutex);
}
@@ -3855,11 +4390,60 @@ static void check_new_map(struct ceph_mds_client *mdsc,
oldstate != CEPH_MDS_STATE_STARTING)
pr_info("mds%d recovery completed\n", s->s_mds);
kick_requests(mdsc, i);
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_lock(&mdsc->mutex);
ceph_kick_flushing_caps(mdsc, s);
+ mutex_unlock(&s->s_mutex);
wake_up_session_caps(s, RECONNECT);
}
}
+ /*
+ * Only open and reconnect sessions that don't exist yet.
+ */
+ for (i = 0; i < newmap->possible_max_rank; i++) {
+ /*
+ * In case the import MDS is crashed just after
+ * the EImportStart journal is flushed, so when
+ * a standby MDS takes over it and is replaying
+ * the EImportStart journal the new MDS daemon
+ * will wait the client to reconnect it, but the
+ * client may never register/open the session yet.
+ *
+ * Will try to reconnect that MDS daemon if the
+ * rank number is in the export targets array and
+ * is the up:reconnect state.
+ */
+ newstate = ceph_mdsmap_get_state(newmap, i);
+ if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT)
+ continue;
+
+ /*
+ * The session maybe registered and opened by some
+ * requests which were choosing random MDSes during
+ * the mdsc->mutex's unlock/lock gap below in rare
+ * case. But the related MDS daemon will just queue
+ * that requests and be still waiting for the client's
+ * reconnection request in up:reconnect state.
+ */
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (likely(!s)) {
+ s = __open_export_target_session(mdsc, i);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ pr_err("failed to open export target session, err %d\n",
+ err);
+ continue;
+ }
+ }
+ dout("send reconnect to export target mds.%d\n", i);
+ mutex_unlock(&mdsc->mutex);
+ send_mds_reconnect(mdsc, s);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+
for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i];
if (!s)
@@ -3928,7 +4512,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
dname.len, dname.name);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
if (!inode) {
dout("handle_lease no inode %llx\n", vino.ino);
@@ -3962,7 +4546,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
case CEPH_MDS_LEASE_RENEW:
if (di->lease_session == session &&
- di->lease_gen == session->s_cap_gen &&
+ di->lease_gen == atomic_read(&session->s_cap_gen) &&
di->lease_renew_from &&
di->lease_renew_after == 0) {
unsigned long duration =
@@ -3990,8 +4574,7 @@ release:
out:
mutex_unlock(&session->s_mutex);
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
return;
bad:
@@ -4027,35 +4610,17 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
memcpy((void *)(lease + 1) + 4,
dentry->d_name.name, dentry->d_name.len);
spin_unlock(&dentry->d_lock);
- /*
- * if this is a preemptive lease RELEASE, no need to
- * flush request stream, since the actual request will
- * soon follow.
- */
- msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
ceph_con_send(&session->s_con, msg);
}
/*
- * lock unlock sessions, to wait ongoing session activities
+ * lock unlock the session, to wait ongoing session activities
*/
-static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
+static void lock_unlock_session(struct ceph_mds_session *s)
{
- int i;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_unlock(&s->s_mutex);
}
static void maybe_recover_session(struct ceph_mds_client *mdsc)
@@ -4068,38 +4633,84 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
return;
- if (!READ_ONCE(fsc->blacklisted))
- return;
-
- if (fsc->last_auto_reconnect &&
- time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
+ if (!READ_ONCE(fsc->blocklisted))
return;
- pr_info("auto reconnect after blacklisted\n");
- fsc->last_auto_reconnect = jiffies;
+ pr_info("auto reconnect after blocklisted\n");
ceph_force_reconnect(fsc->sb);
}
+bool check_session_state(struct ceph_mds_session *s)
+{
+ switch (s->s_state) {
+ case CEPH_MDS_SESSION_OPEN:
+ if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+ s->s_state = CEPH_MDS_SESSION_HUNG;
+ pr_info("mds%d hung\n", s->s_mds);
+ }
+ break;
+ case CEPH_MDS_SESSION_CLOSING:
+ case CEPH_MDS_SESSION_NEW:
+ case CEPH_MDS_SESSION_RESTARTING:
+ case CEPH_MDS_SESSION_CLOSED:
+ case CEPH_MDS_SESSION_REJECTED:
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
+ * then we need to retransmit that request.
+ */
+void inc_session_sequence(struct ceph_mds_session *s)
+{
+ lockdep_assert_held(&s->s_mutex);
+
+ s->s_seq++;
+
+ if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ int ret;
+
+ dout("resending session close request for mds%d\n", s->s_mds);
+ ret = request_close_session(s);
+ if (ret < 0)
+ pr_err("unable to close session to mds%d: %d\n",
+ s->s_mds, ret);
+ }
+}
+
/*
- * delayed work -- periodically trim expired leases, renew caps with mds
+ * delayed work -- periodically trim expired leases, renew caps with mds. If
+ * the @delay parameter is set to 0 or if it's more than 5 secs, the default
+ * workqueue delay value of 5 secs will be used.
*/
-static void schedule_delayed(struct ceph_mds_client *mdsc)
+static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
{
- int delay = 5;
- unsigned hz = round_jiffies_relative(HZ * delay);
- schedule_delayed_work(&mdsc->delayed_work, hz);
+ unsigned long max_delay = HZ * 5;
+
+ /* 5 secs default delay */
+ if (!delay || (delay > max_delay))
+ delay = max_delay;
+ schedule_delayed_work(&mdsc->delayed_work,
+ round_jiffies_relative(delay));
}
static void delayed_work(struct work_struct *work)
{
- int i;
struct ceph_mds_client *mdsc =
container_of(work, struct ceph_mds_client, delayed_work.work);
+ unsigned long delay;
int renew_interval;
int renew_caps;
+ int i;
dout("mdsc delayed_work\n");
+ if (mdsc->stopping)
+ return;
+
mutex_lock(&mdsc->mutex);
renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
renew_caps = time_after_eq(jiffies, HZ*renew_interval +
@@ -4111,23 +4722,8 @@ static void delayed_work(struct work_struct *work)
struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
if (!s)
continue;
- if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout("resending session close request for mds%d\n",
- s->s_mds);
- request_close_session(mdsc, s);
- ceph_put_mds_session(s);
- continue;
- }
- if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
- if (s->s_state == CEPH_MDS_SESSION_OPEN) {
- s->s_state = CEPH_MDS_SESSION_HUNG;
- pr_info("mds%d hung\n", s->s_mds);
- }
- }
- if (s->s_state == CEPH_MDS_SESSION_NEW ||
- s->s_state == CEPH_MDS_SESSION_RESTARTING ||
- s->s_state == CEPH_MDS_SESSION_REJECTED) {
- /* this mds is failed or recovering, just wait */
+
+ if (!check_session_state(s)) {
ceph_put_mds_session(s);
continue;
}
@@ -4148,7 +4744,7 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
- ceph_check_delayed_caps(mdsc);
+ delay = ceph_check_delayed_caps(mdsc);
ceph_queue_cap_reclaim_work(mdsc);
@@ -4156,13 +4752,14 @@ static void delayed_work(struct work_struct *work)
maybe_recover_session(mdsc);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, delay);
}
int ceph_mdsc_init(struct ceph_fs_client *fsc)
{
struct ceph_mds_client *mdsc;
+ int err;
mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
if (!mdsc)
@@ -4171,29 +4768,19 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
if (!mdsc->mdsmap) {
- kfree(mdsc);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto err_mdsc;
}
- fsc->mdsc = mdsc;
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
- mdsc->sessions = NULL;
- atomic_set(&mdsc->num_sessions, 0);
- mdsc->max_sessions = 0;
- mdsc->stopping = 0;
- atomic64_set(&mdsc->quotarealms_count, 0);
mdsc->quotarealms_inodes = RB_ROOT;
mutex_init(&mdsc->quotarealms_inodes_mutex);
- mdsc->last_snap_seq = 0;
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
INIT_LIST_HEAD(&mdsc->snap_empty);
- mdsc->num_snap_realms = 0;
spin_lock_init(&mdsc->snap_empty_lock);
- mdsc->last_tid = 0;
- mdsc->oldest_tid = 0;
mdsc->request_tree = RB_ROOT;
INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
mdsc->last_renew_caps = jiffies;
@@ -4204,13 +4791,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
INIT_LIST_HEAD(&mdsc->cap_flush_list);
- INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
- mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
- atomic_set(&mdsc->cap_reclaim_pending, 0);
+ err = ceph_metric_init(&mdsc->metric);
+ if (err)
+ goto err_mdsmap;
spin_lock_init(&mdsc->dentry_list_lock);
INIT_LIST_HEAD(&mdsc->dentry_leases);
@@ -4228,7 +4815,15 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
strscpy(mdsc->nodename, utsname()->nodename,
sizeof(mdsc->nodename));
+
+ fsc->mdsc = mdsc;
return 0;
+
+err_mdsmap:
+ kfree(mdsc->mdsmap);
+err_mdsc:
+ kfree(mdsc);
+ return err;
}
/*
@@ -4261,6 +4856,30 @@ static void wait_requests(struct ceph_mds_client *mdsc)
dout("wait_requests done\n");
}
+void send_flush_mdlog(struct ceph_mds_session *s)
+{
+ struct ceph_msg *msg;
+
+ /*
+ * Pre-luminous MDS crashes when it sees an unknown session request
+ */
+ if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
+ return;
+
+ mutex_lock(&s->s_mutex);
+ dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
+ ceph_session_state_name(s->s_state), s->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
+ s->s_seq);
+ if (!msg) {
+ pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+ } else {
+ ceph_con_send(&s->s_con, msg);
+ }
+ mutex_unlock(&s->s_mutex);
+}
+
/*
* called before mount is ro, and before dentries are torn down.
* (hmm, does this still race with new lookups?)
@@ -4270,7 +4889,8 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
dout("pre_umount\n");
mdsc->stopping = 1;
- lock_unlock_sessions(mdsc);
+ ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
+ ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
ceph_flush_dirty_caps(mdsc);
wait_requests(mdsc);
@@ -4284,15 +4904,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
}
/*
- * wait for all write mds requests to flush.
+ * flush the mdlog and wait for all write mds requests to flush.
*/
-static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+ u64 want_tid)
{
struct ceph_mds_request *req = NULL, *nextreq;
+ struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
mutex_lock(&mdsc->mutex);
- dout("wait_unsafe_requests want %lld\n", want_tid);
+ dout("%s want %lld\n", __func__, want_tid);
restart:
req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) {
@@ -4304,14 +4926,32 @@ restart:
nextreq = NULL;
if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
(req->r_op & CEPH_MDS_OP_WRITE)) {
+ struct ceph_mds_session *s = req->r_session;
+
+ if (!s) {
+ req = nextreq;
+ continue;
+ }
+
/* write op */
ceph_mdsc_get_request(req);
if (nextreq)
ceph_mdsc_get_request(nextreq);
+ s = ceph_get_mds_session(s);
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests wait on %llu (want %llu)\n",
+
+ /* send flush mdlog request to MDS */
+ if (last_session != s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(last_session);
+ last_session = s;
+ } else {
+ ceph_put_mds_session(s);
+ }
+ dout("%s wait on %llu (want %llu)\n", __func__,
req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion);
+
mutex_lock(&mdsc->mutex);
ceph_mdsc_put_request(req);
if (!nextreq)
@@ -4326,14 +4966,15 @@ restart:
req = nextreq;
}
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests done\n");
+ ceph_put_mds_session(last_session);
+ dout("%s done\n", __func__);
}
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
u64 want_tid, want_flush;
- if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+ if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
return;
dout("sync\n");
@@ -4355,7 +4996,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
dout("sync want tid %lld flush_seq %lld\n",
want_tid, want_flush);
- wait_unsafe_requests(mdsc, want_tid);
+ flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
wait_caps_flush(mdsc, want_flush);
}
@@ -4420,7 +5061,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
ceph_cleanup_snapid_map(mdsc);
- ceph_cleanup_empty_realms(mdsc);
+ ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -4465,7 +5106,16 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
- cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+ /*
+ * Make sure the delayed work stopped before releasing
+ * the resources.
+ *
+ * Because the cancel_delayed_work_sync() will only
+ * guarantee that the work finishes executing. But the
+ * delayed work will re-arm itself again after that.
+ */
+ flush_delayed_work(&mdsc->delayed_work);
+
if (mdsc->mdsmap)
ceph_mdsmap_destroy(mdsc->mdsmap);
kfree(mdsc->sessions);
@@ -4486,6 +5136,8 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
ceph_mdsc_stop(mdsc);
+ ceph_metric_destroy(&mdsc->metric);
+
fsc->mdsc = NULL;
kfree(mdsc);
dout("mdsc_destroy %p done\n", mdsc);
@@ -4498,10 +5150,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
u32 epoch;
- u32 map_len;
u32 num_fs;
u32 mount_fscid = (u32)-1;
- u8 struct_v, struct_cv;
int err = -EINVAL;
ceph_decode_need(&p, end, sizeof(u32), bad);
@@ -4509,24 +5159,17 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
dout("handle_fsmap epoch %u\n", epoch);
- ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
- struct_v = ceph_decode_8(&p);
- struct_cv = ceph_decode_8(&p);
- map_len = ceph_decode_32(&p);
-
- ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
- p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
+ /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
+ ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
- num_fs = ceph_decode_32(&p);
+ ceph_decode_32_safe(&p, end, num_fs, bad);
while (num_fs-- > 0) {
void *info_p, *info_end;
u32 info_len;
- u8 info_v, info_cv;
u32 fscid, namelen;
ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
- info_v = ceph_decode_8(&p);
- info_cv = ceph_decode_8(&p);
+ p += 2; // info_v, info_cv
info_len = ceph_decode_32(&p);
ceph_decode_need(&p, end, info_len, bad);
info_p = p;
@@ -4559,7 +5202,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
return;
bad:
- pr_err("error decoding fsmap\n");
+ pr_err("error decoding fsmap %d. Shutting down mount.\n", err);
+ ceph_umount_begin(mdsc->fsc->sb);
err_out:
mutex_lock(&mdsc->mutex);
mdsc->mdsmap_err = err;
@@ -4597,7 +5241,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
return;
}
- newmap = ceph_mdsmap_decode(&p, end);
+ newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
if (IS_ERR(newmap)) {
err = PTR_ERR(newmap);
goto bad_unlock;
@@ -4620,17 +5264,18 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, 0);
return;
bad_unlock:
mutex_unlock(&mdsc->mutex);
bad:
- pr_err("error decoding mdsmap %d\n", err);
+ pr_err("error decoding mdsmap %d. Shutting down mount.\n", err);
+ ceph_umount_begin(mdsc->fsc->sb);
return;
}
-static struct ceph_connection *con_get(struct ceph_connection *con)
+static struct ceph_connection *mds_get_con(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
@@ -4639,7 +5284,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
return NULL;
}
-static void con_put(struct ceph_connection *con)
+static void mds_put_con(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
@@ -4650,7 +5295,7 @@ static void con_put(struct ceph_connection *con)
* if the client is unresponsive for long enough, the mds will kill
* the session entirely.
*/
-static void peer_reset(struct ceph_connection *con)
+static void mds_peer_reset(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -4659,7 +5304,7 @@ static void peer_reset(struct ceph_connection *con)
send_mds_reconnect(mdsc, s);
}
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -4717,35 +5362,24 @@ out:
* Note: returned pointer is the address of a structure that's
* managed separately. Caller must *not* attempt to free it.
*/
-static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
- int *proto, int force_new)
+static struct ceph_auth_handshake *
+mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
struct ceph_auth_handshake *auth = &s->s_auth;
+ int ret;
- if (force_new && auth->authorizer) {
- ceph_auth_destroy_authorizer(auth->authorizer);
- auth->authorizer = NULL;
- }
- if (!auth->authorizer) {
- int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
- auth);
- if (ret)
- return ERR_PTR(ret);
- } else {
- int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
- auth);
- if (ret)
- return ERR_PTR(ret);
- }
- *proto = ac->protocol;
+ ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
+ force_new, proto, NULL, NULL);
+ if (ret)
+ return ERR_PTR(ret);
return auth;
}
-static int add_authorizer_challenge(struct ceph_connection *con,
+static int mds_add_authorizer_challenge(struct ceph_connection *con,
void *challenge_buf, int challenge_buf_len)
{
struct ceph_mds_session *s = con->private;
@@ -4756,16 +5390,19 @@ static int add_authorizer_challenge(struct ceph_connection *con,
challenge_buf, challenge_buf_len);
}
-static int verify_authorizer_reply(struct ceph_connection *con)
+static int mds_verify_authorizer_reply(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
- return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+ NULL, NULL, NULL, NULL);
}
-static int invalidate_authorizer(struct ceph_connection *con)
+static int mds_invalidate_authorizer(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -4776,6 +5413,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
}
+static int mds_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+ int ret;
+
+ ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int mds_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+ int ret;
+
+ ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int mds_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+
+ return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+
+static int mds_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;
+ int ret;
+
+ if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,
+ used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt)) {
+ ret = ceph_monc_validate_auth(monc);
+ if (ret)
+ return ret;
+ }
+
+ return -EACCES;
+}
+
static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
struct ceph_msg_header *hdr, int *skip)
{
@@ -4814,17 +5525,21 @@ static int mds_check_message_signature(struct ceph_msg *msg)
}
static const struct ceph_connection_operations mds_con_ops = {
- .get = con_get,
- .put = con_put,
- .dispatch = dispatch,
- .get_authorizer = get_authorizer,
- .add_authorizer_challenge = add_authorizer_challenge,
- .verify_authorizer_reply = verify_authorizer_reply,
- .invalidate_authorizer = invalidate_authorizer,
- .peer_reset = peer_reset,
+ .get = mds_get_con,
+ .put = mds_put_con,
.alloc_msg = mds_alloc_msg,
+ .dispatch = mds_dispatch,
+ .peer_reset = mds_peer_reset,
+ .get_authorizer = mds_get_authorizer,
+ .add_authorizer_challenge = mds_add_authorizer_challenge,
+ .verify_authorizer_reply = mds_verify_authorizer_reply,
+ .invalidate_authorizer = mds_invalidate_authorizer,
.sign_message = mds_sign_message,
.check_message_signature = mds_check_message_signature,
+ .get_auth_request = mds_get_auth_request,
+ .handle_auth_reply_more = mds_handle_auth_reply_more,
+ .handle_auth_done = mds_handle_auth_done,
+ .handle_auth_bad_method = mds_handle_auth_bad_method,
};
/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 27a7446e10d3..0598faa50e2e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -10,12 +10,16 @@
#include <linux/spinlock.h>
#include <linux/refcount.h>
#include <linux/utsname.h>
+#include <linux/ktime.h>
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/mdsmap.h>
#include <linux/ceph/auth.h>
+#include "metric.h"
+#include "super.h"
+
/* The first 8 bits are reserved for old ceph releases */
enum ceph_feature_type {
CEPHFS_FEATURE_MIMIC = 8,
@@ -23,24 +27,26 @@ enum ceph_feature_type {
CEPHFS_FEATURE_RECLAIM_CLIENT,
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_METRIC_COLLECT,
+ CEPHFS_FEATURE_ALTERNATE_NAME,
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
+ CEPHFS_FEATURE_OP_GETVXATTR,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
};
-/*
- * This will always have the highest feature bit value
- * as the last element of the array.
- */
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
- \
- CEPHFS_FEATURE_MAX, \
+ CEPHFS_FEATURE_DELEG_INO, \
+ CEPHFS_FEATURE_METRIC_COLLECT, \
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
+ CEPHFS_FEATURE_OP_GETVXATTR, \
}
-#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
/*
* Some lock dependencies:
@@ -80,6 +86,7 @@ struct ceph_mds_reply_info_in {
s32 dir_pin;
struct ceph_timespec btime;
struct ceph_timespec snap_btime;
+ u64 rsnaps;
u64 change_attr;
};
@@ -91,6 +98,11 @@ struct ceph_mds_reply_dir_entry {
loff_t offset;
};
+struct ceph_mds_reply_xattr {
+ char *xattr_value;
+ size_t xattr_value_len;
+};
+
/*
* parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry,
@@ -106,6 +118,7 @@ struct ceph_mds_reply_info_parsed {
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
+ struct ceph_mds_reply_xattr xattr_info;
/* extra */
union {
@@ -177,10 +190,8 @@ struct ceph_mds_session {
struct ceph_auth_handshake s_auth;
- /* protected by s_gen_ttl_lock */
- spinlock_t s_gen_ttl_lock;
- u32 s_cap_gen; /* inc each time we get mds stale msg */
- unsigned long s_cap_ttl; /* when session caps expire */
+ atomic_t s_cap_gen; /* inc each time we get mds stale msg */
+ unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
@@ -194,13 +205,18 @@ struct ceph_mds_session {
struct list_head s_cap_releases; /* waiting cap_release messages */
struct work_struct s_cap_release_work;
- /* protected by mutex */
+ /* See ceph_inode_info->i_dirty_item. */
+ struct list_head s_cap_dirty; /* inodes w/ dirty caps */
+
+ /* See ceph_inode_info->i_flushing_item. */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
+
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
+ struct xarray s_delegated_inos;
};
/*
@@ -255,14 +271,15 @@ struct ceph_mds_request {
#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
+#define CEPH_MDS_R_ASYNC (8) /* async request */
unsigned long r_req_flags;
struct mutex r_fill_mutex;
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
- kuid_t r_uid;
- kgid_t r_gid;
+ int r_request_release_offset;
+ const struct cred *r_cred;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -280,14 +297,19 @@ struct ceph_mds_request {
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
- int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
- struct page *r_locked_page;
int r_err;
+ u32 r_readdir_offset;
+
+ struct page *r_locked_page;
+ int r_dir_caps;
+ int r_num_caps;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
+ unsigned long r_start_latency; /* start time to measure latency */
+ unsigned long r_end_latency; /* finish time to measure latency */
unsigned long r_request_started; /* start time for mds request only,
used to measure lease durations */
@@ -304,21 +326,21 @@ struct ceph_mds_request {
int r_num_fwd; /* number of forward attempts */
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
+ u64 r_deleg_ino;
struct list_head r_wait;
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
- ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
- u32 r_readdir_offset;
+
+ int r_feature_needed;
struct ceph_cap_reservation r_caps_reservation;
- int r_num_caps;
};
struct ceph_pool_perm {
@@ -352,7 +374,7 @@ struct ceph_quotarealm_inode {
struct cap_wait {
struct list_head list;
- unsigned long ino;
+ u64 ino;
pid_t tgid;
int need;
int want;
@@ -373,7 +395,7 @@ struct ceph_mds_client {
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
- int max_sessions; /* len of s_mds_sessions */
+ int max_sessions; /* len of sessions array */
int stopping; /* true if shutting down */
atomic64_t quotarealms_count; /* # realms with quota */
@@ -411,7 +433,6 @@ struct ceph_mds_client {
u64 last_cap_flush_tid;
struct list_head cap_flush_list;
- struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
spinlock_t cap_dirty_lock; /* protects above items */
@@ -446,6 +467,8 @@ struct ceph_mds_client {
struct list_head dentry_leases; /* fifo list */
struct list_head dentry_dir_leases; /* lru list */
+ struct ceph_client_metric metric;
+
spinlock_t snapid_map_lock;
struct rb_root snapid_map_tree;
struct list_head snapid_map_lru;
@@ -458,6 +481,9 @@ struct ceph_mds_client {
extern const char *ceph_mds_op_name(int op);
+extern bool check_session_state(struct ceph_mds_session *s);
+void inc_session_sequence(struct ceph_mds_session *s);
+
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
@@ -485,9 +511,14 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
@@ -498,6 +529,11 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
+extern void send_flush_mdlog(struct ceph_mds_session *s);
+extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state);
+extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
struct ceph_cap *cap);
extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
@@ -512,7 +548,7 @@ extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
static inline void ceph_mdsc_free_path(char *path, int len)
{
- if (path)
+ if (!IS_ERR_OR_NULL(path))
__putname(path - (PATH_MAX - 1 - len));
}
@@ -537,4 +573,16 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
int max_caps);
+
+static inline int ceph_wait_on_async_create(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
+ TASK_KILLABLE);
+}
+
+extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
+extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
+extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 889627817e52..3fbabc98e1f7 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
return -1;
/* pick */
- n = prandom_u32() % n;
+ n = prandom_u32_max(n);
for (j = 0, i = 0; i < m->possible_max_rank; i++) {
if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
@@ -114,14 +114,15 @@ bad:
* Ignore any fields we don't care about (there are quite a few of
* them).
*/
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
{
struct ceph_mdsmap *m;
const void *start = *p;
int i, j, n;
int err;
- u8 mdsmap_v, mdsmap_cv;
+ u8 mdsmap_v;
u16 mdsmap_ev;
+ u32 target;
m = kzalloc(sizeof(*m), GFP_NOFS);
if (!m)
@@ -129,7 +130,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_decode_need(p, end, 1 + 1, bad);
mdsmap_v = ceph_decode_8(p);
- mdsmap_cv = ceph_decode_8(p);
+ *p += sizeof(u8); /* mdsmap_cv */
if (mdsmap_v >= 4) {
u32 mdsmap_len;
ceph_decode_32_safe(p, end, mdsmap_len, bad);
@@ -174,7 +175,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u64 global_id;
u32 namelen;
s32 mds, inc, state;
- u64 state_seq;
u8 info_v;
void *info_end = NULL;
struct ceph_entity_addr addr;
@@ -189,9 +189,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info_v= ceph_decode_8(p);
if (info_v >= 4) {
u32 info_len;
- u8 info_cv;
ceph_decode_need(p, end, 1 + sizeof(u32), bad);
- info_cv = ceph_decode_8(p);
+ *p += sizeof(u8); /* info_cv */
info_len = ceph_decode_32(p);
info_end = *p + info_len;
if (info_end > end)
@@ -203,18 +202,19 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
namelen = ceph_decode_32(p); /* skip mds name */
*p += namelen;
- ceph_decode_need(p, end,
- 4*sizeof(u32) + sizeof(u64) +
- sizeof(addr) + sizeof(struct ceph_timespec),
- bad);
- mds = ceph_decode_32(p);
- inc = ceph_decode_32(p);
- state = ceph_decode_32(p);
- state_seq = ceph_decode_64(p);
- err = ceph_decode_entity_addr(p, end, &addr);
+ ceph_decode_32_safe(p, end, mds, bad);
+ ceph_decode_32_safe(p, end, inc, bad);
+ ceph_decode_32_safe(p, end, state, bad);
+ *p += sizeof(u64); /* state_seq */
+ if (info_v >= 8)
+ err = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ err = ceph_decode_entity_addr(p, end, &addr);
if (err)
goto corrupt;
- ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+
+ ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since),
+ bad);
laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
*p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad);
@@ -245,8 +245,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
}
if (state <= 0) {
- pr_warn("mdsmap_decode got incorrect state(%s)\n",
- ceph_mds_state_name(state));
+ dout("mdsmap_decode got incorrect state(%s)\n",
+ ceph_mds_state_name(state));
continue;
}
@@ -261,9 +261,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
sizeof(u32), GFP_NOFS);
if (!info->export_targets)
goto nomem;
- for (j = 0; j < num_export_targets; j++)
- info->export_targets[j] =
- ceph_decode_32(&pexport_targets);
+ for (j = 0; j < num_export_targets; j++) {
+ target = ceph_decode_32(&pexport_targets);
+ info->export_targets[j] = target;
+ }
} else {
info->export_targets = NULL;
}
@@ -351,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
__decode_and_drop_type(p, end, u8, bad_ext);
}
if (mdsmap_ev >= 8) {
- u32 name_len;
/* enabled */
ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
- ceph_decode_32_safe(p, end, name_len, bad_ext);
- ceph_decode_need(p, end, name_len, bad_ext);
- *p += name_len;
+ /* fs_name */
+ ceph_decode_skip_string(p, end, bad_ext);
}
/* damaged */
if (mdsmap_ev >= 9) {
@@ -369,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
} else {
m->m_damaged = false;
}
+ if (mdsmap_ev >= 17) {
+ /* balancer */
+ ceph_decode_skip_string(p, end, bad_ext);
+ /* standby_count_wanted */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* old_max_mds */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* min_compat_client */
+ ceph_decode_skip_8(p, end, bad_ext);
+ /* required_client_features */
+ ceph_decode_skip_set(p, end, 64, bad_ext);
+ ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
+ } else {
+ /* This forces the usage of the (sync) SETXATTR Op */
+ m->m_max_xattr_size = 0;
+ }
bad_ext:
dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
!!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
@@ -395,9 +410,11 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->possible_max_rank; i++)
- kfree(m->m_info[i].export_targets);
- kfree(m->m_info);
+ if (m->m_info) {
+ for (i = 0; i < m->possible_max_rank; i++)
+ kfree(m->m_info[i].export_targets);
+ kfree(m->m_info);
+ }
kfree(m->m_data_pg_pools);
kfree(m);
}
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
new file mode 100644
index 000000000000..c47347d2e84e
--- /dev/null
+++ b/fs/ceph/metric.c
@@ -0,0 +1,353 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/percpu_counter.h>
+#include <linux/math64.h>
+
+#include "metric.h"
+#include "mds_client.h"
+
+static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
+{
+ struct timespec64 t = ktime_to_timespec64(val);
+ ceph_encode_timespec64(ts, &t);
+}
+
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ struct ceph_metric_head *head;
+ struct ceph_metric_cap *cap;
+ struct ceph_metric_read_latency *read;
+ struct ceph_metric_write_latency *write;
+ struct ceph_metric_metadata_latency *meta;
+ struct ceph_metric_dlease *dlease;
+ struct ceph_opened_files *files;
+ struct ceph_pinned_icaps *icaps;
+ struct ceph_opened_inodes *inodes;
+ struct ceph_read_io_size *rsize;
+ struct ceph_write_io_size *wsize;
+ struct ceph_client_metric *m = &mdsc->metric;
+ u64 nr_caps = atomic64_read(&m->total_caps);
+ u32 header_len = sizeof(struct ceph_metric_header);
+ struct ceph_msg *msg;
+ s64 sum;
+ s32 items = 0;
+ s32 len;
+
+ len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
+ + sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
+ + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize)
+ + sizeof(*wsize);
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+ if (!msg) {
+ pr_err("send metrics to mds%d, failed to allocate message\n",
+ s->s_mds);
+ return false;
+ }
+
+ head = msg->front.iov_base;
+
+ /* encode the cap metric */
+ cap = (struct ceph_metric_cap *)(head + 1);
+ cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+ cap->header.ver = 1;
+ cap->header.compat = 1;
+ cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len);
+ cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit));
+ cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis));
+ cap->total = cpu_to_le64(nr_caps);
+ items++;
+
+ /* encode the read latency metric */
+ read = (struct ceph_metric_read_latency *)(cap + 1);
+ read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+ read->header.ver = 2;
+ read->header.compat = 1;
+ read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
+ sum = m->metric[METRIC_READ].latency_sum;
+ ktime_to_ceph_timespec(&read->lat, sum);
+ ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg);
+ read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum);
+ read->count = cpu_to_le64(m->metric[METRIC_READ].total);
+ items++;
+
+ /* encode the write latency metric */
+ write = (struct ceph_metric_write_latency *)(read + 1);
+ write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+ write->header.ver = 2;
+ write->header.compat = 1;
+ write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
+ sum = m->metric[METRIC_WRITE].latency_sum;
+ ktime_to_ceph_timespec(&write->lat, sum);
+ ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg);
+ write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum);
+ write->count = cpu_to_le64(m->metric[METRIC_WRITE].total);
+ items++;
+
+ /* encode the metadata latency metric */
+ meta = (struct ceph_metric_metadata_latency *)(write + 1);
+ meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+ meta->header.ver = 2;
+ meta->header.compat = 1;
+ meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
+ sum = m->metric[METRIC_METADATA].latency_sum;
+ ktime_to_ceph_timespec(&meta->lat, sum);
+ ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg);
+ meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum);
+ meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total);
+ items++;
+
+ /* encode the dentry lease metric */
+ dlease = (struct ceph_metric_dlease *)(meta + 1);
+ dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
+ dlease->header.ver = 1;
+ dlease->header.compat = 1;
+ dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len);
+ dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit));
+ dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis));
+ dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries));
+ items++;
+
+ sum = percpu_counter_sum(&m->total_inodes);
+
+ /* encode the opened files metric */
+ files = (struct ceph_opened_files *)(dlease + 1);
+ files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES);
+ files->header.ver = 1;
+ files->header.compat = 1;
+ files->header.data_len = cpu_to_le32(sizeof(*files) - header_len);
+ files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files));
+ files->total = cpu_to_le64(sum);
+ items++;
+
+ /* encode the pinned icaps metric */
+ icaps = (struct ceph_pinned_icaps *)(files + 1);
+ icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS);
+ icaps->header.ver = 1;
+ icaps->header.compat = 1;
+ icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len);
+ icaps->pinned_icaps = cpu_to_le64(nr_caps);
+ icaps->total = cpu_to_le64(sum);
+ items++;
+
+ /* encode the opened inodes metric */
+ inodes = (struct ceph_opened_inodes *)(icaps + 1);
+ inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES);
+ inodes->header.ver = 1;
+ inodes->header.compat = 1;
+ inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len);
+ inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes));
+ inodes->total = cpu_to_le64(sum);
+ items++;
+
+ /* encode the read io size metric */
+ rsize = (struct ceph_read_io_size *)(inodes + 1);
+ rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES);
+ rsize->header.ver = 1;
+ rsize->header.compat = 1;
+ rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len);
+ rsize->total_ops = cpu_to_le64(m->metric[METRIC_READ].total);
+ rsize->total_size = cpu_to_le64(m->metric[METRIC_READ].size_sum);
+ items++;
+
+ /* encode the write io size metric */
+ wsize = (struct ceph_write_io_size *)(rsize + 1);
+ wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES);
+ wsize->header.ver = 1;
+ wsize->header.compat = 1;
+ wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len);
+ wsize->total_ops = cpu_to_le64(m->metric[METRIC_WRITE].total);
+ wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum);
+ items++;
+
+ put_unaligned_le32(items, &head->num);
+ msg->front.iov_len = len;
+ msg->hdr.version = cpu_to_le16(1);
+ msg->hdr.compat_version = cpu_to_le16(1);
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ ceph_con_send(&s->s_con, msg);
+
+ return true;
+}
+
+
+static void metric_get_session(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *s;
+ int i;
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+
+ /*
+ * Skip it if MDS doesn't support the metric collection,
+ * or the MDS will close the session's socket connection
+ * directly when it get this message.
+ */
+ if (check_session_state(s) &&
+ test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) {
+ mdsc->metric.session = s;
+ break;
+ }
+
+ ceph_put_mds_session(s);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+static void metric_delayed_work(struct work_struct *work)
+{
+ struct ceph_client_metric *m =
+ container_of(work, struct ceph_client_metric, delayed_work.work);
+ struct ceph_mds_client *mdsc =
+ container_of(m, struct ceph_mds_client, metric);
+
+ if (mdsc->stopping)
+ return;
+
+ if (!m->session || !check_session_state(m->session)) {
+ if (m->session) {
+ ceph_put_mds_session(m->session);
+ m->session = NULL;
+ }
+ metric_get_session(mdsc);
+ }
+ if (m->session) {
+ ceph_mdsc_send_metrics(mdsc, m->session);
+ metric_schedule_delayed(m);
+ }
+}
+
+int ceph_metric_init(struct ceph_client_metric *m)
+{
+ struct ceph_metric *metric;
+ int ret, i;
+
+ if (!m)
+ return -EINVAL;
+
+ atomic64_set(&m->total_dentries, 0);
+ ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL);
+ if (ret)
+ goto err_d_lease_mis;
+
+ atomic64_set(&m->total_caps, 0);
+ ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL);
+ if (ret)
+ goto err_i_caps_hit;
+
+ ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL);
+ if (ret)
+ goto err_i_caps_mis;
+
+ for (i = 0; i < METRIC_MAX; i++) {
+ metric = &m->metric[i];
+ spin_lock_init(&metric->lock);
+ metric->size_sum = 0;
+ metric->size_min = U64_MAX;
+ metric->size_max = 0;
+ metric->total = 0;
+ metric->latency_sum = 0;
+ metric->latency_avg = 0;
+ metric->latency_sq_sum = 0;
+ metric->latency_min = KTIME_MAX;
+ metric->latency_max = 0;
+ }
+
+ atomic64_set(&m->opened_files, 0);
+ ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
+ if (ret)
+ goto err_opened_inodes;
+ ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
+ if (ret)
+ goto err_total_inodes;
+
+ m->session = NULL;
+ INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
+
+ return 0;
+
+err_total_inodes:
+ percpu_counter_destroy(&m->opened_inodes);
+err_opened_inodes:
+ percpu_counter_destroy(&m->i_caps_mis);
+err_i_caps_mis:
+ percpu_counter_destroy(&m->i_caps_hit);
+err_i_caps_hit:
+ percpu_counter_destroy(&m->d_lease_mis);
+err_d_lease_mis:
+ percpu_counter_destroy(&m->d_lease_hit);
+
+ return ret;
+}
+
+void ceph_metric_destroy(struct ceph_client_metric *m)
+{
+ if (!m)
+ return;
+
+ cancel_delayed_work_sync(&m->delayed_work);
+
+ percpu_counter_destroy(&m->total_inodes);
+ percpu_counter_destroy(&m->opened_inodes);
+ percpu_counter_destroy(&m->i_caps_mis);
+ percpu_counter_destroy(&m->i_caps_hit);
+ percpu_counter_destroy(&m->d_lease_mis);
+ percpu_counter_destroy(&m->d_lease_hit);
+
+ ceph_put_mds_session(m->session);
+}
+
+#define METRIC_UPDATE_MIN_MAX(min, max, new) \
+{ \
+ if (unlikely(new < min)) \
+ min = new; \
+ if (unlikely(new > max)) \
+ max = new; \
+}
+
+static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg,
+ ktime_t *sq_sump, ktime_t lat)
+{
+ ktime_t avg;
+
+ if (unlikely(total == 1)) {
+ *lavg = lat;
+ } else {
+ /* the sq is (lat - old_avg) * (lat - new_avg) */
+ avg = *lavg + div64_s64(lat - *lavg, total);
+ *sq_sump += (lat - *lavg)*(lat - avg);
+ *lavg = avg;
+ }
+}
+
+void ceph_update_metrics(struct ceph_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ktime_t lat = ktime_sub(r_end, r_start);
+ ktime_t total;
+
+ if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT))
+ return;
+
+ spin_lock(&m->lock);
+ total = ++m->total;
+ m->size_sum += size;
+ METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size);
+ m->latency_sum += lat;
+ METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat);
+ __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum,
+ lat);
+ spin_unlock(&m->lock);
+}
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
new file mode 100644
index 000000000000..0d0c44bd3332
--- /dev/null
+++ b/fs/ceph/metric.h
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDS_METRIC_H
+#define _FS_CEPH_MDS_METRIC_H
+
+#include <linux/ceph/types.h>
+#include <linux/percpu_counter.h>
+#include <linux/ktime.h>
+
+extern bool disable_send_metrics;
+
+enum ceph_metric_type {
+ CLIENT_METRIC_TYPE_CAP_INFO,
+ CLIENT_METRIC_TYPE_READ_LATENCY,
+ CLIENT_METRIC_TYPE_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_DENTRY_LEASE,
+ CLIENT_METRIC_TYPE_OPENED_FILES,
+ CLIENT_METRIC_TYPE_PINNED_ICAPS,
+ CLIENT_METRIC_TYPE_OPENED_INODES,
+ CLIENT_METRIC_TYPE_READ_IO_SIZES,
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+};
+
+/*
+ * This will always have the highest metric bit value
+ * as the last element of the array.
+ */
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
+ CLIENT_METRIC_TYPE_CAP_INFO, \
+ CLIENT_METRIC_TYPE_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_DENTRY_LEASE, \
+ CLIENT_METRIC_TYPE_OPENED_FILES, \
+ CLIENT_METRIC_TYPE_PINNED_ICAPS, \
+ CLIENT_METRIC_TYPE_OPENED_INODES, \
+ CLIENT_METRIC_TYPE_READ_IO_SIZES, \
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
+ \
+ CLIENT_METRIC_TYPE_MAX, \
+}
+
+struct ceph_metric_header {
+ __le32 type; /* ceph metric type */
+ __u8 ver;
+ __u8 compat;
+ __le32 data_len; /* length of sizeof(hit + mis + total) */
+} __packed;
+
+/* metric caps header */
+struct ceph_metric_cap {
+ struct ceph_metric_header header;
+ __le64 hit;
+ __le64 mis;
+ __le64 total;
+} __packed;
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+ struct ceph_metric_header header;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
+} __packed;
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+ struct ceph_metric_header header;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
+} __packed;
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+ struct ceph_metric_header header;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
+} __packed;
+
+/* metric dentry lease header */
+struct ceph_metric_dlease {
+ struct ceph_metric_header header;
+ __le64 hit;
+ __le64 mis;
+ __le64 total;
+} __packed;
+
+/* metric opened files header */
+struct ceph_opened_files {
+ struct ceph_metric_header header;
+ __le64 opened_files;
+ __le64 total;
+} __packed;
+
+/* metric pinned i_caps header */
+struct ceph_pinned_icaps {
+ struct ceph_metric_header header;
+ __le64 pinned_icaps;
+ __le64 total;
+} __packed;
+
+/* metric opened inodes header */
+struct ceph_opened_inodes {
+ struct ceph_metric_header header;
+ __le64 opened_inodes;
+ __le64 total;
+} __packed;
+
+/* metric read io size header */
+struct ceph_read_io_size {
+ struct ceph_metric_header header;
+ __le64 total_ops;
+ __le64 total_size;
+} __packed;
+
+/* metric write io size header */
+struct ceph_write_io_size {
+ struct ceph_metric_header header;
+ __le64 total_ops;
+ __le64 total_size;
+} __packed;
+
+struct ceph_metric_head {
+ __le32 num; /* the number of metrics that will be sent */
+} __packed;
+
+enum metric_type {
+ METRIC_READ,
+ METRIC_WRITE,
+ METRIC_METADATA,
+ METRIC_COPYFROM,
+ METRIC_MAX
+};
+
+struct ceph_metric {
+ spinlock_t lock;
+ u64 total;
+ u64 size_sum;
+ u64 size_min;
+ u64 size_max;
+ ktime_t latency_sum;
+ ktime_t latency_avg;
+ ktime_t latency_sq_sum;
+ ktime_t latency_min;
+ ktime_t latency_max;
+};
+
+/* This is the global metrics */
+struct ceph_client_metric {
+ atomic64_t total_dentries;
+ struct percpu_counter d_lease_hit;
+ struct percpu_counter d_lease_mis;
+
+ atomic64_t total_caps;
+ struct percpu_counter i_caps_hit;
+ struct percpu_counter i_caps_mis;
+
+ struct ceph_metric metric[METRIC_MAX];
+
+ /* The total number of directories and files that are opened */
+ atomic64_t opened_files;
+
+ /* The total number of inodes that have opened files or directories */
+ struct percpu_counter opened_inodes;
+ struct percpu_counter total_inodes;
+
+ struct ceph_mds_session *session;
+ struct delayed_work delayed_work; /* delayed work */
+};
+
+static inline void metric_schedule_delayed(struct ceph_client_metric *m)
+{
+ if (disable_send_metrics)
+ return;
+
+ /* per second */
+ schedule_delayed_work(&m->delayed_work, round_jiffies_relative(HZ));
+}
+
+extern int ceph_metric_init(struct ceph_client_metric *m);
+extern void ceph_metric_destroy(struct ceph_client_metric *m);
+
+static inline void ceph_update_cap_hit(struct ceph_client_metric *m)
+{
+ percpu_counter_inc(&m->i_caps_hit);
+}
+
+static inline void ceph_update_cap_mis(struct ceph_client_metric *m)
+{
+ percpu_counter_inc(&m->i_caps_mis);
+}
+
+extern void ceph_update_metrics(struct ceph_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc);
+
+static inline void ceph_update_read_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_READ],
+ r_start, r_end, size, rc);
+}
+static inline void ceph_update_write_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_WRITE],
+ r_start, r_end, size, rc);
+}
+static inline void ceph_update_metadata_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_METADATA],
+ r_start, r_end, 0, rc);
+}
+static inline void ceph_update_copyfrom_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_COPYFROM],
+ r_start, r_end, size, rc);
+}
+#endif /* _FS_CEPH_MDS_METRIC_H */
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index de56dee60540..64592adfe48f 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -12,7 +12,7 @@
void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
if (inc)
atomic64_inc(&mdsc->quotarealms_count);
else
@@ -21,14 +21,17 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
static inline bool ceph_has_realms_with_quotas(struct inode *inode)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct super_block *sb = mdsc->fsc->sb;
+ struct super_block *sb = inode->i_sb;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
+ struct inode *root = d_inode(sb->s_root);
if (atomic64_read(&mdsc->quotarealms_count) > 0)
return true;
/* if root is the real CephFS root, we don't have quota realms */
- if (sb->s_root->d_inode &&
- (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+ if (root && ceph_ino(root) == CEPH_INO_ROOT)
+ return false;
+ /* MDS stray dirs have no quota realms */
+ if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino))
return false;
/* otherwise, we can't know for sure */
return true;
@@ -53,7 +56,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
/* increment msg sequence number */
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
mutex_unlock(&session->s_mutex);
/* lookup inode */
@@ -74,8 +77,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
le64_to_cpu(h->max_files));
spin_unlock(&ci->i_ceph_lock);
- /* avoid calling iput_final() in dispatch thread */
- ceph_async_iput(inode);
+ iput(inode);
}
static struct ceph_quotarealm_inode *
@@ -159,8 +161,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
}
if (IS_ERR(in)) {
- pr_warn("Can't lookup inode %llx (err: %ld)\n",
- realm->ino, PTR_ERR(in));
+ dout("Can't lookup inode %llx (err: %ld)\n",
+ realm->ino, PTR_ERR(in));
qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
} else {
qri->timeout = 0;
@@ -193,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
/*
* This function walks through the snaprealm for an inode and returns the
- * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
- * or max_bytes). If the root is reached, return the root ceph_snap_realm
- * instead.
+ * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
+ * max_bytes, or any, depending on the 'which_quota' argument). If the root is
+ * reached, return the root ceph_snap_realm instead.
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
@@ -207,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* will be restarted.
*/
static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
- struct inode *inode, bool retry)
+ struct inode *inode,
+ enum quota_get_realm which_quota,
+ bool retry)
{
struct ceph_inode_info *ci = NULL;
struct ceph_snap_realm *realm, *next;
@@ -246,9 +250,8 @@ restart:
}
ci = ceph_inode(in);
- has_quota = __ceph_has_any_quota(ci);
- /* avoid calling iput_final() while holding mdsc->snap_rwsem */
- ceph_async_iput(in);
+ has_quota = __ceph_has_quota(ci, which_quota);
+ iput(in);
next = realm->parent;
if (has_quota || !next)
@@ -266,7 +269,7 @@ restart:
bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
@@ -278,8 +281,8 @@ restart:
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
- old_realm = get_quota_realm(mdsc, old, true);
- new_realm = get_quota_realm(mdsc, new, false);
+ old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
+ new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
if (PTR_ERR(new_realm) == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
@@ -313,7 +316,7 @@ enum quota_check_op {
static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
loff_t delta)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci;
struct ceph_snap_realm *realm, *next;
struct inode *in;
@@ -361,8 +364,6 @@ restart:
spin_unlock(&ci->i_ceph_lock);
switch (op) {
case QUOTA_CHECK_MAX_FILES_OP:
- exceeded = (max && (rvalue >= max));
- break;
case QUOTA_CHECK_MAX_BYTES_OP:
exceeded = (max && (rvalue + delta > max));
break;
@@ -385,8 +386,7 @@ restart:
pr_warn("Invalid quota check op (%d)\n", op);
exceeded = true; /* Just break the loop */
}
- /* avoid calling iput_final() while holding mdsc->snap_rwsem */
- ceph_async_iput(in);
+ iput(in);
next = realm->parent;
if (exceeded || !next)
@@ -417,7 +417,7 @@ bool ceph_quota_is_max_files_exceeded(struct inode *inode)
WARN_ON(!S_ISDIR(inode->i_mode));
- return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 0);
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 1);
}
/*
@@ -485,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
- realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
+ realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
+ QUOTA_GET_MAX_BYTES, true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
@@ -499,10 +500,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
if (ci->i_max_bytes) {
total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT;
used = ci->i_rbytes >> CEPH_BLOCK_SHIFT;
+ /* For quota size less than 4MB, use 4KB block size */
+ if (!total) {
+ total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT;
+ used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT;
+ buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT;
+ }
/* It is possible for a quota to be exceeded.
* Report 'zero' in that case
*/
free = total > used ? total - used : 0;
+ /* For quota size less than 4KB, report the
+ * total=used=4KB,free=0 when quota is full
+ * and total=free=4KB, used=0 otherwise */
+ if (!total) {
+ total = 1;
+ free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0;
+ buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT;
+ }
}
spin_unlock(&ci->i_ceph_lock);
if (total) {
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ccfcc66aaf44..864cdaa0d2bd 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -60,24 +60,26 @@
/*
* increase ref count for the realm
*
- * caller must hold snap_rwsem for write.
+ * caller must hold snap_rwsem.
*/
void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("get_realm %p %d -> %d\n", realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
/*
- * since we _only_ increment realm refs or empty the empty
- * list with snap_rwsem held, adjusting the empty list here is
- * safe. we do need to protect against concurrent empty list
- * additions, however.
+ * The 0->1 and 1->0 transitions must take the snap_empty_lock
+ * atomically with the refcount change. Go ahead and bump the
+ * nref here, unless it's 0, in which case we take the spinlock
+ * and then do the increment and remove it from the list.
*/
- if (atomic_inc_return(&realm->nref) == 1) {
- spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_not_zero(&realm->nref))
+ return;
+
+ spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_return(&realm->nref) == 1)
list_del_init(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
+ spin_unlock(&mdsc->snap_empty_lock);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -113,29 +115,36 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
{
struct ceph_snap_realm *realm;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
realm = kzalloc(sizeof(*realm), GFP_NOFS);
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 1); /* for caller */
+ /* Do not release the global dummy snaprealm until unmouting */
+ if (ino == CEPH_INO_GLOBAL_SNAPREALM)
+ atomic_set(&realm->nref, 2);
+ else
+ atomic_set(&realm->nref, 1);
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
INIT_LIST_HEAD(&realm->empty_item);
INIT_LIST_HEAD(&realm->dirty_item);
+ INIT_LIST_HEAD(&realm->rebuild_item);
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
mdsc->num_snap_realms++;
- dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ dout("%s %llx %p\n", __func__, realm->ino, realm);
return realm;
}
/*
* lookup the realm rooted at @ino.
*
- * caller must hold snap_rwsem for write.
+ * caller must hold snap_rwsem.
*/
static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino)
@@ -143,6 +152,8 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
while (n) {
r = rb_entry(n, struct ceph_snap_realm, node);
if (ino < r->ino)
@@ -150,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
else if (ino > r->ino)
n = n->rb_right;
else {
- dout("lookup_snap_realm %llx %p\n", r->ino, r);
+ dout("%s %llx %p\n", __func__, r->ino, r);
return r;
}
}
@@ -176,7 +187,9 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
+ dout("%s %p %llx\n", __func__, realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
mdsc->num_snap_realms--;
@@ -198,28 +211,30 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
+ /*
+ * We do not require the snap_empty_lock here, as any caller that
+ * increments the value must hold the snap_rwsem.
+ */
if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm);
}
/*
- * caller needn't hold any locks
+ * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
*/
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (!atomic_dec_and_test(&realm->nref))
+ if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
return;
if (down_write_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&mdsc->snap_empty_lock);
__destroy_snap_realm(mdsc, realm);
up_write(&mdsc->snap_rwsem);
} else {
- spin_lock(&mdsc->snap_empty_lock);
list_add(&realm->empty_item, &mdsc->snap_empty);
spin_unlock(&mdsc->snap_empty_lock);
}
@@ -236,6 +251,8 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
{
struct ceph_snap_realm *realm;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
spin_lock(&mdsc->snap_empty_lock);
while (!list_empty(&mdsc->snap_empty)) {
realm = list_first_entry(&mdsc->snap_empty,
@@ -248,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_empty_lock);
}
-void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc)
{
+ struct ceph_snap_realm *global_realm;
+
down_write(&mdsc->snap_rwsem);
+ global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM);
+ if (global_realm)
+ ceph_put_snap_realm(mdsc, global_realm);
__cleanup_empty_realms(mdsc);
up_write(&mdsc->snap_rwsem);
}
@@ -269,6 +291,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
{
struct ceph_snap_realm *parent;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
if (realm->parent_ino == parentino)
return 0;
@@ -278,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
if (IS_ERR(parent))
return PTR_ERR(parent);
}
- dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
- realm->ino, realm, realm->parent_ino, realm->parent,
- parentino, parent);
+ dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino,
+ realm, realm->parent_ino, realm->parent, parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
@@ -306,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b)
* build the snap context for a given realm.
*/
static int build_snap_context(struct ceph_snap_realm *realm,
- struct list_head* dirty_realms)
+ struct list_head *realm_queue,
+ struct list_head *dirty_realms)
{
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
@@ -320,9 +344,9 @@ static int build_snap_context(struct ceph_snap_realm *realm,
*/
if (parent) {
if (!parent->cached_context) {
- err = build_snap_context(parent, dirty_realms);
- if (err)
- goto fail;
+ /* add to the queue head */
+ list_add(&parent->rebuild_item, realm_queue);
+ return 1;
}
num += parent->cached_context->num_snaps;
}
@@ -335,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
- " (unchanged)\n",
- realm->ino, realm, realm->cached_context,
+ dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+ __func__, realm->ino, realm, realm->cached_context,
realm->cached_context->seq,
(unsigned int)realm->cached_context->num_snaps);
return 0;
@@ -376,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
- realm->ino, realm, snapc, snapc->seq,
- (unsigned int) snapc->num_snaps);
+ dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino,
+ realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps);
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
@@ -395,8 +417,7 @@ fail:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
- pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
- realm, err);
+ pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err);
return err;
}
@@ -406,13 +427,50 @@ fail:
static void rebuild_snap_realms(struct ceph_snap_realm *realm,
struct list_head *dirty_realms)
{
- struct ceph_snap_realm *child;
+ LIST_HEAD(realm_queue);
+ int last = 0;
+ bool skip = false;
+
+ list_add_tail(&realm->rebuild_item, &realm_queue);
+
+ while (!list_empty(&realm_queue)) {
+ struct ceph_snap_realm *_realm, *child;
+
+ _realm = list_first_entry(&realm_queue,
+ struct ceph_snap_realm,
+ rebuild_item);
+
+ /*
+ * If the last building failed dues to memory
+ * issue, just empty the realm_queue and return
+ * to avoid infinite loop.
+ */
+ if (last < 0) {
+ list_del_init(&_realm->rebuild_item);
+ continue;
+ }
+
+ last = build_snap_context(_realm, &realm_queue, dirty_realms);
+ dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
+ last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
- dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm, dirty_realms);
+ /* is any child in the list ? */
+ list_for_each_entry(child, &_realm->children, child_item) {
+ if (!list_empty(&child->rebuild_item)) {
+ skip = true;
+ break;
+ }
+ }
- list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child, dirty_realms);
+ if (!skip) {
+ list_for_each_entry(child, &_realm->children, child_item)
+ list_add_tail(&child->rebuild_item, &realm_queue);
+ }
+
+ /* last == 1 means need to build parent first */
+ if (last <= 0)
+ list_del_init(&_realm->rebuild_item);
+ }
}
@@ -460,20 +518,15 @@ static bool has_new_snaps(struct ceph_snap_context *o,
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
* change).
*/
-void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap **pcapsnap)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
int used, dirty;
- capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
- if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
- return;
- }
-
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
@@ -494,12 +547,14 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
- dout("queue_cap_snap %p already pending\n", inode);
+ dout("%s %p %llx.%llx already pending\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ dout("%s %p %llx.%llx nothing dirty|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
@@ -519,20 +574,17 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
- dout("queue_cap_snap %p "
- "no new_snap|dirty_page|writing\n", inode);
+ dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
}
- dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
- inode, capsnap, old_snapc, ceph_cap_string(dirty),
- capsnap->need_flush ? "" : "no_flush");
+ dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+ __func__, inode, ceph_vinop(inode), capsnap, old_snapc,
+ ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
ihold(inode);
- refcount_set(&capsnap->nref, 1);
- INIT_LIST_HEAD(&capsnap->ci_item);
-
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
@@ -562,31 +614,30 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+ " now pending\n", __func__, inode, ceph_vinop(inode),
capsnap, old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
- capsnap = NULL;
+ *pcapsnap = NULL;
old_snapc = NULL;
update_snapc:
- if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_wr_ref == 0 &&
- ci->i_dirty_caps == 0 &&
- ci->i_flushing_caps == 0) {
- ci->i_head_snapc = NULL;
- } else {
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock);
ceph_buffer_put(old_blob);
- kfree(capsnap);
ceph_put_snap_context(old_snapc);
}
@@ -601,11 +652,11 @@ update_snapc:
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
- capsnap->size = inode->i_size;
+ capsnap->size = i_size_read(inode);
capsnap->mtime = inode->i_mtime;
capsnap->atime = inode->i_atime;
capsnap->ctime = inode->i_ctime;
@@ -615,17 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->truncate_size = ci->i_truncate_size;
capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size,
- capsnap->dirty_pages);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "still has %d dirty pages\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size, capsnap->dirty_pages);
+ return 0;
+ }
+
+ /* Fb cap still in use, delay it */
+ if (ci->i_wb_ref) {
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "used WRBUFFER, delaying\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
+ capsnap->writing = 1;
return 0;
}
ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
- inode, capsnap, capsnap->context,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+ __func__, inode, ceph_vinop(inode), capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
capsnap->size);
@@ -644,26 +706,47 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
+ struct ceph_cap_snap *capsnap = NULL;
- dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+ dout("%s %p %llx inode\n", __func__, realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
+ struct inode *inode = igrab(&ci->netfs.inode);
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
- /* avoid calling iput_final() while holding
- * mdsc->snap_rwsem or in mds dispatch threads */
- ceph_async_iput(lastinode);
+ iput(lastinode);
lastinode = inode;
- ceph_queue_cap_snap(ci);
+
+ /*
+ * Allocate the capsnap memory outside of ceph_queue_cap_snap()
+ * to reduce very possible but unnecessary frequently memory
+ * allocate/free in this loop.
+ */
+ if (!capsnap) {
+ capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
+ if (!capsnap) {
+ pr_err("ENOMEM allocating ceph_cap_snap on %p\n",
+ inode);
+ return;
+ }
+ }
+ capsnap->cap_flush.is_capsnap = true;
+ refcount_set(&capsnap->nref, 1);
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
+ INIT_LIST_HEAD(&capsnap->ci_item);
+
+ ceph_queue_cap_snap(ci, &capsnap);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
- ceph_async_iput(lastinode);
+ iput(lastinode);
- dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+ if (capsnap)
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
+ dout("%s %p %llx done\n", __func__, realm, realm->ino);
}
/*
@@ -682,12 +765,16 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
__le64 *prior_parent_snaps; /* encoded */
struct ceph_snap_realm *realm = NULL;
struct ceph_snap_realm *first_realm = NULL;
- int invalidate = 0;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ int rebuild_snapcs;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
- dout("update_snap_trace deletion=%d\n", deletion);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
+ dout("%s deletion=%d\n", __func__, deletion);
more:
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -711,10 +798,10 @@ more:
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+ dout("%s updating %llx %p %lld -> %lld\n", __func__,
realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
@@ -736,22 +823,30 @@ more:
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
- dout("update_snap_trace %llx %p seq %lld new\n",
+ dout("%s %llx %p seq %lld new\n", __func__,
realm->ino, realm, realm->seq);
- invalidate = 1;
+ rebuild_snapcs = 1;
} else {
- dout("update_snap_trace %llx %p seq %lld unchanged\n",
+ dout("%s %llx %p seq %lld unchanged\n", __func__,
realm->ino, realm, realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
+
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate && p >= e)
- rebuild_snap_realms(realm, &dirty_realms);
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -781,13 +876,13 @@ more:
return 0;
bad:
- err = -EINVAL;
+ err = -EIO;
fail:
if (realm && !IS_ERR(realm))
ceph_put_snap_realm(mdsc, realm);
if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
- pr_err("update_snap_trace error %d\n", err);
+ pr_err("%s error %d\n", __func__, err);
return err;
}
@@ -804,29 +899,61 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
struct inode *inode;
struct ceph_mds_session *session = NULL;
- dout("flush_snaps\n");
+ dout("%s\n", __func__);
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
struct ceph_inode_info, i_snap_flush_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
ceph_flush_snaps(ci, &session);
- /* avoid calling iput_final() while holding
- * session->s_mutex or in mds dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
spin_unlock(&mdsc->snap_flush_lock);
- if (session) {
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- }
- dout("flush_snaps done\n");
+ ceph_put_mds_session(session);
+ dout("%s done\n", __func__);
}
+/**
+ * ceph_change_snap_realm - change the snap_realm for an inode
+ * @inode: inode to move to new snap realm
+ * @realm: new realm to move inode into (may be NULL)
+ *
+ * Detach an inode from its old snaprealm (if any) and attach it to
+ * the new snaprealm (if any). The old snap realm reference held by
+ * the inode is put. If realm is non-NULL, then the caller's reference
+ * to it is taken over by the inode.
+ */
+void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ if (oldrealm) {
+ spin_lock(&oldrealm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ if (oldrealm->ino == ci->i_vino.ino)
+ oldrealm->inode = NULL;
+ spin_unlock(&oldrealm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, oldrealm);
+ }
+
+ ci->i_snap_realm = realm;
+
+ if (realm) {
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps);
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = inode;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ }
+}
/*
* Handle a snap notification from the MDS.
@@ -869,11 +996,11 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
- dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
- ceph_snap_op_name(op), split, trace_len);
+ dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
+ mds, ceph_snap_op_name(op), split, trace_len);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
mutex_unlock(&session->s_mutex);
down_write(&mdsc->snap_rwsem);
@@ -913,7 +1040,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
};
struct inode *inode = ceph_find_inode(sb, vino);
struct ceph_inode_info *ci;
- struct ceph_snap_realm *oldrealm;
if (!inode)
continue;
@@ -931,42 +1057,23 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
- dout(" leaving %p in newer realm %llx %p\n",
- inode, ci->i_snap_realm->ino,
+ dout(" leaving %p %llx.%llx in newer realm %llx %p\n",
+ inode, ceph_vinop(inode), ci->i_snap_realm->ino,
ci->i_snap_realm);
goto skip_inode;
}
- dout(" will move %p to split realm %llx %p\n",
- inode, realm->ino, realm);
- /*
- * Move the inode to the new realm
- */
- oldrealm = ci->i_snap_realm;
- spin_lock(&oldrealm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- spin_unlock(&oldrealm->inodes_with_caps_lock);
-
- spin_lock(&realm->inodes_with_caps_lock);
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- ci->i_snap_realm = realm;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = inode;
- spin_unlock(&realm->inodes_with_caps_lock);
-
- spin_unlock(&ci->i_ceph_lock);
+ dout(" will move %p %llx.%llx to split realm %llx %p\n",
+ inode, ceph_vinop(inode), realm->ino, realm);
ceph_get_snap_realm(mdsc, realm);
- ceph_put_snap_realm(mdsc, oldrealm);
-
- /* avoid calling iput_final() while holding
- * mdsc->snap_rwsem or mds in dispatch threads */
- ceph_async_iput(inode);
+ ceph_change_snap_realm(inode, realm);
+ spin_unlock(&ci->i_ceph_lock);
+ iput(inode);
continue;
skip_inode:
spin_unlock(&ci->i_ceph_lock);
- ceph_async_iput(inode);
+ iput(inode);
}
/* we may have taken some of the old realm's children. */
@@ -999,7 +1106,7 @@ skip_inode:
return;
bad:
- pr_err("corrupt snap message from mds%d\n", mds);
+ pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
@@ -1032,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
}
spin_unlock(&mdsc->snapid_map_lock);
if (exist) {
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
@@ -1076,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
if (exist) {
free_anon_bdev(sm->dev);
kfree(sm);
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
- dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ dout("%s create snapid map %llx -> %x\n", __func__,
+ sm->snap, sm->dev);
return sm;
}
@@ -1155,5 +1265,6 @@ void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
pr_err("snapid map %llx -> %x still in use\n",
sm->snap, sm->dev);
}
+ kfree(sm);
}
}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 4a79f3632260..e36e8948e728 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -46,6 +46,7 @@ const char *ceph_session_op_name(int op)
case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
case CEPH_SESSION_FORCE_RO: return "force_ro";
case CEPH_SESSION_REJECT: return "reject";
+ case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog";
}
return "???";
}
@@ -59,6 +60,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_GETVXATTR: return "getvxattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c7f150686a53..3fc48b43cab0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -27,6 +27,11 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#include <uapi/linux/magic.h>
+
+static DEFINE_SPINLOCK(ceph_fsc_lock);
+static LIST_HEAD(ceph_fsc_list);
+
/*
* Ceph superblock operations
*
@@ -49,8 +54,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
struct ceph_mon_client *monc = &fsc->client->monc;
struct ceph_statfs st;
- u64 fsid;
- int err;
+ int i, err;
u64 data_pool;
if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
@@ -68,15 +72,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
/*
- * express utilization in terms of large blocks to avoid
+ * Express utilization in terms of large blocks to avoid
* overflow on 32-bit machines.
- *
- * NOTE: for the time being, we make bsize == frsize to humor
- * not-yet-ancient versions of glibc that are broken.
- * Someday, we will probably want to report a real block
- * size... whatever that may mean for a network file system!
*/
- buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
/*
@@ -91,18 +89,27 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
}
+ /*
+ * NOTE: for the time being, we make bsize == frsize to humor
+ * not-yet-ancient versions of glibc that are broken.
+ * Someday, we will probably want to report a real block
+ * size... whatever that may mean for a network file system!
+ */
+ buf->f_bsize = buf->f_frsize;
+
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
/* Must convert the fsid, for consistent values across arches */
+ buf->f_fsid.val[0] = 0;
mutex_lock(&monc->mutex);
- fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^
- le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1));
+ for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i)
+ buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]);
mutex_unlock(&monc->mutex);
- buf->f_fsid.val[0] = fsid & 0xffffffff;
- buf->f_fsid.val[1] = fsid >> 32;
+ /* fold the fs_cluster_id into the upper bits */
+ buf->f_fsid.val[1] = monc->fs_cluster_id;
return 0;
}
@@ -143,6 +150,7 @@ enum {
Opt_mds_namespace,
Opt_recover_session,
Opt_source,
+ Opt_mon_addr,
/* string args above */
Opt_dirstat,
Opt_rbytes,
@@ -155,6 +163,8 @@ enum {
Opt_acl,
Opt_quotadf,
Opt_copyfrom,
+ Opt_wsync,
+ Opt_pagecache,
};
enum ceph_recover_session_mode {
@@ -193,7 +203,10 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_u32 ("rsize", Opt_rsize),
fsparam_string ("snapdirname", Opt_snapdirname),
fsparam_string ("source", Opt_source),
+ fsparam_string ("mon_addr", Opt_mon_addr),
fsparam_u32 ("wsize", Opt_wsize),
+ fsparam_flag_no ("wsync", Opt_wsync),
+ fsparam_flag_no ("pagecache", Opt_pagecache),
{}
};
@@ -223,9 +236,92 @@ static void canonicalize_path(char *path)
}
/*
- * Parse the source parameter. Distinguish the server list from the path.
+ * Check if the mds namespace in ceph_mount_options matches
+ * the passed in namespace string. First time match (when
+ * ->mds_namespace is NULL) is treated specially, since
+ * ->mds_namespace needs to be initialized by the caller.
+ */
+static int namespace_equals(struct ceph_mount_options *fsopt,
+ const char *namespace, size_t len)
+{
+ return !(fsopt->mds_namespace &&
+ (strlen(fsopt->mds_namespace) != len ||
+ strncmp(fsopt->mds_namespace, namespace, len)));
+}
+
+static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end,
+ struct fs_context *fc)
+{
+ int r;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+
+ if (*dev_name_end != ':')
+ return invalfc(fc, "separator ':' missing in source");
+
+ r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name,
+ pctx->copts, fc->log.log, ',');
+ if (r)
+ return r;
+
+ fsopt->new_dev_syntax = false;
+ return 0;
+}
+
+static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end,
+ struct fs_context *fc)
+{
+ size_t len;
+ struct ceph_fsid fsid;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ char *fsid_start, *fs_name_start;
+
+ if (*dev_name_end != '=') {
+ dout("separator '=' missing in source");
+ return -EINVAL;
+ }
+
+ fsid_start = strchr(dev_name, '@');
+ if (!fsid_start)
+ return invalfc(fc, "missing cluster fsid");
+ ++fsid_start; /* start of cluster fsid */
+
+ fs_name_start = strchr(fsid_start, '.');
+ if (!fs_name_start)
+ return invalfc(fc, "missing file system name");
+
+ if (ceph_parse_fsid(fsid_start, &fsid))
+ return invalfc(fc, "Invalid FSID");
+
+ ++fs_name_start; /* start of file system name */
+ len = dev_name_end - fs_name_start;
+
+ if (!namespace_equals(fsopt, fs_name_start, len))
+ return invalfc(fc, "Mismatching mds_namespace");
+ kfree(fsopt->mds_namespace);
+ fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
+ dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace);
+
+ fsopt->new_dev_syntax = true;
+ return 0;
+}
+
+/*
+ * Parse the source parameter for new device format. Distinguish the device
+ * spec from the path. Try parsing new device format and fallback to old
+ * format if needed.
*
- * The source will look like:
+ * New device syntax will looks like:
+ * <device_spec>=/<path>
+ * where
+ * <device_spec> is name@fsid.fsname
+ * <path> is optional, but if present must begin with '/'
+ * (monitor addresses are passed via mount option)
+ *
+ * Old device syntax is:
* <server_spec>[,<server_spec>...]:[<path>]
* where
* <server_spec> is <ip>[:<port>]
@@ -258,24 +354,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
dev_name_end = dev_name + strlen(dev_name);
}
- dev_name_end--; /* back up to ':' separator */
- if (dev_name_end < dev_name || *dev_name_end != ':')
- return invalfc(fc, "No path or : separator in source");
+ dev_name_end--; /* back up to separator */
+ if (dev_name_end < dev_name)
+ return invalfc(fc, "Path missing in source");
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
if (fsopt->server_path)
dout("server path '%s'\n", fsopt->server_path);
- ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name,
- pctx->copts, fc->log.log);
- if (ret)
- return ret;
+ dout("trying new device syntax");
+ ret = ceph_parse_new_source(dev_name, dev_name_end, fc);
+ if (ret) {
+ if (ret != -EINVAL)
+ return ret;
+ dout("trying old device syntax");
+ ret = ceph_parse_old_source(dev_name, dev_name_end, fc);
+ if (ret)
+ return ret;
+ }
fc->source = param->string;
param->string = NULL;
return 0;
}
+static int ceph_parse_mon_addr(struct fs_parameter *param,
+ struct fs_context *fc)
+{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+
+ kfree(fsopt->mon_addr);
+ fsopt->mon_addr = param->string;
+ param->string = NULL;
+
+ return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr),
+ pctx->copts, fc->log.log, '/');
+}
+
static int ceph_parse_mount_param(struct fs_context *fc,
struct fs_parameter *param)
{
@@ -301,6 +417,8 @@ static int ceph_parse_mount_param(struct fs_context *fc,
param->string = NULL;
break;
case Opt_mds_namespace:
+ if (!namespace_equals(fsopt, param->string, strlen(param->string)))
+ return invalfc(fc, "Mismatching mds_namespace");
kfree(fsopt->mds_namespace);
fsopt->mds_namespace = param->string;
param->string = NULL;
@@ -318,6 +436,8 @@ static int ceph_parse_mount_param(struct fs_context *fc,
if (fc->source)
return invalfc(fc, "Multiple sources specified");
return ceph_parse_source(param, fc);
+ case Opt_mon_addr:
+ return ceph_parse_mon_addr(param, fc);
case Opt_wsize:
if (result.uint_32 < PAGE_SIZE ||
result.uint_32 > CEPH_MAX_WRITE_SIZE)
@@ -444,6 +564,18 @@ static int ceph_parse_mount_param(struct fs_context *fc,
fc->sb_flags &= ~SB_POSIXACL;
}
break;
+ case Opt_wsync:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ break;
+ case Opt_pagecache:
+ if (result.negated)
+ fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
+ else
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
+ break;
default:
BUG();
}
@@ -463,6 +595,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
kfree(args->mds_namespace);
kfree(args->server_path);
kfree(args->fscache_uniq);
+ kfree(args->mon_addr);
kfree(args);
}
@@ -506,6 +639,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr);
+ if (ret)
+ return ret;
+
return ceph_compare_options(new_opt, fsc->client);
}
@@ -561,12 +698,22 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
seq_puts(m, ",copyfrom");
- if (fsopt->mds_namespace)
+ /* dump mds_namespace when old device syntax is in use */
+ if (fsopt->mds_namespace && !fsopt->new_dev_syntax)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+ if (fsopt->mon_addr)
+ seq_printf(m, ",mon_addr=%s", fsopt->mon_addr);
+
if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");
+ if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
+ seq_puts(m, ",wsync");
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+ seq_puts(m, ",nopagecache");
+
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%u", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -623,8 +770,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- int page_count;
- size_t size;
int err;
fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
@@ -659,6 +804,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->have_copy_from2 = true;
atomic_long_set(&fsc->writeback_count, 0);
+ fsc->write_congested = false;
err = -ENOMEM;
/*
@@ -672,18 +818,15 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->cap_wq)
goto fail_inode_wq;
- /* set up mempools */
- err = -ENOMEM;
- page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
- size = sizeof (struct page *) * (page_count ? page_count : 1);
- fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
- if (!fsc->wb_pagevec_pool)
- goto fail_cap_wq;
+ hash_init(fsc->async_unlink_conflict);
+ spin_lock_init(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&ceph_fsc_lock);
+ list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
+ spin_unlock(&ceph_fsc_lock);
return fsc;
-fail_cap_wq:
- destroy_workqueue(fsc->cap_wq);
fail_inode_wq:
destroy_workqueue(fsc->inode_wq);
fail_client:
@@ -706,12 +849,14 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
{
dout("destroy_fs_client %p\n", fsc);
+ spin_lock(&ceph_fsc_lock);
+ list_del(&fsc->metric_wakeup);
+ spin_unlock(&ceph_fsc_lock);
+
ceph_mdsc_destroy(fsc);
destroy_workqueue(fsc->inode_wq);
destroy_workqueue(fsc->cap_wq);
- mempool_destroy(fsc->wb_pagevec_pool);
-
destroy_mount_options(fsc->mount_options);
ceph_destroy_client(fsc->client);
@@ -725,15 +870,18 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_snap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
struct kmem_cache *ceph_dir_file_cachep;
+struct kmem_cache *ceph_mds_request_cachep;
+mempool_t *ceph_wb_pagevec_pool;
static void ceph_inode_init_once(void *foo)
{
struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
+ inode_init_once(&ci->netfs.inode);
}
static int __init init_caches(void)
@@ -751,6 +899,9 @@ static int __init init_caches(void)
ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
if (!ceph_cap_cachep)
goto bad_cap;
+ ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD);
+ if (!ceph_cap_snap_cachep)
+ goto bad_cap_snap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (!ceph_cap_flush_cachep)
@@ -769,13 +920,19 @@ static int __init init_caches(void)
if (!ceph_dir_file_cachep)
goto bad_dir_file;
- error = ceph_fscache_register();
- if (error)
- goto bad_fscache;
+ ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD);
+ if (!ceph_mds_request_cachep)
+ goto bad_mds_req;
+
+ ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT);
+ if (!ceph_wb_pagevec_pool)
+ goto bad_pagevec_pool;
return 0;
-bad_fscache:
+bad_pagevec_pool:
+ kmem_cache_destroy(ceph_mds_request_cachep);
+bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
kmem_cache_destroy(ceph_file_cachep);
@@ -784,6 +941,8 @@ bad_file:
bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
+ kmem_cache_destroy(ceph_cap_snap_cachep);
+bad_cap_snap:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
@@ -800,19 +959,27 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_cap_snap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
+ kmem_cache_destroy(ceph_mds_request_cachep);
+ mempool_destroy(ceph_wb_pagevec_pool);
+}
- ceph_fscache_unregister();
+static void __ceph_umount_begin(struct ceph_fs_client *fsc)
+{
+ ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
+ ceph_mdsc_force_umount(fsc->mdsc);
+ fsc->filp_gen++; // invalidate open files
}
/*
- * ceph_umount_begin - initiate forced umount. Tear down down the
+ * ceph_umount_begin - initiate forced umount. Tear down the
* mount, skipping steps that may hang while waiting for server(s).
*/
-static void ceph_umount_begin(struct super_block *sb)
+void ceph_umount_begin(struct super_block *sb)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
@@ -820,9 +987,7 @@ static void ceph_umount_begin(struct super_block *sb)
if (!fsc)
return;
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
- ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
- ceph_mdsc_force_umount(fsc->mdsc);
- fsc->filp_gen++; // invalidate open files
+ __ceph_umount_begin(fsc);
}
static const struct super_operations ceph_super_ops = {
@@ -959,6 +1124,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
s->s_time_gran = 1;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
+ s->s_flags |= SB_NODIRATIME | SB_NOATIME;
ret = set_anon_super_fc(s, fc);
if (ret != 0)
@@ -974,16 +1140,16 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
struct ceph_fs_client *new = fc->s_fs_info;
struct ceph_mount_options *fsopt = new->mount_options;
struct ceph_options *opt = new->client->options;
- struct ceph_fs_client *other = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
dout("ceph_compare_super %p\n", sb);
- if (compare_mount_options(fsopt, opt, other)) {
+ if (compare_mount_options(fsopt, opt, fsc)) {
dout("monitor(s)/mount options don't match\n");
return 0;
}
if ((opt->flags & CEPH_OPT_FSID) &&
- ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+ ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
dout("fsid doesn't match\n");
return 0;
}
@@ -991,6 +1157,17 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
dout("flags differ\n");
return 0;
}
+
+ if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
+ dout("client is blocklisted (and CLEANRECOVER is not set)\n");
+ return 0;
+ }
+
+ if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ dout("client has been forcibly unmounted\n");
+ return 0;
+ }
+
return 1;
}
@@ -1020,6 +1197,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
static int ceph_get_tree(struct fs_context *fc)
{
struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
struct super_block *sb;
struct ceph_fs_client *fsc;
struct dentry *res;
@@ -1031,6 +1209,8 @@ static int ceph_get_tree(struct fs_context *fc)
if (!fc->source)
return invalfc(fc, "No source");
+ if (fsopt->new_dev_syntax && !fsopt->mon_addr)
+ return invalfc(fc, "No monitor address");
/* create client (which we may/may not use) */
fsc = create_fs_client(pctx->opts, pctx->copts);
@@ -1107,6 +1287,22 @@ static void ceph_free_fc(struct fs_context *fc)
static int ceph_reconfigure_fc(struct fs_context *fc)
{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ ceph_set_mount_opt(fsc, ASYNC_DIROPS);
+ else
+ ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+
+ if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) {
+ kfree(fsc->mount_options->mon_addr);
+ fsc->mount_options->mon_addr = fsopt->mon_addr;
+ fsopt->mon_addr = NULL;
+ pr_notice("ceph: monitor addresses recorded, but not used for reconnection");
+ }
+
sync_filesystem(fc->root->d_sb);
return 0;
}
@@ -1172,14 +1368,13 @@ nomem:
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
- dev_t dev = s->s_dev;
dout("kill_sb %p\n", s);
ceph_mdsc_pre_umount(fsc->mdsc);
flush_fs_workqueues(fsc);
- generic_shutdown_super(s);
+ kill_anon_super(s);
fsc->client->extra_mon_dispatch = NULL;
ceph_fs_debugfs_cleanup(fsc);
@@ -1187,7 +1382,6 @@ static void ceph_kill_sb(struct super_block *s)
ceph_fscache_unregister_fs(fsc);
destroy_fs_client(fsc);
- free_anon_bdev(dev);
}
static struct file_system_type ceph_fs_type = {
@@ -1204,19 +1398,20 @@ int ceph_force_reconnect(struct super_block *sb)
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
int err = 0;
- ceph_umount_begin(sb);
+ fsc->mount_state = CEPH_MOUNT_RECOVER;
+ __ceph_umount_begin(fsc);
/* Make sure all page caches get invalidated.
* see remove_session_caps_cb() */
flush_workqueue(fsc->inode_wq);
- /* In case that we were blacklisted. This also reset
+ /* In case that we were blocklisted. This also reset
* all mon/osd connections */
ceph_reset_client_addr(fsc->client);
ceph_osdc_clear_abort_err(&fsc->client->osdc);
- fsc->blacklisted = false;
+ fsc->blocklisted = false;
fsc->mount_state = CEPH_MOUNT_MOUNTED;
if (sb->s_root) {
@@ -1254,6 +1449,45 @@ static void __exit exit_ceph(void)
destroy_caches();
}
+static int param_set_metrics(const char *val, const struct kernel_param *kp)
+{
+ struct ceph_fs_client *fsc;
+ int ret;
+
+ ret = param_set_bool(val, kp);
+ if (ret) {
+ pr_err("Failed to parse sending metrics switch value '%s'\n",
+ val);
+ return ret;
+ } else if (!disable_send_metrics) {
+ // wake up all the mds clients
+ spin_lock(&ceph_fsc_lock);
+ list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) {
+ metric_schedule_delayed(&fsc->mdsc->metric);
+ }
+ spin_unlock(&ceph_fsc_lock);
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_metrics = {
+ .set = param_set_metrics,
+ .get = param_get_bool,
+};
+
+bool disable_send_metrics = false;
+module_param_cb(disable_send_metrics, &param_ops_metrics, &disable_send_metrics, 0644);
+MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
+
+/* for both v1 and v2 syntax */
+static bool mount_support = true;
+static const struct kernel_param_ops param_ops_mount_syntax = {
+ .get = param_get_bool,
+};
+module_param_cb(mount_syntax_v1, &param_ops_mount_syntax, &mount_support, 0444);
+module_param_cb(mount_syntax_v2, &param_ops_mount_syntax, &mount_support, 0444);
+
module_init(init_ceph);
module_exit(exit_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 037cdfb2ad4f..40630e6f691c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -17,22 +17,19 @@
#include <linux/posix_acl.h>
#include <linux/refcount.h>
#include <linux/security.h>
-
-#include <linux/ceph/libceph.h>
-
-#ifdef CONFIG_CEPH_FSCACHE
+#include <linux/netfs.h>
#include <linux/fscache.h>
-#endif
+#include <linux/hashtable.h>
-/* f_type in struct statfs */
-#define CEPH_SUPER_MAGIC 0x00c36400
+#include <linux/ceph/libceph.h>
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
+#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */
-#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
+#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
@@ -43,13 +40,18 @@
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
+#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
+#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */
#define CEPH_MOUNT_OPT_DEFAULT \
(CEPH_MOUNT_OPT_DCACHE | \
- CEPH_MOUNT_OPT_NOCOPYFROM)
+ CEPH_MOUNT_OPT_NOCOPYFROM | \
+ CEPH_MOUNT_OPT_ASYNC_DIROPS)
#define ceph_set_mount_opt(fsc, opt) \
- (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt
+#define ceph_clear_mount_opt(fsc, opt) \
+ (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
@@ -84,6 +86,8 @@ struct ceph_mount_options {
unsigned int max_readdir; /* max readdir result (entries) */
unsigned int max_readdir_bytes; /* max readdir result (bytes) */
+ bool new_dev_syntax;
+
/*
* everything above this point can be memcmp'd; everything below
* is handled in compare_mount_options()
@@ -93,18 +97,22 @@ struct ceph_mount_options {
char *mds_namespace; /* default NULL */
char *server_path; /* default NULL (means "/") */
char *fscache_uniq; /* default NULL */
+ char *mon_addr;
};
+#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8
+
struct ceph_fs_client {
struct super_block *sb;
+ struct list_head metric_wakeup;
+
struct ceph_mount_options *mount_options;
struct ceph_client *client;
- unsigned long mount_state;
+ int mount_state;
- unsigned long last_auto_reconnect;
- bool blacklisted;
+ bool blocklisted;
bool have_copy_from2;
@@ -113,23 +121,27 @@ struct ceph_fs_client {
struct ceph_mds_client *mdsc;
- /* writeback */
- mempool_t *wb_pagevec_pool;
atomic_long_t writeback_count;
+ bool write_congested;
struct workqueue_struct *inode_wq;
struct workqueue_struct *cap_wq;
+ DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
+ spinlock_t async_unlink_conflict_lock;
+
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
+ struct dentry *debugfs_status;
struct dentry *debugfs_mds_sessions;
+ struct dentry *debugfs_metrics_dir;
#endif
#ifdef CONFIG_CEPH_FSCACHE
- struct fscache_cookie *fscache;
+ struct fscache_volume *fscache;
#endif
};
@@ -156,7 +168,8 @@ struct ceph_cap {
int issued; /* latest, from the mds */
int implemented; /* implemented superset of
issued (for revocation) */
- int mds, mds_wanted;
+ int mds; /* mds index for this cap */
+ int mds_wanted; /* caps wanted from this mds */
};
/* caps to release */
struct {
@@ -170,14 +183,15 @@ struct ceph_cap {
struct list_head caps_item;
};
-#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
-#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */
struct ceph_cap_flush {
u64 tid;
- int caps; /* 0 means capsnap */
+ int caps;
bool wake; /* wake up flush waiters when finish ? */
+ bool is_capsnap; /* true means capsnap */
struct list_head g_list; // global
struct list_head i_list; // per inode
};
@@ -221,7 +235,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
- kfree(capsnap);
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
}
}
@@ -272,7 +286,8 @@ struct ceph_dentry_info {
struct dentry *dentry;
struct ceph_mds_session *lease_session;
struct list_head lease_list;
- unsigned flags;
+ struct hlist_node hnode;
+ unsigned long flags;
int lease_shared_gen;
u32 lease_gen;
u32 lease_seq;
@@ -281,9 +296,14 @@ struct ceph_dentry_info {
u64 offset;
};
-#define CEPH_DENTRY_REFERENCED 1
-#define CEPH_DENTRY_LEASE_LIST 2
-#define CEPH_DENTRY_SHRINK_LIST 4
+#define CEPH_DENTRY_REFERENCED (1 << 0)
+#define CEPH_DENTRY_LEASE_LIST (1 << 1)
+#define CEPH_DENTRY_SHRINK_LIST (1 << 2)
+#define CEPH_DENTRY_PRIMARY_LINK (1 << 3)
+#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4)
+#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
+#define CEPH_DENTRY_ASYNC_CREATE_BIT (5)
+#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT)
struct ceph_inode_xattrs_info {
/*
@@ -307,6 +327,7 @@ struct ceph_inode_xattrs_info {
* Ceph inode.
*/
struct ceph_inode_info {
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
@@ -315,18 +336,19 @@ struct ceph_inode_info {
u64 i_inline_version;
u32 i_time_warp_seq;
- unsigned i_ceph_flags;
+ unsigned long i_ceph_flags;
atomic64_t i_release_count;
atomic64_t i_ordered_count;
atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ struct ceph_file_layout i_cached_layout; // for async creates
char *i_symlink;
/* for dirs */
struct timespec64 i_rctime;
- u64 i_rbytes, i_rfiles, i_rsubdirs;
+ u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps;
u64 i_files, i_subdirs;
/* quotas */
@@ -345,14 +367,31 @@ struct ceph_inode_info {
struct rb_root i_caps; /* cap list */
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
- struct list_head i_dirty_item, i_flushing_item;
+
+ /*
+ * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty
+ * is protected by the mdsc->cap_dirty_lock, but each individual item
+ * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty
+ * requires the mdsc->cap_dirty_lock. List presence for an item can
+ * be tested under the i_ceph_lock. Changing anything requires both.
+ */
+ struct list_head i_dirty_item;
+
+ /*
+ * Link to session's s_cap_flushing list. Protected in a similar
+ * fashion to i_dirty_item, but also by the s_mutex for changes. The
+ * s_cap_flushing list can be walked while holding either the s_mutex
+ * or msdc->cap_dirty_lock. List presence can also be checked while
+ * holding the i_ceph_lock for this inode.
+ */
+ struct list_head i_flushing_item;
+
/* we need to track cap writeback on a per-cap-bit basis, to allow
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */
struct ceph_cap_flush *i_prealloc_cap_flush;
struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
- unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
struct ceph_cap_reservation i_cap_migration_resv;
@@ -361,6 +400,8 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
+ unsigned long i_last_rd;
+ unsigned long i_last_wr;
int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
@@ -375,7 +416,7 @@ struct ceph_inode_info {
/* held references to caps */
int i_pin_ref;
- int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+ int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
atomic_t i_filelock_ref;
atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */
@@ -390,7 +431,6 @@ struct ceph_inode_info {
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
};
- int i_snap_realm_counter; /* snap realm (if caps) */
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
struct timespec64 i_btime;
@@ -398,20 +438,12 @@ struct ceph_inode_info {
struct work_struct i_work;
unsigned long i_work_mask;
-
-#ifdef CONFIG_CEPH_FSCACHE
- struct fscache_cookie *fscache;
- u32 i_fscache_gen;
-#endif
- errseq_t i_meta_err;
-
- struct inode vfs_inode; /* at end */
};
static inline struct ceph_inode_info *
ceph_inode(const struct inode *inode)
{
- return container_of(inode, struct ceph_inode_info, vfs_inode);
+ return container_of(inode, struct ceph_inode_info, netfs.inode);
}
static inline struct ceph_fs_client *
@@ -426,21 +458,19 @@ ceph_sb_to_client(const struct super_block *sb)
return (struct ceph_fs_client *)sb->s_fs_info;
}
+static inline struct ceph_mds_client *
+ceph_sb_to_mdsc(const struct super_block *sb)
+{
+ return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
+}
+
static inline struct ceph_vino
ceph_vino(const struct inode *inode)
{
return ceph_inode(inode)->i_vino;
}
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * i_ino (kernel inode) st_ino (userspace)
- * i386 32 32
- * x86_64+ino32 64 32
- * x86_64 64 64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
{
u32 ino = vino & 0xffffffff;
ino ^= vino >> 32;
@@ -450,34 +480,17 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
}
/*
- * kernel i_ino value
+ * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on
+ * some arches. We generally do not use this value inside the ceph driver, but
+ * we do want to set it to something, so that generic vfs code has an
+ * appropriate value for tracepoints and the like.
*/
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
{
-#if BITS_PER_LONG == 32
- return ceph_ino_to_ino32(vino.ino);
-#else
+ if (sizeof(ino_t) == sizeof(u32))
+ return ceph_ino_to_ino32(vino.ino);
return (ino_t)vino.ino;
-#endif
-}
-
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- return ino;
}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
- ino = ceph_ino_to_ino32(ino);
- return ino;
-}
-#endif
-
/* for printf-style formatting */
#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
@@ -486,11 +499,34 @@ static inline u64 ceph_ino(struct inode *inode)
{
return ceph_inode(inode)->i_vino.ino;
}
+
static inline u64 ceph_snap(struct inode *inode)
{
return ceph_inode(inode)->i_vino.snap;
}
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+ if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+ return ceph_ino_to_ino32(ino);
+ return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+ return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
static inline int ceph_ino_compare(struct inode *inode, void *data)
{
struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -499,11 +535,44 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
ci->i_vino.snap == pvino->snap;
}
+/*
+ * The MDS reserves a set of inodes for its own usage. These should never
+ * be accessible by clients, and so the MDS has no reason to ever hand these
+ * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE.
+ *
+ * These come from src/mds/mdstypes.h in the ceph sources.
+ */
+#define CEPH_MAX_MDS 0x100
+#define CEPH_NUM_STRAY 10
+#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS)
+#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS)
+#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY))
+
+static inline bool ceph_vino_is_reserved(const struct ceph_vino vino)
+{
+ if (vino.ino >= CEPH_INO_SYSTEM_BASE ||
+ vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET)
+ return false;
+
+ /* Don't warn on mdsdirs */
+ WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET,
+ "Attempt to access reserved inode number 0x%llx",
+ vino.ino);
+ return true;
+}
+
static inline struct inode *ceph_find_inode(struct super_block *sb,
struct ceph_vino vino)
{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
+ if (ceph_vino_is_reserved(vino))
+ return NULL;
+
+ /*
+ * NB: The hashval will be run through the fs/inode.c hash function
+ * anyway, so there is no need to squash the inode number down to
+ * 32-bits first. Just use low-order bits on arches with 32-bit long.
+ */
+ return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
}
@@ -511,25 +580,28 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
* Ceph inode.
*/
#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
-#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
-#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
-#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
-#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
-#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
-#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
-#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
+#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */
+#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */
+#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
+#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
+#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */
/*
* Masks of ceph inode work.
*/
-#define CEPH_I_WORK_WRITEBACK 0 /* writeback */
-#define CEPH_I_WORK_INVALIDATE_PAGES 1 /* invalidate pages */
-#define CEPH_I_WORK_VMTRUNCATE 2 /* vmtruncate */
+#define CEPH_I_WORK_WRITEBACK 0
+#define CEPH_I_WORK_INVALIDATE_PAGES 1
+#define CEPH_I_WORK_VMTRUNCATE 2
+#define CEPH_I_WORK_CHECK_CAPS 3
+#define CEPH_I_WORK_FLUSH_SNAPS 4
/*
* We set the ERROR_WRITE bit when we start seeing write errors on an inode
@@ -638,6 +710,8 @@ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
+ int t);
extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
struct ceph_cap *cap);
@@ -650,12 +724,12 @@ static inline int ceph_caps_issued(struct ceph_inode_info *ci)
return issued;
}
-static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
- int touch)
+static inline int ceph_caps_issued_mask_metric(struct ceph_inode_info *ci,
+ int mask, int touch)
{
int r;
spin_lock(&ci->i_ceph_lock);
- r = __ceph_caps_issued_mask(ci, mask, touch);
+ r = __ceph_caps_issued_mask_metric(ci, mask, touch);
spin_unlock(&ci->i_ceph_lock);
return r;
}
@@ -674,18 +748,12 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
-extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
-
-/*
- * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
- */
-static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
{
- int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (w & CEPH_CAP_FILE_BUFFER)
- w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
- return w;
+ return ci->i_nr_by_mode[0];
}
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
/* what the mds thinks we want */
extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
@@ -701,6 +769,8 @@ extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
extern void ceph_reservation_status(struct ceph_fs_client *client,
int *total, int *avail, int *used,
int *reserved, int *min);
+extern void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session);
@@ -717,7 +787,6 @@ struct ceph_file_info {
spinlock_t rw_contexts_lock;
struct list_head rw_contexts;
- errseq_t meta_err;
u32 filp_gen;
atomic_t num_locks;
};
@@ -822,6 +891,8 @@ struct ceph_snap_realm {
struct list_head dirty_item; /* if realm needs new context */
+ struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */
+
/* the current set of snaps for this realm */
struct ceph_snap_context *cached_context;
@@ -871,13 +942,13 @@ extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
extern int ceph_update_snap_trace(struct ceph_mds_client *m,
void *p, void *e, bool deletion,
struct ceph_snap_realm **realm_ret);
+void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm);
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
-extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc);
extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
u64 snap);
@@ -885,6 +956,7 @@ extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
struct ceph_snapid_map *sm);
extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+void ceph_umount_begin(struct super_block *sb);
/*
@@ -899,6 +971,9 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
}
/* inode.c */
+struct ceph_mds_reply_info_in;
+struct ceph_mds_reply_dirfrag;
+
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
@@ -914,6 +989,11 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec64 *ctime,
struct timespec64 *mtime,
struct timespec64 *atime);
+extern int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation);
extern int ceph_fill_trace(struct super_block *sb,
struct ceph_mds_request *req);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
@@ -923,25 +1003,63 @@ extern int ceph_inode_holds_cap(struct inode *inode, int mask);
extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
-extern void ceph_queue_vmtruncate(struct inode *inode);
-extern void ceph_queue_invalidate(struct inode *inode);
-extern void ceph_queue_writeback(struct inode *inode);
-extern void ceph_async_iput(struct inode *inode);
+void ceph_queue_inode_work(struct inode *inode, int work_bit);
+
+static inline void ceph_queue_vmtruncate(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE);
+}
+
+static inline void ceph_queue_invalidate(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES);
+}
+
+static inline void ceph_queue_writeback(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK);
+}
+
+static inline void ceph_queue_check_caps(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS);
+}
+
+static inline void ceph_queue_flush_snaps(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
+}
+
+extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask);
extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force);
static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
{
return __ceph_do_getattr(inode, NULL, mask, force);
}
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask);
extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
-extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_getattr(const struct path *path, struct kstat *stat,
+extern int ceph_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
+void ceph_inode_shutdown(struct inode *inode);
+
+static inline bool ceph_inode_is_shutdown(struct inode *inode)
+{
+ unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ int state = READ_ONCE(fsc->mount_state);
+
+ return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN;
+}
/* xattr.c */
int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
@@ -997,8 +1115,9 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
/* acl.c */
#ifdef CONFIG_CEPH_FS_POSIX_ACL
-struct posix_acl *ceph_get_acl(struct inode *, int);
-int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+struct posix_acl *ceph_get_acl(struct inode *, int, bool);
+int ceph_set_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, struct posix_acl *acl, int type);
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct ceph_acl_sec_ctx *as_ctx);
void ceph_init_inode_acls(struct inode *inode,
@@ -1042,10 +1161,11 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
extern void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
+ unsigned issued, unsigned wanted,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
extern void __ceph_remove_caps(struct ceph_inode_info *ci);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
@@ -1058,18 +1178,31 @@ extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds);
+extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
+ bool snap_rwsem_locked);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had);
+extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci,
+ int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
+extern void __ceph_remove_capsnap(struct inode *inode,
+ struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc);
+extern void ceph_remove_capsnap(struct inode *inode,
+ struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc);
extern void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession);
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
@@ -1079,25 +1212,37 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
int mds, int drop, int unless);
extern int ceph_get_caps(struct file *filp, int need, int want,
- loff_t endoff, int *got, struct page **pinned_page);
+ loff_t endoff, int *got);
extern int ceph_try_get_caps(struct inode *inode,
int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
-extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode);
/* addr.c */
extern const struct address_space_operations ceph_aops;
+extern const struct netfs_request_ops ceph_netfs_ops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
-extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
+int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
+
+static inline bool ceph_has_inline_data(struct ceph_inode_info *ci)
+{
+ if (ci->i_inline_version == CEPH_INLINE_NONE ||
+ ci->i_inline_version == 1) /* initial version, no data */
+ return false;
+ return true;
+}
/* file.c */
extern const struct file_operations ceph_file_fops;
-extern int ceph_renew_caps(struct inode *inode);
+extern int ceph_renew_caps(struct inode *inode, int fmode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode);
@@ -1114,8 +1259,8 @@ extern const struct dentry_operations ceph_dentry_ops;
extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
-extern int ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err);
+extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
+ struct dentry *dentry);
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
@@ -1151,9 +1296,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client);
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
/* quota.c */
-static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci)
+
+enum quota_get_realm {
+ QUOTA_GET_MAX_FILES,
+ QUOTA_GET_MAX_BYTES,
+ QUOTA_GET_ANY
+};
+
+static inline bool __ceph_has_quota(struct ceph_inode_info *ci,
+ enum quota_get_realm which)
{
- return ci->i_max_files || ci->i_max_bytes;
+ bool has_quota = false;
+
+ switch (which) {
+ case QUOTA_GET_MAX_BYTES:
+ has_quota = !!ci->i_max_bytes;
+ break;
+ case QUOTA_GET_MAX_FILES:
+ has_quota = !!ci->i_max_files;
+ break;
+ default:
+ has_quota = !!(ci->i_max_files || ci->i_max_bytes);
+ }
+ return has_quota;
}
extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
@@ -1162,13 +1327,13 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
u64 max_bytes, u64 max_files)
{
bool had_quota, has_quota;
- had_quota = __ceph_has_any_quota(ci);
+ had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
ci->i_max_bytes = max_bytes;
ci->i_max_files = max_files;
- has_quota = __ceph_has_any_quota(ci);
+ has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
if (had_quota != has_quota)
- ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
+ ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 7b8a070a782d..f31350cda960 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -42,6 +42,7 @@ struct ceph_vxattr {
#define VXATTR_FLAG_READONLY (1<<0)
#define VXATTR_FLAG_HIDDEN (1<<1)
#define VXATTR_FLAG_RSTAT (1<<2)
+#define VXATTR_FLAG_DIRSTAT (1<<3)
/* layouts */
@@ -56,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_string *pool_ns;
s64 pool = ci->i_layout.pool_id;
@@ -68,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
- dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+ dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
@@ -116,7 +117,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
* NULL terminates however, so call it on a temporary buffer and then memcpy
* the result into place.
*/
-static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
+static __printf(3, 4)
+int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
{
int ret;
va_list args;
@@ -159,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
char *val, size_t size)
{
ssize_t ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ci->i_layout.pool_id;
const char *pool_name;
@@ -231,6 +233,12 @@ static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs);
}
+static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps);
+}
+
static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
size_t size)
{
@@ -302,6 +310,48 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
ci->i_snap_btime.tv_nsec);
}
+static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+
+ return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
+}
+
+static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+
+ return ceph_fmt_xattr(val, size, "client%lld",
+ ceph_client_gid(fsc->client));
+}
+
+static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ int issued;
+
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+
+ return ceph_fmt_xattr(val, size, "%s/0x%x",
+ ceph_cap_string(issued), issued);
+}
+
+static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = ceph_fmt_xattr(val, size, "%d",
+ ci->i_auth_cap ? ci->i_auth_cap->session->s_mds : -1);
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
#define CEPH_XATTR_NAME2(_type, _name, _name2) \
XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
@@ -316,6 +366,14 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
}
#define XATTR_RSTAT_FIELD(_type, _name) \
XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
+#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = VXATTR_FLAG_RSTAT, \
+ }
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \
@@ -346,14 +404,15 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_LAYOUT_FIELD(dir, layout, object_size),
XATTR_LAYOUT_FIELD(dir, layout, pool),
XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
- XATTR_NAME_CEPH(dir, entries, 0),
- XATTR_NAME_CEPH(dir, files, 0),
- XATTR_NAME_CEPH(dir, subdirs, 0),
+ XATTR_NAME_CEPH(dir, entries, VXATTR_FLAG_DIRSTAT),
+ XATTR_NAME_CEPH(dir, files, VXATTR_FLAG_DIRSTAT),
+ XATTR_NAME_CEPH(dir, subdirs, VXATTR_FLAG_DIRSTAT),
XATTR_RSTAT_FIELD(dir, rentries),
XATTR_RSTAT_FIELD(dir, rfiles),
XATTR_RSTAT_FIELD(dir, rsubdirs),
+ XATTR_RSTAT_FIELD(dir, rsnaps),
XATTR_RSTAT_FIELD(dir, rbytes),
- XATTR_RSTAT_FIELD(dir, rctime),
+ XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
{
.name = "ceph.dir.pin",
.name_size = sizeof("ceph.dir.pin"),
@@ -377,6 +436,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
.exists_cb = ceph_vxattrcb_snap_btime_exists,
.flags = VXATTR_FLAG_READONLY,
},
+ {
+ .name = "ceph.caps",
+ .name_size = sizeof("ceph.caps"),
+ .getxattr_cb = ceph_vxattrcb_caps,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
{ .name = NULL, 0 } /* Required table terminator */
};
@@ -402,6 +468,38 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
.exists_cb = ceph_vxattrcb_snap_btime_exists,
.flags = VXATTR_FLAG_READONLY,
},
+ {
+ .name = "ceph.caps",
+ .name_size = sizeof("ceph.caps"),
+ .getxattr_cb = ceph_vxattrcb_caps,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ { .name = NULL, 0 } /* Required table terminator */
+};
+
+static struct ceph_vxattr ceph_common_vxattrs[] = {
+ {
+ .name = "ceph.cluster_fsid",
+ .name_size = sizeof("ceph.cluster_fsid"),
+ .getxattr_cb = ceph_vxattrcb_cluster_fsid,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+ {
+ .name = "ceph.client_id",
+ .name_size = sizeof("ceph.client_id"),
+ .getxattr_cb = ceph_vxattrcb_client_id,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+ {
+ .name = "ceph.auth_mds",
+ .name_size = sizeof("ceph.auth_mds"),
+ .getxattr_cb = ceph_vxattrcb_auth_mds,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_READONLY,
+ },
{ .name = NULL, 0 } /* Required table terminator */
};
@@ -427,6 +525,13 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
}
}
+ vxattr = ceph_common_vxattrs;
+ while (vxattr->name) {
+ if (!strcmp(vxattr->name, name))
+ return vxattr;
+ vxattr++;
+ }
+
return NULL;
}
@@ -497,10 +602,10 @@ static int __set_xattr(struct ceph_inode_info *ci,
kfree(*newxattr);
*newxattr = NULL;
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
if (update_xattr) {
- kfree((void *)name);
+ kfree(name);
name = xattr->name;
}
ci->i_xattrs.names_size -= xattr->name_len;
@@ -524,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
}
dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val);
+ ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val);
return 0;
}
@@ -566,9 +671,9 @@ static void __free_xattr(struct ceph_inode_xattr *xattr)
BUG_ON(!xattr);
if (xattr->should_free_name)
- kfree((void *)xattr->name);
+ kfree(xattr->name);
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
kfree(xattr);
}
@@ -582,9 +687,9 @@ static int __remove_xattr(struct ceph_inode_info *ci,
rb_erase(&xattr->node, &ci->i_xattrs.index);
if (xattr->should_free_name)
- kfree((void *)xattr->name);
+ kfree(xattr->name);
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
ci->i_xattrs.names_size -= xattr->name_len;
ci->i_xattrs.vals_size -= xattr->val_len;
@@ -766,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
struct ceph_buffer *old_blob = NULL;
void *dest;
- dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+ dout("__build_xattrs_blob %p\n", &ci->netfs.inode);
if (ci->i_xattrs.dirty) {
int need = __get_required_blob_size(ci, 0, 0);
@@ -826,16 +931,21 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_xattr *xattr;
- struct ceph_vxattr *vxattr = NULL;
+ struct ceph_vxattr *vxattr;
int req_mask;
ssize_t err;
+ if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto handle_non_vxattrs;
+
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr) {
int mask = 0;
if (vxattr->flags & VXATTR_FLAG_RSTAT)
mask |= CEPH_STAT_RSTAT;
+ if (vxattr->flags & VXATTR_FLAG_DIRSTAT)
+ mask |= CEPH_CAP_FILE_SHARED;
err = ceph_do_getattr(inode, mask, true);
if (err)
return err;
@@ -846,8 +956,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
err = -ERANGE;
}
return err;
+ } else {
+ err = ceph_do_getvxattr(inode, name, value, size);
+ /* this would happen with a new client and old server combo */
+ if (err == -EOPNOTSUPP)
+ err = -ENODATA;
+ return err;
}
-
+handle_non_vxattrs:
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
@@ -856,7 +972,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
if (ci->i_xattrs.version == 0 ||
!((req_mask & CEPH_CAP_XATTR_SHARED) ||
- __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) {
+ __ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1))) {
spin_unlock(&ci->i_ceph_lock);
/* security module gets xattr while filling trace */
@@ -914,7 +1030,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
ci->i_xattrs.version, ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 ||
- !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
+ !__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) {
spin_unlock(&ci->i_ceph_lock);
err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
if (err)
@@ -949,6 +1065,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_pagelist *pagelist = NULL;
int op = CEPH_MDS_OP_SETXATTR;
int err;
@@ -969,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
flags |= CEPH_XATTR_REMOVE;
}
- dout("setxattr value=%.*s\n", (int)size, value);
+ dout("setxattr value size: %zu\n", size);
/* do request */
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -987,6 +1104,8 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
if (op == CEPH_MDS_OP_SETXATTR) {
req->r_args.setxattr.flags = cpu_to_le32(flags);
+ req->r_args.setxattr.osdmap_epoch =
+ cpu_to_le32(osdc->osdmap->epoch);
req->r_pagelist = pagelist;
pagelist = NULL;
}
@@ -1065,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name,
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
- if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+ required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+ if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
+ (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
+ dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
+ __func__, ci->i_xattrs.version, required_blob_size,
+ mdsc->mdsmap->m_max_xattr_size);
goto do_sync;
+ }
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
@@ -1082,8 +1207,6 @@ retry:
ceph_cap_string(issued));
__build_xattrs(inode);
- required_blob_size = __get_required_blob_size(ci, name_len, val_len);
-
if (!ci->i_xattrs.prealloc_blob ||
required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
struct ceph_buffer *blob;
@@ -1162,6 +1285,7 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler,
}
static int ceph_set_xattr_handler(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
@@ -1208,7 +1332,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
int err;
err = security_dentry_init_security(dentry, mode, &dentry->d_name,
- &as_ctx->sec_ctx,
+ &name, &as_ctx->sec_ctx,
&as_ctx->sec_ctxlen);
if (err < 0) {
WARN_ON_ONCE(err != -EOPNOTSUPP);
@@ -1232,7 +1356,6 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
* It only supports single security module and only selinux has
* dentry_init_security hook.
*/
- name = XATTR_NAME_SELINUX;
name_len = strlen(name);
err = ceph_pagelist_reserve(pagelist,
4 * 2 + name_len + as_ctx->sec_ctxlen);