aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/inode.c')
-rw-r--r--fs/ceph/inode.c589
1 files changed, 368 insertions, 221 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d01710a16a4a..4af5e55abc15 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,26 +41,31 @@ static void ceph_inode_work(struct work_struct *work);
*/
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+
+ ci->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
inode_set_iversion_raw(inode, 0);
+ percpu_counter_inc(&mdsc->metric.total_inodes);
+
return 0;
}
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
{
struct inode *inode;
- ino_t t = ceph_vino_to_ino(vino);
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-EREMOTEIO);
+
+ inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+ ceph_set_ino_cb, &vino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW)
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
- inode, ceph_vinop(inode), (u64)inode->i_ino);
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
- vino.snap, inode);
+ dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
+ ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
return inode;
}
@@ -76,21 +81,44 @@ struct inode *ceph_get_snapdir(struct inode *parent)
struct inode *inode = ceph_get_inode(parent->i_sb, vino);
struct ceph_inode_info *ci = ceph_inode(inode);
- BUG_ON(!S_ISDIR(parent->i_mode));
if (IS_ERR(inode))
return inode;
+
+ if (!S_ISDIR(parent->i_mode)) {
+ pr_warn_once("bad snapdir parent type (mode=0%o)\n",
+ parent->i_mode);
+ goto err;
+ }
+
+ if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
+ pr_warn_once("bad snapdir inode type (mode=0%o)\n",
+ inode->i_mode);
+ goto err;
+ }
+
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
- inode->i_op = &ceph_snapdir_iops;
- inode->i_fop = &ceph_snapdir_fops;
- ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+ inode->i_mtime = parent->i_mtime;
+ inode->i_ctime = parent->i_ctime;
+ inode->i_atime = parent->i_atime;
ci->i_rbytes = 0;
+ ci->i_btime = ceph_inode(parent)->i_btime;
- if (inode->i_state & I_NEW)
+ if (inode->i_state & I_NEW) {
+ inode->i_op = &ceph_snapdir_iops;
+ inode->i_fop = &ceph_snapdir_fops;
+ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
unlock_new_inode(inode);
+ }
return inode;
+err:
+ if ((inode->i_state & I_NEW))
+ discard_new_inode(inode);
+ else
+ iput(inode);
+ return ERR_PTR(-ENOTDIR);
}
const struct inode_operations ceph_file_iops = {
@@ -148,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_insert_color(&frag->node, &ci->i_fragtree);
dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
+ ceph_vinop(&ci->netfs.inode), f);
return frag;
}
@@ -334,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (nsplits != ci->i_fragtree_nsplits) {
update = true;
} else if (nsplits) {
- i = prandom_u32() % nsplits;
+ i = prandom_u32_max(nsplits);
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
update = true;
@@ -425,11 +453,14 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
struct ceph_inode_info *ci;
int i;
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->vfs_inode);
+ dout("alloc_inode %p\n", &ci->netfs.inode);
+
+ /* Set parameters for the netfs library */
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
spin_lock_init(&ci->i_ceph_lock);
@@ -447,6 +478,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_max_files = 0;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
@@ -471,13 +503,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_prealloc_cap_flush = NULL;
INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
- ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
+ ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
@@ -496,6 +528,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_rdcache_ref = 0;
ci->i_wr_ref = 0;
ci->i_wb_ref = 0;
+ ci->i_fx_ref = 0;
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
@@ -514,12 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
-
- ceph_fscache_inode_init(ci);
-
- ci->i_meta_err = 0;
-
- return &ci->vfs_inode;
+ return &ci->netfs.inode;
}
void ceph_free_inode(struct inode *inode)
@@ -533,19 +561,24 @@ void ceph_free_inode(struct inode *inode)
void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_frag *frag;
struct rb_node *n;
dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ percpu_counter_dec(&mdsc->metric.total_inodes);
+
truncate_inode_pages_final(&inode->i_data);
+ if (inode->i_state & I_PINNING_FSCACHE_WB)
+ ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
ceph_fscache_unregister_inode_cookie(ci);
__ceph_remove_caps(ci);
- if (__ceph_has_any_quota(ci))
+ if (__ceph_has_quota(ci, QUOTA_GET_ANY))
ceph_adjust_quota_realms_count(inode, false);
/*
@@ -553,19 +586,10 @@ void ceph_evict_inode(struct inode *inode)
* caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
- struct ceph_mds_client *mdsc =
- ceph_inode_to_client(inode)->mdsc;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n",
- realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
+ ci->i_snap_realm);
+ ceph_change_snap_realm(inode, NULL);
} else {
ceph_put_snapid_map(mdsc, ci->i_snapid_map);
ci->i_snap_realm = NULL;
@@ -586,6 +610,7 @@ void ceph_evict_inode(struct inode *inode)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
}
static inline blkcnt_t calc_inode_blocks(u64 size)
@@ -606,16 +631,23 @@ int ceph_fill_file_size(struct inode *inode, int issued,
{
struct ceph_inode_info *ci = ceph_inode(inode);
int queue_trunc = 0;
+ loff_t isize = i_size_read(inode);
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
- (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
- dout("size %lld -> %llu\n", inode->i_size, size);
+ (truncate_seq == ci->i_truncate_seq && size > isize)) {
+ dout("size %lld -> %llu\n", isize, size);
if (size > 0 && S_ISDIR(inode->i_mode)) {
pr_err("fill_file_size non-zero size for directory\n");
size = 0;
}
i_size_write(inode, size);
inode->i_blocks = calc_inode_blocks(size);
+ /*
+ * If we're expanding, then we should be able to just update
+ * the existing cookie.
+ */
+ if (size > isize)
+ ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
dout("truncate_seq %u -> %u\n",
@@ -636,7 +668,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if ((issued & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_BUFFER)) ||
mapping_mapped(inode->i_mapping) ||
- __ceph_caps_file_wanted(ci)) {
+ __ceph_is_file_opened(ci)) {
ci->i_truncate_pending++;
queue_trunc = 1;
}
@@ -648,10 +680,6 @@ int ceph_fill_file_size(struct inode *inode, int issued,
truncate_size);
ci->i_truncate_size = truncate_size;
}
-
- if (queue_trunc)
- ceph_fscache_invalidate(inode);
-
return queue_trunc;
}
@@ -727,13 +755,13 @@ void ceph_fill_file_time(struct inode *inode, int issued,
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
*/
-static int fill_inode(struct inode *inode, struct page *locked_page,
- struct ceph_mds_reply_info_in *iinfo,
- struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session, int cap_fmode,
- struct ceph_cap_reservation *caps_reservation)
+int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued, new_issued, info_caps;
@@ -747,11 +775,34 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
bool queue_trunc = false;
bool new_version = false;
bool fill_inline = false;
+ umode_t mode = le32_to_cpu(info->mode);
+ dev_t rdev = le32_to_cpu(info->rdev);
+
+ lockdep_assert_held(&mdsc->snap_rwsem);
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+ dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
+ /* Once I_NEW is cleared, we can't change type or dev numbers */
+ if (inode->i_state & I_NEW) {
+ inode->i_mode = mode;
+ } else {
+ if (inode_wrong_type(inode, mode)) {
+ pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
+ return -ESTALE;
+ }
+
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
+ pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
+ ceph_vinop(inode), MAJOR(inode->i_rdev),
+ MINOR(inode->i_rdev), MAJOR(rdev),
+ MINOR(rdev));
+ return -ESTALE;
+ }
+ }
+
info_caps = le32_to_cpu(info->cap.caps);
/* prealloc new cap struct */
@@ -769,7 +820,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+ pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
iinfo->xattr_len);
}
@@ -805,8 +856,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
issued |= __ceph_caps_dirty(ci);
new_issued = ~issued & info_caps;
- /* update inode */
- inode->i_rdev = le32_to_cpu(info->rdev);
/* directories have fl_stripe_unit set to zero */
if (le32_to_cpu(info->layout.fl_stripe_unit))
inode->i_blkbits =
@@ -818,7 +867,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(info->mode);
+ inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
@@ -884,6 +933,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ci->i_rfiles = le64_to_cpu(info->rfiles);
ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
ci->i_dir_pin = iinfo->dir_pin;
+ ci->i_rsnaps = iinfo->rsnaps;
ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
}
}
@@ -916,7 +966,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
case S_IFCHR:
case S_IFSOCK:
inode->i_blkbits = PAGE_SHIFT;
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ceph_file_iops;
break;
case S_IFREG:
@@ -932,8 +982,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
if (symlen != i_size_read(inode)) {
- pr_err("fill_inode %llx.%llx BAD symlink "
- "size %lld\n", ceph_vinop(inode),
+ pr_err("%s %llx.%llx BAD symlink "
+ "size %lld\n", __func__,
+ ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
@@ -957,7 +1008,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_dir_fops;
break;
default:
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+ pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
ceph_vinop(inode), inode->i_mode);
}
@@ -966,7 +1017,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ceph_snap(inode) == CEPH_NOSNAP) {
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode, info_caps,
+ info_caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
@@ -991,26 +1042,29 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(info_caps));
ci->i_snap_caps |= info_caps;
- if (cap_fmode >= 0)
- __ceph_get_fmode(ci, cap_fmode);
}
- } else if (cap_fmode >= 0) {
- pr_warn("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
- __ceph_get_fmode(ci, cap_fmode);
}
if (iinfo->inline_version > 0 &&
iinfo->inline_version >= ci->i_inline_version) {
int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
ci->i_inline_version = iinfo->inline_version;
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (ceph_has_inline_data(ci) &&
(locked_page || (info_caps & cache_caps)))
fill_inline = true;
}
+ if (cap_fmode >= 0) {
+ if (!info_caps)
+ pr_warn("mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
+ __ceph_touch_fmode(ci, mdsc, cap_fmode);
+ }
+
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_register_inode_cookie(inode);
+
if (fill_inline)
ceph_fill_inline_data(inode, locked_page,
iinfo->inline_data, iinfo->inline_len);
@@ -1050,6 +1104,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
struct ceph_mds_session **old_lease_session)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
@@ -1061,13 +1116,18 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return;
+ if (mask & CEPH_LEASE_PRIMARY_LINK)
+ di->flags |= CEPH_DENTRY_PRIMARY_LINK;
+ else
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
+
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
- if (duration == 0) {
+ if (!(mask & CEPH_LEASE_VALID)) {
__ceph_dentry_dir_lease_touch(di);
return;
}
- if (di->lease_gen == session->s_cap_gen &&
+ if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
time_before(ttl, di->time))
return; /* we already have a newer lease. */
@@ -1078,7 +1138,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (!di->lease_session)
di->lease_session = ceph_get_mds_session(session);
- di->lease_gen = session->s_cap_gen;
+ di->lease_gen = atomic_read(&session->s_cap_gen);
di->lease_seq = le32_to_cpu(lease->seq);
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
@@ -1097,8 +1157,7 @@ static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
__update_dentry_lease(dir, dentry, lease, session, from_time,
&old_lease_session);
spin_unlock(&dentry->d_lock);
- if (old_lease_session)
- ceph_put_mds_session(old_lease_session);
+ ceph_put_mds_session(old_lease_session);
}
/*
@@ -1143,13 +1202,12 @@ static void update_dentry_lease_careful(struct dentry *dentry,
from_time, &old_lease_session);
out_unlock:
spin_unlock(&dentry->d_lock);
- if (old_lease_session)
- ceph_put_mds_session(old_lease_session);
+ ceph_put_mds_session(old_lease_session);
}
/*
* splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
+ * caller must hold directory i_rwsem for this to be safe.
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
@@ -1239,10 +1297,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct inode *dir = req->r_parent;
if (dir) {
- err = fill_inode(dir, NULL,
- &rinfo->diri, rinfo->dirfrag,
- session, -1,
- &req->r_caps_reservation);
+ err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ rinfo->dirfrag, session, -1,
+ &req->r_caps_reservation);
if (err < 0)
goto done;
} else {
@@ -1298,28 +1355,26 @@ retry_lookup:
}
if (rinfo->head->is_target) {
- tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ /* Should be filled in by handle_reply */
+ BUG_ON(!req->r_target_inode);
- in = ceph_get_inode(sb, tvino);
- if (IS_ERR(in)) {
- err = PTR_ERR(in);
- goto done;
- }
-
- err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
- session,
+ in = req->r_target_inode;
+ err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
+ NULL, session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
+ pr_err("ceph_fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in));
+ req->r_target_inode = NULL;
if (in->i_state & I_NEW)
discard_new_inode(in);
+ else
+ iput(in);
goto done;
}
- req->r_target_inode = in;
if (in->i_state & I_NEW)
unlock_new_inode(in);
}
@@ -1411,10 +1466,12 @@ retry_lookup:
} else if (have_lease) {
if (d_unhashed(dn))
d_add(dn, NULL);
+ }
+
+ if (!d_unhashed(dn) && have_lease)
update_dentry_lease(dir, dn,
rinfo->dlease, session,
req->r_request_started);
- }
goto done;
}
@@ -1500,10 +1557,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
- rc = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (rc < 0) {
- pr_err("fill_inode badness on %p got %d\n", in, rc);
+ pr_err("ceph_fill_inode badness on %p got %d\n",
+ in, rc);
err = rc;
if (in->i_state & I_NEW) {
ihold(in);
@@ -1513,8 +1571,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
unlock_new_inode(in);
}
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(in);
+ iput(in);
}
return err;
@@ -1549,7 +1606,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
- * i_mutex, no need to use page lock */
+ * i_rwsem, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
if (idx == 0)
@@ -1578,8 +1635,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct dentry *dn;
struct inode *in;
int err = 0, skipped = 0, ret, i;
- struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+ u32 frag = le32_to_cpu(req->r_args.readdir.frag);
u32 last_hash = 0;
u32 fpos_offset;
struct ceph_readdir_cache_control cache_ctl = {};
@@ -1596,7 +1652,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} else if (rinfo->offset_hash) {
/* mds understands offset_hash */
WARN_ON_ONCE(req->r_readdir_offset != 2);
- last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+ last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
}
}
@@ -1707,18 +1763,16 @@ retry_lookup:
}
}
- ret = fill_inode(in, NULL, &rde->inode, NULL, session,
- -1, &req->r_caps_reservation);
+ ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (ret < 0) {
- pr_err("fill_inode badness on %p\n", in);
+ pr_err("ceph_fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
- /* avoid calling iput_final() in mds
- * dispatch threads */
if (in->i_state & I_NEW) {
ihold(in);
discard_new_inode(in);
}
- ceph_async_iput(in);
+ iput(in);
}
d_drop(dn);
err = ret;
@@ -1731,7 +1785,7 @@ retry_lookup:
if (ceph_security_xattr_deadlock(in)) {
dout(" skip splicing dn %p to inode %p"
" (security xattr deadlock)\n", dn, in);
- ceph_async_iput(in);
+ iput(in);
skipped++;
goto next_item;
}
@@ -1772,89 +1826,29 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
bool ret;
spin_lock(&ci->i_ceph_lock);
- dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+ dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
i_size_write(inode, size);
+ ceph_fscache_update(inode);
inode->i_blocks = calc_inode_blocks(size);
ret = __ceph_should_report_size(ci);
spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-/*
- * Put reference to inode, but avoid calling iput_final() in current thread.
- * iput_final() may wait for reahahead pages. The wait can cause deadlock in
- * some contexts.
- */
-void ceph_async_iput(struct inode *inode)
-{
- if (!inode)
- return;
- for (;;) {
- if (atomic_add_unless(&inode->i_count, -1, 1))
- break;
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ceph_inode(inode)->i_work))
- break;
- /* queue work failed, i_count must be at least 2 */
- }
-}
-
-/*
- * Write back inode data in a worker thread. (This can't be done
- * in the message handler context.)
- */
-void ceph_queue_writeback(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask);
-
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ci->i_work)) {
- dout("ceph_queue_writeback %p\n", inode);
- } else {
- dout("ceph_queue_writeback %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
- iput(inode);
- }
-}
-
-/*
- * queue an async invalidation
- */
-void ceph_queue_invalidate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask);
-
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ceph_inode(inode)->i_work)) {
- dout("ceph_queue_invalidate %p\n", inode);
- } else {
- dout("ceph_queue_invalidate %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
- iput(inode);
- }
+ return ret;
}
-/*
- * Queue an async vmtruncate. If we fail to queue work, we will handle
- * the truncation the next time we call __ceph_do_pending_vmtruncate.
- */
-void ceph_queue_vmtruncate(struct inode *inode)
+void ceph_queue_inode_work(struct inode *inode, int work_bit)
{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask);
+ set_bit(work_bit, &ci->i_work_mask);
ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ci->i_work)) {
- dout("ceph_queue_vmtruncate %p\n", inode);
+ if (queue_work(fsc->inode_wq, &ci->i_work)) {
+ dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
} else {
- dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n",
+ dout("queue_inode_work %p already queued, mask=%lx\n",
inode, ci->i_work_mask);
iput(inode);
}
@@ -1863,15 +1857,16 @@ void ceph_queue_vmtruncate(struct inode *inode)
static void ceph_do_invalidate_pages(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
u32 orig_gen;
int check = 0;
+ ceph_fscache_invalidate(inode, false);
+
mutex_lock(&ci->i_truncate_mutex);
- if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
- pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
- inode, ceph_ino(inode));
+ if (ceph_inode_is_shutdown(inode)) {
+ pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
+ __func__, ceph_vinop(inode));
mapping_set_error(inode->i_mapping, -EIO);
truncate_pagecache(inode, 0);
mutex_unlock(&ci->i_truncate_mutex);
@@ -1892,7 +1887,8 @@ static void ceph_do_invalidate_pages(struct inode *inode)
spin_unlock(&ci->i_ceph_lock);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
- pr_err("invalidate_pages %p fails\n", inode);
+ pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
+ ceph_vinop(inode));
}
spin_lock(&ci->i_ceph_lock);
@@ -1958,6 +1954,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_resize(inode, to);
truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
@@ -1972,7 +1969,7 @@ retry:
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0, NULL);
wake_up_all(&ci->i_cap_wq);
}
@@ -1981,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work)
{
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_work);
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
dout("writeback %p\n", inode);
@@ -1993,6 +1990,12 @@ static void ceph_inode_work(struct work_struct *work)
if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
__ceph_do_pending_vmtruncate(inode);
+ if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
+ ceph_check_caps(ci, 0, NULL);
+
+ if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
+ ceph_flush_snaps(ci, NULL);
+
iput(inode);
}
@@ -2115,20 +2118,21 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
}
if (ia_valid & ATTR_SIZE) {
- dout("setattr %p size %lld -> %lld\n", inode,
- inode->i_size, attr->ia_size);
- if ((issued & CEPH_CAP_FILE_EXCL) &&
- attr->ia_size > inode->i_size) {
- i_size_write(inode, attr->ia_size);
- inode->i_blocks = calc_inode_blocks(attr->ia_size);
- ci->i_reported_size = attr->ia_size;
- dirtied |= CEPH_CAP_FILE_EXCL;
- ia_valid |= ATTR_MTIME;
+ loff_t isize = i_size_read(inode);
+
+ dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
+ if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+ if (attr->ia_size > isize) {
+ i_size_write(inode, attr->ia_size);
+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
+ ci->i_reported_size = attr->ia_size;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ ia_valid |= ATTR_MTIME;
+ }
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- attr->ia_size != inode->i_size) {
+ attr->ia_size != isize) {
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
- req->r_args.setattr.old_size =
- cpu_to_le64(inode->i_size);
+ req->r_args.setattr.old_size = cpu_to_le64(isize);
mask |= CEPH_SETATTR_SIZE;
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
@@ -2188,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
inode->i_ctime = attr->ia_ctime;
+ inode_inc_iversion_raw(inode);
}
release &= issued;
@@ -2198,7 +2203,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
-
if (mask) {
req->r_inode = inode;
ihold(inode);
@@ -2223,7 +2227,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
/*
* setattr
*/
-int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -2232,12 +2237,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- err = setattr_prepare(dentry, attr);
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ err = setattr_prepare(&init_user_ns, dentry, attr);
if (err != 0)
return err;
if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size > max(inode->i_size, fsc->max_file_size))
+ attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
return -EFBIG;
if ((attr->ia_valid & ATTR_SIZE) &&
@@ -2247,11 +2255,41 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
err = __ceph_setattr(inode, attr);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
- err = posix_acl_chmod(inode, attr->ia_mode);
+ err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
return err;
}
+int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
+{
+ int issued = ceph_caps_issued(ceph_inode(inode));
+
+ /*
+ * If any 'x' caps is issued we can just choose the auth MDS
+ * instead of the random replica MDSes. Because only when the
+ * Locker is in LOCK_EXEC state will the loner client could
+ * get the 'x' caps. And if we send the getattr requests to
+ * any replica MDS it must auth pin and tries to rdlock from
+ * the auth MDS, and then the auth MDS need to do the Locker
+ * state transition to LOCK_SYNC. And after that the lock state
+ * will change back.
+ *
+ * This cost much when doing the Locker state transition and
+ * usually will need to revoke caps from clients.
+ *
+ * And for the 'Xs' caps for getxattr we will also choose the
+ * auth MDS, because the MDS side code is buggy due to setxattr
+ * won't notify the replica MDSes when the values changed and
+ * the replica MDS will return the old values. Though we will
+ * fix it in MDS code, but this still makes sense for old ceph.
+ */
+ if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
+ || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
+ return USE_AUTH_MDS;
+ else
+ return USE_ANY_MDS;
+}
+
/*
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
@@ -2272,10 +2310,10 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
dout("do_getattr inode %p mask %s mode 0%o\n",
inode, ceph_cap_string(mask), inode->i_mode);
- if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
- return 0;
+ if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
+ return 0;
- mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+ mode = ceph_try_to_choose_auth_mds(inode, mask);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -2290,7 +2328,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (inline_version == 0) {
/* the reply is supposed to contain inline data */
err = -EINVAL;
- } else if (inline_version == CEPH_INLINE_NONE) {
+ } else if (inline_version == CEPH_INLINE_NONE ||
+ inline_version == 1) {
err = -ENODATA;
} else {
err = req->r_reply_info.targeti.inline_len;
@@ -2301,12 +2340,65 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
return err;
}
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int mode = USE_AUTH_MDS;
+ int err;
+ char *xattr_value;
+ size_t xattr_value_len;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
+ if (IS_ERR(req)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto put;
+ }
+
+ ihold(inode);
+ req->r_inode = inode;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto put;
+
+ xattr_value = req->r_reply_info.xattr_info.xattr_value;
+ xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
+
+ dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+
+ err = (int)xattr_value_len;
+ if (size == 0)
+ goto put;
+
+ if (xattr_value_len > size) {
+ err = -ERANGE;
+ goto put;
+ }
+
+ memcpy(value, xattr_value, xattr_value_len);
+put:
+ ceph_mdsc_put_request(req);
+out:
+ dout("do_getvxattr result=%d\n", err);
+ return err;
+}
+
/*
* Check inode permissions. We verify we have a valid value for
* the AUTH cap, then call the generic handler.
*/
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask)
{
int err;
@@ -2316,20 +2408,28 @@ int ceph_permission(struct inode *inode, int mask)
err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
if (!err)
- err = generic_permission(inode, mask);
+ err = generic_permission(&init_user_ns, inode, mask);
return err;
}
/* Craft a mask of needed caps given a set of requested statx attrs. */
-static int statx_to_caps(u32 want)
+static int statx_to_caps(u32 want, umode_t mode)
{
int mask = 0;
if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
mask |= CEPH_CAP_AUTH_SHARED;
- if (want & (STATX_NLINK|STATX_CTIME))
- mask |= CEPH_CAP_LINK_SHARED;
+ if (want & (STATX_NLINK|STATX_CTIME)) {
+ /*
+ * The link count for directories depends on inode->i_subdirs,
+ * and that is only updated when Fs caps are held.
+ */
+ if (S_ISDIR(mode))
+ mask |= CEPH_CAP_FILE_SHARED;
+ else
+ mask |= CEPH_CAP_LINK_SHARED;
+ }
if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
STATX_BLOCKS))
@@ -2345,24 +2445,29 @@ static int statx_to_caps(u32 want)
* Get all the attributes. If we have sufficient caps for the requested attrs,
* then we can avoid talking to the MDS at all.
*/
-int ceph_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct super_block *sb = inode->i_sb;
struct ceph_inode_info *ci = ceph_inode(inode);
u32 valid_mask = STATX_BASIC_STATS;
int err = 0;
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
/* Skip the getattr altogether if we're asked not to sync */
- if (!(flags & AT_STATX_DONT_SYNC)) {
- err = ceph_do_getattr(inode, statx_to_caps(request_mask),
- flags & AT_STATX_FORCE_SYNC);
+ if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
+ err = ceph_do_getattr(inode,
+ statx_to_caps(request_mask, inode->i_mode),
+ flags & AT_STATX_FORCE_SYNC);
if (err)
return err;
}
- generic_fillattr(inode, stat);
- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+ generic_fillattr(&init_user_ns, inode, stat);
+ stat->ino = ceph_present_inode(inode);
/*
* btime on newly-allocated inodes is 0, so if this is still set to
@@ -2374,16 +2479,34 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
}
if (ceph_snap(inode) == CEPH_NOSNAP)
- stat->dev = inode->i_sb->s_dev;
+ stat->dev = sb->s_dev;
else
stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
- RBYTES))
+ if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
stat->size = ci->i_rbytes;
- else
+ } else if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ struct ceph_inode_info *pci;
+ struct ceph_snap_realm *realm;
+ struct inode *parent;
+
+ parent = ceph_lookup_inode(sb, ceph_ino(inode));
+ if (!parent)
+ return PTR_ERR(parent);
+
+ pci = ceph_inode(parent);
+ spin_lock(&pci->i_ceph_lock);
+ realm = pci->i_snap_realm;
+ if (realm)
+ stat->size = realm->num_snaps;
+ else
+ stat->size = 0;
+ spin_unlock(&pci->i_ceph_lock);
+ iput(parent);
+ } else {
stat->size = ci->i_files + ci->i_subdirs;
+ }
stat->blocks = 0;
stat->blksize = 65536;
/*
@@ -2399,3 +2522,27 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
stat->result_mask = request_mask & valid_mask;
return err;
}
+
+void ceph_inode_shutdown(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct rb_node *p;
+ int iputs = 0;
+ bool invalidate = false;
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
+ p = rb_first(&ci->i_caps);
+ while (p) {
+ struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+
+ p = rb_next(p);
+ iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (invalidate)
+ ceph_queue_invalidate(inode);
+ while (iputs--)
+ iput(inode);
+}