/* * Copyright (c) 2014 Christoph Hellwig. */ #include #include #include #include #include #include "pnfs.h" #include "netns.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS struct nfs4_layout { struct list_head lo_perstate; struct nfs4_layout_stateid *lo_state; struct nfsd4_layout_seg lo_seg; }; static struct kmem_cache *nfs4_layout_cache; static struct kmem_cache *nfs4_layout_stateid_cache; static struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lock_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, }; /* pNFS device ID to export fsid mapping */ #define DEVID_HASH_BITS 8 #define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS) #define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1) static u64 nfsd_devid_seq = 1; static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE]; static DEFINE_SPINLOCK(nfsd_devid_lock); static inline u32 devid_hashfn(u64 idx) { return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK; } static void nfsd4_alloc_devid_map(const struct svc_fh *fhp) { const struct knfsd_fh *fh = &fhp->fh_handle; size_t fsid_len = key_len(fh->fh_fsid_type); struct nfsd4_deviceid_map *map, *old; int i; map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL); if (!map) return; map->fsid_type = fh->fh_fsid_type; memcpy(&map->fsid, fh->fh_fsid, fsid_len); spin_lock(&nfsd_devid_lock); if (fhp->fh_export->ex_devid_map) goto out_unlock; for (i = 0; i < DEVID_HASH_SIZE; i++) { list_for_each_entry(old, &nfsd_devid_hash[i], hash) { if (old->fsid_type != fh->fh_fsid_type) continue; if (memcmp(old->fsid, fh->fh_fsid, key_len(old->fsid_type))) continue; fhp->fh_export->ex_devid_map = old; goto out_unlock; } } map->idx = nfsd_devid_seq++; list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]); fhp->fh_export->ex_devid_map = map; map = NULL; out_unlock: spin_unlock(&nfsd_devid_lock); kfree(map); } struct nfsd4_deviceid_map * nfsd4_find_devid_map(int idx) { struct nfsd4_deviceid_map *map, *ret = NULL; rcu_read_lock(); list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash) if (map->idx == idx) ret = map; rcu_read_unlock(); return ret; } int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, u32 device_generation) { if (!fhp->fh_export->ex_devid_map) { nfsd4_alloc_devid_map(fhp); if (!fhp->fh_export->ex_devid_map) return -ENOMEM; } id->fsid_idx = fhp->fh_export->ex_devid_map->idx; id->generation = device_generation; id->pad = 0; return 0; } void nfsd4_setup_layout_type(struct svc_export *exp) { struct super_block *sb = exp->ex_path.mnt->mnt_sb; if (!(exp->ex_flags & NFSEXP_PNFS)) return; if (sb->s_export_op->get_uuid && sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks) exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; } static void nfsd4_free_layout_stateid(struct nfs4_stid *stid) { struct nfs4_layout_stateid *ls = layoutstateid(stid); struct nfs4_client *clp = ls->ls_stid.sc_client; struct nfs4_file *fp = ls->ls_stid.sc_file; trace_layoutstate_free(&ls->ls_stid.sc_stateid); spin_lock(&clp->cl_lock); list_del_init(&ls->ls_perclnt); spin_unlock(&clp->cl_lock); spin_lock(&fp->fi_lock); list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); fput(ls->ls_file); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); kmem_cache_free(nfs4_layout_stateid_cache, ls); } static int nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) { struct file_lock *fl; int status; fl = locks_alloc_lock(); if (!fl) return -ENOMEM; locks_init_lock(fl); fl->fl_lmops = &nfsd4_layouts_lm_ops; fl->fl_flags = FL_LAYOUT; fl->fl_type = F_RDLCK; fl->fl_end = OFFSET_MAX; fl->fl_owner = ls; fl->fl_pid = current->tgid; fl->fl_file = ls->ls_file; status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); if (status) { locks_free_lock(fl); return status; } BUG_ON(fl != NULL); return 0; } static struct nfs4_layout_stateid * nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid *parent, u32 layout_type) { struct nfs4_client *clp = cstate->clp; struct nfs4_file *fp = parent->sc_file; struct nfs4_layout_stateid *ls; struct nfs4_stid *stp; stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache); if (!stp) return NULL; stp->sc_free = nfsd4_free_layout_stateid; get_nfs4_file(fp); stp->sc_file = fp; ls = layoutstateid(stp); INIT_LIST_HEAD(&ls->ls_perclnt); INIT_LIST_HEAD(&ls->ls_perfile); spin_lock_init(&ls->ls_lock); INIT_LIST_HEAD(&ls->ls_layouts); ls->ls_layout_type = layout_type; nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, NFSPROC4_CLNT_CB_LAYOUT); if (parent->sc_type == NFS4_DELEG_STID) ls->ls_file = get_file(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; } spin_lock(&clp->cl_lock); stp->sc_type = NFS4_LAYOUT_STID; list_add(&ls->ls_perclnt, &clp->cl_lo_states); spin_unlock(&clp->cl_lock); spin_lock(&fp->fi_lock); list_add(&ls->ls_perfile, &fp->fi_lo_states); spin_unlock(&fp->fi_lock); trace_layoutstate_alloc(&ls->ls_stid.sc_stateid); return ls; } __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid, bool create, u32 layout_type, struct nfs4_layout_stateid **lsp) { struct nfs4_layout_stateid *ls; struct nfs4_stid *stid; unsigned char typemask = NFS4_LAYOUT_STID; __be32 status; if (create) typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, net_generic(SVC_NET(rqstp), nfsd_net_id)); if (status) goto out; if (!fh_match(&cstate->current_fh.fh_handle, &stid->sc_file->fi_fhandle)) { status = nfserr_bad_stateid; goto out_put_stid; } if (stid->sc_type != NFS4_LAYOUT_STID) { ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); nfs4_put_stid(stid); status = nfserr_jukebox; if (!ls) goto out; } else { ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); status = nfserr_bad_stateid; if (stateid->si_generation > stid->sc_stateid.si_generation) goto out_put_stid; if (layout_type != ls->ls_layout_type) goto out_put_stid; } *lsp = ls; return 0; out_put_stid: nfs4_put_stid(stid); out: return status; } static void nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) { spin_lock(&ls->ls_lock); if (ls->ls_recalled) goto out_unlock; ls->ls_recalled = true; atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); if (list_empty(&ls->ls_layouts)) goto out_unlock; trace_layout_recall(&ls->ls_stid.sc_stateid); atomic_inc(&ls->ls_stid.sc_count); update_stateid(&ls->ls_stid.sc_stateid); memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); nfsd4_run_cb(&ls->ls_recall); out_unlock: spin_unlock(&ls->ls_lock); } static inline u64 layout_end(struct nfsd4_layout_seg *seg) { u64 end = seg->offset + seg->length; return end >= seg->offset ? end : NFS4_MAX_UINT64; } static void layout_update_len(struct nfsd4_layout_seg *lo, u64 end) { if (end == NFS4_MAX_UINT64) lo->length = NFS4_MAX_UINT64; else lo->length = end - lo->offset; } static bool layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s) { if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode) return false; if (layout_end(&lo->lo_seg) <= s->offset) return false; if (layout_end(s) <= lo->lo_seg.offset) return false; return true; } static bool layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new) { if (lo->iomode != new->iomode) return false; if (layout_end(new) < lo->offset) return false; if (layout_end(lo) < new->offset) return false; lo->offset = min(lo->offset, new->offset); layout_update_len(lo, max(layout_end(lo), layout_end(new))); return true; } static __be32 nfsd4_recall_conflict(struct nfs4_layout_stateid *ls) { struct nfs4_file *fp = ls->ls_stid.sc_file; struct nfs4_layout_stateid *l, *n; __be32 nfserr = nfs_ok; assert_spin_locked(&fp->fi_lock); list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) { if (l != ls) { nfsd4_recall_file_layout(l); nfserr = nfserr_recallconflict; } } return nfserr; } __be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) { struct nfsd4_layout_seg *seg = &lgp->lg_seg; struct nfs4_file *fp = ls->ls_stid.sc_file; struct nfs4_layout *lp, *new = NULL; __be32 nfserr; spin_lock(&fp->fi_lock); nfserr = nfsd4_recall_conflict(ls); if (nfserr) goto out; spin_lock(&ls->ls_lock); list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { if (layouts_try_merge(&lp->lo_seg, seg)) goto done; } spin_unlock(&ls->ls_lock); spin_unlock(&fp->fi_lock); new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL); if (!new) return nfserr_jukebox; memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg)); new->lo_state = ls; spin_lock(&fp->fi_lock); nfserr = nfsd4_recall_conflict(ls); if (nfserr) goto out; spin_lock(&ls->ls_lock); list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { if (layouts_try_merge(&lp->lo_seg, seg)) goto done; } atomic_inc(&ls->ls_stid.sc_count); list_add_tail(&new->lo_perstate, &ls->ls_layouts); new = NULL; done: update_stateid(&ls->ls_stid.sc_stateid); memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); spin_unlock(&ls->ls_lock); out: spin_unlock(&fp->fi_lock); if (new) kmem_cache_free(nfs4_layout_cache, new); return nfserr; } static void nfsd4_free_layouts(struct list_head *reaplist) { while (!list_empty(reaplist)) { struct nfs4_layout *lp = list_first_entry(reaplist, struct nfs4_layout, lo_perstate); list_del(&lp->lo_perstate); nfs4_put_stid(&lp->lo_state->ls_stid); kmem_cache_free(nfs4_layout_cache, lp); } } static void nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg, struct list_head *reaplist) { struct nfsd4_layout_seg *lo = &lp->lo_seg; u64 end = layout_end(lo); if (seg->offset <= lo->offset) { if (layout_end(seg) >= end) { list_move_tail(&lp->lo_perstate, reaplist); return; } lo->offset = layout_end(seg); } else { /* retain the whole layout segment on a split. */ if (layout_end(seg) < end) { dprintk("%s: split not supported\n", __func__); return; } end = seg->offset; } layout_update_len(lo, end); } __be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_layoutreturn *lrp) { struct nfs4_layout_stateid *ls; struct nfs4_layout *lp, *n; LIST_HEAD(reaplist); __be32 nfserr; int found = 0; nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid, false, lrp->lr_layout_type, &ls); if (nfserr) { trace_layout_return_lookup_fail(&lrp->lr_sid); return nfserr; } spin_lock(&ls->ls_lock); list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) { if (layouts_overlapping(lp, &lrp->lr_seg)) { nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist); found++; } } if (!list_empty(&ls->ls_layouts)) { if (found) { update_stateid(&ls->ls_stid.sc_stateid); memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); } lrp->lrs_present = 1; } else { trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); nfs4_unhash_stid(&ls->ls_stid); lrp->lrs_present = 0; } spin_unlock(&ls->ls_lock); nfs4_put_stid(&ls->ls_stid); nfsd4_free_layouts(&reaplist); return nfs_ok; } __be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_layoutreturn *lrp) { struct nfs4_layout_stateid *ls, *n; struct nfs4_client *clp = cstate->clp; struct nfs4_layout *lp, *t; LIST_HEAD(reaplist); lrp->lrs_present = 0; spin_lock(&clp->cl_lock); list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { if (ls->ls_layout_type != lrp->lr_layout_type) continue; if (lrp->lr_return_type == RETURN_FSID && !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle, &cstate->current_fh.fh_handle)) continue; spin_lock(&ls->ls_lock); list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) { if (lrp->lr_seg.iomode == IOMODE_ANY || lrp->lr_seg.iomode == lp->lo_seg.iomode) list_move_tail(&lp->lo_perstate, &reaplist); } spin_unlock(&ls->ls_lock); } spin_unlock(&clp->cl_lock); nfsd4_free_layouts(&reaplist); return 0; } static void nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls, struct list_head *reaplist) { spin_lock(&ls->ls_lock); list_splice_init(&ls->ls_layouts, reaplist); spin_unlock(&ls->ls_lock); } void nfsd4_return_all_client_layouts(struct nfs4_client *clp) { struct nfs4_layout_stateid *ls, *n; LIST_HEAD(reaplist); spin_lock(&clp->cl_lock); list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) nfsd4_return_all_layouts(ls, &reaplist); spin_unlock(&clp->cl_lock); nfsd4_free_layouts(&reaplist); } void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_layout_stateid *ls, *n; LIST_HEAD(reaplist); spin_lock(&fp->fi_lock); list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) { if (ls->ls_stid.sc_client == clp) nfsd4_return_all_layouts(ls, &reaplist); } spin_unlock(&fp->fi_lock); nfsd4_free_layouts(&reaplist); } static void nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) { struct nfs4_client *clp = ls->ls_stid.sc_client; char addr_str[INET6_ADDRSTRLEN]; static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; char *argv[8]; int error; rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); trace_layout_recall_fail(&ls->ls_stid.sc_stateid); printk(KERN_WARNING "nfsd: client %s failed to respond to layout recall. " " Fencing..\n", addr_str); argv[0] = "/sbin/nfsd-recall-failed"; argv[1] = addr_str; argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n", addr_str, error); } } static int nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_layout_stateid *ls = container_of(cb, struct nfs4_layout_stateid, ls_recall); LIST_HEAD(reaplist); switch (task->tk_status) { case 0: return 1; case -NFS4ERR_NOMATCHING_LAYOUT: trace_layout_recall_done(&ls->ls_stid.sc_stateid); task->tk_status = 0; return 1; case -NFS4ERR_DELAY: /* Poll the client until it's done with the layout */ /* FIXME: cap number of retries. * The pnfs standard states that we need to only expire * the client after at-least "lease time" .eg lease-time * 2 * when failing to communicate a recall */ rpc_delay(task, HZ/100); /* 10 mili-seconds */ return 0; default: /* * Unknown error or non-responding client, we'll need to fence. */ nfsd4_cb_layout_fail(ls); return -1; } } static void nfsd4_cb_layout_release(struct nfsd4_callback *cb) { struct nfs4_layout_stateid *ls = container_of(cb, struct nfs4_layout_stateid, ls_recall); LIST_HEAD(reaplist); trace_layout_recall_release(&ls->ls_stid.sc_stateid); nfsd4_return_all_layouts(ls, &reaplist); nfsd4_free_layouts(&reaplist); nfs4_put_stid(&ls->ls_stid); } static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { .done = nfsd4_cb_layout_done, .release = nfsd4_cb_layout_release, }; static bool nfsd4_layout_lm_break(struct file_lock *fl) { /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a layout isn't returned * in time: */ fl->fl_break_time = 0; nfsd4_recall_file_layout(fl->fl_owner); return false; } static int nfsd4_layout_lm_change(struct file_lock *onlist, int arg, struct list_head *dispose) { BUG_ON(!(arg & F_UNLCK)); return lease_modify(onlist, arg, dispose); } static const struct lock_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, }; int nfsd4_init_pnfs(void) { int i; for (i = 0; i < DEVID_HASH_SIZE; i++) INIT_LIST_HEAD(&nfsd_devid_hash[i]); nfs4_layout_cache = kmem_cache_create("nfs4_layout", sizeof(struct nfs4_layout), 0, 0, NULL); if (!nfs4_layout_cache) return -ENOMEM; nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", sizeof(struct nfs4_layout_stateid), 0, 0, NULL); if (!nfs4_layout_stateid_cache) { kmem_cache_destroy(nfs4_layout_cache); return -ENOMEM; } return 0; } void nfsd4_exit_pnfs(void) { int i; kmem_cache_destroy(nfs4_layout_cache); kmem_cache_destroy(nfs4_layout_stateid_cache); for (i = 0; i < DEVID_HASH_SIZE; i++) { struct nfsd4_deviceid_map *map, *n; list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash) kfree(map); } }