diff options
Diffstat (limited to 'drivers/staging/lustre/lustre/llite')
29 files changed, 0 insertions, 22669 deletions
diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile deleted file mode 100644 index 519fd747e3ad..000000000000 --- a/drivers/staging/lustre/lustre/llite/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += lustre.o -lustre-y := dcache.o dir.o file.o llite_lib.o llite_nfs.o \ - rw.o rw26.o namei.o symlink.o llite_mmap.o range_lock.o \ - xattr.o xattr_cache.o xattr_security.o \ - super25.o statahead.o glimpse.o lcommon_cl.o lcommon_misc.o \ - vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o \ - lproc_llite.o diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c deleted file mode 100644 index 11b82c639bfe..000000000000 --- a/drivers/staging/lustre/lustre/llite/dcache.c +++ /dev/null @@ -1,300 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/quotaops.h> - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd_support.h> -#include <uapi/linux/lustre/lustre_idl.h> -#include <lustre_dlm.h> - -#include "llite_internal.h" - -static void free_dentry_data(struct rcu_head *head) -{ - struct ll_dentry_data *lld; - - lld = container_of(head, struct ll_dentry_data, lld_rcu_head); - kfree(lld); -} - -/* should NOT be called with the dcache lock, see fs/dcache.c */ -static void ll_release(struct dentry *de) -{ - struct ll_dentry_data *lld; - - LASSERT(de); - lld = ll_d2d(de); - if (lld->lld_it) { - ll_intent_release(lld->lld_it); - kfree(lld->lld_it); - } - - de->d_fsdata = NULL; - call_rcu(&lld->lld_rcu_head, free_dentry_data); -} - -/* Compare if two dentries are the same. Don't match if the existing dentry - * is marked invalid. Returns 1 if different, 0 if the same. - * - * This avoids a race where ll_lookup_it() instantiates a dentry, but we get - * an AST before calling d_revalidate_it(). The dentry still exists (marked - * INVALID) so d_lookup() matches it, but we have no lock on it (so - * lock_match() fails) and we spin around real_lookup(). - * - * This race doesn't apply to lookups in d_alloc_parallel(), and for - * those we want to ensure that only one dentry with a given name is - * in ll_lookup_nd() at a time. So allow invalid dentries to match - * while d_in_lookup(). We will be called again when the lookup - * completes, and can give a different answer then. - */ -static int ll_dcompare(const struct dentry *dentry, - unsigned int len, const char *str, - const struct qstr *name) -{ - if (len != name->len) - return 1; - - if (memcmp(str, name->name, len)) - return 1; - - CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n", - name->len, name->name, dentry, dentry->d_flags, - d_count(dentry)); - - /* mountpoint is always valid */ - if (d_mountpoint(dentry)) - return 0; - - /* ensure exclusion against parallel lookup of the same name */ - if (d_in_lookup((struct dentry *)dentry)) - return 0; - - if (d_lustre_invalid(dentry)) - return 1; - - return 0; -} - -/** - * Called when last reference to a dentry is dropped and dcache wants to know - * whether or not it should cache it: - * - return 1 to delete the dentry immediately - * - return 0 to cache the dentry - * Should NOT be called with the dcache lock, see fs/dcache.c - */ -static int ll_ddelete(const struct dentry *de) -{ - LASSERT(de); - - CDEBUG(D_DENTRY, "%s dentry %pd (%p, parent %p, inode %p) %s%s\n", - d_lustre_invalid(de) ? "deleting" : "keeping", - de, de, de->d_parent, d_inode(de), - d_unhashed(de) ? "" : "hashed,", - list_empty(&de->d_subdirs) ? "" : "subdirs"); - - /* kernel >= 2.6.38 last refcount is decreased after this function. */ - LASSERT(d_count(de) == 1); - - if (d_lustre_invalid(de)) - return 1; - return 0; -} - -static int ll_d_init(struct dentry *de) -{ - struct ll_dentry_data *lld = kzalloc(sizeof(*lld), GFP_KERNEL); - - if (unlikely(!lld)) - return -ENOMEM; - lld->lld_invalid = 1; - de->d_fsdata = lld; - return 0; -} - -void ll_intent_drop_lock(struct lookup_intent *it) -{ - if (it->it_op && it->it_lock_mode) { - struct lustre_handle handle; - - handle.cookie = it->it_lock_handle; - - CDEBUG(D_DLMTRACE, - "releasing lock with cookie %#llx from it %p\n", - handle.cookie, it); - ldlm_lock_decref(&handle, it->it_lock_mode); - - /* bug 494: intent_release may be called multiple times, from - * this thread and we don't want to double-decref this lock - */ - it->it_lock_mode = 0; - if (it->it_remote_lock_mode != 0) { - handle.cookie = it->it_remote_lock_handle; - - CDEBUG(D_DLMTRACE, - "releasing remote lock with cookie%#llx from it %p\n", - handle.cookie, it); - ldlm_lock_decref(&handle, - it->it_remote_lock_mode); - it->it_remote_lock_mode = 0; - } - } -} - -void ll_intent_release(struct lookup_intent *it) -{ - CDEBUG(D_INFO, "intent %p released\n", it); - ll_intent_drop_lock(it); - /* We are still holding extra reference on a request, need to free it */ - if (it_disposition(it, DISP_ENQ_OPEN_REF)) - ptlrpc_req_finished(it->it_request); /* ll_file_open */ - - if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */ - ptlrpc_req_finished(it->it_request); - - it->it_disposition = 0; - it->it_request = NULL; -} - -void ll_invalidate_aliases(struct inode *inode) -{ - struct dentry *dentry; - - CDEBUG(D_INODE, "marking dentries for ino " DFID "(%p) invalid\n", - PFID(ll_inode2fid(inode)), inode); - - spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { - CDEBUG(D_DENTRY, - "dentry in drop %pd (%p) parent %p inode %p flags %d\n", - dentry, dentry, dentry->d_parent, - d_inode(dentry), dentry->d_flags); - - d_lustre_invalidate(dentry, 0); - } - spin_unlock(&inode->i_lock); -} - -int ll_revalidate_it_finish(struct ptlrpc_request *request, - struct lookup_intent *it, - struct inode *inode) -{ - int rc = 0; - - if (!request) - return 0; - - if (it_disposition(it, DISP_LOOKUP_NEG)) - return -ENOENT; - - rc = ll_prep_inode(&inode, request, NULL, it); - - return rc; -} - -void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode) -{ - if (it->it_lock_mode && inode) { - struct ll_sb_info *sbi = ll_i2sbi(inode); - - CDEBUG(D_DLMTRACE, "setting l_data to inode " DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); - } - - /* drop lookup or getattr locks immediately */ - if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) { - /* on 2.6 there are situation when several lookups and - * revalidations may be requested during single operation. - * therefore, we don't release intent here -bzzz - */ - ll_intent_drop_lock(it); - } -} - -static int ll_revalidate_dentry(struct dentry *dentry, - unsigned int lookup_flags) -{ - struct inode *dir = d_inode(dentry->d_parent); - - /* If this is intermediate component path lookup and we were able to get - * to this dentry, then its lock has not been revoked and the - * path component is valid. - */ - if (lookup_flags & LOOKUP_PARENT) - return 1; - - /* Symlink - always valid as long as the dentry was found */ - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) - return 1; - - /* - * VFS warns us that this is the second go around and previous - * operation failed (most likely open|creat), so this time - * we better talk to the server via the lookup path by name, - * not by fid. - */ - if (lookup_flags & LOOKUP_REVAL) - return 0; - - if (!dentry_may_statahead(dir, dentry)) - return 1; - - if (lookup_flags & LOOKUP_RCU) - return -ECHILD; - - ll_statahead(dir, &dentry, !d_inode(dentry)); - return 1; -} - -/* - * Always trust cached dentries. Update statahead window if necessary. - */ -static int ll_revalidate_nd(struct dentry *dentry, unsigned int flags) -{ - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, flags=%u\n", - dentry, flags); - - return ll_revalidate_dentry(dentry, flags); -} - -const struct dentry_operations ll_d_ops = { - .d_init = ll_d_init, - .d_revalidate = ll_revalidate_nd, - .d_release = ll_release, - .d_delete = ll_ddelete, - .d_compare = ll_dcompare, -}; diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c deleted file mode 100644 index d10d27268323..000000000000 --- a/drivers/staging/lustre/lustre/llite/dir.c +++ /dev/null @@ -1,1706 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/dir.c - * - * Directory code for lustre client. - */ - -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/mm.h> -#include <linux/uaccess.h> -#include <linux/buffer_head.h> /* for wait_on_buffer */ -#include <linux/pagevec.h> -#include <linux/prefetch.h> - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd_support.h> -#include <obd_class.h> -#include <uapi/linux/lustre/lustre_ioctl.h> -#include <lustre_lib.h> -#include <lustre_dlm.h> -#include <lustre_fid.h> -#include <lustre_kernelcomm.h> -#include <lustre_swab.h> - -#include "llite_internal.h" - -/* - * (new) readdir implementation overview. - * - * Original lustre readdir implementation cached exact copy of raw directory - * pages on the client. These pages were indexed in client page cache by - * logical offset in the directory file. This design, while very simple and - * intuitive had some inherent problems: - * - * . it implies that byte offset to the directory entry serves as a - * telldir(3)/seekdir(3) cookie, but that offset is not stable: in - * ext3/htree directory entries may move due to splits, and more - * importantly, - * - * . it is incompatible with the design of split directories for cmd3, - * that assumes that names are distributed across nodes based on their - * hash, and so readdir should be done in hash order. - * - * New readdir implementation does readdir in hash order, and uses hash of a - * file name as a telldir/seekdir cookie. This led to number of complications: - * - * . hash is not unique, so it cannot be used to index cached directory - * pages on the client (note, that it requires a whole pageful of hash - * collided entries to cause two pages to have identical hashes); - * - * . hash is not unique, so it cannot, strictly speaking, be used as an - * entry cookie. ext3/htree has the same problem and lustre implementation - * mimics their solution: seekdir(hash) positions directory at the first - * entry with the given hash. - * - * Client side. - * - * 0. caching - * - * Client caches directory pages using hash of the first entry as an index. As - * noted above hash is not unique, so this solution doesn't work as is: - * special processing is needed for "page hash chains" (i.e., sequences of - * pages filled with entries all having the same hash value). - * - * First, such chains have to be detected. To this end, server returns to the - * client the hash of the first entry on the page next to one returned. When - * client detects that this hash is the same as hash of the first entry on the - * returned page, page hash collision has to be handled. Pages in the - * hash chain, except first one, are termed "overflow pages". - * - * Solution to index uniqueness problem is to not cache overflow - * pages. Instead, when page hash collision is detected, all overflow pages - * from emerging chain are immediately requested from the server and placed in - * a special data structure (struct ll_dir_chain). This data structure is used - * by ll_readdir() to process entries from overflow pages. When readdir - * invocation finishes, overflow pages are discarded. If page hash collision - * chain weren't completely processed, next call to readdir will again detect - * page hash collision, again read overflow pages in, process next portion of - * entries and again discard the pages. This is not as wasteful as it looks, - * because, given reasonable hash, page hash collisions are extremely rare. - * - * 1. directory positioning - * - * When seekdir(hash) is called, original - * - * - * - * - * - * - * - * - * Server. - * - * identification of and access to overflow pages - * - * page format - * - * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains - * a header lu_dirpage which describes the start/end hash, and whether this - * page is empty (contains no dir entry) or hash collide with next page. - * After client receives reply, several pages will be integrated into dir page - * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the lu_dirpage - * for this integrated page will be adjusted. See lmv_adjust_dirpages(). - * - */ -struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, - __u64 offset) -{ - struct md_callback cb_op; - struct page *page; - int rc; - - cb_op.md_blocking_ast = ll_md_blocking_ast; - rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page); - if (rc) - return ERR_PTR(rc); - - return page; -} - -void ll_release_page(struct inode *inode, struct page *page, bool remove) -{ - kunmap(page); - - /* - * Always remove the page for striped dir, because the page is - * built from temporarily in LMV layer - */ - if (inode && S_ISDIR(inode->i_mode) && - ll_i2info(inode)->lli_lsm_md) { - __free_page(page); - return; - } - - if (remove) { - lock_page(page); - if (likely(page->mapping)) - truncate_complete_page(page->mapping, page); - unlock_page(page); - } - put_page(page); -} - -/** - * return IF_* type for given lu_dirent entry. - * IF_* flag shld be converted to particular OS file type in - * platform llite module. - */ -static __u16 ll_dirent_type_get(struct lu_dirent *ent) -{ - __u16 type = 0; - struct luda_type *lt; - int len = 0; - - if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) { - const unsigned int align = sizeof(struct luda_type) - 1; - - len = le16_to_cpu(ent->lde_namelen); - len = (len + align) & ~align; - lt = (void *)ent->lde_name + len; - type = IFTODT(le16_to_cpu(lt->lt_type)); - } - return type; -} - -int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, - struct dir_context *ctx) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - __u64 pos = *ppos; - int is_api32 = ll_need_32bit_api(sbi); - int is_hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; - struct page *page; - bool done = false; - int rc = 0; - - page = ll_get_dir_page(inode, op_data, pos); - - while (rc == 0 && !done) { - struct lu_dirpage *dp; - struct lu_dirent *ent; - __u64 hash; - __u64 next; - - if (IS_ERR(page)) { - rc = PTR_ERR(page); - break; - } - - hash = MDS_DIR_END_OFF; - dp = page_address(page); - for (ent = lu_dirent_start(dp); ent && !done; - ent = lu_dirent_next(ent)) { - __u16 type; - int namelen; - struct lu_fid fid; - __u64 lhash; - __u64 ino; - - hash = le64_to_cpu(ent->lde_hash); - if (hash < pos) - /* - * Skip until we find target hash - * value. - */ - continue; - - namelen = le16_to_cpu(ent->lde_namelen); - if (namelen == 0) - /* - * Skip dummy record. - */ - continue; - - if (is_api32 && is_hash64) - lhash = hash >> 32; - else - lhash = hash; - fid_le_to_cpu(&fid, &ent->lde_fid); - ino = cl_fid_build_ino(&fid, is_api32); - type = ll_dirent_type_get(ent); - ctx->pos = lhash; - /* For 'll_nfs_get_name_filldir()', it will try - * to access the 'ent' through its 'lde_name', - * so the parameter 'name' for 'ctx->actor()' - * must be part of the 'ent'. - */ - done = !dir_emit(ctx, ent->lde_name, - namelen, ino, type); - } - - if (done) { - pos = hash; - ll_release_page(inode, page, false); - break; - } - - next = le64_to_cpu(dp->ldp_hash_end); - pos = next; - if (pos == MDS_DIR_END_OFF) { - /* - * End of directory reached. - */ - done = 1; - ll_release_page(inode, page, false); - } else { - /* - * Normal case: continue to the next - * page. - */ - ll_release_page(inode, page, - le32_to_cpu(dp->ldp_flags) & - LDF_COLLIDE); - next = pos; - page = ll_get_dir_page(inode, op_data, pos); - } - } - - ctx->pos = pos; - return rc; -} - -static int ll_readdir(struct file *filp, struct dir_context *ctx) -{ - struct inode *inode = file_inode(filp); - struct ll_file_data *lfd = LUSTRE_FPRIVATE(filp); - struct ll_sb_info *sbi = ll_i2sbi(inode); - __u64 pos = lfd ? lfd->lfd_pos : 0; - int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; - int api32 = ll_need_32bit_api(sbi); - struct md_op_data *op_data; - int rc; - - CDEBUG(D_VFSTRACE, - "VFS Op:inode=" DFID "(%p) pos/size %lu/%llu 32bit_api %d\n", - PFID(ll_inode2fid(inode)), inode, (unsigned long)pos, - i_size_read(inode), api32); - - if (pos == MDS_DIR_END_OFF) { - /* - * end-of-file. - */ - rc = 0; - goto out; - } - - op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, - LUSTRE_OPC_ANY, inode); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out; - } - - if (unlikely(op_data->op_mea1)) { - /* - * This is only needed for striped dir to fill .., - * see lmv_read_page - */ - if (file_dentry(filp)->d_parent && - file_dentry(filp)->d_parent->d_inode) { - __u64 ibits = MDS_INODELOCK_UPDATE; - struct inode *parent; - - parent = file_dentry(filp)->d_parent->d_inode; - if (ll_have_md_lock(parent, &ibits, LCK_MINMODE)) - op_data->op_fid3 = *ll_inode2fid(parent); - } - - /* - * If it can not find in cache, do lookup .. on the master - * object - */ - if (fid_is_zero(&op_data->op_fid3)) { - rc = ll_dir_get_parent_fid(inode, &op_data->op_fid3); - if (rc) { - ll_finish_md_op_data(op_data); - return rc; - } - } - } - op_data->op_max_pages = sbi->ll_md_brw_pages; - ctx->pos = pos; - rc = ll_dir_read(inode, &pos, op_data, ctx); - pos = ctx->pos; - if (lfd) - lfd->lfd_pos = pos; - - if (pos == MDS_DIR_END_OFF) { - if (api32) - pos = LL_DIR_END_OFF_32BIT; - else - pos = LL_DIR_END_OFF; - } else { - if (api32 && hash64) - pos >>= 32; - } - ctx->pos = pos; - ll_finish_md_op_data(op_data); -out: - if (!rc) - ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1); - - return rc; -} - -static int ll_send_mgc_param(struct obd_export *mgc, char *string) -{ - struct mgs_send_param *msp; - int rc = 0; - - msp = kzalloc(sizeof(*msp), GFP_NOFS); - if (!msp) - return -ENOMEM; - - strlcpy(msp->mgs_param, string, sizeof(msp->mgs_param)); - rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO, - sizeof(struct mgs_send_param), msp, NULL); - if (rc) - CERROR("Failed to set parameter: %d\n", rc); - kfree(msp); - - return rc; -} - -/** - * Create striped directory with specified stripe(@lump) - * - * param[in] parent the parent of the directory. - * param[in] lump the specified stripes. - * param[in] dirname the name of the directory. - * param[in] mode the specified mode of the directory. - * - * retval =0 if striped directory is being created successfully. - * <0 if the creation is failed. - */ -static int ll_dir_setdirstripe(struct inode *parent, struct lmv_user_md *lump, - const char *dirname, umode_t mode) -{ - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - struct ll_sb_info *sbi = ll_i2sbi(parent); - struct inode *inode = NULL; - struct dentry dentry; - int err; - - if (unlikely(lump->lum_magic != LMV_USER_MAGIC)) - return -EINVAL; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p) name %s stripe_offset %d, stripe_count: %u\n", - PFID(ll_inode2fid(parent)), parent, dirname, - (int)lump->lum_stripe_offset, lump->lum_stripe_count); - - if (lump->lum_stripe_count > 1 && - !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_DIR_STRIPE)) - return -EINVAL; - - if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC)) - lustre_swab_lmv_user_md(lump); - - if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) - mode &= ~current_umask(); - mode = (mode & (0777 | S_ISVTX)) | S_IFDIR; - op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname, - strlen(dirname), mode, LUSTRE_OPC_MKDIR, - lump); - if (IS_ERR(op_data)) { - err = PTR_ERR(op_data); - goto err_exit; - } - - op_data->op_cli_flags |= CLI_SET_MEA; - err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode, - from_kuid(&init_user_ns, current_fsuid()), - from_kgid(&init_user_ns, current_fsgid()), - cfs_curproc_cap_pack(), 0, &request); - ll_finish_md_op_data(op_data); - - err = ll_prep_inode(&inode, request, parent->i_sb, NULL); - if (err) - goto err_exit; - - memset(&dentry, 0, sizeof(dentry)); - dentry.d_inode = inode; - - err = ll_init_security(&dentry, inode, parent); - iput(inode); - -err_exit: - ptlrpc_req_finished(request); - return err; -} - -int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, - int set_default) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct md_op_data *op_data; - struct ptlrpc_request *req = NULL; - int rc = 0; - struct lustre_sb_info *lsi = s2lsi(inode->i_sb); - struct obd_device *mgc = lsi->lsi_mgc; - int lum_size; - - if (lump) { - /* - * This is coming from userspace, so should be in - * local endian. But the MDS would like it in little - * endian, so we swab it before we send it. - */ - switch (lump->lmm_magic) { - case LOV_USER_MAGIC_V1: { - if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) - lustre_swab_lov_user_md_v1(lump); - lum_size = sizeof(struct lov_user_md_v1); - break; - } - case LOV_USER_MAGIC_V3: { - if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3)) - lustre_swab_lov_user_md_v3( - (struct lov_user_md_v3 *)lump); - lum_size = sizeof(struct lov_user_md_v3); - break; - } - case LMV_USER_MAGIC: { - if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC)) - lustre_swab_lmv_user_md( - (struct lmv_user_md *)lump); - lum_size = sizeof(struct lmv_user_md); - break; - } - default: { - CDEBUG(D_IOCTL, - "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n", - lump->lmm_magic, LOV_USER_MAGIC_V1, - LOV_USER_MAGIC_V3); - return -EINVAL; - } - } - } else { - lum_size = sizeof(struct lov_user_md_v1); - } - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - /* swabbing is done in lov_setstripe() on server side */ - rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req); - ll_finish_md_op_data(op_data); - ptlrpc_req_finished(req); - if (rc) - return rc; - -#if OBD_OCD_VERSION(2, 13, 53, 0) > LUSTRE_VERSION_CODE - /* - * 2.9 server has stored filesystem default stripe in ROOT xattr, - * and it's stored into system config for backward compatibility. - * - * In the following we use the fact that LOV_USER_MAGIC_V1 and - * LOV_USER_MAGIC_V3 have the same initial fields so we do not - * need to make the distinction between the 2 versions - */ - if (set_default && mgc->u.cli.cl_mgc_mgsexp) { - char *param = NULL; - char *buf; - - param = kzalloc(MGS_PARAM_MAXLEN, GFP_NOFS); - if (!param) - return -ENOMEM; - - buf = param; - /* Get fsname and assume devname to be -MDT0000. */ - ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN); - strcat(buf, "-MDT0000.lov"); - buf += strlen(buf); - - /* Set root stripesize */ - sprintf(buf, ".stripesize=%u", - lump ? le32_to_cpu(lump->lmm_stripe_size) : 0); - rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); - if (rc) - goto end; - - /* Set root stripecount */ - sprintf(buf, ".stripecount=%hd", - lump ? le16_to_cpu(lump->lmm_stripe_count) : 0); - rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); - if (rc) - goto end; - - /* Set root stripeoffset */ - sprintf(buf, ".stripeoffset=%hd", - lump ? le16_to_cpu(lump->lmm_stripe_offset) : - (typeof(lump->lmm_stripe_offset))(-1)); - rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); - -end: - kfree(param); - } -#endif - return rc; -} - -/** - * This function will be used to get default LOV/LMV/Default LMV - * @valid will be used to indicate which stripe it will retrieve - * OBD_MD_MEA LMV stripe EA - * OBD_MD_DEFAULT_MEA Default LMV stripe EA - * otherwise Default LOV EA. - * Each time, it can only retrieve 1 stripe EA - **/ -int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size, - struct ptlrpc_request **request, u64 valid) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct mdt_body *body; - struct lov_mds_md *lmm = NULL; - struct ptlrpc_request *req = NULL; - int rc, lmmsize; - struct md_op_data *op_data; - - rc = ll_get_max_mdsize(sbi, &lmmsize); - if (rc) - return rc; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, - 0, lmmsize, LUSTRE_OPC_ANY, - NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; - rc = md_getattr(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc < 0) { - CDEBUG(D_INFO, "md_getattr failed on inode " DFID ": rc %d\n", - PFID(ll_inode2fid(inode)), rc); - goto out; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - lmmsize = body->mbo_eadatasize; - - if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || - lmmsize == 0) { - rc = -ENODATA; - goto out; - } - - lmm = req_capsule_server_sized_get(&req->rq_pill, - &RMF_MDT_MD, lmmsize); - LASSERT(lmm); - - /* - * This is coming from the MDS, so is probably in - * little endian. We convert it to host endian before - * passing it to userspace. - */ - /* We don't swab objects for directories */ - switch (le32_to_cpu(lmm->lmm_magic)) { - case LOV_MAGIC_V1: - if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) - lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); - break; - case LOV_MAGIC_V3: - if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) - lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); - break; - case LMV_MAGIC_V1: - if (cpu_to_le32(LMV_MAGIC) != LMV_MAGIC) - lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm); - break; - case LMV_USER_MAGIC: - if (cpu_to_le32(LMV_USER_MAGIC) != LMV_USER_MAGIC) - lustre_swab_lmv_user_md((struct lmv_user_md *)lmm); - break; - default: - CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic); - rc = -EPROTO; - } -out: - *plmm = lmm; - *plmm_size = lmmsize; - *request = req; - return rc; -} - -int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid) -{ - struct md_op_data *op_data; - int mdt_index, rc; - - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) - return -ENOMEM; - - op_data->op_flags |= MF_GET_MDT_IDX; - op_data->op_fid1 = *fid; - rc = md_getattr(sbi->ll_md_exp, op_data, NULL); - mdt_index = op_data->op_mds; - kvfree(op_data); - if (rc < 0) - return rc; - - return mdt_index; -} - -/* - * Get MDT index for the inode. - */ -int ll_get_mdt_idx(struct inode *inode) -{ - return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode)); -} - -/** - * Generic handler to do any pre-copy work. - * - * It sends a first hsm_progress (with extent length == 0) to coordinator as a - * first information for it that real work has started. - * - * Moreover, for a ARCHIVE request, it will sample the file data version and - * store it in \a copy. - * - * \return 0 on success. - */ -static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct hsm_progress_kernel hpk; - int rc2, rc = 0; - - /* Forge a hsm_progress based on data from copy. */ - hpk.hpk_fid = copy->hc_hai.hai_fid; - hpk.hpk_cookie = copy->hc_hai.hai_cookie; - hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset; - hpk.hpk_extent.length = 0; - hpk.hpk_flags = 0; - hpk.hpk_errval = 0; - hpk.hpk_data_version = 0; - - /* For archive request, we need to read the current file version. */ - if (copy->hc_hai.hai_action == HSMA_ARCHIVE) { - struct inode *inode; - __u64 data_version = 0; - - /* Get inode for this fid */ - inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); - if (IS_ERR(inode)) { - hpk.hpk_flags |= HP_FLAG_RETRY; - /* hpk_errval is >= 0 */ - hpk.hpk_errval = -PTR_ERR(inode); - rc = PTR_ERR(inode); - goto progress; - } - - /* Read current file data version */ - rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH); - iput(inode); - if (rc != 0) { - CDEBUG(D_HSM, - "Could not read file data version of " DFID " (rc = %d). Archive request (%#llx) could not be done.\n", - PFID(©->hc_hai.hai_fid), rc, - copy->hc_hai.hai_cookie); - hpk.hpk_flags |= HP_FLAG_RETRY; - /* hpk_errval must be >= 0 */ - hpk.hpk_errval = -rc; - goto progress; - } - - /* Store in the hsm_copy for later copytool use. - * Always modified even if no lsm. - */ - copy->hc_data_version = data_version; - } - -progress: - /* On error, the request should be considered as completed */ - if (hpk.hpk_errval > 0) - hpk.hpk_flags |= HP_FLAG_COMPLETED; - rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), - &hpk, NULL); - - return rc ? rc : rc2; -} - -/** - * Generic handler to do any post-copy work. - * - * It will send the last hsm_progress update to coordinator to inform it - * that copy is finished and whether it was successful or not. - * - * Moreover, - * - for ARCHIVE request, it will sample the file data version and compare it - * with the version saved in ll_ioc_copy_start(). If they do not match, copy - * will be considered as failed. - * - for RESTORE request, it will sample the file data version and send it to - * coordinator which is useful if the file was imported as 'released'. - * - * \return 0 on success. - */ -static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct hsm_progress_kernel hpk; - int rc2, rc = 0; - - /* If you modify the logic here, also check llapi_hsm_copy_end(). */ - /* Take care: copy->hc_hai.hai_action, len, gid and data are not - * initialized if copy_end was called with copy == NULL. - */ - - /* Forge a hsm_progress based on data from copy. */ - hpk.hpk_fid = copy->hc_hai.hai_fid; - hpk.hpk_cookie = copy->hc_hai.hai_cookie; - hpk.hpk_extent = copy->hc_hai.hai_extent; - hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED; - hpk.hpk_errval = copy->hc_errval; - hpk.hpk_data_version = 0; - - /* For archive request, we need to check the file data was not changed. - * - * For restore request, we need to send the file data version, this is - * useful when the file was created using hsm_import. - */ - if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) || - (copy->hc_hai.hai_action == HSMA_RESTORE)) && - (copy->hc_errval == 0)) { - struct inode *inode; - __u64 data_version = 0; - - /* Get lsm for this fid */ - inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); - if (IS_ERR(inode)) { - hpk.hpk_flags |= HP_FLAG_RETRY; - /* hpk_errval must be >= 0 */ - hpk.hpk_errval = -PTR_ERR(inode); - rc = PTR_ERR(inode); - goto progress; - } - - rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH); - iput(inode); - if (rc) { - CDEBUG(D_HSM, - "Could not read file data version. Request could not be confirmed.\n"); - if (hpk.hpk_errval == 0) - hpk.hpk_errval = -rc; - goto progress; - } - - /* Store in the hsm_copy for later copytool use. - * Always modified even if no lsm. - */ - hpk.hpk_data_version = data_version; - - /* File could have been stripped during archiving, so we need - * to check anyway. - */ - if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) && - (copy->hc_data_version != data_version)) { - CDEBUG(D_HSM, "File data version mismatched. File content was changed during archiving. " DFID ", start:%#llx current:%#llx\n", - PFID(©->hc_hai.hai_fid), - copy->hc_data_version, data_version); - /* File was changed, send error to cdt. Do not ask for - * retry because if a file is modified frequently, - * the cdt will loop on retried archive requests. - * The policy engine will ask for a new archive later - * when the file will not be modified for some tunable - * time - */ - hpk.hpk_flags &= ~HP_FLAG_RETRY; - rc = -EBUSY; - /* hpk_errval must be >= 0 */ - hpk.hpk_errval = -rc; - } - } - -progress: - rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), - &hpk, NULL); - - return rc ? rc : rc2; -} - -static int copy_and_ioctl(int cmd, struct obd_export *exp, - const void __user *data, size_t size) -{ - void *copy; - int rc; - - copy = memdup_user(data, size); - if (IS_ERR(copy)) - return PTR_ERR(copy); - - rc = obd_iocontrol(cmd, exp, size, copy, NULL); - kfree(copy); - - return rc; -} - -static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl) -{ - int cmd = qctl->qc_cmd; - int type = qctl->qc_type; - int id = qctl->qc_id; - int valid = qctl->qc_valid; - int rc = 0; - - switch (cmd) { - case Q_SETQUOTA: - case Q_SETINFO: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; - case Q_GETQUOTA: - if (((type == USRQUOTA && - !uid_eq(current_euid(), make_kuid(&init_user_ns, id))) || - (type == GRPQUOTA && - !in_egroup_p(make_kgid(&init_user_ns, id)))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - break; - case Q_GETINFO: - break; - default: - CERROR("unsupported quotactl op: %#x\n", cmd); - return -ENOTTY; - } - - if (valid != QC_GENERAL) { - if (cmd == Q_GETINFO) - qctl->qc_cmd = Q_GETOINFO; - else if (cmd == Q_GETQUOTA) - qctl->qc_cmd = Q_GETOQUOTA; - else - return -EINVAL; - - switch (valid) { - case QC_MDTIDX: - rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, - sizeof(*qctl), qctl, NULL); - break; - case QC_OSTIDX: - rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp, - sizeof(*qctl), qctl, NULL); - break; - case QC_UUID: - rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, - sizeof(*qctl), qctl, NULL); - if (rc == -EAGAIN) - rc = obd_iocontrol(OBD_IOC_QUOTACTL, - sbi->ll_dt_exp, - sizeof(*qctl), qctl, NULL); - break; - default: - rc = -EINVAL; - break; - } - - if (rc) - return rc; - - qctl->qc_cmd = cmd; - } else { - struct obd_quotactl *oqctl; - - oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS); - if (!oqctl) - return -ENOMEM; - - QCTL_COPY(oqctl, qctl); - rc = obd_quotactl(sbi->ll_md_exp, oqctl); - if (rc) { - kfree(oqctl); - return rc; - } - /* If QIF_SPACE is not set, client should collect the - * space usage from OSSs by itself - */ - if (cmd == Q_GETQUOTA && - !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) && - !oqctl->qc_dqblk.dqb_curspace) { - struct obd_quotactl *oqctl_tmp; - - oqctl_tmp = kzalloc(sizeof(*oqctl_tmp), GFP_NOFS); - if (!oqctl_tmp) { - rc = -ENOMEM; - goto out; - } - - oqctl_tmp->qc_cmd = Q_GETOQUOTA; - oqctl_tmp->qc_id = oqctl->qc_id; - oqctl_tmp->qc_type = oqctl->qc_type; - - /* collect space usage from OSTs */ - oqctl_tmp->qc_dqblk.dqb_curspace = 0; - rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp); - if (!rc || rc == -EREMOTEIO) { - oqctl->qc_dqblk.dqb_curspace = - oqctl_tmp->qc_dqblk.dqb_curspace; - oqctl->qc_dqblk.dqb_valid |= QIF_SPACE; - } - - /* collect space & inode usage from MDTs */ - oqctl_tmp->qc_dqblk.dqb_curspace = 0; - oqctl_tmp->qc_dqblk.dqb_curinodes = 0; - rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp); - if (!rc || rc == -EREMOTEIO) { - oqctl->qc_dqblk.dqb_curspace += - oqctl_tmp->qc_dqblk.dqb_curspace; - oqctl->qc_dqblk.dqb_curinodes = - oqctl_tmp->qc_dqblk.dqb_curinodes; - oqctl->qc_dqblk.dqb_valid |= QIF_INODES; - } else { - oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE; - } - - kfree(oqctl_tmp); - } -out: - QCTL_COPY(qctl, oqctl); - kfree(oqctl); - } - - return rc; -} - -/* This function tries to get a single name component, - * to send to the server. No actual path traversal involved, - * so we limit to NAME_MAX - */ -static char *ll_getname(const char __user *filename) -{ - int ret = 0, len; - char *tmp; - - tmp = kzalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmp) - return ERR_PTR(-ENOMEM); - - len = strncpy_from_user(tmp, filename, NAME_MAX + 1); - if (len < 0) - ret = len; - else if (len == 0) - ret = -ENOENT; - else if (len > NAME_MAX && tmp[NAME_MAX] != 0) - ret = -ENAMETOOLONG; - - if (ret) { - kfree(tmp); - tmp = ERR_PTR(ret); - } - return tmp; -} - -#define ll_putname(filename) kfree(filename) - -static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(file); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct obd_ioctl_data *data; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), cmd=%#x\n", - PFID(ll_inode2fid(inode)), inode, cmd); - - /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ - if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ - return -ENOTTY; - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); - switch (cmd) { - case FSFILT_IOC_GETFLAGS: - case FSFILT_IOC_SETFLAGS: - return ll_iocontrol(inode, file, cmd, arg); - case FSFILT_IOC_GETVERSION_OLD: - case FSFILT_IOC_GETVERSION: - return put_user(inode->i_generation, (int __user *)arg); - /* We need to special case any other ioctls we want to handle, - * to send them to the MDS/OST as appropriate and to properly - * network encode the arg field. - case FSFILT_IOC_SETVERSION_OLD: - case FSFILT_IOC_SETVERSION: - */ - case LL_IOC_GET_MDTIDX: { - int mdtidx; - - mdtidx = ll_get_mdt_idx(inode); - if (mdtidx < 0) - return mdtidx; - - if (put_user((int)mdtidx, (int __user *)arg)) - return -EFAULT; - - return 0; - } - case IOC_MDC_LOOKUP: { - int namelen, len = 0; - char *buf = NULL; - char *filename; - - rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); - if (rc) - return rc; - data = (void *)buf; - - filename = data->ioc_inlbuf1; - namelen = strlen(filename); - - if (namelen < 1) { - CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); - rc = -EINVAL; - goto out_free; - } - - rc = ll_get_fid_by_name(inode, filename, namelen, NULL, NULL); - if (rc < 0) { - CERROR("%s: lookup %.*s failed: rc = %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), namelen, - filename, rc); - goto out_free; - } -out_free: - kvfree(buf); - return rc; - } - case LL_IOC_LMV_SETSTRIPE: { - struct lmv_user_md *lum; - char *buf = NULL; - char *filename; - int namelen = 0; - int lumlen = 0; - umode_t mode; - int len; - int rc; - - rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); - if (rc) - return rc; - - data = (void *)buf; - if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || - data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) { - rc = -EINVAL; - goto lmv_out_free; - } - - filename = data->ioc_inlbuf1; - namelen = data->ioc_inllen1; - - if (namelen < 1) { - CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); - rc = -EINVAL; - goto lmv_out_free; - } - lum = (struct lmv_user_md *)data->ioc_inlbuf2; - lumlen = data->ioc_inllen2; - - if (lum->lum_magic != LMV_USER_MAGIC || - lumlen != sizeof(*lum)) { - CERROR("%s: wrong lum magic %x or size %d: rc = %d\n", - filename, lum->lum_magic, lumlen, -EFAULT); - rc = -EINVAL; - goto lmv_out_free; - } - -#if OBD_OCD_VERSION(2, 9, 50, 0) > LUSTRE_VERSION_CODE - mode = data->ioc_type != 0 ? data->ioc_type : 0777; -#else - mode = data->ioc_type; -#endif - rc = ll_dir_setdirstripe(inode, lum, filename, mode); -lmv_out_free: - kvfree(buf); - return rc; - } - case LL_IOC_LMV_SET_DEFAULT_STRIPE: { - struct lmv_user_md __user *ulump; - struct lmv_user_md lum; - int rc; - - ulump = (struct lmv_user_md __user *)arg; - if (copy_from_user(&lum, ulump, sizeof(lum))) - return -EFAULT; - - if (lum.lum_magic != LMV_USER_MAGIC) - return -EINVAL; - - rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0); - - return rc; - } - case LL_IOC_LOV_SETSTRIPE: { - struct lov_user_md_v3 lumv3; - struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; - struct lov_user_md_v1 __user *lumv1p = (void __user *)arg; - struct lov_user_md_v3 __user *lumv3p = (void __user *)arg; - - int set_default = 0; - - LASSERT(sizeof(lumv3) == sizeof(*lumv3p)); - LASSERT(sizeof(lumv3.lmm_objects[0]) == - sizeof(lumv3p->lmm_objects[0])); - /* first try with v1 which is smaller than v3 */ - if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1))) - return -EFAULT; - - if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) { - if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3))) - return -EFAULT; - } - - if (is_root_inode(inode)) - set_default = 1; - - /* in v1 and v3 cases lumv1 points to data */ - rc = ll_dir_setstripe(inode, lumv1, set_default); - - return rc; - } - case LL_IOC_LMV_GETSTRIPE: { - struct lmv_user_md __user *ulmv; - struct lmv_user_md lum; - struct ptlrpc_request *request = NULL; - struct lmv_user_md *tmp = NULL; - union lmv_mds_md *lmm = NULL; - u64 valid = 0; - int max_stripe_count; - int stripe_count; - int mdt_index; - int lum_size; - int lmmsize; - int rc; - int i; - - ulmv = (struct lmv_user_md __user *)arg; - if (copy_from_user(&lum, ulmv, sizeof(*ulmv))) - return -EFAULT; - - max_stripe_count = lum.lum_stripe_count; - /* - * lum_magic will indicate which stripe the ioctl will like - * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC - * is for default LMV stripe - */ - if (lum.lum_magic == LMV_MAGIC_V1) - valid |= OBD_MD_MEA; - else if (lum.lum_magic == LMV_USER_MAGIC) - valid |= OBD_MD_DEFAULT_MEA; - else - return -EINVAL; - - rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, &request, - valid); - if (rc) - goto finish_req; - - /* Get default LMV EA */ - if (lum.lum_magic == LMV_USER_MAGIC) { - if (lmmsize > sizeof(*ulmv)) { - rc = -EINVAL; - goto finish_req; - } - - if (copy_to_user(ulmv, lmm, lmmsize)) - rc = -EFAULT; - - goto finish_req; - } - - stripe_count = lmv_mds_md_stripe_count_get(lmm); - if (max_stripe_count < stripe_count) { - lum.lum_stripe_count = stripe_count; - if (copy_to_user(ulmv, &lum, sizeof(lum))) { - rc = -EFAULT; - goto finish_req; - } - rc = -E2BIG; - goto finish_req; - } - - lum_size = lmv_user_md_size(stripe_count, LMV_MAGIC_V1); - tmp = kzalloc(lum_size, GFP_NOFS); - if (!tmp) { - rc = -ENOMEM; - goto finish_req; - } - - mdt_index = ll_get_mdt_idx(inode); - if (mdt_index < 0) { - rc = -ENOMEM; - goto out_tmp; - } - tmp->lum_magic = LMV_MAGIC_V1; - tmp->lum_stripe_count = 0; - tmp->lum_stripe_offset = mdt_index; - for (i = 0; i < stripe_count; i++) { - struct lu_fid fid; - - fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]); - mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid); - if (mdt_index < 0) { - rc = mdt_index; - goto out_tmp; - } - tmp->lum_objects[i].lum_mds = mdt_index; - tmp->lum_objects[i].lum_fid = fid; - tmp->lum_stripe_count++; - } - - if (copy_to_user(ulmv, tmp, lum_size)) { - rc = -EFAULT; - goto out_tmp; - } -out_tmp: - kfree(tmp); -finish_req: - ptlrpc_req_finished(request); - return rc; - } - - case LL_IOC_LOV_SWAP_LAYOUTS: - return -EPERM; - case IOC_OBD_STATFS: - return ll_obd_statfs(inode, (void __user *)arg); - case LL_IOC_LOV_GETSTRIPE: - case LL_IOC_MDC_GETINFO: - case IOC_MDC_GETFILEINFO: - case IOC_MDC_GETFILESTRIPE: { - struct ptlrpc_request *request = NULL; - struct lov_user_md __user *lump; - struct lov_mds_md *lmm = NULL; - struct mdt_body *body; - char *filename = NULL; - int lmmsize; - - if (cmd == IOC_MDC_GETFILEINFO || - cmd == IOC_MDC_GETFILESTRIPE) { - filename = ll_getname((const char __user *)arg); - if (IS_ERR(filename)) - return PTR_ERR(filename); - - rc = ll_lov_getstripe_ea_info(inode, filename, &lmm, - &lmmsize, &request); - } else { - rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, - &request, 0); - } - - if (request) { - body = req_capsule_server_get(&request->rq_pill, - &RMF_MDT_BODY); - LASSERT(body); - } else { - goto out_req; - } - - if (rc < 0) { - if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO || - cmd == LL_IOC_MDC_GETINFO)) { - rc = 0; - goto skip_lmm; - } - - goto out_req; - } - - if (cmd == IOC_MDC_GETFILESTRIPE || - cmd == LL_IOC_LOV_GETSTRIPE) { - lump = (struct lov_user_md __user *)arg; - } else { - struct lov_user_mds_data __user *lmdp; - - lmdp = (struct lov_user_mds_data __user *)arg; - lump = &lmdp->lmd_lmm; - } - if (copy_to_user(lump, lmm, lmmsize)) { - if (copy_to_user(lump, lmm, sizeof(*lump))) { - rc = -EFAULT; - goto out_req; - } - rc = -EOVERFLOW; - } -skip_lmm: - if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) { - struct lov_user_mds_data __user *lmdp; - lstat_t st = { 0 }; - - st.st_dev = inode->i_sb->s_dev; - st.st_mode = body->mbo_mode; - st.st_nlink = body->mbo_nlink; - st.st_uid = body->mbo_uid; - st.st_gid = body->mbo_gid; - st.st_rdev = body->mbo_rdev; - st.st_size = body->mbo_size; - st.st_blksize = PAGE_SIZE; - st.st_blocks = body->mbo_blocks; - st.st_atime = body->mbo_atime; - st.st_mtime = body->mbo_mtime; - st.st_ctime = body->mbo_ctime; - st.st_ino = cl_fid_build_ino(&body->mbo_fid1, - sbi->ll_flags & - LL_SBI_32BIT_API); - - lmdp = (struct lov_user_mds_data __user *)arg; - if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st))) { - rc = -EFAULT; - goto out_req; - } - } - -out_req: - ptlrpc_req_finished(request); - if (filename) - ll_putname(filename); - return rc; - } - case OBD_IOC_QUOTACTL: { - struct if_quotactl *qctl; - - qctl = kzalloc(sizeof(*qctl), GFP_NOFS); - if (!qctl) - return -ENOMEM; - - if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl))) { - rc = -EFAULT; - goto out_quotactl; - } - - rc = quotactl_ioctl(sbi, qctl); - - if (rc == 0 && copy_to_user((void __user *)arg, qctl, - sizeof(*qctl))) - rc = -EFAULT; - -out_quotactl: - kfree(qctl); - return rc; - } - case OBD_IOC_GETDTNAME: - case OBD_IOC_GETMDNAME: - return ll_get_obd_name(inode, cmd, arg); - case LL_IOC_FLUSHCTX: - return ll_flush_ctx(inode); - case LL_IOC_GETOBDCOUNT: { - int count, vallen; - struct obd_export *exp; - - if (copy_from_user(&count, (int __user *)arg, sizeof(int))) - return -EFAULT; - - /* get ost count when count is zero, get mdt count otherwise */ - exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp; - vallen = sizeof(count); - rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT), - KEY_TGT_COUNT, &vallen, &count); - if (rc) { - CERROR("get target count failed: %d\n", rc); - return rc; - } - - if (copy_to_user((int __user *)arg, &count, sizeof(int))) - return -EFAULT; - - return 0; - } - case LL_IOC_PATH2FID: - if (copy_to_user((void __user *)arg, ll_inode2fid(inode), - sizeof(struct lu_fid))) - return -EFAULT; - return 0; - case LL_IOC_GET_CONNECT_FLAGS: { - return obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, - (void __user *)arg); - } - case OBD_IOC_CHANGELOG_SEND: - case OBD_IOC_CHANGELOG_CLEAR: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg, - sizeof(struct ioc_changelog)); - return rc; - case OBD_IOC_FID2PATH: - return ll_fid2path(inode, (void __user *)arg); - case LL_IOC_GETPARENT: - return ll_getparent(file, (void __user *)arg); - case LL_IOC_FID2MDTIDX: { - struct obd_export *exp = ll_i2mdexp(inode); - struct lu_fid fid; - __u32 index; - - if (copy_from_user(&fid, (const struct lu_fid __user *)arg, - sizeof(fid))) - return -EFAULT; - - /* Call mdc_iocontrol */ - rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid, - &index); - if (rc) - return rc; - - return index; - } - case LL_IOC_HSM_REQUEST: { - struct hsm_user_request *hur; - ssize_t totalsize; - - hur = memdup_user((void __user *)arg, sizeof(*hur)); - if (IS_ERR(hur)) - return PTR_ERR(hur); - - /* Compute the whole struct size */ - totalsize = hur_len(hur); - kfree(hur); - if (totalsize < 0) - return -E2BIG; - - /* Final size will be more than double totalsize */ - if (totalsize >= MDS_MAXREQSIZE / 3) - return -E2BIG; - - hur = kzalloc(totalsize, GFP_NOFS); - if (!hur) - return -ENOMEM; - - /* Copy the whole struct */ - if (copy_from_user(hur, (void __user *)arg, totalsize)) { - kvfree(hur); - return -EFAULT; - } - - if (hur->hur_request.hr_action == HUA_RELEASE) { - const struct lu_fid *fid; - struct inode *f; - int i; - - for (i = 0; i < hur->hur_request.hr_itemcount; i++) { - fid = &hur->hur_user_item[i].hui_fid; - f = search_inode_for_lustre(inode->i_sb, fid); - if (IS_ERR(f)) { - rc = PTR_ERR(f); - break; - } - - rc = ll_hsm_release(f); - iput(f); - if (rc != 0) - break; - } - } else { - rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize, - hur, NULL); - } - - kvfree(hur); - - return rc; - } - case LL_IOC_HSM_PROGRESS: { - struct hsm_progress_kernel hpk; - struct hsm_progress hp; - - if (copy_from_user(&hp, (void __user *)arg, sizeof(hp))) - return -EFAULT; - - hpk.hpk_fid = hp.hp_fid; - hpk.hpk_cookie = hp.hp_cookie; - hpk.hpk_extent = hp.hp_extent; - hpk.hpk_flags = hp.hp_flags; - hpk.hpk_errval = hp.hp_errval; - hpk.hpk_data_version = 0; - - /* File may not exist in Lustre; all progress - * reported to Lustre root - */ - rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk, - NULL); - return rc; - } - case LL_IOC_HSM_CT_START: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg, - sizeof(struct lustre_kernelcomm)); - return rc; - - case LL_IOC_HSM_COPY_START: { - struct hsm_copy *copy; - int rc; - - copy = memdup_user((char __user *)arg, sizeof(*copy)); - if (IS_ERR(copy)) - return PTR_ERR(copy); - - rc = ll_ioc_copy_start(inode->i_sb, copy); - if (copy_to_user((char __user *)arg, copy, sizeof(*copy))) - rc = -EFAULT; - - kfree(copy); - return rc; - } - case LL_IOC_HSM_COPY_END: { - struct hsm_copy *copy; - int rc; - - copy = memdup_user((char __user *)arg, sizeof(*copy)); - if (IS_ERR(copy)) - return PTR_ERR(copy); - - rc = ll_ioc_copy_end(inode->i_sb, copy); - if (copy_to_user((char __user *)arg, copy, sizeof(*copy))) - rc = -EFAULT; - - kfree(copy); - return rc; - } - case LL_IOC_MIGRATE: { - char *buf = NULL; - const char *filename; - int namelen = 0; - int len; - int rc; - int mdtidx; - - rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); - if (rc < 0) - return rc; - - data = (struct obd_ioctl_data *)buf; - if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || - !data->ioc_inllen1 || !data->ioc_inllen2) { - rc = -EINVAL; - goto migrate_free; - } - - filename = data->ioc_inlbuf1; - namelen = data->ioc_inllen1; - if (namelen < 1 || namelen != strlen(filename) + 1) { - rc = -EINVAL; - goto migrate_free; - } - - if (data->ioc_inllen2 != sizeof(mdtidx)) { - rc = -EINVAL; - goto migrate_free; - } - mdtidx = *(int *)data->ioc_inlbuf2; - - rc = ll_migrate(inode, file, mdtidx, filename, namelen - 1); -migrate_free: - kvfree(buf); - - return rc; - } - - default: - return obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, - (void __user *)arg); - } -} - -static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) -{ - struct inode *inode = file->f_mapping->host; - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_sb_info *sbi = ll_i2sbi(inode); - int api32 = ll_need_32bit_api(sbi); - loff_t ret = -EINVAL; - - switch (origin) { - case SEEK_SET: - break; - case SEEK_CUR: - offset += file->f_pos; - break; - case SEEK_END: - if (offset > 0) - goto out; - if (api32) - offset += LL_DIR_END_OFF_32BIT; - else - offset += LL_DIR_END_OFF; - break; - default: - goto out; - } - - if (offset >= 0 && - ((api32 && offset <= LL_DIR_END_OFF_32BIT) || - (!api32 && offset <= LL_DIR_END_OFF))) { - if (offset != file->f_pos) { - if ((api32 && offset == LL_DIR_END_OFF_32BIT) || - (!api32 && offset == LL_DIR_END_OFF)) - fd->lfd_pos = MDS_DIR_END_OFF; - else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH) - fd->lfd_pos = offset << 32; - else - fd->lfd_pos = offset; - file->f_pos = offset; - } - ret = offset; - } - goto out; - -out: - return ret; -} - -static int ll_dir_open(struct inode *inode, struct file *file) -{ - return ll_file_open(inode, file); -} - -static int ll_dir_release(struct inode *inode, struct file *file) -{ - return ll_file_release(inode, file); -} - -const struct file_operations ll_dir_operations = { - .llseek = ll_dir_seek, - .open = ll_dir_open, - .release = ll_dir_release, - .read = generic_read_dir, - .iterate_shared = ll_readdir, - .unlocked_ioctl = ll_dir_ioctl, - .fsync = ll_fsync, -}; diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c deleted file mode 100644 index ca5faea13b7e..000000000000 --- a/drivers/staging/lustre/lustre/llite/file.c +++ /dev/null @@ -1,3600 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/file.c - * - * Author: Peter Braam <braam@clusterfs.com> - * Author: Phil Schwan <phil@clusterfs.com> - * Author: Andreas Dilger <adilger@clusterfs.com> - */ - -#define DEBUG_SUBSYSTEM S_LLITE -#include <lustre_dlm.h> -#include <linux/pagemap.h> -#include <linux/file.h> -#include <linux/sched.h> -#include <linux/mount.h> -#include <uapi/linux/lustre/lustre_fiemap.h> -#include <uapi/linux/lustre/lustre_ioctl.h> -#include <lustre_swab.h> - -#include <cl_object.h> -#include "llite_internal.h" - -static int -ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); - -static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, - bool *lease_broken); - -static enum llioc_iter -ll_iocontrol_call(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg, int *rcp); - -static struct ll_file_data *ll_file_data_get(void) -{ - struct ll_file_data *fd; - - fd = kmem_cache_zalloc(ll_file_data_slab, GFP_NOFS); - if (!fd) - return NULL; - fd->fd_write_failed = false; - return fd; -} - -static void ll_file_data_put(struct ll_file_data *fd) -{ - if (fd) - kmem_cache_free(ll_file_data_slab, fd); -} - -/** - * Packs all the attributes into @op_data for the CLOSE rpc. - */ -static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data, - struct obd_client_handle *och) -{ - struct ll_inode_info *lli = ll_i2info(inode); - - ll_prep_md_op_data(op_data, inode, NULL, NULL, - 0, 0, LUSTRE_OPC_ANY, NULL); - - op_data->op_attr.ia_mode = inode->i_mode; - op_data->op_attr.ia_atime = inode->i_atime; - op_data->op_attr.ia_mtime = inode->i_mtime; - op_data->op_attr.ia_ctime = inode->i_ctime; - op_data->op_attr.ia_size = i_size_read(inode); - op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET | - ATTR_MTIME | ATTR_MTIME_SET | - ATTR_CTIME | ATTR_CTIME_SET; - op_data->op_attr_blocks = inode->i_blocks; - op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags); - op_data->op_handle = och->och_fh; - - /* - * For HSM: if inode data has been modified, pack it so that - * MDT can set data dirty flag in the archive. - */ - if (och->och_flags & FMODE_WRITE && - test_and_clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags)) - op_data->op_bias |= MDS_DATA_MODIFIED; -} - -/** - * Perform a close, possibly with a bias. - * The meaning of "data" depends on the value of "bias". - * - * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version. - * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to - * swap layouts with. - */ -static int ll_close_inode_openhandle(struct inode *inode, - struct obd_client_handle *och, - enum mds_op_bias bias, - void *data) -{ - const struct ll_inode_info *lli = ll_i2info(inode); - struct obd_export *md_exp = ll_i2mdexp(inode); - struct md_op_data *op_data; - struct ptlrpc_request *req = NULL; - int rc; - - if (!class_exp2obd(md_exp)) { - CERROR("%s: invalid MDC connection handle closing " DFID "\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&lli->lli_fid)); - rc = 0; - goto out; - } - - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - /* - * We leak openhandle and request here on error, but not much to be - * done in OOM case since app won't retry close on error either. - */ - if (!op_data) { - rc = -ENOMEM; - goto out; - } - - ll_prepare_close(inode, op_data, och); - switch (bias) { - case MDS_CLOSE_LAYOUT_SWAP: - LASSERT(data); - op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP; - op_data->op_data_version = 0; - op_data->op_lease_handle = och->och_lease_handle; - op_data->op_fid2 = *ll_inode2fid(data); - break; - - case MDS_HSM_RELEASE: - LASSERT(data); - op_data->op_bias |= MDS_HSM_RELEASE; - op_data->op_data_version = *(__u64 *)data; - op_data->op_lease_handle = och->och_lease_handle; - op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS; - break; - - default: - LASSERT(!data); - break; - } - - rc = md_close(md_exp, op_data, och->och_mod, &req); - if (rc && rc != -EINTR) { - CERROR("%s: inode " DFID " mdc close failed: rc = %d\n", - md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc); - } - - if (op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP) && - !rc) { - struct mdt_body *body; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED)) - rc = -EBUSY; - } - - ll_finish_md_op_data(op_data); - -out: - md_clear_open_replay_data(md_exp, och); - och->och_fh.cookie = DEAD_HANDLE_MAGIC; - kfree(och); - - ptlrpc_req_finished(req); - return rc; -} - -int ll_md_real_close(struct inode *inode, fmode_t fmode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_client_handle **och_p; - struct obd_client_handle *och; - __u64 *och_usecount; - int rc = 0; - - if (fmode & FMODE_WRITE) { - och_p = &lli->lli_mds_write_och; - och_usecount = &lli->lli_open_fd_write_count; - } else if (fmode & FMODE_EXEC) { - och_p = &lli->lli_mds_exec_och; - och_usecount = &lli->lli_open_fd_exec_count; - } else { - LASSERT(fmode & FMODE_READ); - och_p = &lli->lli_mds_read_och; - och_usecount = &lli->lli_open_fd_read_count; - } - - mutex_lock(&lli->lli_och_mutex); - if (*och_usecount > 0) { - /* There are still users of this handle, so skip - * freeing it. - */ - mutex_unlock(&lli->lli_och_mutex); - return 0; - } - - och = *och_p; - *och_p = NULL; - mutex_unlock(&lli->lli_och_mutex); - - if (och) { - /* There might be a race and this handle may already - * be closed. - */ - rc = ll_close_inode_openhandle(inode, och, 0, NULL); - } - - return rc; -} - -static int ll_md_close(struct inode *inode, struct file *file) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_inode_info *lli = ll_i2info(inode); - int lockmode; - __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK; - struct lustre_handle lockh; - union ldlm_policy_data policy = { - .l_inodebits = { MDS_INODELOCK_OPEN } - }; - int rc = 0; - - /* clear group lock, if present */ - if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) - ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid); - - if (fd->fd_lease_och) { - bool lease_broken; - - /* Usually the lease is not released when the - * application crashed, we need to release here. - */ - rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken); - CDEBUG(rc ? D_ERROR : D_INODE, - "Clean up lease " DFID " %d/%d\n", - PFID(&lli->lli_fid), rc, lease_broken); - - fd->fd_lease_och = NULL; - } - - if (fd->fd_och) { - rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL); - fd->fd_och = NULL; - goto out; - } - - /* Let's see if we have good enough OPEN lock on the file and if - * we can skip talking to MDS - */ - - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_omode & FMODE_WRITE) { - lockmode = LCK_CW; - LASSERT(lli->lli_open_fd_write_count); - lli->lli_open_fd_write_count--; - } else if (fd->fd_omode & FMODE_EXEC) { - lockmode = LCK_PR; - LASSERT(lli->lli_open_fd_exec_count); - lli->lli_open_fd_exec_count--; - } else { - lockmode = LCK_CR; - LASSERT(lli->lli_open_fd_read_count); - lli->lli_open_fd_read_count--; - } - mutex_unlock(&lli->lli_och_mutex); - - if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode), - LDLM_IBITS, &policy, lockmode, &lockh)) - rc = ll_md_real_close(inode, fd->fd_omode); - -out: - LUSTRE_FPRIVATE(file) = NULL; - ll_file_data_put(fd); - - return rc; -} - -/* While this returns an error code, fput() the caller does not, so we need - * to make every effort to clean up all of our state here. Also, applications - * rarely check close errors and even if an error is returned they will not - * re-try the close call. - */ -int ll_file_release(struct inode *inode, struct file *file) -{ - struct ll_file_data *fd; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_inode_info *lli = ll_i2info(inode); - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - if (!is_root_inode(inode)) - ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1); - fd = LUSTRE_FPRIVATE(file); - LASSERT(fd); - - /* The last ref on @file, maybe not be the owner pid of statahead, - * because parent and child process can share the same file handle. - */ - if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd) - ll_deauthorize_statahead(inode, fd); - - if (is_root_inode(inode)) { - LUSTRE_FPRIVATE(file) = NULL; - ll_file_data_put(fd); - return 0; - } - - if (!S_ISDIR(inode->i_mode)) { - if (lli->lli_clob) - lov_read_and_clear_async_rc(lli->lli_clob); - lli->lli_async_rc = 0; - } - - rc = ll_md_close(inode, file); - - if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val)) - libcfs_debug_dumplog(); - - return rc; -} - -static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize, - struct lookup_intent *itp) -{ - struct inode *inode = d_inode(de); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct dentry *parent = de->d_parent; - const char *name = NULL; - struct md_op_data *op_data; - struct ptlrpc_request *req = NULL; - int len = 0, rc; - - LASSERT(parent); - LASSERT(itp->it_flags & MDS_OPEN_BY_FID); - - /* - * if server supports open-by-fid, or file name is invalid, don't pack - * name in open request - */ - if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) && - lu_name_is_valid_2(de->d_name.name, de->d_name.len)) { - name = de->d_name.name; - len = de->d_name.len; - } - - op_data = ll_prep_md_op_data(NULL, d_inode(parent), inode, name, len, - O_RDWR, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - op_data->op_data = lmm; - op_data->op_data_size = lmmsize; - - rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req, - &ll_md_blocking_ast, 0); - ll_finish_md_op_data(op_data); - if (rc == -ESTALE) { - /* reason for keep own exit path - don`t flood log - * with messages with -ESTALE errors. - */ - if (!it_disposition(itp, DISP_OPEN_OPEN) || - it_open_error(DISP_OPEN_OPEN, itp)) - goto out; - ll_release_openhandle(inode, itp); - goto out; - } - - if (it_disposition(itp, DISP_LOOKUP_NEG)) { - rc = -ENOENT; - goto out; - } - - if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) { - rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp); - CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc); - goto out; - } - - rc = ll_prep_inode(&inode, req, NULL, itp); - if (!rc && itp->it_lock_mode) - ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL); - -out: - ptlrpc_req_finished(req); - ll_intent_drop_lock(itp); - - /* - * We did open by fid, but by the time we got to the server, - * the object disappeared. If this is a create, we cannot really - * tell the userspace that the file it was trying to create - * does not exist. Instead let's return -ESTALE, and the VFS will - * retry the create with LOOKUP_REVAL that we are going to catch - * in ll_revalidate_dentry() and use lookup then. - */ - if (rc == -ENOENT && itp->it_op & IT_CREAT) - rc = -ESTALE; - - return rc; -} - -static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it, - struct obd_client_handle *och) -{ - struct mdt_body *body; - - body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY); - och->och_fh = body->mbo_handle; - och->och_fid = body->mbo_fid1; - och->och_lease_handle.cookie = it->it_lock_handle; - och->och_magic = OBD_CLIENT_HANDLE_MAGIC; - och->och_flags = it->it_flags; - - return md_set_open_replay_data(md_exp, och, it); -} - -static int ll_local_open(struct file *file, struct lookup_intent *it, - struct ll_file_data *fd, struct obd_client_handle *och) -{ - struct inode *inode = file_inode(file); - - LASSERT(!LUSTRE_FPRIVATE(file)); - - LASSERT(fd); - - if (och) { - int rc; - - rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); - if (rc != 0) - return rc; - } - - LUSTRE_FPRIVATE(file) = fd; - ll_readahead_init(inode, &fd->fd_ras); - fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); - - /* ll_cl_context initialize */ - rwlock_init(&fd->fd_lock); - INIT_LIST_HEAD(&fd->fd_lccs); - - return 0; -} - -/* Open a file, and (for the very first open) create objects on the OSTs at - * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object - * creation or open until ll_lov_setstripe() ioctl is called. - * - * If we already have the stripe MD locally then we don't request it in - * md_open(), by passing a lmm_size = 0. - * - * It is up to the application to ensure no other processes open this file - * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be - * used. We might be able to avoid races of that sort by getting lli_open_sem - * before returning in the O_LOV_DELAY_CREATE case and dropping it here - * or in ll_file_release(), but I'm not sure that is desirable/necessary. - */ -int ll_file_open(struct inode *inode, struct file *file) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct lookup_intent *it, oit = { .it_op = IT_OPEN, - .it_flags = file->f_flags }; - struct obd_client_handle **och_p = NULL; - __u64 *och_usecount = NULL; - struct ll_file_data *fd; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), flags %o\n", - PFID(ll_inode2fid(inode)), inode, file->f_flags); - - it = file->private_data; /* XXX: compat macro */ - file->private_data = NULL; /* prevent ll_local_open assertion */ - - fd = ll_file_data_get(); - if (!fd) { - rc = -ENOMEM; - goto out_openerr; - } - - fd->fd_file = file; - if (S_ISDIR(inode->i_mode)) - ll_authorize_statahead(inode, fd); - - if (is_root_inode(inode)) { - LUSTRE_FPRIVATE(file) = fd; - return 0; - } - - if (!it || !it->it_disposition) { - /* Convert f_flags into access mode. We cannot use file->f_mode, - * because everything but O_ACCMODE mask was stripped from - * there - */ - if ((oit.it_flags + 1) & O_ACCMODE) - oit.it_flags++; - if (file->f_flags & O_TRUNC) - oit.it_flags |= FMODE_WRITE; - - /* kernel only call f_op->open in dentry_open. filp_open calls - * dentry_open after call to open_namei that checks permissions. - * Only nfsd_open call dentry_open directly without checking - * permissions and because of that this code below is safe. - */ - if (oit.it_flags & (FMODE_WRITE | FMODE_READ)) - oit.it_flags |= MDS_OPEN_OWNEROVERRIDE; - - /* We do not want O_EXCL here, presumably we opened the file - * already? XXX - NFS implications? - */ - oit.it_flags &= ~O_EXCL; - - /* bug20584, if "it_flags" contains O_CREAT, the file will be - * created if necessary, then "IT_CREAT" should be set to keep - * consistent with it - */ - if (oit.it_flags & O_CREAT) - oit.it_op |= IT_CREAT; - - it = &oit; - } - -restart: - /* Let's see if we have file open on MDS already. */ - if (it->it_flags & FMODE_WRITE) { - och_p = &lli->lli_mds_write_och; - och_usecount = &lli->lli_open_fd_write_count; - } else if (it->it_flags & FMODE_EXEC) { - och_p = &lli->lli_mds_exec_och; - och_usecount = &lli->lli_open_fd_exec_count; - } else { - och_p = &lli->lli_mds_read_och; - och_usecount = &lli->lli_open_fd_read_count; - } - - mutex_lock(&lli->lli_och_mutex); - if (*och_p) { /* Open handle is present */ - if (it_disposition(it, DISP_OPEN_OPEN)) { - /* Well, there's extra open request that we do not need, - * let's close it somehow. This will decref request. - */ - rc = it_open_error(DISP_OPEN_OPEN, it); - if (rc) { - mutex_unlock(&lli->lli_och_mutex); - goto out_openerr; - } - - ll_release_openhandle(inode, it); - } - (*och_usecount)++; - - rc = ll_local_open(file, it, fd, NULL); - if (rc) { - (*och_usecount)--; - mutex_unlock(&lli->lli_och_mutex); - goto out_openerr; - } - } else { - LASSERT(*och_usecount == 0); - if (!it->it_disposition) { - /* We cannot just request lock handle now, new ELC code - * means that one of other OPEN locks for this file - * could be cancelled, and since blocking ast handler - * would attempt to grab och_mutex as well, that would - * result in a deadlock - */ - mutex_unlock(&lli->lli_och_mutex); - /* - * Normally called under two situations: - * 1. NFS export. - * 2. revalidate with IT_OPEN (revalidate doesn't - * execute this intent any more). - * - * Always fetch MDS_OPEN_LOCK if this is not setstripe. - * - * Always specify MDS_OPEN_BY_FID because we don't want - * to get file with different fid. - */ - it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID; - rc = ll_intent_file_open(file->f_path.dentry, - NULL, 0, it); - if (rc) - goto out_openerr; - - goto restart; - } - *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS); - if (!*och_p) { - rc = -ENOMEM; - goto out_och_free; - } - - (*och_usecount)++; - - /* md_intent_lock() didn't get a request ref if there was an - * open error, so don't do cleanup on the request here - * (bug 3430) - */ - /* XXX (green): Should not we bail out on any error here, not - * just open error? - */ - rc = it_open_error(DISP_OPEN_OPEN, it); - if (rc) - goto out_och_free; - - LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF), - "inode %p: disposition %x, status %d\n", inode, - it_disposition(it, ~0), it->it_status); - - rc = ll_local_open(file, it, fd, *och_p); - if (rc) - goto out_och_free; - } - mutex_unlock(&lli->lli_och_mutex); - fd = NULL; - - /* Must do this outside lli_och_mutex lock to prevent deadlock where - * different kind of OPEN lock for this same inode gets cancelled - * by ldlm_cancel_lru - */ - if (!S_ISREG(inode->i_mode)) - goto out_och_free; - - cl_lov_delay_create_clear(&file->f_flags); - goto out_och_free; - -out_och_free: - if (rc) { - if (och_p && *och_p) { - kfree(*och_p); - *och_p = NULL; - (*och_usecount)--; - } - mutex_unlock(&lli->lli_och_mutex); - -out_openerr: - if (lli->lli_opendir_key == fd) - ll_deauthorize_statahead(inode, fd); - if (fd) - ll_file_data_put(fd); - } else { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1); - } - - if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) { - ptlrpc_req_finished(it->it_request); - it_clear_disposition(it, DISP_ENQ_OPEN_REF); - } - - return rc; -} - -static int ll_md_blocking_lease_ast(struct ldlm_lock *lock, - struct ldlm_lock_desc *desc, - void *data, int flag) -{ - int rc; - struct lustre_handle lockh; - - switch (flag) { - case LDLM_CB_BLOCKING: - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); - if (rc < 0) { - CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); - return rc; - } - break; - case LDLM_CB_CANCELING: - /* do nothing */ - break; - } - return 0; -} - -/** - * Acquire a lease and open the file. - */ -static struct obd_client_handle * -ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode, - __u64 open_flags) -{ - struct lookup_intent it = { .it_op = IT_OPEN }; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct md_op_data *op_data; - struct ptlrpc_request *req = NULL; - struct lustre_handle old_handle = { 0 }; - struct obd_client_handle *och = NULL; - int rc; - int rc2; - - if (fmode != FMODE_WRITE && fmode != FMODE_READ) - return ERR_PTR(-EINVAL); - - if (file) { - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct obd_client_handle **och_p; - __u64 *och_usecount; - - if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC)) - return ERR_PTR(-EPERM); - - /* Get the openhandle of the file */ - rc = -EBUSY; - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_lease_och) { - mutex_unlock(&lli->lli_och_mutex); - return ERR_PTR(rc); - } - - if (!fd->fd_och) { - if (file->f_mode & FMODE_WRITE) { - LASSERT(lli->lli_mds_write_och); - och_p = &lli->lli_mds_write_och; - och_usecount = &lli->lli_open_fd_write_count; - } else { - LASSERT(lli->lli_mds_read_och); - och_p = &lli->lli_mds_read_och; - och_usecount = &lli->lli_open_fd_read_count; - } - if (*och_usecount == 1) { - fd->fd_och = *och_p; - *och_p = NULL; - *och_usecount = 0; - rc = 0; - } - } - mutex_unlock(&lli->lli_och_mutex); - if (rc < 0) /* more than 1 opener */ - return ERR_PTR(rc); - - LASSERT(fd->fd_och); - old_handle = fd->fd_och->och_fh; - } - - och = kzalloc(sizeof(*och), GFP_NOFS); - if (!och) - return ERR_PTR(-ENOMEM); - - op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out; - } - - /* To tell the MDT this openhandle is from the same owner */ - op_data->op_handle = old_handle; - - it.it_flags = fmode | open_flags; - it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE; - rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req, - &ll_md_blocking_lease_ast, - /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise - * it can be cancelled which may mislead applications that the lease is - * broken; - * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal - * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast - * doesn't deal with openhandle, so normal openhandle will be leaked. - */ - LDLM_FL_NO_LRU | LDLM_FL_EXCL); - ll_finish_md_op_data(op_data); - ptlrpc_req_finished(req); - if (rc < 0) - goto out_release_it; - - if (it_disposition(&it, DISP_LOOKUP_NEG)) { - rc = -ENOENT; - goto out_release_it; - } - - rc = it_open_error(DISP_OPEN_OPEN, &it); - if (rc) - goto out_release_it; - - LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF)); - ll_och_fill(sbi->ll_md_exp, &it, och); - - if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ { - rc = -EOPNOTSUPP; - goto out_close; - } - - /* already get lease, handle lease lock */ - ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); - if (it.it_lock_mode == 0 || - it.it_lock_bits != MDS_INODELOCK_OPEN) { - /* open lock must return for lease */ - CERROR(DFID "lease granted but no open lock, %d/%llu.\n", - PFID(ll_inode2fid(inode)), it.it_lock_mode, - it.it_lock_bits); - rc = -EPROTO; - goto out_close; - } - - ll_intent_release(&it); - return och; - -out_close: - /* Cancel open lock */ - if (it.it_lock_mode != 0) { - ldlm_lock_decref_and_cancel(&och->och_lease_handle, - it.it_lock_mode); - it.it_lock_mode = 0; - och->och_lease_handle.cookie = 0ULL; - } - rc2 = ll_close_inode_openhandle(inode, och, 0, NULL); - if (rc2 < 0) - CERROR("%s: error closing file " DFID ": %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&ll_i2info(inode)->lli_fid), rc2); - och = NULL; /* och has been freed in ll_close_inode_openhandle() */ -out_release_it: - ll_intent_release(&it); -out: - kfree(och); - return ERR_PTR(rc); -} - -/** - * Check whether a layout swap can be done between two inodes. - * - * \param[in] inode1 First inode to check - * \param[in] inode2 Second inode to check - * - * \retval 0 on success, layout swap can be performed between both inodes - * \retval negative error code if requirements are not met - */ -static int ll_check_swap_layouts_validity(struct inode *inode1, - struct inode *inode2) -{ - if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) - return -EINVAL; - - if (inode_permission(inode1, MAY_WRITE) || - inode_permission(inode2, MAY_WRITE)) - return -EPERM; - - if (inode1->i_sb != inode2->i_sb) - return -EXDEV; - - return 0; -} - -static int ll_swap_layouts_close(struct obd_client_handle *och, - struct inode *inode, struct inode *inode2) -{ - const struct lu_fid *fid1 = ll_inode2fid(inode); - const struct lu_fid *fid2; - int rc; - - CDEBUG(D_INODE, "%s: biased close of file " DFID "\n", - ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1)); - - rc = ll_check_swap_layouts_validity(inode, inode2); - if (rc < 0) - goto out_free_och; - - /* We now know that inode2 is a lustre inode */ - fid2 = ll_inode2fid(inode2); - - rc = lu_fid_cmp(fid1, fid2); - if (!rc) { - rc = -EINVAL; - goto out_free_och; - } - - /* - * Close the file and swap layouts between inode & inode2. - * NB: lease lock handle is released in mdc_close_layout_swap_pack() - * because we still need it to pack l_remote_handle to MDT. - */ - rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP, - inode2); - - och = NULL; /* freed in ll_close_inode_openhandle() */ - -out_free_och: - kfree(och); - return rc; -} - -/** - * Release lease and close the file. - * It will check if the lease has ever broken. - */ -static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, - bool *lease_broken) -{ - struct ldlm_lock *lock; - bool cancelled = true; - - lock = ldlm_handle2lock(&och->och_lease_handle); - if (lock) { - lock_res_and_lock(lock); - cancelled = ldlm_is_cancel(lock); - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); - } - - CDEBUG(D_INODE, "lease for " DFID " broken? %d\n", - PFID(&ll_i2info(inode)->lli_fid), cancelled); - - if (!cancelled) - ldlm_cli_cancel(&och->och_lease_handle, 0); - if (lease_broken) - *lease_broken = cancelled; - - return ll_close_inode_openhandle(inode, och, 0, NULL); -} - -int ll_merge_attr(const struct lu_env *env, struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *obj = lli->lli_clob; - struct cl_attr *attr = vvp_env_thread_attr(env); - s64 atime; - s64 mtime; - s64 ctime; - int rc = 0; - - ll_inode_size_lock(inode); - - /* merge timestamps the most recently obtained from mds with - * timestamps obtained from osts - */ - LTIME_S(inode->i_atime) = lli->lli_atime; - LTIME_S(inode->i_mtime) = lli->lli_mtime; - LTIME_S(inode->i_ctime) = lli->lli_ctime; - - mtime = LTIME_S(inode->i_mtime); - atime = LTIME_S(inode->i_atime); - ctime = LTIME_S(inode->i_ctime); - - cl_object_attr_lock(obj); - rc = cl_object_attr_get(env, obj, attr); - cl_object_attr_unlock(obj); - - if (rc != 0) - goto out_size_unlock; - - if (atime < attr->cat_atime) - atime = attr->cat_atime; - - if (ctime < attr->cat_ctime) - ctime = attr->cat_ctime; - - if (mtime < attr->cat_mtime) - mtime = attr->cat_mtime; - - CDEBUG(D_VFSTRACE, DFID " updating i_size %llu\n", - PFID(&lli->lli_fid), attr->cat_size); - - i_size_write(inode, attr->cat_size); - - inode->i_blocks = attr->cat_blocks; - - LTIME_S(inode->i_mtime) = mtime; - LTIME_S(inode->i_atime) = atime; - LTIME_S(inode->i_ctime) = ctime; - -out_size_unlock: - ll_inode_size_unlock(inode); - - return rc; -} - -static bool file_is_noatime(const struct file *file) -{ - const struct vfsmount *mnt = file->f_path.mnt; - const struct inode *inode = file_inode(file); - - /* Adapted from file_accessed() and touch_atime().*/ - if (file->f_flags & O_NOATIME) - return true; - - if (inode->i_flags & S_NOATIME) - return true; - - if (IS_NOATIME(inode)) - return true; - - if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY)) - return true; - - if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) - return true; - - if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) - return true; - - return false; -} - -static void ll_io_init(struct cl_io *io, const struct file *file, int write) -{ - struct inode *inode = file_inode(file); - - io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK; - if (write) { - io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND); - io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || - file->f_flags & O_DIRECT || - IS_SYNC(inode); - } - io->ci_obj = ll_i2info(inode)->lli_clob; - io->ci_lockreq = CILR_MAYBE; - if (ll_file_nolock(file)) { - io->ci_lockreq = CILR_NEVER; - io->ci_no_srvlock = 1; - } else if (file->f_flags & O_APPEND) { - io->ci_lockreq = CILR_MANDATORY; - } - - io->ci_noatime = file_is_noatime(file); -} - -static ssize_t -ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, - struct file *file, enum cl_io_type iot, - loff_t *ppos, size_t count) -{ - struct ll_inode_info *lli = ll_i2info(file_inode(file)); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct vvp_io *vio = vvp_env_io(env); - struct range_lock range; - struct cl_io *io; - ssize_t result = 0; - int rc = 0; - - CDEBUG(D_VFSTRACE, "file: %pD, type: %d ppos: %llu, count: %zu\n", - file, iot, *ppos, count); - -restart: - io = vvp_env_thread_io(env); - ll_io_init(io, file, iot == CIT_WRITE); - - if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) { - struct vvp_io *vio = vvp_env_io(env); - bool range_locked = false; - - if (file->f_flags & O_APPEND) - range_lock_init(&range, 0, LUSTRE_EOF); - else - range_lock_init(&range, *ppos, *ppos + count - 1); - - vio->vui_fd = LUSTRE_FPRIVATE(file); - vio->vui_iter = args->u.normal.via_iter; - vio->vui_iocb = args->u.normal.via_iocb; - /* - * Direct IO reads must also take range lock, - * or multiple reads will try to work on the same pages - * See LU-6227 for details. - */ - if (((iot == CIT_WRITE) || - (iot == CIT_READ && (file->f_flags & O_DIRECT))) && - !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - CDEBUG(D_VFSTRACE, "Range lock [%llu, %llu]\n", - range.rl_node.in_extent.start, - range.rl_node.in_extent.end); - rc = range_lock(&lli->lli_write_tree, &range); - if (rc < 0) - goto out; - - range_locked = true; - } - ll_cl_add(file, env, io); - rc = cl_io_loop(env, io); - ll_cl_remove(file, env); - if (range_locked) { - CDEBUG(D_VFSTRACE, "Range unlock [%llu, %llu]\n", - range.rl_node.in_extent.start, - range.rl_node.in_extent.end); - range_unlock(&lli->lli_write_tree, &range); - } - } else { - /* cl_io_rw_init() handled IO */ - rc = io->ci_result; - } - - if (io->ci_nob > 0) { - result = io->ci_nob; - count -= io->ci_nob; - *ppos = io->u.ci_wr.wr.crw_pos; - - /* prepare IO restart */ - if (count > 0) - args->u.normal.via_iter = vio->vui_iter; - } -out: - cl_io_fini(env, io); - - if ((!rc || rc == -ENODATA) && count > 0 && io->ci_need_restart) { - CDEBUG(D_VFSTRACE, - "%s: restart %s from %lld, count:%zu, result: %zd\n", - file_dentry(file)->d_name.name, - iot == CIT_READ ? "read" : "write", - *ppos, count, result); - goto restart; - } - - if (iot == CIT_READ) { - if (result >= 0) - ll_stats_ops_tally(ll_i2sbi(file_inode(file)), - LPROC_LL_READ_BYTES, result); - } else if (iot == CIT_WRITE) { - if (result >= 0) { - ll_stats_ops_tally(ll_i2sbi(file_inode(file)), - LPROC_LL_WRITE_BYTES, result); - fd->fd_write_failed = false; - } else if (!result && !rc) { - rc = io->ci_result; - if (rc < 0) - fd->fd_write_failed = true; - else - fd->fd_write_failed = false; - } else if (rc != -ERESTARTSYS) { - fd->fd_write_failed = true; - } - } - CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result); - - return result > 0 ? result : rc; -} - -static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct lu_env *env; - struct vvp_io_args *args; - ssize_t result; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - args = ll_env_args(env); - args->u.normal.via_iter = to; - args->u.normal.via_iocb = iocb; - - result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ, - &iocb->ki_pos, iov_iter_count(to)); - cl_env_put(env, &refcheck); - return result; -} - -/* - * Write to a file (through the page cache). - */ -static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct lu_env *env; - struct vvp_io_args *args; - ssize_t result; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - args = ll_env_args(env); - args->u.normal.via_iter = from; - args->u.normal.via_iocb = iocb; - - result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE, - &iocb->ki_pos, iov_iter_count(from)); - cl_env_put(env, &refcheck); - return result; -} - -int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, - __u64 flags, struct lov_user_md *lum, - int lum_size) -{ - struct lookup_intent oit = { - .it_op = IT_OPEN, - .it_flags = flags | MDS_OPEN_BY_FID, - }; - int rc = 0; - - ll_inode_size_lock(inode); - rc = ll_intent_file_open(dentry, lum, lum_size, &oit); - if (rc < 0) - goto out_unlock; - - ll_release_openhandle(inode, &oit); - -out_unlock: - ll_inode_size_unlock(inode); - ll_intent_release(&oit); - return rc; -} - -int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, - struct lov_mds_md **lmmp, int *lmm_size, - struct ptlrpc_request **request) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct mdt_body *body; - struct lov_mds_md *lmm = NULL; - struct ptlrpc_request *req = NULL; - struct md_op_data *op_data; - int rc, lmmsize; - - rc = ll_get_default_mdsize(sbi, &lmmsize); - if (rc) - return rc; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, - strlen(filename), lmmsize, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; - rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc < 0) { - CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n", - filename, rc); - goto out; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - lmmsize = body->mbo_eadatasize; - - if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || - lmmsize == 0) { - rc = -ENODATA; - goto out; - } - - lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize); - - if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) && - (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) { - rc = -EPROTO; - goto out; - } - - /* - * This is coming from the MDS, so is probably in - * little endian. We convert it to host endian before - * passing it to userspace. - */ - if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) { - int stripe_count; - - stripe_count = le16_to_cpu(lmm->lmm_stripe_count); - if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED) - stripe_count = 0; - - /* if function called for directory - we should - * avoid swab not existent lsm objects - */ - if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) { - lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); - if (S_ISREG(body->mbo_mode)) - lustre_swab_lov_user_md_objects( - ((struct lov_user_md_v1 *)lmm)->lmm_objects, - stripe_count); - } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { - lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); - if (S_ISREG(body->mbo_mode)) - lustre_swab_lov_user_md_objects( - ((struct lov_user_md_v3 *)lmm)->lmm_objects, - stripe_count); - } - } - -out: - *lmmp = lmm; - *lmm_size = lmmsize; - *request = req; - return rc; -} - -static int ll_lov_setea(struct inode *inode, struct file *file, - unsigned long arg) -{ - __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE; - struct lov_user_md *lump; - int lum_size = sizeof(struct lov_user_md) + - sizeof(struct lov_user_ost_data); - int rc; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - lump = kzalloc(lum_size, GFP_NOFS); - if (!lump) - return -ENOMEM; - - if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) { - kvfree(lump); - return -EFAULT; - } - - rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump, - lum_size); - cl_lov_delay_create_clear(&file->f_flags); - - kvfree(lump); - return rc; -} - -static int ll_file_getstripe(struct inode *inode, - struct lov_user_md __user *lum) -{ - struct lu_env *env; - u16 refcheck; - int rc; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum); - cl_env_put(env, &refcheck); - return rc; -} - -static int ll_lov_setstripe(struct inode *inode, struct file *file, - unsigned long arg) -{ - struct lov_user_md __user *lum = (struct lov_user_md __user *)arg; - struct lov_user_md *klum; - int lum_size, rc; - __u64 flags = FMODE_WRITE; - - rc = ll_copy_user_md(lum, &klum); - if (rc < 0) - return rc; - - lum_size = rc; - rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, klum, - lum_size); - cl_lov_delay_create_clear(&file->f_flags); - if (rc == 0) { - __u32 gen; - - put_user(0, &lum->lmm_stripe_count); - - ll_layout_refresh(inode, &gen); - rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg); - } - - kfree(klum); - return rc; -} - -static int -ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_grouplock grouplock; - int rc; - - if (arg == 0) { - CWARN("group id for group lock must not be 0\n"); - return -EINVAL; - } - - if (ll_file_nolock(file)) - return -EOPNOTSUPP; - - spin_lock(&lli->lli_lock); - if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { - CWARN("group lock already existed with gid %lu\n", - fd->fd_grouplock.lg_gid); - spin_unlock(&lli->lli_lock); - return -EINVAL; - } - LASSERT(!fd->fd_grouplock.lg_lock); - spin_unlock(&lli->lli_lock); - - rc = cl_get_grouplock(ll_i2info(inode)->lli_clob, - arg, (file->f_flags & O_NONBLOCK), &grouplock); - if (rc) - return rc; - - spin_lock(&lli->lli_lock); - if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { - spin_unlock(&lli->lli_lock); - CERROR("another thread just won the race\n"); - cl_put_grouplock(&grouplock); - return -EINVAL; - } - - fd->fd_flags |= LL_FILE_GROUP_LOCKED; - fd->fd_grouplock = grouplock; - spin_unlock(&lli->lli_lock); - - CDEBUG(D_INFO, "group lock %lu obtained\n", arg); - return 0; -} - -static int ll_put_grouplock(struct inode *inode, struct file *file, - unsigned long arg) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_grouplock grouplock; - - spin_lock(&lli->lli_lock); - if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - spin_unlock(&lli->lli_lock); - CWARN("no group lock held\n"); - return -EINVAL; - } - LASSERT(fd->fd_grouplock.lg_lock); - - if (fd->fd_grouplock.lg_gid != arg) { - CWARN("group lock %lu doesn't match current id %lu\n", - arg, fd->fd_grouplock.lg_gid); - spin_unlock(&lli->lli_lock); - return -EINVAL; - } - - grouplock = fd->fd_grouplock; - memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock)); - fd->fd_flags &= ~LL_FILE_GROUP_LOCKED; - spin_unlock(&lli->lli_lock); - - cl_put_grouplock(&grouplock); - CDEBUG(D_INFO, "group lock %lu released\n", arg); - return 0; -} - -/** - * Close inode open handle - * - * \param inode [in] inode in question - * \param it [in,out] intent which contains open info and result - * - * \retval 0 success - * \retval <0 failure - */ -int ll_release_openhandle(struct inode *inode, struct lookup_intent *it) -{ - struct obd_client_handle *och; - int rc; - - LASSERT(inode); - - /* Root ? Do nothing. */ - if (is_root_inode(inode)) - return 0; - - /* No open handle to close? Move away */ - if (!it_disposition(it, DISP_OPEN_OPEN)) - return 0; - - LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0); - - och = kzalloc(sizeof(*och), GFP_NOFS); - if (!och) { - rc = -ENOMEM; - goto out; - } - - ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); - - rc = ll_close_inode_openhandle(inode, och, 0, NULL); -out: - /* this one is in place of ll_file_open */ - if (it_disposition(it, DISP_ENQ_OPEN_REF)) { - ptlrpc_req_finished(it->it_request); - it_clear_disposition(it, DISP_ENQ_OPEN_REF); - } - return rc; -} - -/** - * Get size for inode for which FIEMAP mapping is requested. - * Make the FIEMAP get_info call and returns the result. - * - * \param fiemap kernel buffer to hold extens - * \param num_bytes kernel buffer size - */ -static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap, - size_t num_bytes) -{ - struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, }; - struct lu_env *env; - u16 refcheck; - int rc = 0; - - /* Checks for fiemap flags */ - if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) { - fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT; - return -EBADR; - } - - /* Check for FIEMAP_FLAG_SYNC */ - if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) { - rc = filemap_fdatawrite(inode->i_mapping); - if (rc) - return rc; - } - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - if (i_size_read(inode) == 0) { - rc = ll_glimpse_size(inode); - if (rc) - goto out; - } - - fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE); - obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid); - - /* If filesize is 0, then there would be no objects for mapping */ - if (fmkey.lfik_oa.o_size == 0) { - fiemap->fm_mapped_extents = 0; - rc = 0; - goto out; - } - - memcpy(&fmkey.lfik_fiemap, fiemap, sizeof(*fiemap)); - - rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob, - &fmkey, fiemap, &num_bytes); -out: - cl_env_put(env, &refcheck); - return rc; -} - -int ll_fid2path(struct inode *inode, void __user *arg) -{ - struct obd_export *exp = ll_i2mdexp(inode); - const struct getinfo_fid2path __user *gfin = arg; - struct getinfo_fid2path *gfout; - u32 pathlen; - size_t outsize; - int rc; - - if (!capable(CAP_DAC_READ_SEARCH) && - !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) - return -EPERM; - - /* Only need to get the buflen */ - if (get_user(pathlen, &gfin->gf_pathlen)) - return -EFAULT; - - if (pathlen > PATH_MAX) - return -EINVAL; - - outsize = sizeof(*gfout) + pathlen; - - gfout = kzalloc(outsize, GFP_NOFS); - if (!gfout) - return -ENOMEM; - - if (copy_from_user(gfout, arg, sizeof(*gfout))) { - rc = -EFAULT; - goto gf_free; - } - - /* Call mdc_iocontrol */ - rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL); - if (rc != 0) - goto gf_free; - - if (copy_to_user(arg, gfout, outsize)) - rc = -EFAULT; - -gf_free: - kfree(gfout); - return rc; -} - -/* - * Read the data_version for inode. - * - * This value is computed using stripe object version on OST. - * Version is computed using server side locking. - * - * @param flags if do sync on the OST side; - * 0: no sync - * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs - * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs - */ -int ll_data_version(struct inode *inode, __u64 *data_version, int flags) -{ - struct cl_object *obj = ll_i2info(inode)->lli_clob; - struct lu_env *env; - struct cl_io *io; - u16 refcheck; - int result; - - /* If no file object initialized, we consider its version is 0. */ - if (!obj) { - *data_version = 0; - return 0; - } - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = vvp_env_thread_io(env); - io->ci_obj = obj; - io->u.ci_data_version.dv_data_version = 0; - io->u.ci_data_version.dv_flags = flags; - -restart: - if (!cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj)) - result = cl_io_loop(env, io); - else - result = io->ci_result; - - *data_version = io->u.ci_data_version.dv_data_version; - - cl_io_fini(env, io); - - if (unlikely(io->ci_need_restart)) - goto restart; - - cl_env_put(env, &refcheck); - - return result; -} - -/* - * Trigger a HSM release request for the provided inode. - */ -int ll_hsm_release(struct inode *inode) -{ - struct lu_env *env; - struct obd_client_handle *och = NULL; - __u64 data_version = 0; - int rc; - u16 refcheck; - - CDEBUG(D_INODE, "%s: Releasing file " DFID ".\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&ll_i2info(inode)->lli_fid)); - - och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE); - if (IS_ERR(och)) { - rc = PTR_ERR(och); - goto out; - } - - /* Grab latest data_version and [am]time values */ - rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH); - if (rc != 0) - goto out; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) { - rc = PTR_ERR(env); - goto out; - } - - ll_merge_attr(env, inode); - cl_env_put(env, &refcheck); - - /* Release the file. - * NB: lease lock handle is released in mdc_hsm_release_pack() because - * we still need it to pack l_remote_handle to MDT. - */ - rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE, - &data_version); - och = NULL; - -out: - if (och && !IS_ERR(och)) /* close the file */ - ll_lease_close(och, inode, NULL); - - return rc; -} - -struct ll_swap_stack { - u64 dv1; - u64 dv2; - struct inode *inode1; - struct inode *inode2; - bool check_dv1; - bool check_dv2; -}; - -static int ll_swap_layouts(struct file *file1, struct file *file2, - struct lustre_swap_layouts *lsl) -{ - struct mdc_swap_layouts msl; - struct md_op_data *op_data; - __u32 gid; - __u64 dv; - struct ll_swap_stack *llss = NULL; - int rc; - - llss = kzalloc(sizeof(*llss), GFP_NOFS); - if (!llss) - return -ENOMEM; - - llss->inode1 = file_inode(file1); - llss->inode2 = file_inode(file2); - - rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2); - if (rc < 0) - goto free; - - /* we use 2 bool because it is easier to swap than 2 bits */ - if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1) - llss->check_dv1 = true; - - if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2) - llss->check_dv2 = true; - - /* we cannot use lsl->sl_dvX directly because we may swap them */ - llss->dv1 = lsl->sl_dv1; - llss->dv2 = lsl->sl_dv2; - - rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2)); - if (!rc) /* same file, done! */ - goto free; - - if (rc < 0) { /* sequentialize it */ - swap(llss->inode1, llss->inode2); - swap(file1, file2); - swap(llss->dv1, llss->dv2); - swap(llss->check_dv1, llss->check_dv2); - } - - gid = lsl->sl_gid; - if (gid != 0) { /* application asks to flush dirty cache */ - rc = ll_get_grouplock(llss->inode1, file1, gid); - if (rc < 0) - goto free; - - rc = ll_get_grouplock(llss->inode2, file2, gid); - if (rc < 0) { - ll_put_grouplock(llss->inode1, file1, gid); - goto free; - } - } - - /* ultimate check, before swapping the layouts we check if - * dataversion has changed (if requested) - */ - if (llss->check_dv1) { - rc = ll_data_version(llss->inode1, &dv, 0); - if (rc) - goto putgl; - if (dv != llss->dv1) { - rc = -EAGAIN; - goto putgl; - } - } - - if (llss->check_dv2) { - rc = ll_data_version(llss->inode2, &dv, 0); - if (rc) - goto putgl; - if (dv != llss->dv2) { - rc = -EAGAIN; - goto putgl; - } - } - - /* struct md_op_data is used to send the swap args to the mdt - * only flags is missing, so we use struct mdc_swap_layouts - * through the md_op_data->op_data - */ - /* flags from user space have to be converted before they are send to - * server, no flag is sent today, they are only used on the client - */ - msl.msl_flags = 0; - rc = -ENOMEM; - op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0, - 0, LUSTRE_OPC_ANY, &msl); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto free; - } - - rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1), - sizeof(*op_data), op_data, NULL); - ll_finish_md_op_data(op_data); - -putgl: - if (gid != 0) { - ll_put_grouplock(llss->inode2, file2, gid); - ll_put_grouplock(llss->inode1, file1, gid); - } - -free: - kfree(llss); - - return rc; -} - -int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss) -{ - struct md_op_data *op_data; - int rc; - - /* Detect out-of range masks */ - if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK) - return -EINVAL; - - /* Non-root users are forbidden to set or clear flags which are - * NOT defined in HSM_USER_MASK. - */ - if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* Detect out-of range archive id */ - if ((hss->hss_valid & HSS_ARCHIVE_ID) && - (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE)) - return -EINVAL; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, hss); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode), - sizeof(*op_data), op_data, NULL); - - ll_finish_md_op_data(op_data); - - return rc; -} - -static int ll_hsm_import(struct inode *inode, struct file *file, - struct hsm_user_import *hui) -{ - struct hsm_state_set *hss = NULL; - struct iattr *attr = NULL; - int rc; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - /* set HSM flags */ - hss = kzalloc(sizeof(*hss), GFP_NOFS); - if (!hss) - return -ENOMEM; - - hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID; - hss->hss_archive_id = hui->hui_archive_id; - hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED; - rc = ll_hsm_state_set(inode, hss); - if (rc != 0) - goto free_hss; - - attr = kzalloc(sizeof(*attr), GFP_NOFS); - if (!attr) { - rc = -ENOMEM; - goto free_hss; - } - - attr->ia_mode = hui->hui_mode & 0777; - attr->ia_mode |= S_IFREG; - attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid); - attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid); - attr->ia_size = hui->hui_size; - attr->ia_mtime.tv_sec = hui->hui_mtime; - attr->ia_mtime.tv_nsec = hui->hui_mtime_ns; - attr->ia_atime.tv_sec = hui->hui_atime; - attr->ia_atime.tv_nsec = hui->hui_atime_ns; - - attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE | - ATTR_UID | ATTR_GID | - ATTR_MTIME | ATTR_MTIME_SET | - ATTR_ATIME | ATTR_ATIME_SET; - - inode_lock(inode); - - rc = ll_setattr_raw(file->f_path.dentry, attr, true); - if (rc == -ENODATA) - rc = 0; - - inode_unlock(inode); - - kfree(attr); -free_hss: - kfree(hss); - return rc; -} - -static inline long ll_lease_type_from_fmode(fmode_t fmode) -{ - return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) | - ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0); -} - -static long -ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(file); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - int flags, rc; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p),cmd=%x\n", - PFID(ll_inode2fid(inode)), inode, cmd); - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); - - /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ - if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ - return -ENOTTY; - - switch (cmd) { - case LL_IOC_GETFLAGS: - /* Get the current value of the file flags */ - return put_user(fd->fd_flags, (int __user *)arg); - case LL_IOC_SETFLAGS: - case LL_IOC_CLRFLAGS: - /* Set or clear specific file flags */ - /* XXX This probably needs checks to ensure the flags are - * not abused, and to handle any flag side effects. - */ - if (get_user(flags, (int __user *)arg)) - return -EFAULT; - - if (cmd == LL_IOC_SETFLAGS) { - if ((flags & LL_FILE_IGNORE_LOCK) && - !(file->f_flags & O_DIRECT)) { - CERROR("%s: unable to disable locking on non-O_DIRECT file\n", - current->comm); - return -EINVAL; - } - - fd->fd_flags |= flags; - } else { - fd->fd_flags &= ~flags; - } - return 0; - case LL_IOC_LOV_SETSTRIPE: - return ll_lov_setstripe(inode, file, arg); - case LL_IOC_LOV_SETEA: - return ll_lov_setea(inode, file, arg); - case LL_IOC_LOV_SWAP_LAYOUTS: { - struct file *file2; - struct lustre_swap_layouts lsl; - - if (copy_from_user(&lsl, (char __user *)arg, - sizeof(struct lustre_swap_layouts))) - return -EFAULT; - - if ((file->f_flags & O_ACCMODE) == O_RDONLY) - return -EPERM; - - file2 = fget(lsl.sl_fd); - if (!file2) - return -EBADF; - - /* O_WRONLY or O_RDWR */ - if ((file2->f_flags & O_ACCMODE) == O_RDONLY) { - rc = -EPERM; - goto out; - } - - if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) { - struct obd_client_handle *och = NULL; - struct ll_inode_info *lli; - struct inode *inode2; - - if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE) { - rc = -EINVAL; - goto out; - } - - lli = ll_i2info(inode); - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_lease_och) { - och = fd->fd_lease_och; - fd->fd_lease_och = NULL; - } - mutex_unlock(&lli->lli_och_mutex); - if (!och) { - rc = -ENOLCK; - goto out; - } - inode2 = file_inode(file2); - rc = ll_swap_layouts_close(och, inode, inode2); - } else { - rc = ll_swap_layouts(file, file2, &lsl); - } -out: - fput(file2); - return rc; - } - case LL_IOC_LOV_GETSTRIPE: - return ll_file_getstripe(inode, - (struct lov_user_md __user *)arg); - case FSFILT_IOC_GETFLAGS: - case FSFILT_IOC_SETFLAGS: - return ll_iocontrol(inode, file, cmd, arg); - case FSFILT_IOC_GETVERSION_OLD: - case FSFILT_IOC_GETVERSION: - return put_user(inode->i_generation, (int __user *)arg); - case LL_IOC_GROUP_LOCK: - return ll_get_grouplock(inode, file, arg); - case LL_IOC_GROUP_UNLOCK: - return ll_put_grouplock(inode, file, arg); - case IOC_OBD_STATFS: - return ll_obd_statfs(inode, (void __user *)arg); - - /* We need to special case any other ioctls we want to handle, - * to send them to the MDS/OST as appropriate and to properly - * network encode the arg field. - case FSFILT_IOC_SETVERSION_OLD: - case FSFILT_IOC_SETVERSION: - */ - case LL_IOC_FLUSHCTX: - return ll_flush_ctx(inode); - case LL_IOC_PATH2FID: { - if (copy_to_user((void __user *)arg, ll_inode2fid(inode), - sizeof(struct lu_fid))) - return -EFAULT; - - return 0; - } - case LL_IOC_GETPARENT: - return ll_getparent(file, (struct getparent __user *)arg); - case OBD_IOC_FID2PATH: - return ll_fid2path(inode, (void __user *)arg); - case LL_IOC_DATA_VERSION: { - struct ioc_data_version idv; - int rc; - - if (copy_from_user(&idv, (char __user *)arg, sizeof(idv))) - return -EFAULT; - - idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH; - rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags); - if (rc == 0 && copy_to_user((char __user *)arg, &idv, - sizeof(idv))) - return -EFAULT; - - return rc; - } - - case LL_IOC_GET_MDTIDX: { - int mdtidx; - - mdtidx = ll_get_mdt_idx(inode); - if (mdtidx < 0) - return mdtidx; - - if (put_user(mdtidx, (int __user *)arg)) - return -EFAULT; - - return 0; - } - case OBD_IOC_GETDTNAME: - case OBD_IOC_GETMDNAME: - return ll_get_obd_name(inode, cmd, arg); - case LL_IOC_HSM_STATE_GET: { - struct md_op_data *op_data; - struct hsm_user_state *hus; - int rc; - - hus = kzalloc(sizeof(*hus), GFP_NOFS); - if (!hus) - return -ENOMEM; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, hus); - if (IS_ERR(op_data)) { - kfree(hus); - return PTR_ERR(op_data); - } - - rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), - op_data, NULL); - - if (copy_to_user((void __user *)arg, hus, sizeof(*hus))) - rc = -EFAULT; - - ll_finish_md_op_data(op_data); - kfree(hus); - return rc; - } - case LL_IOC_HSM_STATE_SET: { - struct hsm_state_set *hss; - int rc; - - hss = memdup_user((char __user *)arg, sizeof(*hss)); - if (IS_ERR(hss)) - return PTR_ERR(hss); - - rc = ll_hsm_state_set(inode, hss); - - kfree(hss); - return rc; - } - case LL_IOC_HSM_ACTION: { - struct md_op_data *op_data; - struct hsm_current_action *hca; - int rc; - - hca = kzalloc(sizeof(*hca), GFP_NOFS); - if (!hca) - return -ENOMEM; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, hca); - if (IS_ERR(op_data)) { - kfree(hca); - return PTR_ERR(op_data); - } - - rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), - op_data, NULL); - - if (copy_to_user((char __user *)arg, hca, sizeof(*hca))) - rc = -EFAULT; - - ll_finish_md_op_data(op_data); - kfree(hca); - return rc; - } - case LL_IOC_SET_LEASE: { - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_client_handle *och = NULL; - bool lease_broken; - fmode_t fmode; - - switch (arg) { - case LL_LEASE_WRLCK: - if (!(file->f_mode & FMODE_WRITE)) - return -EPERM; - fmode = FMODE_WRITE; - break; - case LL_LEASE_RDLCK: - if (!(file->f_mode & FMODE_READ)) - return -EPERM; - fmode = FMODE_READ; - break; - case LL_LEASE_UNLCK: - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_lease_och) { - och = fd->fd_lease_och; - fd->fd_lease_och = NULL; - } - mutex_unlock(&lli->lli_och_mutex); - - if (!och) - return -ENOLCK; - - fmode = och->och_flags; - rc = ll_lease_close(och, inode, &lease_broken); - if (rc < 0) - return rc; - - if (lease_broken) - fmode = 0; - - return ll_lease_type_from_fmode(fmode); - default: - return -EINVAL; - } - - CDEBUG(D_INODE, "Set lease with mode %u\n", fmode); - - /* apply for lease */ - och = ll_lease_open(inode, file, fmode, 0); - if (IS_ERR(och)) - return PTR_ERR(och); - - rc = 0; - mutex_lock(&lli->lli_och_mutex); - if (!fd->fd_lease_och) { - fd->fd_lease_och = och; - och = NULL; - } - mutex_unlock(&lli->lli_och_mutex); - if (och) { - /* impossible now that only excl is supported for now */ - ll_lease_close(och, inode, &lease_broken); - rc = -EBUSY; - } - return rc; - } - case LL_IOC_GET_LEASE: { - struct ll_inode_info *lli = ll_i2info(inode); - struct ldlm_lock *lock = NULL; - fmode_t fmode = 0; - - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_lease_och) { - struct obd_client_handle *och = fd->fd_lease_och; - - lock = ldlm_handle2lock(&och->och_lease_handle); - if (lock) { - lock_res_and_lock(lock); - if (!ldlm_is_cancel(lock)) - fmode = och->och_flags; - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); - } - } - mutex_unlock(&lli->lli_och_mutex); - return ll_lease_type_from_fmode(fmode); - } - case LL_IOC_HSM_IMPORT: { - struct hsm_user_import *hui; - - hui = memdup_user((void __user *)arg, sizeof(*hui)); - if (IS_ERR(hui)) - return PTR_ERR(hui); - - rc = ll_hsm_import(inode, file, hui); - - kfree(hui); - return rc; - } - default: { - int err; - - if (ll_iocontrol_call(inode, file, cmd, arg, &err) == - LLIOC_STOP) - return err; - - return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL, - (void __user *)arg); - } - } -} - -static loff_t ll_file_seek(struct file *file, loff_t offset, int origin) -{ - struct inode *inode = file_inode(file); - loff_t retval, eof = 0; - - retval = offset + ((origin == SEEK_END) ? i_size_read(inode) : - (origin == SEEK_CUR) ? file->f_pos : 0); - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), to=%llu=%#llx(%d)\n", - PFID(ll_inode2fid(inode)), inode, retval, retval, origin); - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1); - - if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) { - retval = ll_glimpse_size(inode); - if (retval != 0) - return retval; - eof = i_size_read(inode); - } - - return generic_file_llseek_size(file, offset, origin, - ll_file_maxbytes(inode), eof); -} - -static int ll_flush(struct file *file, fl_owner_t id) -{ - struct inode *inode = file_inode(file); - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - int rc, err; - - LASSERT(!S_ISDIR(inode->i_mode)); - - /* catch async errors that were recorded back when async writeback - * failed for pages in this mapping. - */ - rc = lli->lli_async_rc; - lli->lli_async_rc = 0; - if (lli->lli_clob) { - err = lov_read_and_clear_async_rc(lli->lli_clob); - if (!rc) - rc = err; - } - - /* The application has been told about write failure already. - * Do not report failure again. - */ - if (fd->fd_write_failed) - return 0; - return rc ? -EIO : 0; -} - -/** - * Called to make sure a portion of file has been written out. - * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST. - * - * Return how many pages have been written. - */ -int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, - enum cl_fsync_mode mode, int ignore_layout) -{ - struct lu_env *env; - struct cl_io *io; - struct cl_fsync_io *fio; - int result; - u16 refcheck; - - if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL && - mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL) - return -EINVAL; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = vvp_env_thread_io(env); - io->ci_obj = ll_i2info(inode)->lli_clob; - io->ci_ignore_layout = ignore_layout; - - /* initialize parameters for sync */ - fio = &io->u.ci_fsync; - fio->fi_start = start; - fio->fi_end = end; - fio->fi_fid = ll_inode2fid(inode); - fio->fi_mode = mode; - fio->fi_nr_written = 0; - - if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0) - result = cl_io_loop(env, io); - else - result = io->ci_result; - if (result == 0) - result = fio->fi_nr_written; - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - - return result; -} - -int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct inode *inode = file_inode(file); - struct ll_inode_info *lli = ll_i2info(inode); - struct ptlrpc_request *req; - int rc, err; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); - - rc = file_write_and_wait_range(file, start, end); - inode_lock(inode); - - /* catch async errors that were recorded back when async writeback - * failed for pages in this mapping. - */ - if (!S_ISDIR(inode->i_mode)) { - err = lli->lli_async_rc; - lli->lli_async_rc = 0; - if (rc == 0) - rc = err; - if (lli->lli_clob) { - err = lov_read_and_clear_async_rc(lli->lli_clob); - if (rc == 0) - rc = err; - } - } - - err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req); - if (!rc) - rc = err; - if (!err) - ptlrpc_req_finished(req); - - if (S_ISREG(inode->i_mode)) { - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - - err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0); - if (rc == 0 && err < 0) - rc = err; - if (rc < 0) - fd->fd_write_failed = true; - else - fd->fd_write_failed = false; - } - - inode_unlock(inode); - return rc; -} - -static int -ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) -{ - struct inode *inode = file_inode(file); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ldlm_enqueue_info einfo = { - .ei_type = LDLM_FLOCK, - .ei_cb_cp = ldlm_flock_completion_ast, - .ei_cbdata = file_lock, - }; - struct md_op_data *op_data; - struct lustre_handle lockh = {0}; - union ldlm_policy_data flock = { { 0 } }; - int fl_type = file_lock->fl_type; - __u64 flags = 0; - int rc; - int rc2 = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID " file_lock=%p\n", - PFID(ll_inode2fid(inode)), file_lock); - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1); - - if (file_lock->fl_flags & FL_FLOCK) - LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK)); - else if (!(file_lock->fl_flags & FL_POSIX)) - return -EINVAL; - - flock.l_flock.owner = (unsigned long)file_lock->fl_owner; - flock.l_flock.pid = file_lock->fl_pid; - flock.l_flock.start = file_lock->fl_start; - flock.l_flock.end = file_lock->fl_end; - - /* Somewhat ugly workaround for svc lockd. - * lockd installs custom fl_lmops->lm_compare_owner that checks - * for the fl_owner to be the same (which it always is on local node - * I guess between lockd processes) and then compares pid. - * As such we assign pid to the owner field to make it all work, - * conflict with normal locks is unlikely since pid space and - * pointer space for current->files are not intersecting - */ - if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner) - flock.l_flock.owner = (unsigned long)file_lock->fl_pid; - - switch (fl_type) { - case F_RDLCK: - einfo.ei_mode = LCK_PR; - break; - case F_UNLCK: - /* An unlock request may or may not have any relation to - * existing locks so we may not be able to pass a lock handle - * via a normal ldlm_lock_cancel() request. The request may even - * unlock a byte range in the middle of an existing lock. In - * order to process an unlock request we need all of the same - * information that is given with a normal read or write record - * lock request. To avoid creating another ldlm unlock (cancel) - * message we'll treat a LCK_NL flock request as an unlock. - */ - einfo.ei_mode = LCK_NL; - break; - case F_WRLCK: - einfo.ei_mode = LCK_PW; - break; - default: - CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type); - return -ENOTSUPP; - } - - switch (cmd) { - case F_SETLKW: -#ifdef F_SETLKW64 - case F_SETLKW64: -#endif - flags = 0; - break; - case F_SETLK: -#ifdef F_SETLK64 - case F_SETLK64: -#endif - flags = LDLM_FL_BLOCK_NOWAIT; - break; - case F_GETLK: -#ifdef F_GETLK64 - case F_GETLK64: -#endif - flags = LDLM_FL_TEST_LOCK; - break; - default: - CERROR("unknown fcntl lock command: %d\n", cmd); - return -EINVAL; - } - - /* - * Save the old mode so that if the mode in the lock changes we - * can decrement the appropriate reader or writer refcount. - */ - file_lock->fl_type = einfo.ei_mode; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - CDEBUG(D_DLMTRACE, "inode=" DFID ", pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n", - PFID(ll_inode2fid(inode)), flock.l_flock.pid, flags, - einfo.ei_mode, flock.l_flock.start, flock.l_flock.end); - - rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh, - flags); - - /* Restore the file lock type if not TEST lock. */ - if (!(flags & LDLM_FL_TEST_LOCK)) - file_lock->fl_type = fl_type; - - if ((rc == 0 || file_lock->fl_type == F_UNLCK) && - !(flags & LDLM_FL_TEST_LOCK)) - rc2 = locks_lock_file_wait(file, file_lock); - - if (rc2 && file_lock->fl_type != F_UNLCK) { - einfo.ei_mode = LCK_NL; - md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, - &lockh, flags); - rc = rc2; - } - - ll_finish_md_op_data(op_data); - - return rc; -} - -int ll_get_fid_by_name(struct inode *parent, const char *name, - int namelen, struct lu_fid *fid, - struct inode **inode) -{ - struct md_op_data *op_data = NULL; - struct ptlrpc_request *req; - struct mdt_body *body; - int rc; - - op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE; - rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc < 0) - return rc; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - rc = -EFAULT; - goto out_req; - } - if (fid) - *fid = body->mbo_fid1; - - if (inode) - rc = ll_prep_inode(inode, req, parent->i_sb, NULL); -out_req: - ptlrpc_req_finished(req); - return rc; -} - -int ll_migrate(struct inode *parent, struct file *file, int mdtidx, - const char *name, int namelen) -{ - struct ptlrpc_request *request = NULL; - struct obd_client_handle *och = NULL; - struct inode *child_inode = NULL; - struct dentry *dchild = NULL; - struct md_op_data *op_data; - struct mdt_body *body; - u64 data_version = 0; - struct qstr qstr; - int rc; - - CDEBUG(D_VFSTRACE, "migrate %s under " DFID " to MDT%d\n", - name, PFID(ll_inode2fid(parent)), mdtidx); - - op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, - 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - /* Get child FID first */ - qstr.hash = full_name_hash(parent, name, namelen); - qstr.name = name; - qstr.len = namelen; - dchild = d_lookup(file_dentry(file), &qstr); - if (dchild) { - op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); - if (dchild->d_inode) - child_inode = igrab(dchild->d_inode); - dput(dchild); - } - - if (!child_inode) { - rc = ll_get_fid_by_name(parent, name, namelen, - &op_data->op_fid3, &child_inode); - if (rc) - goto out_free; - } - - if (!child_inode) { - rc = -EINVAL; - goto out_free; - } - - inode_lock(child_inode); - op_data->op_fid3 = *ll_inode2fid(child_inode); - if (!fid_is_sane(&op_data->op_fid3)) { - CERROR("%s: migrate %s, but fid " DFID " is insane\n", - ll_get_fsname(parent->i_sb, NULL, 0), name, - PFID(&op_data->op_fid3)); - rc = -EINVAL; - goto out_unlock; - } - - rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3); - if (rc < 0) - goto out_unlock; - - if (rc == mdtidx) { - CDEBUG(D_INFO, "%s: " DFID " is already on MDT%d.\n", name, - PFID(&op_data->op_fid3), mdtidx); - rc = 0; - goto out_unlock; - } -again: - if (S_ISREG(child_inode->i_mode)) { - och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0); - if (IS_ERR(och)) { - rc = PTR_ERR(och); - och = NULL; - goto out_unlock; - } - - rc = ll_data_version(child_inode, &data_version, - LL_DV_WR_FLUSH); - if (rc) - goto out_close; - - op_data->op_handle = och->och_fh; - op_data->op_data = och->och_mod; - op_data->op_data_version = data_version; - op_data->op_lease_handle = och->och_lease_handle; - op_data->op_bias |= MDS_RENAME_MIGRATE; - } - - op_data->op_mds = mdtidx; - op_data->op_cli_flags = CLI_MIGRATE; - rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, - namelen, name, namelen, &request); - if (!rc) { - LASSERT(request); - ll_update_times(request, parent); - - body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); - LASSERT(body); - - /* - * If the server does release layout lock, then we cleanup - * the client och here, otherwise release it in out_close: - */ - if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) { - obd_mod_put(och->och_mod); - md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp, - och); - och->och_fh.cookie = DEAD_HANDLE_MAGIC; - kfree(och); - och = NULL; - } - } - - if (request) { - ptlrpc_req_finished(request); - request = NULL; - } - - /* Try again if the file layout has changed. */ - if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) - goto again; - -out_close: - if (och) /* close the file */ - ll_lease_close(och, child_inode, NULL); - if (!rc) - clear_nlink(child_inode); -out_unlock: - inode_unlock(child_inode); - iput(child_inode); -out_free: - ll_finish_md_op_data(op_data); - return rc; -} - -static int -ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock) -{ - return -ENOSYS; -} - -/** - * test if some locks matching bits and l_req_mode are acquired - * - bits can be in different locks - * - if found clear the common lock bits in *bits - * - the bits not found, are kept in *bits - * \param inode [IN] - * \param bits [IN] searched lock bits [IN] - * \param l_req_mode [IN] searched lock mode - * \retval boolean, true iff all bits are found - */ -int ll_have_md_lock(struct inode *inode, __u64 *bits, - enum ldlm_mode l_req_mode) -{ - struct lustre_handle lockh; - union ldlm_policy_data policy; - enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ? - (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode; - struct lu_fid *fid; - __u64 flags; - int i; - - if (!inode) - return 0; - - fid = &ll_i2info(inode)->lli_fid; - CDEBUG(D_INFO, "trying to match res " DFID " mode %s\n", PFID(fid), - ldlm_lockname[mode]); - - flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK; - for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) { - policy.l_inodebits.bits = *bits & (1 << i); - if (policy.l_inodebits.bits == 0) - continue; - - if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, - &policy, mode, &lockh)) { - struct ldlm_lock *lock; - - lock = ldlm_handle2lock(&lockh); - if (lock) { - *bits &= - ~(lock->l_policy_data.l_inodebits.bits); - LDLM_LOCK_PUT(lock); - } else { - *bits &= ~policy.l_inodebits.bits; - } - } - } - return *bits == 0; -} - -enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, - struct lustre_handle *lockh, __u64 flags, - enum ldlm_mode mode) -{ - union ldlm_policy_data policy = { .l_inodebits = { bits } }; - struct lu_fid *fid; - - fid = &ll_i2info(inode)->lli_fid; - CDEBUG(D_INFO, "trying to match res " DFID "\n", PFID(fid)); - - return md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED, - fid, LDLM_IBITS, &policy, mode, lockh); -} - -static int ll_inode_revalidate_fini(struct inode *inode, int rc) -{ - /* Already unlinked. Just update nlink and return success */ - if (rc == -ENOENT) { - clear_nlink(inode); - /* If it is striped directory, and there is bad stripe - * Let's revalidate the dentry again, instead of returning - * error - */ - if (S_ISDIR(inode->i_mode) && ll_i2info(inode)->lli_lsm_md) - return 0; - - /* This path cannot be hit for regular files unless in - * case of obscure races, so no need to validate size. - */ - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) - return 0; - } else if (rc != 0) { - CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR, - "%s: revalidate FID " DFID " error: rc = %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), rc); - } - - return rc; -} - -static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits) -{ - struct inode *inode = d_inode(dentry); - struct ptlrpc_request *req = NULL; - struct obd_export *exp; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p),name=%pd\n", - PFID(ll_inode2fid(inode)), inode, dentry); - - exp = ll_i2mdexp(inode); - - /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC. - * But under CMD case, it caused some lock issues, should be fixed - * with new CMD ibits lock. See bug 12718 - */ - if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) { - struct lookup_intent oit = { .it_op = IT_GETATTR }; - struct md_op_data *op_data; - - if (ibits == MDS_INODELOCK_LOOKUP) - oit.it_op = IT_LOOKUP; - - /* Call getattr by fid, so do not provide name at all. */ - op_data = ll_prep_md_op_data(NULL, inode, - inode, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - rc = md_intent_lock(exp, op_data, &oit, &req, - &ll_md_blocking_ast, 0); - ll_finish_md_op_data(op_data); - if (rc < 0) { - rc = ll_inode_revalidate_fini(inode, rc); - goto out; - } - - rc = ll_revalidate_it_finish(req, &oit, inode); - if (rc != 0) { - ll_intent_release(&oit); - goto out; - } - - /* Unlinked? Unhash dentry, so it is not picked up later by - * do_lookup() -> ll_revalidate_it(). We cannot use d_drop - * here to preserve get_cwd functionality on 2.6. - * Bug 10503 - */ - if (!d_inode(dentry)->i_nlink) { - spin_lock(&inode->i_lock); - d_lustre_invalidate(dentry, 0); - spin_unlock(&inode->i_lock); - } - - ll_lookup_finish_locks(&oit, inode); - } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) { - struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry)); - u64 valid = OBD_MD_FLGETATTR; - struct md_op_data *op_data; - int ealen = 0; - - if (S_ISREG(inode->i_mode)) { - rc = ll_get_default_mdsize(sbi, &ealen); - if (rc) - return rc; - valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE; - } - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, - 0, ealen, LUSTRE_OPC_ANY, - NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = valid; - rc = md_getattr(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc) - return ll_inode_revalidate_fini(inode, rc); - - rc = ll_prep_inode(&inode, req, NULL, NULL); - } -out: - ptlrpc_req_finished(req); - return rc; -} - -static int ll_merge_md_attr(struct inode *inode) -{ - struct cl_attr attr = { 0 }; - int rc; - - LASSERT(ll_i2info(inode)->lli_lsm_md); - rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md, - &attr, ll_md_blocking_ast); - if (rc) - return rc; - - set_nlink(inode, attr.cat_nlink); - inode->i_blocks = attr.cat_blocks; - i_size_write(inode, attr.cat_size); - - ll_i2info(inode)->lli_atime = attr.cat_atime; - ll_i2info(inode)->lli_mtime = attr.cat_mtime; - ll_i2info(inode)->lli_ctime = attr.cat_ctime; - - return 0; -} - -static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits) -{ - struct inode *inode = d_inode(dentry); - int rc; - - rc = __ll_inode_revalidate(dentry, ibits); - if (rc != 0) - return rc; - - /* if object isn't regular file, don't validate size */ - if (!S_ISREG(inode->i_mode)) { - if (S_ISDIR(inode->i_mode) && - ll_i2info(inode)->lli_lsm_md) { - rc = ll_merge_md_attr(inode); - if (rc) - return rc; - } - - LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime; - LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime; - LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime; - } else { - struct ll_inode_info *lli = ll_i2info(inode); - - /* In case of restore, the MDT has the right size and has - * already send it back without granting the layout lock, - * inode is up-to-date so glimpse is useless. - * Also to glimpse we need the layout, in case of a running - * restore the MDT holds the layout lock so the glimpse will - * block up to the end of restore (getattr will block) - */ - if (!test_bit(LLIF_FILE_RESTORING, &lli->lli_flags)) - rc = ll_glimpse_size(inode); - } - return rc; -} - -int ll_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) -{ - struct inode *inode = d_inode(path->dentry); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_inode_info *lli = ll_i2info(inode); - int res; - - res = ll_inode_revalidate(path->dentry, - MDS_INODELOCK_UPDATE | MDS_INODELOCK_LOOKUP); - ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1); - - if (res) - return res; - - OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30); - - stat->dev = inode->i_sb->s_dev; - if (ll_need_32bit_api(sbi)) - stat->ino = cl_fid_build_ino(&lli->lli_fid, 1); - else - stat->ino = inode->i_ino; - stat->mode = inode->i_mode; - stat->uid = inode->i_uid; - stat->gid = inode->i_gid; - stat->rdev = inode->i_rdev; - stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; - stat->blksize = 1 << inode->i_blkbits; - - stat->nlink = inode->i_nlink; - stat->size = i_size_read(inode); - stat->blocks = inode->i_blocks; - - return 0; -} - -static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) -{ - int rc; - size_t num_bytes; - struct fiemap *fiemap; - unsigned int extent_count = fieinfo->fi_extents_max; - - num_bytes = sizeof(*fiemap) + (extent_count * - sizeof(struct fiemap_extent)); - fiemap = kvzalloc(num_bytes, GFP_KERNEL); - if (!fiemap) - return -ENOMEM; - - fiemap->fm_flags = fieinfo->fi_flags; - fiemap->fm_extent_count = fieinfo->fi_extents_max; - fiemap->fm_start = start; - fiemap->fm_length = len; - - if (extent_count > 0 && - copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start, - sizeof(struct fiemap_extent))) { - rc = -EFAULT; - goto out; - } - - rc = ll_do_fiemap(inode, fiemap, num_bytes); - - fieinfo->fi_flags = fiemap->fm_flags; - fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents; - if (extent_count > 0 && - copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0], - fiemap->fm_mapped_extents * - sizeof(struct fiemap_extent))) { - rc = -EFAULT; - goto out; - } -out: - kvfree(fiemap); - return rc; -} - -struct posix_acl *ll_get_acl(struct inode *inode, int type) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct posix_acl *acl = NULL; - - spin_lock(&lli->lli_lock); - /* VFS' acl_permission_check->check_acl will release the refcount */ - acl = posix_acl_dup(lli->lli_posix_acl); - spin_unlock(&lli->lli_lock); - - return acl; -} - -int ll_inode_permission(struct inode *inode, int mask) -{ - struct ll_sb_info *sbi; - struct root_squash_info *squash; - const struct cred *old_cred = NULL; - struct cred *cred = NULL; - bool squash_id = false; - cfs_cap_t cap; - int rc = 0; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - /* as root inode are NOT getting validated in lookup operation, - * need to do it before permission check. - */ - - if (is_root_inode(inode)) { - rc = __ll_inode_revalidate(inode->i_sb->s_root, - MDS_INODELOCK_LOOKUP); - if (rc) - return rc; - } - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), inode mode %x mask %o\n", - PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask); - - /* squash fsuid/fsgid if needed */ - sbi = ll_i2sbi(inode); - squash = &sbi->ll_squash; - if (unlikely(squash->rsi_uid && - uid_eq(current_fsuid(), GLOBAL_ROOT_UID) && - !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) { - squash_id = true; - } - - if (squash_id) { - CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n", - __kuid_val(current_fsuid()), __kgid_val(current_fsgid()), - squash->rsi_uid, squash->rsi_gid); - - /* - * update current process's credentials - * and FS capability - */ - cred = prepare_creds(); - if (!cred) - return -ENOMEM; - - cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid); - cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid); - for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) { - if ((1 << cap) & CFS_CAP_FS_MASK) - cap_lower(cred->cap_effective, cap); - } - old_cred = override_creds(cred); - } - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1); - rc = generic_permission(inode, mask); - - /* restore current process's credentials and FS capability */ - if (squash_id) { - revert_creds(old_cred); - put_cred(cred); - } - - return rc; -} - -/* -o localflock - only provides locally consistent flock locks */ -const struct file_operations ll_file_operations = { - .read_iter = ll_file_read_iter, - .write_iter = ll_file_write_iter, - .unlocked_ioctl = ll_file_ioctl, - .open = ll_file_open, - .release = ll_file_release, - .mmap = ll_file_mmap, - .llseek = ll_file_seek, - .splice_read = generic_file_splice_read, - .fsync = ll_fsync, - .flush = ll_flush -}; - -const struct file_operations ll_file_operations_flock = { - .read_iter = ll_file_read_iter, - .write_iter = ll_file_write_iter, - .unlocked_ioctl = ll_file_ioctl, - .open = ll_file_open, - .release = ll_file_release, - .mmap = ll_file_mmap, - .llseek = ll_file_seek, - .splice_read = generic_file_splice_read, - .fsync = ll_fsync, - .flush = ll_flush, - .flock = ll_file_flock, - .lock = ll_file_flock -}; - -/* These are for -o noflock - to return ENOSYS on flock calls */ -const struct file_operations ll_file_operations_noflock = { - .read_iter = ll_file_read_iter, - .write_iter = ll_file_write_iter, - .unlocked_ioctl = ll_file_ioctl, - .open = ll_file_open, - .release = ll_file_release, - .mmap = ll_file_mmap, - .llseek = ll_file_seek, - .splice_read = generic_file_splice_read, - .fsync = ll_fsync, - .flush = ll_flush, - .flock = ll_file_noflock, - .lock = ll_file_noflock -}; - -const struct inode_operations ll_file_inode_operations = { - .setattr = ll_setattr, - .getattr = ll_getattr, - .permission = ll_inode_permission, - .listxattr = ll_listxattr, - .fiemap = ll_fiemap, - .get_acl = ll_get_acl, -}; - -/* dynamic ioctl number support routines */ -static struct llioc_ctl_data { - struct rw_semaphore ioc_sem; - struct list_head ioc_head; -} llioc = { - __RWSEM_INITIALIZER(llioc.ioc_sem), - LIST_HEAD_INIT(llioc.ioc_head) -}; - -struct llioc_data { - struct list_head iocd_list; - unsigned int iocd_size; - llioc_callback_t iocd_cb; - unsigned int iocd_count; - unsigned int iocd_cmd[0]; -}; - -void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd) -{ - unsigned int size; - struct llioc_data *in_data = NULL; - - if (!cb || !cmd || count > LLIOC_MAX_CMD || count < 0) - return NULL; - - size = sizeof(*in_data) + count * sizeof(unsigned int); - in_data = kzalloc(size, GFP_NOFS); - if (!in_data) - return NULL; - - in_data->iocd_size = size; - in_data->iocd_cb = cb; - in_data->iocd_count = count; - memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count); - - down_write(&llioc.ioc_sem); - list_add_tail(&in_data->iocd_list, &llioc.ioc_head); - up_write(&llioc.ioc_sem); - - return in_data; -} -EXPORT_SYMBOL(ll_iocontrol_register); - -void ll_iocontrol_unregister(void *magic) -{ - struct llioc_data *tmp; - - if (!magic) - return; - - down_write(&llioc.ioc_sem); - list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) { - if (tmp == magic) { - list_del(&tmp->iocd_list); - up_write(&llioc.ioc_sem); - - kfree(tmp); - return; - } - } - up_write(&llioc.ioc_sem); - - CWARN("didn't find iocontrol register block with magic: %p\n", magic); -} -EXPORT_SYMBOL(ll_iocontrol_unregister); - -static enum llioc_iter -ll_iocontrol_call(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg, int *rcp) -{ - enum llioc_iter ret = LLIOC_CONT; - struct llioc_data *data; - int rc = -EINVAL, i; - - down_read(&llioc.ioc_sem); - list_for_each_entry(data, &llioc.ioc_head, iocd_list) { - for (i = 0; i < data->iocd_count; i++) { - if (cmd != data->iocd_cmd[i]) - continue; - - ret = data->iocd_cb(inode, file, cmd, arg, data, &rc); - break; - } - - if (ret == LLIOC_STOP) - break; - } - up_read(&llioc.ioc_sem); - - if (rcp) - *rcp = rc; - return ret; -} - -int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *obj = lli->lli_clob; - struct lu_env *env; - int rc; - u16 refcheck; - - if (!obj) - return 0; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - rc = cl_conf_set(env, obj, conf); - if (rc < 0) - goto out; - - if (conf->coc_opc == OBJECT_CONF_SET) { - struct ldlm_lock *lock = conf->coc_lock; - struct cl_layout cl = { - .cl_layout_gen = 0, - }; - - LASSERT(lock); - LASSERT(ldlm_has_layout(lock)); - - /* it can only be allowed to match after layout is - * applied to inode otherwise false layout would be - * seen. Applying layout should happen before dropping - * the intent lock. - */ - ldlm_lock_allow_match(lock); - - rc = cl_object_layout_get(env, obj, &cl); - if (rc < 0) - goto out; - - CDEBUG(D_VFSTRACE, DFID ": layout version change: %u -> %u\n", - PFID(&lli->lli_fid), ll_layout_version_get(lli), - cl.cl_layout_gen); - ll_layout_version_set(lli, cl.cl_layout_gen); - } -out: - cl_env_put(env, &refcheck); - return rc; -} - -/* Fetch layout from MDT with getxattr request, if it's not ready yet */ -static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock) - -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req; - struct mdt_body *body; - void *lvbdata; - void *lmm; - int lmmsize; - int rc; - - CDEBUG(D_INODE, DFID " LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n", - PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock), - lock->l_lvb_data, lock->l_lvb_len); - - if (lock->l_lvb_data && ldlm_is_lvb_ready(lock)) - return 0; - - /* if layout lock was granted right away, the layout is returned - * within DLM_LVB of dlm reply; otherwise if the lock was ever - * blocked and then granted via completion ast, we have to fetch - * layout here. Please note that we can't use the LVB buffer in - * completion AST because it doesn't have a large enough buffer - */ - rc = ll_get_default_mdsize(sbi, &lmmsize); - if (rc == 0) - rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), - OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0, - lmmsize, 0, &req); - if (rc < 0) - return rc; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - lmmsize = body->mbo_eadatasize; - if (lmmsize == 0) /* empty layout */ { - rc = 0; - goto out; - } - - lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize); - if (!lmm) { - rc = -EFAULT; - goto out; - } - - lvbdata = kvzalloc(lmmsize, GFP_NOFS); - if (!lvbdata) { - rc = -ENOMEM; - goto out; - } - - memcpy(lvbdata, lmm, lmmsize); - lock_res_and_lock(lock); - if (lock->l_lvb_data) - kvfree(lock->l_lvb_data); - - lock->l_lvb_data = lvbdata; - lock->l_lvb_len = lmmsize; - unlock_res_and_lock(lock); - -out: - ptlrpc_req_finished(req); - return rc; -} - -/** - * Apply the layout to the inode. Layout lock is held and will be released - * in this function. - */ -static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode, - struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ldlm_lock *lock; - struct cl_object_conf conf; - int rc = 0; - bool lvb_ready; - bool wait_layout = false; - - LASSERT(lustre_handle_is_used(lockh)); - - lock = ldlm_handle2lock(lockh); - LASSERT(lock); - LASSERT(ldlm_has_layout(lock)); - - LDLM_DEBUG(lock, "File " DFID "(%p) being reconfigured", - PFID(&lli->lli_fid), inode); - - /* in case this is a caching lock and reinstate with new inode */ - md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL); - - lock_res_and_lock(lock); - lvb_ready = ldlm_is_lvb_ready(lock); - unlock_res_and_lock(lock); - /* checking lvb_ready is racy but this is okay. The worst case is - * that multi processes may configure the file on the same time. - */ - if (lvb_ready) { - rc = 0; - goto out; - } - - rc = ll_layout_fetch(inode, lock); - if (rc < 0) - goto out; - - /* for layout lock, lmm is returned in lock's lvb. - * lvb_data is immutable if the lock is held so it's safe to access it - * without res lock. - * - * set layout to file. Unlikely this will fail as old layout was - * surely eliminated - */ - memset(&conf, 0, sizeof(conf)); - conf.coc_opc = OBJECT_CONF_SET; - conf.coc_inode = inode; - conf.coc_lock = lock; - conf.u.coc_layout.lb_buf = lock->l_lvb_data; - conf.u.coc_layout.lb_len = lock->l_lvb_len; - rc = ll_layout_conf(inode, &conf); - - /* refresh layout failed, need to wait */ - wait_layout = rc == -EBUSY; - -out: - LDLM_LOCK_PUT(lock); - ldlm_lock_decref(lockh, mode); - - /* wait for IO to complete if it's still being used. */ - if (wait_layout) { - CDEBUG(D_INODE, "%s: " DFID "(%p) wait for layout reconf\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&lli->lli_fid), inode); - - memset(&conf, 0, sizeof(conf)); - conf.coc_opc = OBJECT_CONF_WAIT; - conf.coc_inode = inode; - rc = ll_layout_conf(inode, &conf); - if (rc == 0) - rc = -EAGAIN; - - CDEBUG(D_INODE, - "%s: file=" DFID " waiting layout return: %d.\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&lli->lli_fid), rc); - } - return rc; -} - -static int ll_layout_refresh_locked(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct md_op_data *op_data; - struct lookup_intent it; - struct lustre_handle lockh; - enum ldlm_mode mode; - struct ldlm_enqueue_info einfo = { - .ei_type = LDLM_IBITS, - .ei_mode = LCK_CR, - .ei_cb_bl = &ll_md_blocking_ast, - .ei_cb_cp = &ldlm_completion_ast, - }; - int rc; - -again: - /* mostly layout lock is caching on the local side, so try to match - * it before grabbing layout lock mutex. - */ - mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0, - LCK_CR | LCK_CW | LCK_PR | LCK_PW); - if (mode != 0) { /* hit cached lock */ - rc = ll_layout_lock_set(&lockh, mode, inode); - if (rc == -EAGAIN) - goto again; - return rc; - } - - op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, - 0, 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - /* have to enqueue one */ - memset(&it, 0, sizeof(it)); - it.it_op = IT_LAYOUT; - lockh.cookie = 0ULL; - - LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file " DFID "(%p)", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&lli->lli_fid), inode); - - rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0); - ptlrpc_req_finished(it.it_request); - it.it_request = NULL; - - ll_finish_md_op_data(op_data); - - mode = it.it_lock_mode; - it.it_lock_mode = 0; - ll_intent_drop_lock(&it); - - if (rc == 0) { - /* set lock data in case this is a new lock */ - ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); - rc = ll_layout_lock_set(&lockh, mode, inode); - if (rc == -EAGAIN) - goto again; - } - - return rc; -} - -/** - * This function checks if there exists a LAYOUT lock on the client side, - * or enqueues it if it doesn't have one in cache. - * - * This function will not hold layout lock so it may be revoked any time after - * this function returns. Any operations depend on layout should be redone - * in that case. - * - * This function should be called before lov_io_init() to get an uptodate - * layout version, the caller should save the version number and after IO - * is finished, this function should be called again to verify that layout - * is not changed during IO time. - */ -int ll_layout_refresh(struct inode *inode, __u32 *gen) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc; - - *gen = ll_layout_version_get(lli); - if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE) - return 0; - - /* sanity checks */ - LASSERT(fid_is_sane(ll_inode2fid(inode))); - LASSERT(S_ISREG(inode->i_mode)); - - /* take layout lock mutex to enqueue layout lock exclusively. */ - mutex_lock(&lli->lli_layout_mutex); - - rc = ll_layout_refresh_locked(inode); - if (rc < 0) - goto out; - - *gen = ll_layout_version_get(lli); -out: - mutex_unlock(&lli->lli_layout_mutex); - - return rc; -} - -/** - * This function send a restore request to the MDT - */ -int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length) -{ - struct hsm_user_request *hur; - int len, rc; - - len = sizeof(struct hsm_user_request) + - sizeof(struct hsm_user_item); - hur = kzalloc(len, GFP_NOFS); - if (!hur) - return -ENOMEM; - - hur->hur_request.hr_action = HUA_RESTORE; - hur->hur_request.hr_archive_id = 0; - hur->hur_request.hr_flags = 0; - memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid, - sizeof(hur->hur_user_item[0].hui_fid)); - hur->hur_user_item[0].hui_extent.offset = offset; - hur->hur_user_item[0].hui_extent.length = length; - hur->hur_request.hr_itemcount = 1; - rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp, - len, hur, NULL); - kfree(hur); - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/glimpse.c b/drivers/staging/lustre/lustre/llite/glimpse.c deleted file mode 100644 index 3075358f3f08..000000000000 --- a/drivers/staging/lustre/lustre/llite/glimpse.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * glimpse code shared between vvp and liblustre (and other Lustre clients in - * the future). - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Oleg Drokin <oleg.drokin@sun.com> - */ - -#include <linux/libcfs/libcfs.h> -#include <obd_class.h> -#include <obd_support.h> -#include <obd.h> - -#include <lustre_dlm.h> -#include <lustre_mdc.h> -#include <linux/pagemap.h> -#include <linux/file.h> - -#include <cl_object.h> -#include "llite_internal.h" - -static const struct cl_lock_descr whole_file = { - .cld_start = 0, - .cld_end = CL_PAGE_EOF, - .cld_mode = CLM_READ -}; - -/* - * Check whether file has possible unwriten pages. - * - * \retval 1 file is mmap-ed or has dirty pages - * 0 otherwise - */ -blkcnt_t dirty_cnt(struct inode *inode) -{ - blkcnt_t cnt = 0; - struct vvp_object *vob = cl_inode2vvp(inode); - void *results[1]; - - if (inode->i_mapping) - cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages, - results, 0, 1, - PAGECACHE_TAG_DIRTY); - if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0) - cnt = 1; - - return (cnt > 0) ? 1 : 0; -} - -int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, - struct inode *inode, struct cl_object *clob, int agl) -{ - const struct lu_fid *fid = lu_object_fid(&clob->co_lu); - struct cl_lock *lock = vvp_env_lock(env); - struct cl_lock_descr *descr = &lock->cll_descr; - int result = 0; - - CDEBUG(D_DLMTRACE, "Glimpsing inode " DFID "\n", PFID(fid)); - - /* NOTE: this looks like DLM lock request, but it may - * not be one. Due to CEF_ASYNC flag (translated - * to LDLM_FL_HAS_INTENT by osc), this is - * glimpse request, that won't revoke any - * conflicting DLM locks held. Instead, - * ll_glimpse_callback() will be called on each - * client holding a DLM lock against this file, - * and resulting size will be returned for each - * stripe. DLM lock on [0, EOF] is acquired only - * if there were no conflicting locks. If there - * were conflicting locks, enqueuing or waiting - * fails with -ENAVAIL, but valid inode - * attributes are returned anyway. - */ - *descr = whole_file; - descr->cld_obj = clob; - descr->cld_mode = CLM_READ; - descr->cld_enq_flags = CEF_ASYNC | CEF_MUST; - if (agl) - descr->cld_enq_flags |= CEF_AGL; - /* - * CEF_ASYNC is used because glimpse sub-locks cannot - * deadlock (because they never conflict with other - * locks) and, hence, can be enqueued out-of-order. - * - * CEF_MUST protects glimpse lock from conversion into - * a lockless mode. - */ - result = cl_lock_request(env, io, lock); - if (result < 0) - return result; - - if (!agl) { - ll_merge_attr(env, inode); - if (i_size_read(inode) > 0 && !inode->i_blocks) { - /* - * LU-417: Add dirty pages block count - * lest i_blocks reports 0, some "cp" or - * "tar" may think it's a completely - * sparse file and skip it. - */ - inode->i_blocks = dirty_cnt(inode); - } - } - - cl_lock_release(env, lock); - - return result; -} - -static int cl_io_get(struct inode *inode, struct lu_env **envout, - struct cl_io **ioout, u16 *refcheck) -{ - struct lu_env *env; - struct cl_io *io; - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *clob = lli->lli_clob; - int result; - - if (S_ISREG(inode->i_mode)) { - env = cl_env_get(refcheck); - if (!IS_ERR(env)) { - io = vvp_env_thread_io(env); - io->ci_obj = clob; - *envout = env; - *ioout = io; - result = 1; - } else { - result = PTR_ERR(env); - } - } else { - result = 0; - } - return result; -} - -int cl_glimpse_size0(struct inode *inode, int agl) -{ - /* - * We don't need ast_flags argument to cl_glimpse_size(), because - * osc_lock_enqueue() takes care of the possible deadlock that said - * argument was introduced to avoid. - */ - /* - * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to - * cl_glimpse_size(), which doesn't make sense: glimpse locks are not - * blocking anyway. - */ - struct lu_env *env = NULL; - struct cl_io *io = NULL; - int result; - u16 refcheck; - - result = cl_io_get(inode, &env, &io, &refcheck); - if (result > 0) { -again: - io->ci_verify_layout = 1; - result = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (result > 0) - /* - * nothing to do for this io. This currently happens - * when stripe sub-object's are not yet created. - */ - result = io->ci_result; - else if (result == 0) - result = cl_glimpse_lock(env, io, inode, io->ci_obj, - agl); - - OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2); - cl_io_fini(env, io); - if (unlikely(io->ci_need_restart)) - goto again; - cl_env_put(env, &refcheck); - } - return result; -} diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c deleted file mode 100644 index df5c0c0ae703..000000000000 --- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c +++ /dev/null @@ -1,293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * cl code shared between vvp and liblustre (and other Lustre clients in the - * future). - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <linux/libcfs/libcfs.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/quotaops.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/rbtree.h> - -#include <obd.h> -#include <obd_support.h> -#include <lustre_fid.h> -#include <lustre_dlm.h> -#include <lustre_mdc.h> -#include <cl_object.h> - -#include "llite_internal.h" - -/* - * ccc_ prefix stands for "Common Client Code". - */ - -/***************************************************************************** - * - * Vvp device and device type functions. - * - */ - -/** - * An `emergency' environment used by cl_inode_fini() when cl_env_get() - * fails. Access to this environment is serialized by cl_inode_fini_guard - * mutex. - */ -struct lu_env *cl_inode_fini_env; -u16 cl_inode_fini_refcheck; - -/** - * A mutex serializing calls to slp_inode_fini() under extreme memory - * pressure, when environments cannot be allocated. - */ -static DEFINE_MUTEX(cl_inode_fini_guard); - -int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, - unsigned int attr_flags) -{ - struct lu_env *env; - struct cl_io *io; - int result; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = vvp_env_thread_io(env); - io->ci_obj = obj; - io->ci_verify_layout = 1; - - io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime); - io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime); - io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime); - io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size; - io->u.ci_setattr.sa_attr_flags = attr_flags; - io->u.ci_setattr.sa_valid = attr->ia_valid; - io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu); - -again: - if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { - struct vvp_io *vio = vvp_env_io(env); - - if (attr->ia_valid & ATTR_FILE) - /* populate the file descriptor for ftruncate to honor - * group lock - see LU-787 - */ - vio->vui_fd = LUSTRE_FPRIVATE(attr->ia_file); - - result = cl_io_loop(env, io); - } else { - result = io->ci_result; - } - cl_io_fini(env, io); - if (unlikely(io->ci_need_restart)) - goto again; - - cl_env_put(env, &refcheck); - return result; -} - -/** - * Initialize or update CLIO structures for regular files when new - * meta-data arrives from the server. - * - * \param inode regular file inode - * \param md new file metadata from MDS - * - allocates cl_object if necessary, - * - updated layout, if object was already here. - */ -int cl_file_inode_init(struct inode *inode, struct lustre_md *md) -{ - struct lu_env *env; - struct ll_inode_info *lli; - struct cl_object *clob; - struct lu_site *site; - struct lu_fid *fid; - struct cl_object_conf conf = { - .coc_inode = inode, - .u = { - .coc_layout = md->layout, - } - }; - int result = 0; - u16 refcheck; - - LASSERT(md->body->mbo_valid & OBD_MD_FLID); - LASSERT(S_ISREG(inode->i_mode)); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - site = ll_i2sbi(inode)->ll_site; - lli = ll_i2info(inode); - fid = &lli->lli_fid; - LASSERT(fid_is_sane(fid)); - - if (!lli->lli_clob) { - /* clob is slave of inode, empty lli_clob means for new inode, - * there is no clob in cache with the given fid, so it is - * unnecessary to perform lookup-alloc-lookup-insert, just - * alloc and insert directly. - */ - LASSERT(inode->i_state & I_NEW); - conf.coc_lu.loc_flags = LOC_F_NEW; - clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), - fid, &conf); - if (!IS_ERR(clob)) { - /* - * No locking is necessary, as new inode is - * locked by I_NEW bit. - */ - lli->lli_clob = clob; - lu_object_ref_add(&clob->co_lu, "inode", inode); - } else { - result = PTR_ERR(clob); - } - } else { - result = cl_conf_set(env, lli->lli_clob, &conf); - } - - cl_env_put(env, &refcheck); - - if (result != 0) - CERROR("Failure to initialize cl object " DFID ": %d\n", - PFID(fid), result); - return result; -} - -/** - * Wait for others drop their references of the object at first, then we drop - * the last one, which will lead to the object be destroyed immediately. - * Must be called after cl_object_kill() against this object. - * - * The reason we want to do this is: destroying top object will wait for sub - * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs) - * to initiate top object destroying which may deadlock. See bz22520. - */ -static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) -{ - struct lu_object_header *header = obj->co_lu.lo_header; - wait_queue_entry_t waiter; - - if (unlikely(atomic_read(&header->loh_ref) != 1)) { - struct lu_site *site = obj->co_lu.lo_dev->ld_site; - struct lu_site_bkt_data *bkt; - - bkt = lu_site_bkt_from_fid(site, &header->loh_fid); - - init_waitqueue_entry(&waiter, current); - add_wait_queue(&bkt->lsb_marche_funebre, &waiter); - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&header->loh_ref) == 1) - break; - schedule(); - } - - set_current_state(TASK_RUNNING); - remove_wait_queue(&bkt->lsb_marche_funebre, &waiter); - } - - cl_object_put(env, obj); -} - -void cl_inode_fini(struct inode *inode) -{ - struct lu_env *env; - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *clob = lli->lli_clob; - u16 refcheck; - int emergency; - - if (clob) { - env = cl_env_get(&refcheck); - emergency = IS_ERR(env); - if (emergency) { - mutex_lock(&cl_inode_fini_guard); - LASSERT(cl_inode_fini_env); - env = cl_inode_fini_env; - } - /* - * cl_object cache is a slave to inode cache (which, in turn - * is a slave to dentry cache), don't keep cl_object in memory - * when its master is evicted. - */ - cl_object_kill(env, clob); - lu_object_ref_del(&clob->co_lu, "inode", inode); - cl_object_put_last(env, clob); - lli->lli_clob = NULL; - if (emergency) - mutex_unlock(&cl_inode_fini_guard); - else - cl_env_put(env, &refcheck); - } -} - -/** - * build inode number from passed @fid - */ -__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32) -{ - if (BITS_PER_LONG == 32 || api32) - return fid_flatten32(fid); - else - return fid_flatten(fid); -} - -/** - * build inode generation from passed @fid. If our FID overflows the 32-bit - * inode number then return a non-zero generation to distinguish them. - */ -__u32 cl_fid_build_gen(const struct lu_fid *fid) -{ - __u32 gen; - - if (fid_is_igif(fid)) { - gen = lu_igif_gen(fid); - return gen; - } - - gen = fid_flatten(fid) >> 32; - return gen; -} diff --git a/drivers/staging/lustre/lustre/llite/lcommon_misc.c b/drivers/staging/lustre/lustre/llite/lcommon_misc.c deleted file mode 100644 index a246b955306e..000000000000 --- a/drivers/staging/lustre/lustre/llite/lcommon_misc.c +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * cl code shared between vvp and liblustre (and other Lustre clients in the - * future). - * - */ -#define DEBUG_SUBSYSTEM S_LLITE -#include <obd_class.h> -#include <obd_support.h> -#include <obd.h> -#include <cl_object.h> - -#include "llite_internal.h" - -/* Initialize the default and maximum LOV EA and cookie sizes. This allows - * us to make MDS RPCs with large enough reply buffers to hold the - * maximum-sized (= maximum striped) EA and cookie without having to - * calculate this (via a call into the LOV + OSCs) each time we make an RPC. - */ -int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp) -{ - u32 val_size, max_easize, def_easize; - int rc; - - val_size = sizeof(max_easize); - rc = obd_get_info(NULL, dt_exp, sizeof(KEY_MAX_EASIZE), KEY_MAX_EASIZE, - &val_size, &max_easize); - if (rc) - return rc; - - val_size = sizeof(def_easize); - rc = obd_get_info(NULL, dt_exp, sizeof(KEY_DEFAULT_EASIZE), - KEY_DEFAULT_EASIZE, &val_size, &def_easize); - if (rc) - return rc; - - /* - * default cookiesize is 0 because from 2.4 server doesn't send - * llog cookies to client. - */ - CDEBUG(D_HA, "updating def/max_easize: %d/%d\n", - def_easize, max_easize); - - rc = md_init_ea_size(md_exp, max_easize, def_easize); - return rc; -} - -/** - * This function is used as an upcall-callback hooked by liblustre and llite - * clients into obd_notify() listeners chain to handle notifications about - * change of import connect_flags. See llu_fsswop_mount() and - * lustre_common_fill_super(). - */ -int cl_ocd_update(struct obd_device *host, - struct obd_device *watched, - enum obd_notify_event ev, void *owner, void *data) -{ - struct lustre_client_ocd *lco; - struct client_obd *cli; - __u64 flags; - int result; - - if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) && - watched->obd_set_up && !watched->obd_stopping) { - cli = &watched->u.cli; - lco = owner; - flags = cli->cl_import->imp_connect_data.ocd_connect_flags; - CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n", - lco->lco_flags, flags); - mutex_lock(&lco->lco_lock); - lco->lco_flags &= flags; - /* for each osc event update ea size */ - if (lco->lco_dt_exp) - cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp); - - mutex_unlock(&lco->lco_lock); - result = 0; - } else { - CERROR("unexpected notification from %s %s (setup:%d,stopping:%d)!\n", - watched->obd_type->typ_name, - watched->obd_name, watched->obd_set_up, - watched->obd_stopping); - result = -EINVAL; - } - return result; -} - -#define GROUPLOCK_SCOPE "grouplock" - -int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, - struct ll_grouplock *cg) -{ - struct lu_env *env; - struct cl_io *io; - struct cl_lock *lock; - struct cl_lock_descr *descr; - __u32 enqflags; - u16 refcheck; - int rc; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = vvp_env_thread_io(env); - io->ci_obj = obj; - - rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (rc != 0) { - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - /* Does not make sense to take GL for released layout */ - if (rc > 0) - rc = -ENOTSUPP; - return rc; - } - - lock = vvp_env_lock(env); - descr = &lock->cll_descr; - descr->cld_obj = obj; - descr->cld_start = 0; - descr->cld_end = CL_PAGE_EOF; - descr->cld_gid = gid; - descr->cld_mode = CLM_GROUP; - - enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0); - descr->cld_enq_flags = enqflags; - - rc = cl_lock_request(env, io, lock); - if (rc < 0) { - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - return rc; - } - - cg->lg_env = env; - cg->lg_io = io; - cg->lg_lock = lock; - cg->lg_gid = gid; - - return 0; -} - -void cl_put_grouplock(struct ll_grouplock *cg) -{ - struct lu_env *env = cg->lg_env; - struct cl_io *io = cg->lg_io; - struct cl_lock *lock = cg->lg_lock; - - LASSERT(cg->lg_env); - LASSERT(cg->lg_gid); - - cl_lock_release(env, lock); - cl_io_fini(env, io); - cl_env_put(env, NULL); -} diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h deleted file mode 100644 index d46bcf71b273..000000000000 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ /dev/null @@ -1,1337 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef LLITE_INTERNAL_H -#define LLITE_INTERNAL_H -#include <lustre_debug.h> -#include <uapi/linux/lustre/lustre_ver.h> -#include <lustre_disk.h> /* for s2sbi */ -#include <lustre_linkea.h> - -/* for struct cl_lock_descr and struct cl_io */ -#include <lustre_patchless_compat.h> -#include <lustre_compat.h> -#include <cl_object.h> -#include <lustre_lmv.h> -#include <lustre_mdc.h> -#include <lustre_intent.h> -#include <linux/compat.h> -#include <linux/namei.h> -#include <linux/xattr.h> -#include <linux/posix_acl_xattr.h> -#include "vvp_internal.h" -#include "range_lock.h" - -#ifndef FMODE_EXEC -#define FMODE_EXEC 0 -#endif - -#ifndef VM_FAULT_RETRY -#define VM_FAULT_RETRY 0 -#endif - -/** Only used on client-side for indicating the tail of dir hash/offset. */ -#define LL_DIR_END_OFF 0x7fffffffffffffffULL -#define LL_DIR_END_OFF_32BIT 0x7fffffffUL - -/* 4UL * 1024 * 1024 */ -#define LL_MAX_BLKSIZE_BITS 22 - -#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0") -#define LUSTRE_FPRIVATE(file) ((file)->private_data) - -struct ll_dentry_data { - struct lookup_intent *lld_it; - unsigned int lld_sa_generation; - unsigned int lld_invalid:1; - unsigned int lld_nfs_dentry:1; - struct rcu_head lld_rcu_head; -}; - -#define ll_d2d(de) ((struct ll_dentry_data *)((de)->d_fsdata)) - -#define LLI_INODE_MAGIC 0x111d0de5 -#define LLI_INODE_DEAD 0xdeadd00d - -struct ll_getname_data { - struct dir_context ctx; - char *lgd_name; /* points to buffer with NAME_MAX+1 size */ - struct lu_fid lgd_fid; /* target fid we are looking for */ - int lgd_found; /* inode matched? */ -}; - -struct ll_grouplock { - struct lu_env *lg_env; - struct cl_io *lg_io; - struct cl_lock *lg_lock; - unsigned long lg_gid; -}; - -enum ll_file_flags { - /* File data is modified. */ - LLIF_DATA_MODIFIED = 0, - /* File is being restored */ - LLIF_FILE_RESTORING = 1, - /* Xattr cache is attached to the file */ - LLIF_XATTR_CACHE = 2, -}; - -struct ll_inode_info { - __u32 lli_inode_magic; - - spinlock_t lli_lock; - unsigned long lli_flags; - struct posix_acl *lli_posix_acl; - - /* identifying fields for both metadata and data stacks. */ - struct lu_fid lli_fid; - /* master inode fid for stripe directory */ - struct lu_fid lli_pfid; - - /* We need all three because every inode may be opened in different - * modes - */ - struct obd_client_handle *lli_mds_read_och; - struct obd_client_handle *lli_mds_write_och; - struct obd_client_handle *lli_mds_exec_och; - __u64 lli_open_fd_read_count; - __u64 lli_open_fd_write_count; - __u64 lli_open_fd_exec_count; - /* Protects access to och pointers and their usage counters */ - struct mutex lli_och_mutex; - - struct inode lli_vfs_inode; - - /* the most recent timestamps obtained from mds */ - s64 lli_atime; - s64 lli_mtime; - s64 lli_ctime; - spinlock_t lli_agl_lock; - - /* Try to make the d::member and f::member are aligned. Before using - * these members, make clear whether it is directory or not. - */ - union { - /* for directory */ - struct { - /* serialize normal readdir and statahead-readdir. */ - struct mutex lli_readdir_mutex; - - /* metadata statahead */ - /* since parent-child threads can share the same @file - * struct, "opendir_key" is the token when dir close for - * case of parent exit before child -- it is me should - * cleanup the dir readahead. - */ - void *lli_opendir_key; - struct ll_statahead_info *lli_sai; - /* protect statahead stuff. */ - spinlock_t lli_sa_lock; - /* "opendir_pid" is the token when lookup/revalidate - * -- I am the owner of dir statahead. - */ - pid_t lli_opendir_pid; - /* stat will try to access statahead entries or start - * statahead if this flag is set, and this flag will be - * set upon dir open, and cleared when dir is closed, - * statahead hit ratio is too low, or start statahead - * thread failed. - */ - unsigned int lli_sa_enabled:1; - /* generation for statahead */ - unsigned int lli_sa_generation; - /* directory stripe information */ - struct lmv_stripe_md *lli_lsm_md; - /* default directory stripe offset. This is extracted - * from the "dmv" xattr in order to decide which MDT to - * create a subdirectory on. The MDS itself fetches - * "dmv" and gets the rest of the default layout itself - * (count, hash, etc). - */ - __u32 lli_def_stripe_offset; - }; - - /* for non-directory */ - struct { - struct mutex lli_size_mutex; - char *lli_symlink_name; - /* - * struct rw_semaphore { - * signed long count; // align d.d_def_acl - * spinlock_t wait_lock; // align d.d_sa_lock - * struct list_head wait_list; - * } - */ - struct rw_semaphore lli_trunc_sem; - struct range_lock_tree lli_write_tree; - - struct rw_semaphore lli_glimpse_sem; - unsigned long lli_glimpse_time; - struct list_head lli_agl_list; - __u64 lli_agl_index; - - /* for writepage() only to communicate to fsync */ - int lli_async_rc; - - /* - * whenever a process try to read/write the file, the - * jobid of the process will be saved here, and it'll - * be packed into the write PRC when flush later. - * - * so the read/write statistics for jobid will not be - * accurate if the file is shared by different jobs. - */ - char lli_jobid[LUSTRE_JOBID_SIZE]; - }; - }; - - /* XXX: For following frequent used members, although they maybe special - * used for non-directory object, it is some time-wasting to check - * whether the object is directory or not before using them. On the - * other hand, currently, sizeof(f) > sizeof(d), it cannot reduce - * the "ll_inode_info" size even if moving those members into u.f. - * So keep them out side. - * - * In the future, if more members are added only for directory, - * some of the following members can be moved into u.f. - */ - struct cl_object *lli_clob; - - /* mutex to request for layout lock exclusively. */ - struct mutex lli_layout_mutex; - /* Layout version, protected by lli_layout_lock */ - __u32 lli_layout_gen; - spinlock_t lli_layout_lock; - - struct rw_semaphore lli_xattrs_list_rwsem; - struct mutex lli_xattrs_enq_lock; - struct list_head lli_xattrs;/* ll_xattr_entry->xe_list */ -}; - -static inline __u32 ll_layout_version_get(struct ll_inode_info *lli) -{ - __u32 gen; - - spin_lock(&lli->lli_layout_lock); - gen = lli->lli_layout_gen; - spin_unlock(&lli->lli_layout_lock); - - return gen; -} - -static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen) -{ - spin_lock(&lli->lli_layout_lock); - lli->lli_layout_gen = gen; - spin_unlock(&lli->lli_layout_lock); -} - -int ll_xattr_cache_destroy(struct inode *inode); - -int ll_xattr_cache_get(struct inode *inode, const char *name, - char *buffer, size_t size, __u64 valid); - -int ll_init_security(struct dentry *dentry, struct inode *inode, - struct inode *dir); - -/* - * Locking to guarantee consistency of non-atomic updates to long long i_size, - * consistency between file size and KMS. - * - * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order. - */ - -void ll_inode_size_lock(struct inode *inode); -void ll_inode_size_unlock(struct inode *inode); - -/* FIXME: replace the name of this with LL_I to conform to kernel stuff */ -/* static inline struct ll_inode_info *LL_I(struct inode *inode) */ -static inline struct ll_inode_info *ll_i2info(struct inode *inode) -{ - return container_of(inode, struct ll_inode_info, lli_vfs_inode); -} - -/* default to about 64M of readahead on a given system. */ -#define SBI_DEFAULT_READAHEAD_MAX (64UL << (20 - PAGE_SHIFT)) - -/* default to read-ahead full files smaller than 2MB on the second read */ -#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_SHIFT)) - -enum ra_stat { - RA_STAT_HIT = 0, - RA_STAT_MISS, - RA_STAT_DISTANT_READPAGE, - RA_STAT_MISS_IN_WINDOW, - RA_STAT_FAILED_GRAB_PAGE, - RA_STAT_FAILED_MATCH, - RA_STAT_DISCARDED, - RA_STAT_ZERO_LEN, - RA_STAT_ZERO_WINDOW, - RA_STAT_EOF, - RA_STAT_MAX_IN_FLIGHT, - RA_STAT_WRONG_GRAB_PAGE, - RA_STAT_FAILED_REACH_END, - _NR_RA_STAT, -}; - -struct ll_ra_info { - atomic_t ra_cur_pages; - unsigned long ra_max_pages; - unsigned long ra_max_pages_per_file; - unsigned long ra_max_read_ahead_whole_pages; -}; - -/* ra_io_arg will be filled in the beginning of ll_readahead with - * ras_lock, then the following ll_read_ahead_pages will read RA - * pages according to this arg, all the items in this structure are - * counted by page index. - */ -struct ra_io_arg { - unsigned long ria_start; /* start offset of read-ahead*/ - unsigned long ria_end; /* end offset of read-ahead*/ - unsigned long ria_reserved; /* reserved pages for read-ahead */ - unsigned long ria_end_min; /* minimum end to cover current read */ - bool ria_eof; /* reach end of file */ - /* If stride read pattern is detected, ria_stoff means where - * stride read is started. Note: for normal read-ahead, the - * value here is meaningless, and also it will not be accessed - */ - pgoff_t ria_stoff; - /* ria_length and ria_pages are the length and pages length in the - * stride I/O mode. And they will also be used to check whether - * it is stride I/O read-ahead in the read-ahead pages - */ - unsigned long ria_length; - unsigned long ria_pages; -}; - -/* LL_HIST_MAX=32 causes an overflow */ -#define LL_HIST_MAX 28 -#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */ -#define LL_PROCESS_HIST_MAX 10 -struct per_process_info { - pid_t pid; - struct obd_histogram pp_r_hist; - struct obd_histogram pp_w_hist; -}; - -/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */ -struct ll_rw_extents_info { - struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1]; -}; - -#define LL_OFFSET_HIST_MAX 100 -struct ll_rw_process_info { - pid_t rw_pid; - int rw_op; - loff_t rw_range_start; - loff_t rw_range_end; - loff_t rw_last_file_pos; - loff_t rw_offset; - size_t rw_smallest_extent; - size_t rw_largest_extent; - struct ll_file_data *rw_last_file; -}; - -enum stats_track_type { - STATS_TRACK_ALL = 0, /* track all processes */ - STATS_TRACK_PID, /* track process with this pid */ - STATS_TRACK_PPID, /* track processes with this ppid */ - STATS_TRACK_GID, /* track processes with this gid */ - STATS_TRACK_LAST, -}; - -/* flags for sbi->ll_flags */ -#define LL_SBI_NOLCK 0x01 /* DLM locking disabled (directio-only) */ -#define LL_SBI_CHECKSUM 0x02 /* checksum each page as it's written */ -#define LL_SBI_FLOCK 0x04 -#define LL_SBI_USER_XATTR 0x08 /* support user xattr */ -#define LL_SBI_ACL 0x10 /* support ACL */ -/* LL_SBI_RMT_CLIENT 0x40 remote client */ -#define LL_SBI_MDS_CAPA 0x80 /* support mds capa, obsolete */ -#define LL_SBI_OSS_CAPA 0x100 /* support oss capa, obsolete */ -#define LL_SBI_LOCALFLOCK 0x200 /* Local flocks support by kernel */ -#define LL_SBI_LRU_RESIZE 0x400 /* lru resize support */ -#define LL_SBI_LAZYSTATFS 0x800 /* lazystatfs mount option */ -/* LL_SBI_SOM_PREVIEW 0x1000 SOM preview mount option, obsolete */ -#define LL_SBI_32BIT_API 0x2000 /* generate 32 bit inodes. */ -#define LL_SBI_64BIT_HASH 0x4000 /* support 64-bits dir hash/offset */ -#define LL_SBI_AGL_ENABLED 0x8000 /* enable agl */ -#define LL_SBI_VERBOSE 0x10000 /* verbose mount/umount */ -#define LL_SBI_LAYOUT_LOCK 0x20000 /* layout lock support */ -#define LL_SBI_USER_FID2PATH 0x40000 /* allow fid2path by unprivileged users */ -#define LL_SBI_XATTR_CACHE 0x80000 /* support for xattr cache */ -#define LL_SBI_NOROOTSQUASH 0x100000 /* do not apply root squash */ -#define LL_SBI_ALWAYS_PING 0x200000 /* always ping even if server - * suppress_pings - */ - -#define LL_SBI_FLAGS { \ - "nolck", \ - "checksum", \ - "flock", \ - "user_xattr", \ - "acl", \ - "???", \ - "???", \ - "mds_capa", \ - "oss_capa", \ - "flock", \ - "lru_resize", \ - "lazy_statfs", \ - "som", \ - "32bit_api", \ - "64bit_hash", \ - "agl", \ - "verbose", \ - "layout", \ - "user_fid2path",\ - "xattr_cache", \ - "norootsquash", \ - "always_ping", \ -} - -/* - * This is embedded into llite super-blocks to keep track of connect - * flags (capabilities) supported by all imports given mount is - * connected to. - */ -struct lustre_client_ocd { - /* - * This is conjunction of connect_flags across all imports - * (LOVs) this mount is connected to. This field is updated by - * cl_ocd_update() under ->lco_lock. - */ - __u64 lco_flags; - struct mutex lco_lock; - struct obd_export *lco_md_exp; - struct obd_export *lco_dt_exp; -}; - -struct ll_sb_info { - /* this protects pglist and ra_info. It isn't safe to - * grab from interrupt contexts - */ - spinlock_t ll_lock; - spinlock_t ll_pp_extent_lock; /* pp_extent entry*/ - spinlock_t ll_process_lock; /* ll_rw_process_info */ - struct obd_uuid ll_sb_uuid; - struct obd_export *ll_md_exp; - struct obd_export *ll_dt_exp; - struct dentry *ll_debugfs_entry; - struct lu_fid ll_root_fid; /* root object fid */ - - int ll_flags; - unsigned int ll_umounting:1, - ll_xattr_cache_enabled:1, - ll_client_common_fill_super_succeeded:1; - - struct lustre_client_ocd ll_lco; - - struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ - - /* - * Used to track "unstable" pages on a client, and maintain a - * LRU list of clean pages. An "unstable" page is defined as - * any page which is sent to a server as part of a bulk request, - * but is uncommitted to stable storage. - */ - struct cl_client_cache *ll_cache; - - struct lprocfs_stats *ll_ra_stats; - - struct ll_ra_info ll_ra_info; - unsigned int ll_namelen; - const struct file_operations *ll_fop; - - unsigned int ll_md_brw_pages; /* readdir pages per RPC */ - - struct lu_site *ll_site; - struct cl_device *ll_cl; - /* Statistics */ - struct ll_rw_extents_info ll_rw_extents_info; - int ll_extent_process_count; - struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX]; - unsigned int ll_offset_process_count; - struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX]; - unsigned int ll_rw_offset_entry_count; - int ll_stats_track_id; - enum stats_track_type ll_stats_track_type; - int ll_rw_stats_on; - - /* metadata stat-ahead */ - unsigned int ll_sa_max; /* max statahead RPCs */ - atomic_t ll_sa_total; /* statahead thread started - * count - */ - atomic_t ll_sa_wrong; /* statahead thread stopped for - * low hit ratio - */ - atomic_t ll_sa_running; /* running statahead thread - * count - */ - atomic_t ll_agl_total; /* AGL thread started count */ - - dev_t ll_sdev_orig; /* save s_dev before assign for - * clustered nfs - */ - /* root squash */ - struct root_squash_info ll_squash; - struct path ll_mnt; - - __kernel_fsid_t ll_fsid; - struct kobject ll_kobj; /* sysfs object */ - struct super_block *ll_sb; /* struct super_block (for sysfs code)*/ - struct completion ll_kobj_unregister; -}; - -/* - * per file-descriptor read-ahead data. - */ -struct ll_readahead_state { - spinlock_t ras_lock; - /* - * index of the last page that read(2) needed and that wasn't in the - * cache. Used by ras_update() to detect seeks. - * - * XXX nikita: if access seeks into cached region, Lustre doesn't see - * this. - */ - unsigned long ras_last_readpage; - /* - * number of pages read after last read-ahead window reset. As window - * is reset on each seek, this is effectively a number of consecutive - * accesses. Maybe ->ras_accessed_in_window is better name. - * - * XXX nikita: window is also reset (by ras_update()) when Lustre - * believes that memory pressure evicts read-ahead pages. In that - * case, it probably doesn't make sense to expand window to - * PTLRPC_MAX_BRW_PAGES on the third access. - */ - unsigned long ras_consecutive_pages; - /* - * number of read requests after the last read-ahead window reset - * As window is reset on each seek, this is effectively the number - * on consecutive read request and is used to trigger read-ahead. - */ - unsigned long ras_consecutive_requests; - /* - * Parameters of current read-ahead window. Handled by - * ras_update(). On the initial access to the file or after a seek, - * window is reset to 0. After 3 consecutive accesses, window is - * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by - * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages. - */ - unsigned long ras_window_start, ras_window_len; - /* - * Optimal RPC size. It decides how many pages will be sent - * for each read-ahead. - */ - unsigned long ras_rpc_size; - /* - * Where next read-ahead should start at. This lies within read-ahead - * window. Read-ahead window is read in pieces rather than at once - * because: 1. lustre limits total number of pages under read-ahead by - * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages - * not covered by DLM lock. - */ - unsigned long ras_next_readahead; - /* - * Total number of ll_file_read requests issued, reads originating - * due to mmap are not counted in this total. This value is used to - * trigger full file read-ahead after multiple reads to a small file. - */ - unsigned long ras_requests; - /* - * Page index with respect to the current request, these value - * will not be accurate when dealing with reads issued via mmap. - */ - unsigned long ras_request_index; - /* - * The following 3 items are used for detecting the stride I/O - * mode. - * In stride I/O mode, - * ...............|-----data-----|****gap*****|--------|******|.... - * offset |-stride_pages-|-stride_gap-| - * ras_stride_offset = offset; - * ras_stride_length = stride_pages + stride_gap; - * ras_stride_pages = stride_pages; - * Note: all these three items are counted by pages. - */ - unsigned long ras_stride_length; - unsigned long ras_stride_pages; - pgoff_t ras_stride_offset; - /* - * number of consecutive stride request count, and it is similar as - * ras_consecutive_requests, but used for stride I/O mode. - * Note: only more than 2 consecutive stride request are detected, - * stride read-ahead will be enable - */ - unsigned long ras_consecutive_stride_requests; -}; - -extern struct kmem_cache *ll_file_data_slab; -struct lustre_handle; -struct ll_file_data { - struct ll_readahead_state fd_ras; - struct ll_grouplock fd_grouplock; - __u64 lfd_pos; - __u32 fd_flags; - fmode_t fd_omode; - /* openhandle if lease exists for this file. - * Borrow lli->lli_och_mutex to protect assignment - */ - struct obd_client_handle *fd_lease_och; - struct obd_client_handle *fd_och; - struct file *fd_file; - /* Indicate whether need to report failure when close. - * true: failure is known, not report again. - * false: unknown failure, should report. - */ - bool fd_write_failed; - rwlock_t fd_lock; /* protect lcc list */ - struct list_head fd_lccs; /* list of ll_cl_context */ -}; - -extern struct dentry *llite_root; -extern struct kset *llite_kset; - -static inline struct inode *ll_info2i(struct ll_inode_info *lli) -{ - return &lli->lli_vfs_inode; -} - -__u32 ll_i2suppgid(struct inode *i); -void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2); - -static inline int ll_need_32bit_api(struct ll_sb_info *sbi) -{ -#if BITS_PER_LONG == 32 - return 1; -#elif defined(CONFIG_COMPAT) - return unlikely(in_compat_syscall() || - (sbi->ll_flags & LL_SBI_32BIT_API)); -#else - return unlikely(sbi->ll_flags & LL_SBI_32BIT_API); -#endif -} - -void ll_ras_enter(struct file *f); - -/* llite/lcommon_misc.c */ -int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp); -int cl_ocd_update(struct obd_device *host, - struct obd_device *watched, - enum obd_notify_event ev, void *owner, void *data); -int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, - struct ll_grouplock *cg); -void cl_put_grouplock(struct ll_grouplock *cg); - -/* llite/lproc_llite.c */ -int ldebugfs_register_mountpoint(struct dentry *parent, - struct super_block *sb, char *osc, char *mdc); -void ldebugfs_unregister_mountpoint(struct ll_sb_info *sbi); -void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count); -void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars); -void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, - struct ll_file_data *file, loff_t pos, - size_t count, int rw); - -enum { - LPROC_LL_DIRTY_HITS, - LPROC_LL_DIRTY_MISSES, - LPROC_LL_READ_BYTES, - LPROC_LL_WRITE_BYTES, - LPROC_LL_BRW_READ, - LPROC_LL_BRW_WRITE, - LPROC_LL_IOCTL, - LPROC_LL_OPEN, - LPROC_LL_RELEASE, - LPROC_LL_MAP, - LPROC_LL_LLSEEK, - LPROC_LL_FSYNC, - LPROC_LL_READDIR, - LPROC_LL_SETATTR, - LPROC_LL_TRUNC, - LPROC_LL_FLOCK, - LPROC_LL_GETATTR, - LPROC_LL_CREATE, - LPROC_LL_LINK, - LPROC_LL_UNLINK, - LPROC_LL_SYMLINK, - LPROC_LL_MKDIR, - LPROC_LL_RMDIR, - LPROC_LL_MKNOD, - LPROC_LL_RENAME, - LPROC_LL_STAFS, - LPROC_LL_ALLOC_INODE, - LPROC_LL_SETXATTR, - LPROC_LL_GETXATTR, - LPROC_LL_GETXATTR_HITS, - LPROC_LL_LISTXATTR, - LPROC_LL_REMOVEXATTR, - LPROC_LL_INODE_PERM, - LPROC_LL_FILE_OPCODES -}; - -/* llite/dir.c */ -extern const struct file_operations ll_dir_operations; -extern const struct inode_operations ll_dir_inode_operations; -int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, - struct dir_context *ctx); -int ll_get_mdt_idx(struct inode *inode); -int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid); -struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, - __u64 offset); -void ll_release_page(struct inode *inode, struct page *page, bool remove); - -/* llite/namei.c */ -extern const struct inode_operations ll_special_inode_operations; - -struct inode *ll_iget(struct super_block *sb, ino_t hash, - struct lustre_md *lic); -int ll_test_inode_by_fid(struct inode *inode, void *opaque); -int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, int flag); -struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de); -void ll_update_times(struct ptlrpc_request *request, struct inode *inode); - -/* llite/rw.c */ -int ll_writepage(struct page *page, struct writeback_control *wbc); -int ll_writepages(struct address_space *mapping, struct writeback_control *wbc); -int ll_readpage(struct file *file, struct page *page); -void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); -struct ll_cl_context *ll_cl_find(struct file *file); -void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io); -void ll_cl_remove(struct file *file, const struct lu_env *env); - -extern const struct address_space_operations ll_aops; - -/* llite/file.c */ -extern const struct file_operations ll_file_operations; -extern const struct file_operations ll_file_operations_flock; -extern const struct file_operations ll_file_operations_noflock; -extern const struct inode_operations ll_file_inode_operations; -int ll_have_md_lock(struct inode *inode, __u64 *bits, - enum ldlm_mode l_req_mode); -enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, - struct lustre_handle *lockh, __u64 flags, - enum ldlm_mode mode); -int ll_file_open(struct inode *inode, struct file *file); -int ll_file_release(struct inode *inode, struct file *file); -int ll_release_openhandle(struct inode *inode, struct lookup_intent *it); -int ll_md_real_close(struct inode *inode, fmode_t fmode); -int ll_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -struct posix_acl *ll_get_acl(struct inode *inode, int type); -int ll_migrate(struct inode *parent, struct file *file, int mdtidx, - const char *name, int namelen); -int ll_get_fid_by_name(struct inode *parent, const char *name, - int namelen, struct lu_fid *fid, struct inode **inode); -int ll_inode_permission(struct inode *inode, int mask); - -int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, - __u64 flags, struct lov_user_md *lum, - int lum_size); -int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, - struct lov_mds_md **lmm, int *lmm_size, - struct ptlrpc_request **request); -int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, - int set_default); -int ll_dir_getstripe(struct inode *inode, void **lmmp, int *lmm_size, - struct ptlrpc_request **request, u64 valid); -int ll_fsync(struct file *file, loff_t start, loff_t end, int data); -int ll_merge_attr(const struct lu_env *env, struct inode *inode); -int ll_fid2path(struct inode *inode, void __user *arg); -int ll_data_version(struct inode *inode, __u64 *data_version, int flags); -int ll_hsm_release(struct inode *inode); -int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss); - -/* llite/dcache.c */ - -extern const struct dentry_operations ll_d_ops; -void ll_intent_drop_lock(struct lookup_intent *it); -void ll_intent_release(struct lookup_intent *it); -void ll_invalidate_aliases(struct inode *inode); -void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode); -int ll_revalidate_it_finish(struct ptlrpc_request *request, - struct lookup_intent *it, struct inode *inode); - -/* llite/llite_lib.c */ -extern struct super_operations lustre_super_operations; - -void ll_lli_init(struct ll_inode_info *lli); -int ll_fill_super(struct super_block *sb); -void ll_put_super(struct super_block *sb); -void ll_kill_super(struct super_block *sb); -struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock); -void ll_dir_clear_lsm_md(struct inode *inode); -void ll_clear_inode(struct inode *inode); -int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import); -int ll_setattr(struct dentry *de, struct iattr *attr); -int ll_statfs(struct dentry *de, struct kstatfs *sfs); -int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, - __u64 max_age, __u32 flags); -int ll_update_inode(struct inode *inode, struct lustre_md *md); -int ll_read_inode2(struct inode *inode, void *opaque); -void ll_delete_inode(struct inode *inode); -int ll_iocontrol(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); -int ll_flush_ctx(struct inode *inode); -void ll_umount_begin(struct super_block *sb); -int ll_remount_fs(struct super_block *sb, int *flags, char *data); -int ll_show_options(struct seq_file *seq, struct dentry *dentry); -void ll_dirty_page_discard_warn(struct page *page, int ioret); -int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, - struct super_block *sb, struct lookup_intent *it); -int ll_obd_statfs(struct inode *inode, void __user *arg); -int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); -int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize); -int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize); -int ll_process_config(struct lustre_cfg *lcfg); - -enum { - LUSTRE_OPC_MKDIR = 0, - LUSTRE_OPC_SYMLINK = 1, - LUSTRE_OPC_MKNOD = 2, - LUSTRE_OPC_CREATE = 3, - LUSTRE_OPC_ANY = 5, -}; - -struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, - struct inode *i1, struct inode *i2, - const char *name, size_t namelen, - u32 mode, __u32 opc, void *data); -void ll_finish_md_op_data(struct md_op_data *op_data); -int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg); -char *ll_get_fsname(struct super_block *sb, char *buf, int buflen); -void ll_compute_rootsquash_state(struct ll_sb_info *sbi); -void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req); -ssize_t ll_copy_user_md(const struct lov_user_md __user *md, - struct lov_user_md **kbuf); - -/* Compute expected user md size when passing in a md from user space */ -static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum) -{ - switch (lum->lmm_magic) { - case LOV_USER_MAGIC_V1: - return sizeof(struct lov_user_md_v1); - case LOV_USER_MAGIC_V3: - return sizeof(struct lov_user_md_v3); - case LOV_USER_MAGIC_SPECIFIC: - if (lum->lmm_stripe_count > LOV_MAX_STRIPE_COUNT) - return -EINVAL; - - return lov_user_md_size(lum->lmm_stripe_count, - LOV_USER_MAGIC_SPECIFIC); - } - return -EINVAL; -} - -/* llite/llite_nfs.c */ -extern const struct export_operations lustre_export_operations; -__u32 get_uuid2int(const char *name, int len); -void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid); -struct inode *search_inode_for_lustre(struct super_block *sb, - const struct lu_fid *fid); -int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid); - -/* llite/symlink.c */ -extern const struct inode_operations ll_fast_symlink_inode_operations; - -/** - * IO arguments for various VFS I/O interfaces. - */ -struct vvp_io_args { - /** normal/splice */ - union { - struct { - struct kiocb *via_iocb; - struct iov_iter *via_iter; - } normal; - } u; -}; - -struct ll_cl_context { - struct list_head lcc_list; - void *lcc_cookie; - const struct lu_env *lcc_env; - struct cl_io *lcc_io; - struct cl_page *lcc_page; -}; - -struct ll_thread_info { - struct vvp_io_args lti_args; - struct ra_io_arg lti_ria; - struct ll_cl_context lti_io_ctx; -}; - -extern struct lu_context_key ll_thread_key; -static inline struct ll_thread_info *ll_env_info(const struct lu_env *env) -{ - struct ll_thread_info *lti; - - lti = lu_context_key_get(&env->le_ctx, &ll_thread_key); - LASSERT(lti); - return lti; -} - -static inline struct vvp_io_args *ll_env_args(const struct lu_env *env) -{ - return &ll_env_info(env)->lti_args; -} - -/* llite/llite_mmap.c */ - -int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last); -int ll_file_mmap(struct file *file, struct vm_area_struct *vma); -void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma, - unsigned long addr, size_t count); -struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, - size_t count); - -static inline void ll_invalidate_page(struct page *vmpage) -{ - struct address_space *mapping = vmpage->mapping; - loff_t offset = vmpage->index << PAGE_SHIFT; - - LASSERT(PageLocked(vmpage)); - if (!mapping) - return; - - /* - * truncate_complete_page() calls - * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete(). - */ - ll_teardown_mmaps(mapping, offset, offset + PAGE_SIZE); - truncate_complete_page(mapping, vmpage); -} - -#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi) - -/* don't need an addref as the sb_info should be holding one */ -static inline struct obd_export *ll_s2dtexp(struct super_block *sb) -{ - return ll_s2sbi(sb)->ll_dt_exp; -} - -/* don't need an addref as the sb_info should be holding one */ -static inline struct obd_export *ll_s2mdexp(struct super_block *sb) -{ - return ll_s2sbi(sb)->ll_md_exp; -} - -static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi) -{ - struct obd_device *obd = sbi->ll_md_exp->exp_obd; - - if (!obd) - LBUG(); - return &obd->u.cli; -} - -/* FIXME: replace the name of this with LL_SB to conform to kernel stuff */ -static inline struct ll_sb_info *ll_i2sbi(struct inode *inode) -{ - return ll_s2sbi(inode->i_sb); -} - -static inline struct obd_export *ll_i2dtexp(struct inode *inode) -{ - return ll_s2dtexp(inode->i_sb); -} - -static inline struct obd_export *ll_i2mdexp(struct inode *inode) -{ - return ll_s2mdexp(inode->i_sb); -} - -static inline struct lu_fid *ll_inode2fid(struct inode *inode) -{ - struct lu_fid *fid; - - LASSERT(inode); - fid = &ll_i2info(inode)->lli_fid; - - return fid; -} - -static inline loff_t ll_file_maxbytes(struct inode *inode) -{ - struct cl_object *obj = ll_i2info(inode)->lli_clob; - - if (!obj) - return MAX_LFS_FILESIZE; - - return min_t(loff_t, cl_object_maxbytes(obj), MAX_LFS_FILESIZE); -} - -/* llite/xattr.c */ -extern const struct xattr_handler *ll_xattr_handlers[]; - -#define XATTR_USER_T 1 -#define XATTR_TRUSTED_T 2 -#define XATTR_SECURITY_T 3 -#define XATTR_ACL_ACCESS_T 4 -#define XATTR_ACL_DEFAULT_T 5 -#define XATTR_LUSTRE_T 6 -#define XATTR_OTHER_T 7 - -ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size); -int ll_xattr_list(struct inode *inode, const char *name, int type, - void *buffer, size_t size, __u64 valid); -const struct xattr_handler *get_xattr_type(const char *name); - -/** - * Common IO arguments for various VFS I/O interfaces. - */ -int cl_sb_init(struct super_block *sb); -int cl_sb_fini(struct super_block *sb); - -enum ras_update_flags { - LL_RAS_HIT = 0x1, - LL_RAS_MMAP = 0x2 -}; -void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len); -void ll_ra_stats_inc(struct inode *inode, enum ra_stat which); - -/* statahead.c */ -#define LL_SA_RPC_MIN 2 -#define LL_SA_RPC_DEF 32 -#define LL_SA_RPC_MAX 8192 - -#define LL_SA_CACHE_BIT 5 -#define LL_SA_CACHE_SIZE (1 << LL_SA_CACHE_BIT) -#define LL_SA_CACHE_MASK (LL_SA_CACHE_SIZE - 1) - -/* per inode struct, for dir only */ -struct ll_statahead_info { - struct dentry *sai_dentry; - atomic_t sai_refcount; /* when access this struct, hold - * refcount - */ - unsigned int sai_max; /* max ahead of lookup */ - __u64 sai_sent; /* stat requests sent count */ - __u64 sai_replied; /* stat requests which received - * reply - */ - __u64 sai_index; /* index of statahead entry */ - __u64 sai_index_wait; /* index of entry which is the - * caller is waiting for - */ - __u64 sai_hit; /* hit count */ - __u64 sai_miss; /* miss count: - * for "ls -al" case, it includes - * hidden dentry miss; - * for "ls -l" case, it does not - * include hidden dentry miss. - * "sai_miss_hidden" is used for - * the later case. - */ - unsigned int sai_consecutive_miss; /* consecutive miss */ - unsigned int sai_miss_hidden;/* "ls -al", but first dentry - * is not a hidden one - */ - unsigned int sai_skip_hidden;/* skipped hidden dentry count */ - unsigned int sai_ls_all:1, /* "ls -al", do stat-ahead for - * hidden entries - */ - sai_agl_valid:1,/* AGL is valid for the dir */ - sai_in_readpage:1;/* statahead in readdir() */ - wait_queue_head_t sai_waitq; /* stat-ahead wait queue */ - struct task_struct *sai_task; /* stat-ahead thread */ - struct task_struct *sai_agl_task; /* AGL thread */ - struct list_head sai_interim_entries; /* entries which got async - * stat reply, but not - * instantiated - */ - struct list_head sai_entries; /* completed entries */ - struct list_head sai_agls; /* AGLs to be sent */ - struct list_head sai_cache[LL_SA_CACHE_SIZE]; - spinlock_t sai_cache_lock[LL_SA_CACHE_SIZE]; - atomic_t sai_cache_count; /* entry count in cache */ -}; - -int ll_statahead(struct inode *dir, struct dentry **dentry, bool unplug); -void ll_authorize_statahead(struct inode *dir, void *key); -void ll_deauthorize_statahead(struct inode *dir, void *key); - -blkcnt_t dirty_cnt(struct inode *inode); - -int cl_glimpse_size0(struct inode *inode, int agl); -int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, - struct inode *inode, struct cl_object *clob, int agl); - -static inline int cl_glimpse_size(struct inode *inode) -{ - return cl_glimpse_size0(inode, 0); -} - -static inline int cl_agl(struct inode *inode) -{ - return cl_glimpse_size0(inode, 1); -} - -static inline int ll_glimpse_size(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - int rc; - - down_read(&lli->lli_glimpse_sem); - rc = cl_glimpse_size(inode); - lli->lli_glimpse_time = cfs_time_current(); - up_read(&lli->lli_glimpse_sem); - return rc; -} - -/* - * dentry may statahead when statahead is enabled and current process has opened - * parent directory, and this dentry hasn't accessed statahead cache before - */ -static inline bool -dentry_may_statahead(struct inode *dir, struct dentry *dentry) -{ - struct ll_inode_info *lli; - struct ll_dentry_data *ldd; - - if (ll_i2sbi(dir)->ll_sa_max == 0) - return false; - - lli = ll_i2info(dir); - - /* - * statahead is not allowed for this dir, there may be three causes: - * 1. dir is not opened. - * 2. statahead hit ratio is too low. - * 3. previous stat started statahead thread failed. - */ - if (!lli->lli_sa_enabled) - return false; - - /* not the same process, don't statahead */ - if (lli->lli_opendir_pid != current_pid()) - return false; - - /* - * When stating a dentry, kernel may trigger 'revalidate' or 'lookup' - * multiple times, eg. for 'getattr', 'getxattr' and etc. - * For patchless client, lookup intent is not accurate, which may - * misguide statahead. For example: - * The 'revalidate' call for 'getattr' and 'getxattr' of a dentry will - * have the same intent -- IT_GETATTR, while one dentry should access - * statahead cache once, otherwise statahead windows is messed up. - * The solution is as following: - * Assign 'lld_sa_generation' with 'lli_sa_generation' when a dentry - * IT_GETATTR for the first time, and subsequent IT_GETATTR will - * bypass interacting with statahead cache by checking - * 'lld_sa_generation == lli->lli_sa_generation'. - */ - ldd = ll_d2d(dentry); - if (ldd->lld_sa_generation == lli->lli_sa_generation) - return false; - - return true; -} - -/* llite ioctl register support routine */ -enum llioc_iter { - LLIOC_CONT = 0, - LLIOC_STOP -}; - -#define LLIOC_MAX_CMD 256 - -/* - * Rules to write a callback function: - * - * Parameters: - * @magic: Dynamic ioctl call routine will feed this value with the pointer - * returned to ll_iocontrol_register. Callback functions should use this - * data to check the potential collasion of ioctl cmd. If collasion is - * found, callback function should return LLIOC_CONT. - * @rcp: The result of ioctl command. - * - * Return values: - * If @magic matches the pointer returned by ll_iocontrol_data, the - * callback should return LLIOC_STOP; return LLIOC_STOP otherwise. - */ -typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, - struct file *file, unsigned int cmd, unsigned long arg, - void *magic, int *rcp); - -/* export functions */ -/* Register ioctl block dynamatically for a regular file. - * - * @cmd: the array of ioctl command set - * @count: number of commands in the @cmd - * @cb: callback function, it will be called if an ioctl command is found to - * belong to the command list @cmd. - * - * Return value: - * A magic pointer will be returned if success; - * otherwise, NULL will be returned. - */ -void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd); -void ll_iocontrol_unregister(void *magic); - -int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, - enum cl_fsync_mode mode, int ignore_layout); - -/** direct write pages */ -struct ll_dio_pages { - /** page array to be written. we don't support - * partial pages except the last one. - */ - struct page **ldp_pages; - /* offset of each page */ - loff_t *ldp_offsets; - /** if ldp_offsets is NULL, it means a sequential - * pages to be written, then this is the file offset - * of the first page. - */ - loff_t ldp_start_offset; - /** how many bytes are to be written. */ - size_t ldp_size; - /** # of pages in the array. */ - int ldp_nr; -}; - -ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, - int rw, struct inode *inode, - struct ll_dio_pages *pv); - -static inline int ll_file_nolock(const struct file *file) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct inode *inode = file_inode(file); - - return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) || - (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK)); -} - -static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode, - struct lookup_intent *it, __u64 *bits) -{ - if (!it->it_lock_set) { - struct lustre_handle handle; - - /* If this inode is a remote object, it will get two - * separate locks in different namespaces, Master MDT, - * where the name entry is, will grant LOOKUP lock, - * remote MDT, where the object is, will grant - * UPDATE|PERM lock. The inode will be attached to both - * LOOKUP and PERM locks, so revoking either locks will - * case the dcache being cleared - */ - if (it->it_remote_lock_mode) { - handle.cookie = it->it_remote_lock_handle; - CDEBUG(D_DLMTRACE, "setting l_data to inode " DFID "%p for remote lock %#llx\n", - PFID(ll_inode2fid(inode)), inode, - handle.cookie); - md_set_lock_data(exp, &handle, inode, NULL); - } - - handle.cookie = it->it_lock_handle; - - CDEBUG(D_DLMTRACE, - "setting l_data to inode " DFID "%p for lock %#llx\n", - PFID(ll_inode2fid(inode)), inode, handle.cookie); - - md_set_lock_data(exp, &handle, inode, &it->it_lock_bits); - it->it_lock_set = 1; - } - - if (bits) - *bits = it->it_lock_bits; -} - -static inline int d_lustre_invalid(const struct dentry *dentry) -{ - return ll_d2d(dentry)->lld_invalid; -} - -/* - * Mark dentry INVALID, if dentry refcount is zero (this is normally case for - * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later; - * else dput() of the last refcount will unhash this dentry and kill it. - */ -static inline void d_lustre_invalidate(struct dentry *dentry, int nested) -{ - CDEBUG(D_DENTRY, - "invalidate dentry %pd (%p) parent %p inode %p refc %d\n", - dentry, dentry, - dentry->d_parent, d_inode(dentry), d_count(dentry)); - - spin_lock_nested(&dentry->d_lock, - nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL); - ll_d2d(dentry)->lld_invalid = 1; - if (d_count(dentry) == 0) - __d_drop(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void d_lustre_revalidate(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - LASSERT(ll_d2d(dentry)); - ll_d2d(dentry)->lld_invalid = 0; - spin_unlock(&dentry->d_lock); -} - -int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf); -int ll_layout_refresh(struct inode *inode, __u32 *gen); -int ll_layout_restore(struct inode *inode, loff_t start, __u64 length); - -int ll_xattr_init(void); -void ll_xattr_fini(void); - -int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, - struct cl_page *page, enum cl_req_type crt); - -int ll_getparent(struct file *file, struct getparent __user *arg); - -/* lcommon_cl.c */ -int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, - unsigned int attr_flags); - -extern struct lu_env *cl_inode_fini_env; -extern u16 cl_inode_fini_refcheck; - -int cl_file_inode_init(struct inode *inode, struct lustre_md *md); -void cl_inode_fini(struct inode *inode); - -__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32); -__u32 cl_fid_build_gen(const struct lu_fid *fid); - -#endif /* LLITE_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c deleted file mode 100644 index e7500c53fafc..000000000000 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ /dev/null @@ -1,2666 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/llite_lib.c - * - * Lustre Light Super operations - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <linux/module.h> -#include <linux/statfs.h> -#include <linux/types.h> -#include <linux/mm.h> - -#include <uapi/linux/lustre/lustre_ioctl.h> -#include <lustre_ha.h> -#include <lustre_dlm.h> -#include <lprocfs_status.h> -#include <lustre_disk.h> -#include <uapi/linux/lustre/lustre_param.h> -#include <lustre_log.h> -#include <cl_object.h> -#include <obd_cksum.h> -#include "llite_internal.h" - -struct kmem_cache *ll_file_data_slab; -struct dentry *llite_root; -struct kset *llite_kset; - -#ifndef log2 -#define log2(n) ffz(~(n)) -#endif - -static struct ll_sb_info *ll_init_sbi(struct super_block *sb) -{ - struct ll_sb_info *sbi = NULL; - unsigned long pages; - unsigned long lru_page_max; - struct sysinfo si; - class_uuid_t uuid; - int i; - - sbi = kzalloc(sizeof(*sbi), GFP_NOFS); - if (!sbi) - return NULL; - - spin_lock_init(&sbi->ll_lock); - mutex_init(&sbi->ll_lco.lco_lock); - spin_lock_init(&sbi->ll_pp_extent_lock); - spin_lock_init(&sbi->ll_process_lock); - sbi->ll_rw_stats_on = 0; - - si_meminfo(&si); - pages = si.totalram - si.totalhigh; - lru_page_max = pages / 2; - - sbi->ll_cache = cl_cache_init(lru_page_max); - if (!sbi->ll_cache) { - kfree(sbi); - return NULL; - } - - sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32, - SBI_DEFAULT_READAHEAD_MAX); - sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file; - sbi->ll_ra_info.ra_max_read_ahead_whole_pages = - SBI_DEFAULT_READAHEAD_WHOLE_MAX; - - ll_generate_random_uuid(uuid); - class_uuid_unparse(uuid, &sbi->ll_sb_uuid); - CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid); - - sbi->ll_flags |= LL_SBI_VERBOSE; - sbi->ll_flags |= LL_SBI_CHECKSUM; - - sbi->ll_flags |= LL_SBI_LRU_RESIZE; - sbi->ll_flags |= LL_SBI_LAZYSTATFS; - - for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { - spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. - pp_r_hist.oh_lock); - spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. - pp_w_hist.oh_lock); - } - - /* metadata statahead is enabled by default */ - sbi->ll_sa_max = LL_SA_RPC_DEF; - atomic_set(&sbi->ll_sa_total, 0); - atomic_set(&sbi->ll_sa_wrong, 0); - atomic_set(&sbi->ll_sa_running, 0); - atomic_set(&sbi->ll_agl_total, 0); - sbi->ll_flags |= LL_SBI_AGL_ENABLED; - - /* root squash */ - sbi->ll_squash.rsi_uid = 0; - sbi->ll_squash.rsi_gid = 0; - INIT_LIST_HEAD(&sbi->ll_squash.rsi_nosquash_nids); - init_rwsem(&sbi->ll_squash.rsi_sem); - - sbi->ll_sb = sb; - - return sbi; -} - -static void ll_free_sbi(struct super_block *sb) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - - if (sbi->ll_cache) { - if (!list_empty(&sbi->ll_squash.rsi_nosquash_nids)) - cfs_free_nidlist(&sbi->ll_squash.rsi_nosquash_nids); - cl_cache_decref(sbi->ll_cache); - sbi->ll_cache = NULL; - } - - kfree(sbi); -} - -static int client_common_fill_super(struct super_block *sb, char *md, char *dt) -{ - struct inode *root = NULL; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_device *obd; - struct obd_statfs *osfs = NULL; - struct ptlrpc_request *request = NULL; - struct obd_connect_data *data = NULL; - struct obd_uuid *uuid; - struct md_op_data *op_data; - struct lustre_md lmd; - u64 valid; - int size, err, checksum; - - obd = class_name2obd(md); - if (!obd) { - CERROR("MD %s: not setup or attached\n", md); - return -EINVAL; - } - - data = kzalloc(sizeof(*data), GFP_NOFS); - if (!data) - return -ENOMEM; - - osfs = kzalloc(sizeof(*osfs), GFP_NOFS); - if (!osfs) { - kfree(data); - return -ENOMEM; - } - - /* indicate the features supported by this client */ - data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | - OBD_CONNECT_ATTRFID | - OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE | - OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | - OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 | - OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | - OBD_CONNECT_64BITHASH | - OBD_CONNECT_EINPROGRESS | - OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LAYOUTLOCK | - OBD_CONNECT_PINGLESS | - OBD_CONNECT_MAX_EASIZE | - OBD_CONNECT_FLOCK_DEAD | - OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | - OBD_CONNECT_OPEN_BY_FID | - OBD_CONNECT_DIR_STRIPE | - OBD_CONNECT_BULK_MBITS; - - if (sbi->ll_flags & LL_SBI_LRU_RESIZE) - data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; -#ifdef CONFIG_FS_POSIX_ACL - data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK; -#endif - - if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT)) - /* flag mdc connection as lightweight, only used for test - * purpose, use with care - */ - data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT; - - data->ocd_ibits_known = MDS_INODELOCK_FULL; - data->ocd_version = LUSTRE_VERSION_CODE; - - if (sb_rdonly(sb)) - data->ocd_connect_flags |= OBD_CONNECT_RDONLY; - if (sbi->ll_flags & LL_SBI_USER_XATTR) - data->ocd_connect_flags |= OBD_CONNECT_XATTR; - - if (sbi->ll_flags & LL_SBI_FLOCK) - sbi->ll_fop = &ll_file_operations_flock; - else if (sbi->ll_flags & LL_SBI_LOCALFLOCK) - sbi->ll_fop = &ll_file_operations; - else - sbi->ll_fop = &ll_file_operations_noflock; - - /* always ping even if server suppress_pings */ - if (sbi->ll_flags & LL_SBI_ALWAYS_PING) - data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; - - data->ocd_brw_size = MD_MAX_BRW_SIZE; - - err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, - data, NULL); - if (err == -EBUSY) { - LCONSOLE_ERROR_MSG(0x14f, - "An MDT (md %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n", - md); - goto out; - } - - if (err) { - CERROR("cannot connect to %s: rc = %d\n", md, err); - goto out; - } - - sbi->ll_md_exp->exp_connect_data = *data; - - err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp, - LUSTRE_SEQ_METADATA); - if (err) { - CERROR("%s: Can't init metadata layer FID infrastructure, rc = %d\n", - sbi->ll_md_exp->exp_obd->obd_name, err); - goto out_md; - } - - /* For mount, we only need fs info from MDT0, and also in DNE, it - * can make sure the client can be mounted as long as MDT0 is - * available - */ - err = obd_statfs(NULL, sbi->ll_md_exp, osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), - OBD_STATFS_FOR_MDT0); - if (err) - goto out_md_fid; - - /* This needs to be after statfs to ensure connect has finished. - * Note that "data" does NOT contain the valid connect reply. - * If connecting to a 1.8 server there will be no LMV device, so - * we can access the MDC export directly and exp_connect_flags will - * be non-zero, but if accessing an upgraded 2.1 server it will - * have the correct flags filled in. - * XXX: fill in the LMV exp_connect_flags from MDC(s). - */ - valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD; - if (exp_connect_flags(sbi->ll_md_exp) != 0 && - valid != CLIENT_CONNECT_MDT_REQD) { - char *buf; - - buf = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!buf) { - err = -ENOMEM; - goto out_md_fid; - } - obd_connect_flags2str(buf, PAGE_SIZE, - valid ^ CLIENT_CONNECT_MDT_REQD, ","); - LCONSOLE_ERROR_MSG(0x170, - "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n", - sbi->ll_md_exp->exp_obd->obd_name, buf); - kfree(buf); - err = -EPROTO; - goto out_md_fid; - } - - size = sizeof(*data); - err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA), - KEY_CONN_DATA, &size, data); - if (err) { - CERROR("%s: Get connect data failed: rc = %d\n", - sbi->ll_md_exp->exp_obd->obd_name, err); - goto out_md_fid; - } - - LASSERT(osfs->os_bsize); - sb->s_blocksize = osfs->os_bsize; - sb->s_blocksize_bits = log2(osfs->os_bsize); - sb->s_magic = LL_SUPER_MAGIC; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sbi->ll_namelen = osfs->os_namelen; - sbi->ll_mnt.mnt = current->fs->root.mnt; - - if ((sbi->ll_flags & LL_SBI_USER_XATTR) && - !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) { - LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); - sbi->ll_flags &= ~LL_SBI_USER_XATTR; - } - - if (data->ocd_connect_flags & OBD_CONNECT_ACL) { - sb->s_flags |= SB_POSIXACL; - sbi->ll_flags |= LL_SBI_ACL; - } else { - LCONSOLE_INFO("client wants to enable acl, but mdt not!\n"); - sb->s_flags &= ~SB_POSIXACL; - sbi->ll_flags &= ~LL_SBI_ACL; - } - - if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH) - sbi->ll_flags |= LL_SBI_64BIT_HASH; - - if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) - sbi->ll_md_brw_pages = data->ocd_brw_size >> PAGE_SHIFT; - else - sbi->ll_md_brw_pages = 1; - - if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) - sbi->ll_flags |= LL_SBI_LAYOUT_LOCK; - - if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) { - if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) { - LCONSOLE_INFO( - "%s: disabling xattr cache due to unknown maximum xattr size.\n", - dt); - } else { - sbi->ll_flags |= LL_SBI_XATTR_CACHE; - sbi->ll_xattr_cache_enabled = 1; - } - } - - obd = class_name2obd(dt); - if (!obd) { - CERROR("DT %s: not setup or attached\n", dt); - err = -ENODEV; - goto out_md_fid; - } - - data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | - OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | - OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | - OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| - OBD_CONNECT_AT | OBD_CONNECT_OSS_CAPA | - OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | - OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | - OBD_CONNECT_EINPROGRESS | - OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LAYOUTLOCK | - OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | - OBD_CONNECT_BULK_MBITS; - - if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) { - /* OBD_CONNECT_CKSUM should always be set, even if checksums are - * disabled by default, because it can still be enabled on the - * fly via /sys. As a consequence, we still need to come to an - * agreement on the supported algorithms at connect time - */ - data->ocd_connect_flags |= OBD_CONNECT_CKSUM; - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY)) - data->ocd_cksum_types = OBD_CKSUM_ADLER; - else - data->ocd_cksum_types = cksum_types_supported_client(); - } - - data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; - - /* always ping even if server suppress_pings */ - if (sbi->ll_flags & LL_SBI_ALWAYS_PING) - data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; - - CDEBUG(D_RPCTRACE, - "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d\n", - data->ocd_connect_flags, - data->ocd_version, data->ocd_grant); - - obd->obd_upcall.onu_owner = &sbi->ll_lco; - obd->obd_upcall.onu_upcall = cl_ocd_update; - - data->ocd_brw_size = DT_MAX_BRW_SIZE; - - err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data, - NULL); - if (err == -EBUSY) { - LCONSOLE_ERROR_MSG(0x150, - "An OST (dt %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n", - dt); - goto out_md; - } else if (err) { - CERROR("%s: Cannot connect to %s: rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, dt, err); - goto out_md; - } - - sbi->ll_dt_exp->exp_connect_data = *data; - - err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp, - LUSTRE_SEQ_METADATA); - if (err) { - CERROR("%s: Can't init data layer FID infrastructure, rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, err); - goto out_dt; - } - - mutex_lock(&sbi->ll_lco.lco_lock); - sbi->ll_lco.lco_flags = data->ocd_connect_flags; - sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; - sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; - mutex_unlock(&sbi->ll_lco.lco_lock); - - fid_zero(&sbi->ll_root_fid); - err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid); - if (err) { - CERROR("cannot mds_connect: rc = %d\n", err); - goto out_lock_cn_cb; - } - if (!fid_is_sane(&sbi->ll_root_fid)) { - CERROR("%s: Invalid root fid " DFID " during mount\n", - sbi->ll_md_exp->exp_obd->obd_name, - PFID(&sbi->ll_root_fid)); - err = -EINVAL; - goto out_lock_cn_cb; - } - CDEBUG(D_SUPER, "rootfid " DFID "\n", PFID(&sbi->ll_root_fid)); - - sb->s_op = &lustre_super_operations; - sb->s_xattr = ll_xattr_handlers; -#if THREAD_SIZE >= 8192 /*b=17630*/ - sb->s_export_op = &lustre_export_operations; -#endif - - /* make root inode - * XXX: move this to after cbd setup? - */ - valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMODEASIZE; - if (sbi->ll_flags & LL_SBI_ACL) - valid |= OBD_MD_FLACL; - - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) { - err = -ENOMEM; - goto out_lock_cn_cb; - } - - op_data->op_fid1 = sbi->ll_root_fid; - op_data->op_mode = 0; - op_data->op_valid = valid; - - err = md_getattr(sbi->ll_md_exp, op_data, &request); - kfree(op_data); - if (err) { - CERROR("%s: md_getattr failed for root: rc = %d\n", - sbi->ll_md_exp->exp_obd->obd_name, err); - goto out_lock_cn_cb; - } - - err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, - sbi->ll_md_exp, &lmd); - if (err) { - CERROR("failed to understand root inode md: rc = %d\n", err); - ptlrpc_req_finished(request); - goto out_lock_cn_cb; - } - - LASSERT(fid_is_sane(&sbi->ll_root_fid)); - root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, - sbi->ll_flags & LL_SBI_32BIT_API), - &lmd); - md_free_lustre_md(sbi->ll_md_exp, &lmd); - ptlrpc_req_finished(request); - - if (IS_ERR(root)) { -#ifdef CONFIG_FS_POSIX_ACL - if (lmd.posix_acl) { - posix_acl_release(lmd.posix_acl); - lmd.posix_acl = NULL; - } -#endif - err = -EBADF; - CERROR("lustre_lite: bad iget4 for root\n"); - goto out_root; - } - - checksum = sbi->ll_flags & LL_SBI_CHECKSUM; - err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), - KEY_CHECKSUM, sizeof(checksum), &checksum, - NULL); - if (err) { - CERROR("%s: Set checksum failed: rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, err); - goto out_root; - } - cl_sb_init(sb); - - err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET), - KEY_CACHE_SET, sizeof(*sbi->ll_cache), - sbi->ll_cache, NULL); - if (err) { - CERROR("%s: Set cache_set failed: rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, err); - goto out_root; - } - - sb->s_root = d_make_root(root); - if (!sb->s_root) { - CERROR("%s: can't make root dentry\n", - ll_get_fsname(sb, NULL, 0)); - err = -ENOMEM; - goto out_lock_cn_cb; - } - - sbi->ll_sdev_orig = sb->s_dev; - - /* We set sb->s_dev equal on all lustre clients in order to support - * NFS export clustering. NFSD requires that the FSID be the same - * on all clients. - */ - /* s_dev is also used in lt_compare() to compare two fs, but that is - * only a node-local comparison. - */ - uuid = obd_get_uuid(sbi->ll_md_exp); - if (uuid) { - sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid)); - get_uuid2fsid(uuid->uuid, strlen(uuid->uuid), &sbi->ll_fsid); - } - - kfree(data); - kfree(osfs); - - if (llite_root) { - err = ldebugfs_register_mountpoint(llite_root, sb, dt, md); - if (err < 0) { - CERROR("%s: could not register mount in debugfs: " - "rc = %d\n", ll_get_fsname(sb, NULL, 0), err); - err = 0; - } - } - - return err; -out_root: - iput(root); -out_lock_cn_cb: - obd_fid_fini(sbi->ll_dt_exp->exp_obd); -out_dt: - obd_disconnect(sbi->ll_dt_exp); - sbi->ll_dt_exp = NULL; -out_md_fid: - obd_fid_fini(sbi->ll_md_exp->exp_obd); -out_md: - obd_disconnect(sbi->ll_md_exp); - sbi->ll_md_exp = NULL; -out: - kfree(data); - kfree(osfs); - return err; -} - -int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) -{ - int size, rc; - - size = sizeof(*lmmsize); - rc = obd_get_info(NULL, sbi->ll_dt_exp, sizeof(KEY_MAX_EASIZE), - KEY_MAX_EASIZE, &size, lmmsize); - if (rc) { - CERROR("%s: cannot get max LOV EA size: rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, rc); - return rc; - } - - size = sizeof(int); - rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE), - KEY_MAX_EASIZE, &size, lmmsize); - if (rc) - CERROR("Get max mdsize error rc %d\n", rc); - - return rc; -} - -/** - * Get the value of the default_easize parameter. - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] sbi superblock info for this filesystem - * \param[out] lmmsize pointer to storage location for value - * - * \retval 0 on success - * \retval negative negated errno on failure - */ -int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize) -{ - int size, rc; - - size = sizeof(int); - rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE), - KEY_DEFAULT_EASIZE, &size, lmmsize); - if (rc) - CERROR("Get default mdsize error rc %d\n", rc); - - return rc; -} - -/** - * Set the default_easize parameter to the given value. - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] sbi superblock info for this filesystem - * \param[in] lmmsize the size to set - * - * \retval 0 on success - * \retval negative negated errno on failure - */ -int ll_set_default_mdsize(struct ll_sb_info *sbi, int lmmsize) -{ - if (lmmsize < sizeof(struct lov_mds_md) || - lmmsize > OBD_MAX_DEFAULT_EA_SIZE) - return -EINVAL; - - return obd_set_info_async(NULL, sbi->ll_md_exp, - sizeof(KEY_DEFAULT_EASIZE), - KEY_DEFAULT_EASIZE, - sizeof(int), &lmmsize, NULL); -} - -static void client_common_put_super(struct super_block *sb) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - - cl_sb_fini(sb); - - obd_fid_fini(sbi->ll_dt_exp->exp_obd); - obd_disconnect(sbi->ll_dt_exp); - sbi->ll_dt_exp = NULL; - - ldebugfs_unregister_mountpoint(sbi); - - obd_fid_fini(sbi->ll_md_exp->exp_obd); - obd_disconnect(sbi->ll_md_exp); - sbi->ll_md_exp = NULL; -} - -void ll_kill_super(struct super_block *sb) -{ - struct ll_sb_info *sbi; - - /* not init sb ?*/ - if (!(sb->s_flags & SB_ACTIVE)) - return; - - sbi = ll_s2sbi(sb); - /* we need to restore s_dev from changed for clustered NFS before - * put_super because new kernels have cached s_dev and change sb->s_dev - * in put_super not affected real removing devices - */ - if (sbi) { - sb->s_dev = sbi->ll_sdev_orig; - sbi->ll_umounting = 1; - - /* wait running statahead threads to quit */ - while (atomic_read(&sbi->ll_sa_running) > 0) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC >> 3)); - } - } -} - -static inline int ll_set_opt(const char *opt, char *data, int fl) -{ - if (strncmp(opt, data, strlen(opt)) != 0) - return 0; - else - return fl; -} - -/* non-client-specific mount options are parsed in lmd_parse */ -static int ll_options(char *options, int *flags) -{ - int tmp; - char *s1 = options, *s2; - - if (!options) - return 0; - - CDEBUG(D_CONFIG, "Parsing opts %s\n", options); - - while (*s1) { - CDEBUG(D_SUPER, "next opt=%s\n", s1); - tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("noflock", s1, - LL_SBI_FLOCK | LL_SBI_LOCALFLOCK); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("context", s1, 1); - if (tmp) - goto next; - tmp = ll_set_opt("fscontext", s1, 1); - if (tmp) - goto next; - tmp = ll_set_opt("defcontext", s1, 1); - if (tmp) - goto next; - tmp = ll_set_opt("rootcontext", s1, 1); - if (tmp) - goto next; - tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH); - if (tmp) { - *flags &= ~tmp; - goto next; - } - - tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("always_ping", s1, LL_SBI_ALWAYS_PING); - if (tmp) { - *flags |= tmp; - goto next; - } - LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n", - s1); - return -EINVAL; - -next: - /* Find next opt */ - s2 = strchr(s1, ','); - if (!s2) - break; - s1 = s2 + 1; - } - return 0; -} - -void ll_lli_init(struct ll_inode_info *lli) -{ - lli->lli_inode_magic = LLI_INODE_MAGIC; - lli->lli_flags = 0; - spin_lock_init(&lli->lli_lock); - lli->lli_posix_acl = NULL; - /* Do not set lli_fid, it has been initialized already. */ - fid_zero(&lli->lli_pfid); - lli->lli_mds_read_och = NULL; - lli->lli_mds_write_och = NULL; - lli->lli_mds_exec_och = NULL; - lli->lli_open_fd_read_count = 0; - lli->lli_open_fd_write_count = 0; - lli->lli_open_fd_exec_count = 0; - mutex_init(&lli->lli_och_mutex); - spin_lock_init(&lli->lli_agl_lock); - spin_lock_init(&lli->lli_layout_lock); - ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE); - lli->lli_clob = NULL; - - init_rwsem(&lli->lli_xattrs_list_rwsem); - mutex_init(&lli->lli_xattrs_enq_lock); - - LASSERT(lli->lli_vfs_inode.i_mode != 0); - if (S_ISDIR(lli->lli_vfs_inode.i_mode)) { - mutex_init(&lli->lli_readdir_mutex); - lli->lli_opendir_key = NULL; - lli->lli_sai = NULL; - spin_lock_init(&lli->lli_sa_lock); - lli->lli_opendir_pid = 0; - lli->lli_sa_enabled = 0; - lli->lli_def_stripe_offset = -1; - } else { - mutex_init(&lli->lli_size_mutex); - lli->lli_symlink_name = NULL; - init_rwsem(&lli->lli_trunc_sem); - range_lock_tree_init(&lli->lli_write_tree); - init_rwsem(&lli->lli_glimpse_sem); - lli->lli_glimpse_time = 0; - INIT_LIST_HEAD(&lli->lli_agl_list); - lli->lli_agl_index = 0; - lli->lli_async_rc = 0; - } - mutex_init(&lli->lli_layout_mutex); -} - -int ll_fill_super(struct super_block *sb) -{ - struct lustre_profile *lprof = NULL; - struct lustre_sb_info *lsi = s2lsi(sb); - struct ll_sb_info *sbi; - char *dt = NULL, *md = NULL; - char *profilenm = get_profile_name(sb); - struct config_llog_instance *cfg; - int err; - static atomic_t ll_bdi_num = ATOMIC_INIT(0); - - CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); - - err = ptlrpc_inc_ref(); - if (err) - return err; - - cfg = kzalloc(sizeof(*cfg), GFP_NOFS); - if (!cfg) { - err = -ENOMEM; - goto out_put; - } - - try_module_get(THIS_MODULE); - - /* client additional sb info */ - sbi = ll_init_sbi(sb); - lsi->lsi_llsbi = sbi; - if (!sbi) { - module_put(THIS_MODULE); - kfree(cfg); - err = -ENOMEM; - goto out_put; - } - - err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags); - if (err) - goto out_free; - - err = super_setup_bdi_name(sb, "lustre-%d", - atomic_inc_return(&ll_bdi_num)); - if (err) - goto out_free; - - /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */ - sb->s_d_op = &ll_d_ops; - - /* Generate a string unique to this super, in case some joker tries - * to mount the same fs at two mount points. - * Use the address of the super itself. - */ - cfg->cfg_instance = sb; - cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid; - cfg->cfg_callback = class_config_llog_handler; - /* set up client obds */ - err = lustre_process_log(sb, profilenm, cfg); - if (err < 0) - goto out_free; - - /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */ - lprof = class_get_profile(profilenm); - if (!lprof) { - LCONSOLE_ERROR_MSG(0x156, - "The client profile '%s' could not be read from the MGS. Does that filesystem exist?\n", - profilenm); - err = -EINVAL; - goto out_free; - } - CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, - lprof->lp_md, lprof->lp_dt); - - dt = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_dt, cfg->cfg_instance); - if (!dt) { - err = -ENOMEM; - goto out_free; - } - - md = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_md, cfg->cfg_instance); - if (!md) { - err = -ENOMEM; - goto out_free; - } - - /* connections, registrations, sb setup */ - err = client_common_fill_super(sb, md, dt); - if (!err) - sbi->ll_client_common_fill_super_succeeded = 1; - -out_free: - kfree(md); - kfree(dt); - if (lprof) - class_put_profile(lprof); - if (err) - ll_put_super(sb); - else if (sbi->ll_flags & LL_SBI_VERBOSE) - LCONSOLE_WARN("Mounted %s\n", profilenm); - - kfree(cfg); -out_put: - if (err) - ptlrpc_dec_ref(); - return err; -} /* ll_fill_super */ - -void ll_put_super(struct super_block *sb) -{ - struct config_llog_instance cfg, params_cfg; - struct obd_device *obd; - struct lustre_sb_info *lsi = s2lsi(sb); - struct ll_sb_info *sbi = ll_s2sbi(sb); - char *profilenm = get_profile_name(sb); - int next, force = 1, rc = 0; - long ccc_count; - - CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm); - - cfg.cfg_instance = sb; - lustre_end_log(sb, profilenm, &cfg); - - params_cfg.cfg_instance = sb; - lustre_end_log(sb, PARAMS_FILENAME, ¶ms_cfg); - - if (sbi->ll_md_exp) { - obd = class_exp2obd(sbi->ll_md_exp); - if (obd) - force = obd->obd_force; - } - - /* Wait for unstable pages to be committed to stable storage */ - if (!force) - rc = l_wait_event_abortable(sbi->ll_cache->ccc_unstable_waitq, - !atomic_long_read(&sbi->ll_cache->ccc_unstable_nr)); - - ccc_count = atomic_long_read(&sbi->ll_cache->ccc_unstable_nr); - if (!force && rc != -ERESTARTSYS) - LASSERTF(!ccc_count, "count: %li\n", ccc_count); - - /* We need to set force before the lov_disconnect in - * lustre_common_put_super, since l_d cleans up osc's as well. - */ - if (force) { - next = 0; - while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, - &next)) != NULL) { - obd->obd_force = force; - } - } - - if (sbi->ll_client_common_fill_super_succeeded) { - /* Only if client_common_fill_super succeeded */ - client_common_put_super(sb); - } - - next = 0; - while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next))) - class_manual_cleanup(obd); - - if (sbi->ll_flags & LL_SBI_VERBOSE) - LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : ""); - - if (profilenm) - class_del_profile(profilenm); - - ll_free_sbi(sb); - lsi->lsi_llsbi = NULL; - - lustre_common_put_super(sb); - - cl_env_cache_purge(~0); - - module_put(THIS_MODULE); - - ptlrpc_dec_ref(); -} /* client_put_super */ - -struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock) -{ - struct inode *inode = NULL; - - /* NOTE: we depend on atomic igrab() -bzzz */ - lock_res_and_lock(lock); - if (lock->l_resource->lr_lvb_inode) { - struct ll_inode_info *lli; - - lli = ll_i2info(lock->l_resource->lr_lvb_inode); - if (lli->lli_inode_magic == LLI_INODE_MAGIC) { - inode = igrab(lock->l_resource->lr_lvb_inode); - } else { - inode = lock->l_resource->lr_lvb_inode; - LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO : - D_WARNING, lock, - "lr_lvb_inode %p is bogus: magic %08x", - lock->l_resource->lr_lvb_inode, - lli->lli_inode_magic); - inode = NULL; - } - } - unlock_res_and_lock(lock); - return inode; -} - -void ll_dir_clear_lsm_md(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - - LASSERT(S_ISDIR(inode->i_mode)); - - if (lli->lli_lsm_md) { - lmv_free_memmd(lli->lli_lsm_md); - lli->lli_lsm_md = NULL; - } -} - -static struct inode *ll_iget_anon_dir(struct super_block *sb, - const struct lu_fid *fid, - struct lustre_md *md) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct mdt_body *body = md->body; - struct inode *inode; - ino_t ino; - - ino = cl_fid_build_ino(fid, sbi->ll_flags & LL_SBI_32BIT_API); - inode = iget_locked(sb, ino); - if (!inode) { - CERROR("%s: failed get simple inode " DFID ": rc = -ENOENT\n", - ll_get_fsname(sb, NULL, 0), PFID(fid)); - return ERR_PTR(-ENOENT); - } - - if (inode->i_state & I_NEW) { - struct ll_inode_info *lli = ll_i2info(inode); - struct lmv_stripe_md *lsm = md->lmv; - - inode->i_mode = (inode->i_mode & ~S_IFMT) | - (body->mbo_mode & S_IFMT); - LASSERTF(S_ISDIR(inode->i_mode), "Not slave inode " DFID "\n", - PFID(fid)); - - LTIME_S(inode->i_mtime) = 0; - LTIME_S(inode->i_atime) = 0; - LTIME_S(inode->i_ctime) = 0; - inode->i_rdev = 0; - - inode->i_op = &ll_dir_inode_operations; - inode->i_fop = &ll_dir_operations; - lli->lli_fid = *fid; - ll_lli_init(lli); - - LASSERT(lsm); - /* master object FID */ - lli->lli_pfid = body->mbo_fid1; - CDEBUG(D_INODE, "lli %p slave " DFID " master " DFID "\n", - lli, PFID(fid), PFID(&lli->lli_pfid)); - unlock_new_inode(inode); - } - - return inode; -} - -static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md) -{ - struct lmv_stripe_md *lsm = md->lmv; - struct lu_fid *fid; - int i; - - LASSERT(lsm); - /* - * XXX sigh, this lsm_root initialization should be in - * LMV layer, but it needs ll_iget right now, so we - * put this here right now. - */ - for (i = 0; i < lsm->lsm_md_stripe_count; i++) { - fid = &lsm->lsm_md_oinfo[i].lmo_fid; - LASSERT(!lsm->lsm_md_oinfo[i].lmo_root); - /* Unfortunately ll_iget will call ll_update_inode, - * where the initialization of slave inode is slightly - * different, so it reset lsm_md to NULL to avoid - * initializing lsm for slave inode. - */ - /* For migrating inode, master stripe and master object will - * be same, so we only need assign this inode - */ - if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && !i) - lsm->lsm_md_oinfo[i].lmo_root = inode; - else - lsm->lsm_md_oinfo[i].lmo_root = - ll_iget_anon_dir(inode->i_sb, fid, md); - if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) { - int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root); - - lsm->lsm_md_oinfo[i].lmo_root = NULL; - return rc; - } - } - - return 0; -} - -static inline int lli_lsm_md_eq(const struct lmv_stripe_md *lsm_md1, - const struct lmv_stripe_md *lsm_md2) -{ - return lsm_md1->lsm_md_magic == lsm_md2->lsm_md_magic && - lsm_md1->lsm_md_stripe_count == lsm_md2->lsm_md_stripe_count && - lsm_md1->lsm_md_master_mdt_index == - lsm_md2->lsm_md_master_mdt_index && - lsm_md1->lsm_md_hash_type == lsm_md2->lsm_md_hash_type && - lsm_md1->lsm_md_layout_version == - lsm_md2->lsm_md_layout_version && - !strcmp(lsm_md1->lsm_md_pool_name, - lsm_md2->lsm_md_pool_name); -} - -static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct lmv_stripe_md *lsm = md->lmv; - int rc; - - LASSERT(S_ISDIR(inode->i_mode)); - CDEBUG(D_INODE, "update lsm %p of " DFID "\n", lli->lli_lsm_md, - PFID(ll_inode2fid(inode))); - - /* no striped information from request. */ - if (!lsm) { - if (!lli->lli_lsm_md) { - return 0; - } else if (lli->lli_lsm_md->lsm_md_hash_type & - LMV_HASH_FLAG_MIGRATION) { - /* - * migration is done, the temporay MIGRATE layout has - * been removed - */ - CDEBUG(D_INODE, DFID " finish migration.\n", - PFID(ll_inode2fid(inode))); - lmv_free_memmd(lli->lli_lsm_md); - lli->lli_lsm_md = NULL; - return 0; - } - /* - * The lustre_md from req does not include stripeEA, - * see ll_md_setattr - */ - return 0; - } - - /* set the directory layout */ - if (!lli->lli_lsm_md) { - struct cl_attr *attr; - - rc = ll_init_lsm_md(inode, md); - if (rc) - return rc; - - /* - * set lsm_md to NULL, so the following free lustre_md - * will not free this lsm - */ - md->lmv = NULL; - lli->lli_lsm_md = lsm; - - attr = kzalloc(sizeof(*attr), GFP_NOFS); - if (!attr) - return -ENOMEM; - - /* validate the lsm */ - rc = md_merge_attr(ll_i2mdexp(inode), lsm, attr, - ll_md_blocking_ast); - if (rc) { - kfree(attr); - return rc; - } - - if (md->body->mbo_valid & OBD_MD_FLNLINK) - md->body->mbo_nlink = attr->cat_nlink; - if (md->body->mbo_valid & OBD_MD_FLSIZE) - md->body->mbo_size = attr->cat_size; - if (md->body->mbo_valid & OBD_MD_FLATIME) - md->body->mbo_atime = attr->cat_atime; - if (md->body->mbo_valid & OBD_MD_FLCTIME) - md->body->mbo_ctime = attr->cat_ctime; - if (md->body->mbo_valid & OBD_MD_FLMTIME) - md->body->mbo_mtime = attr->cat_mtime; - - kfree(attr); - - CDEBUG(D_INODE, "Set lsm %p magic %x to " DFID "\n", lsm, - lsm->lsm_md_magic, PFID(ll_inode2fid(inode))); - return 0; - } - - /* Compare the old and new stripe information */ - if (!lsm_md_eq(lli->lli_lsm_md, lsm)) { - struct lmv_stripe_md *old_lsm = lli->lli_lsm_md; - int idx; - - CERROR("%s: inode " DFID "(%p)'s lmv layout mismatch (%p)/(%p) magic:0x%x/0x%x stripe count: %d/%d master_mdt: %d/%d hash_type:0x%x/0x%x layout: 0x%x/0x%x pool:%s/%s\n", - ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), - inode, lsm, old_lsm, - lsm->lsm_md_magic, old_lsm->lsm_md_magic, - lsm->lsm_md_stripe_count, - old_lsm->lsm_md_stripe_count, - lsm->lsm_md_master_mdt_index, - old_lsm->lsm_md_master_mdt_index, - lsm->lsm_md_hash_type, old_lsm->lsm_md_hash_type, - lsm->lsm_md_layout_version, - old_lsm->lsm_md_layout_version, - lsm->lsm_md_pool_name, - old_lsm->lsm_md_pool_name); - - for (idx = 0; idx < old_lsm->lsm_md_stripe_count; idx++) { - CERROR("%s: sub FIDs in old lsm idx %d, old: " DFID "\n", - ll_get_fsname(inode->i_sb, NULL, 0), idx, - PFID(&old_lsm->lsm_md_oinfo[idx].lmo_fid)); - } - - for (idx = 0; idx < lsm->lsm_md_stripe_count; idx++) { - CERROR("%s: sub FIDs in new lsm idx %d, new: " DFID "\n", - ll_get_fsname(inode->i_sb, NULL, 0), idx, - PFID(&lsm->lsm_md_oinfo[idx].lmo_fid)); - } - - return -EIO; - } - - return 0; -} - -void ll_clear_inode(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - if (S_ISDIR(inode->i_mode)) { - /* these should have been cleared in ll_file_release */ - LASSERT(!lli->lli_opendir_key); - LASSERT(!lli->lli_sai); - LASSERT(lli->lli_opendir_pid == 0); - } - - md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode)); - - LASSERT(!lli->lli_open_fd_write_count); - LASSERT(!lli->lli_open_fd_read_count); - LASSERT(!lli->lli_open_fd_exec_count); - - if (lli->lli_mds_write_och) - ll_md_real_close(inode, FMODE_WRITE); - if (lli->lli_mds_exec_och) - ll_md_real_close(inode, FMODE_EXEC); - if (lli->lli_mds_read_och) - ll_md_real_close(inode, FMODE_READ); - - if (S_ISLNK(inode->i_mode)) { - kfree(lli->lli_symlink_name); - lli->lli_symlink_name = NULL; - } - - ll_xattr_cache_destroy(inode); - -#ifdef CONFIG_FS_POSIX_ACL - forget_all_cached_acls(inode); - if (lli->lli_posix_acl) { - posix_acl_release(lli->lli_posix_acl); - lli->lli_posix_acl = NULL; - } -#endif - lli->lli_inode_magic = LLI_INODE_DEAD; - - if (S_ISDIR(inode->i_mode)) - ll_dir_clear_lsm_md(inode); - if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) - LASSERT(list_empty(&lli->lli_agl_list)); - - /* - * XXX This has to be done before lsm is freed below, because - * cl_object still uses inode lsm. - */ - cl_inode_fini(inode); -} - -#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) - -static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data) -{ - struct lustre_md md; - struct inode *inode = d_inode(dentry); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *request = NULL; - int rc, ia_valid; - - op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &request); - if (rc) { - ptlrpc_req_finished(request); - if (rc == -ENOENT) { - clear_nlink(inode); - /* Unlinked special device node? Or just a race? - * Pretend we did everything. - */ - if (!S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode)) { - ia_valid = op_data->op_attr.ia_valid; - op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS; - rc = simple_setattr(dentry, &op_data->op_attr); - op_data->op_attr.ia_valid = ia_valid; - } - } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) { - CERROR("md_setattr fails: rc = %d\n", rc); - } - return rc; - } - - rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, - sbi->ll_md_exp, &md); - if (rc) { - ptlrpc_req_finished(request); - return rc; - } - - ia_valid = op_data->op_attr.ia_valid; - /* inode size will be in cl_setattr_ost, can't do it now since dirty - * cache is not cleared yet. - */ - op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE); - if (S_ISREG(inode->i_mode)) - inode_lock(inode); - rc = simple_setattr(dentry, &op_data->op_attr); - if (S_ISREG(inode->i_mode)) - inode_unlock(inode); - op_data->op_attr.ia_valid = ia_valid; - - rc = ll_update_inode(inode, &md); - ptlrpc_req_finished(request); - - return rc; -} - -/* If this inode has objects allocated to it (lsm != NULL), then the OST - * object(s) determine the file size and mtime. Otherwise, the MDS will - * keep these values until such a time that objects are allocated for it. - * We do the MDS operations first, as it is checking permissions for us. - * We don't to the MDS RPC if there is nothing that we want to store there, - * otherwise there is no harm in updating mtime/atime on the MDS if we are - * going to do an RPC anyways. - * - * If we are doing a truncate, we will send the mtime and ctime updates - * to the OST with the punch RPC, otherwise we do an explicit setattr RPC. - * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE - * at the same time. - * - * In case of HSMimport, we only set attr on MDS. - */ -int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import) -{ - struct inode *inode = d_inode(dentry); - struct ll_inode_info *lli = ll_i2info(inode); - struct md_op_data *op_data = NULL; - int rc = 0; - - CDEBUG(D_VFSTRACE, "%s: setattr inode " DFID "(%p) from %llu to %llu, valid %x, hsm_import %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), inode, - i_size_read(inode), attr->ia_size, attr->ia_valid, hsm_import); - - if (attr->ia_valid & ATTR_SIZE) { - /* Check new size against VFS/VM file size limit and rlimit */ - rc = inode_newsize_ok(inode, attr->ia_size); - if (rc) - return rc; - - /* The maximum Lustre file size is variable, based on the - * OST maximum object size and number of stripes. This - * needs another check in addition to the VFS check above. - */ - if (attr->ia_size > ll_file_maxbytes(inode)) { - CDEBUG(D_INODE, "file " DFID " too large %llu > %llu\n", - PFID(&lli->lli_fid), attr->ia_size, - ll_file_maxbytes(inode)); - return -EFBIG; - } - - attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; - } - - /* POSIX: check before ATTR_*TIME_SET set (from setattr_prepare) */ - if (attr->ia_valid & TIMES_SET_FLAGS) { - if ((!uid_eq(current_fsuid(), inode->i_uid)) && - !capable(CAP_FOWNER)) - return -EPERM; - } - - /* We mark all of the fields "set" so MDS/OST does not re-set them */ - if (attr->ia_valid & ATTR_CTIME) { - attr->ia_ctime = current_time(inode); - attr->ia_valid |= ATTR_CTIME_SET; - } - if (!(attr->ia_valid & ATTR_ATIME_SET) && - (attr->ia_valid & ATTR_ATIME)) { - attr->ia_atime = current_time(inode); - attr->ia_valid |= ATTR_ATIME_SET; - } - if (!(attr->ia_valid & ATTR_MTIME_SET) && - (attr->ia_valid & ATTR_MTIME)) { - attr->ia_mtime = current_time(inode); - attr->ia_valid |= ATTR_MTIME_SET; - } - - if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) - CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %llu\n", - LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime), - (s64)ktime_get_real_seconds()); - - if (S_ISREG(inode->i_mode)) - inode_unlock(inode); - - /* - * We always do an MDS RPC, even if we're only changing the size; - * only the MDS knows whether truncate() should fail with -ETXTBUSY - */ - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) { - rc = -ENOMEM; - goto out; - } - - if (!hsm_import && attr->ia_valid & ATTR_SIZE) { - /* - * If we are changing file size, file content is - * modified, flag it. - */ - attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; - op_data->op_bias |= MDS_DATA_MODIFIED; - clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags); - } - - op_data->op_attr = *attr; - - rc = ll_md_setattr(dentry, op_data); - if (rc) - goto out; - - if (!S_ISREG(inode->i_mode) || hsm_import) { - rc = 0; - goto out; - } - - if (attr->ia_valid & (ATTR_SIZE | - ATTR_ATIME | ATTR_ATIME_SET | - ATTR_MTIME | ATTR_MTIME_SET)) { - /* For truncate and utimes sending attributes to OSTs, setting - * mtime/atime to the past will be performed under PW [0:EOF] - * extent lock (new_size:EOF for truncate). It may seem - * excessive to send mtime/atime updates to OSTs when not - * setting times to past, but it is necessary due to possible - * time de-synchronization between MDT inode and OST objects - */ - rc = cl_setattr_ost(ll_i2info(inode)->lli_clob, attr, 0); - } - - /* - * If the file was restored, it needs to set dirty flag. - * - * We've already sent MDS_DATA_MODIFIED flag in - * ll_md_setattr() for truncate. However, the MDT refuses to - * set the HS_DIRTY flag on released files, so we have to set - * it again if the file has been restored. Please check how - * LLIF_DATA_MODIFIED is set in vvp_io_setattr_fini(). - * - * Please notice that if the file is not released, the previous - * MDS_DATA_MODIFIED has taken effect and usually - * LLIF_DATA_MODIFIED is not set(see vvp_io_setattr_fini()). - * This way we can save an RPC for common open + trunc - * operation. - */ - if (test_and_clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags)) { - struct hsm_state_set hss = { - .hss_valid = HSS_SETMASK, - .hss_setmask = HS_DIRTY, - }; - int rc2; - - rc2 = ll_hsm_state_set(inode, &hss); - /* - * truncate and write can happen at the same time, so that - * the file can be set modified even though the file is not - * restored from released state, and ll_hsm_state_set() is - * not applicable for the file, and rc2 < 0 is normal in this - * case. - */ - if (rc2 < 0) - CDEBUG(D_INFO, DFID "HSM set dirty failed: rc2 = %d\n", - PFID(ll_inode2fid(inode)), rc2); - } - -out: - if (op_data) - ll_finish_md_op_data(op_data); - - if (S_ISREG(inode->i_mode)) { - inode_lock(inode); - if ((attr->ia_valid & ATTR_SIZE) && !hsm_import) - inode_dio_wait(inode); - } - - ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ? - LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1); - - return rc; -} - -int ll_setattr(struct dentry *de, struct iattr *attr) -{ - int mode = d_inode(de)->i_mode; - - if ((attr->ia_valid & (ATTR_CTIME | ATTR_SIZE | ATTR_MODE)) == - (ATTR_CTIME | ATTR_SIZE | ATTR_MODE)) - attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; - - if (((attr->ia_valid & (ATTR_MODE | ATTR_FORCE | ATTR_SIZE)) == - (ATTR_SIZE | ATTR_MODE)) && - (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) || - (((mode & (S_ISGID | 0010)) == (S_ISGID | 0010)) && - !(attr->ia_mode & S_ISGID)))) - attr->ia_valid |= ATTR_FORCE; - - if ((attr->ia_valid & ATTR_MODE) && - (mode & S_ISUID) && - !(attr->ia_mode & S_ISUID) && - !(attr->ia_valid & ATTR_KILL_SUID)) - attr->ia_valid |= ATTR_KILL_SUID; - - if ((attr->ia_valid & ATTR_MODE) && - ((mode & (S_ISGID | 0010)) == (S_ISGID | 0010)) && - !(attr->ia_mode & S_ISGID) && - !(attr->ia_valid & ATTR_KILL_SGID)) - attr->ia_valid |= ATTR_KILL_SGID; - - return ll_setattr_raw(de, attr, false); -} - -int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, - __u64 max_age, __u32 flags) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_statfs obd_osfs; - int rc; - - rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags); - if (rc) { - CERROR("md_statfs fails: rc = %d\n", rc); - return rc; - } - - osfs->os_type = sb->s_magic; - - CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n", - osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, - osfs->os_files); - - if (sbi->ll_flags & LL_SBI_LAZYSTATFS) - flags |= OBD_STATFS_NODELAY; - - rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags); - if (rc) { - CERROR("obd_statfs fails: rc = %d\n", rc); - return rc; - } - - CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n", - obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree, - obd_osfs.os_files); - - osfs->os_bsize = obd_osfs.os_bsize; - osfs->os_blocks = obd_osfs.os_blocks; - osfs->os_bfree = obd_osfs.os_bfree; - osfs->os_bavail = obd_osfs.os_bavail; - - /* If we don't have as many objects free on the OST as inodes - * on the MDS, we reduce the total number of inodes to - * compensate, so that the "inodes in use" number is correct. - */ - if (obd_osfs.os_ffree < osfs->os_ffree) { - osfs->os_files = (osfs->os_files - osfs->os_ffree) + - obd_osfs.os_ffree; - osfs->os_ffree = obd_osfs.os_ffree; - } - - return rc; -} - -int ll_statfs(struct dentry *de, struct kstatfs *sfs) -{ - struct super_block *sb = de->d_sb; - struct obd_statfs osfs; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64()); - ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1); - - /* Some amount of caching on the client is allowed */ - rc = ll_statfs_internal(sb, &osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), - 0); - if (rc) - return rc; - - statfs_unpack(sfs, &osfs); - - /* We need to downshift for all 32-bit kernels, because we can't - * tell if the kernel is being called via sys_statfs64() or not. - * Stop before overflowing f_bsize - in which case it is better - * to just risk EOVERFLOW if caller is using old sys_statfs(). - */ - if (sizeof(long) < 8) { - while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) { - sfs->f_bsize <<= 1; - - osfs.os_blocks >>= 1; - osfs.os_bfree >>= 1; - osfs.os_bavail >>= 1; - } - } - - sfs->f_blocks = osfs.os_blocks; - sfs->f_bfree = osfs.os_bfree; - sfs->f_bavail = osfs.os_bavail; - sfs->f_fsid = ll_s2sbi(sb)->ll_fsid; - return 0; -} - -void ll_inode_size_lock(struct inode *inode) -{ - struct ll_inode_info *lli; - - LASSERT(!S_ISDIR(inode->i_mode)); - - lli = ll_i2info(inode); - mutex_lock(&lli->lli_size_mutex); -} - -void ll_inode_size_unlock(struct inode *inode) -{ - struct ll_inode_info *lli; - - lli = ll_i2info(inode); - mutex_unlock(&lli->lli_size_mutex); -} - -int ll_update_inode(struct inode *inode, struct lustre_md *md) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct mdt_body *body = md->body; - struct ll_sb_info *sbi = ll_i2sbi(inode); - - if (body->mbo_valid & OBD_MD_FLEASIZE) - cl_file_inode_init(inode, md); - - if (S_ISDIR(inode->i_mode)) { - int rc; - - rc = ll_update_lsm_md(inode, md); - if (rc) - return rc; - } - -#ifdef CONFIG_FS_POSIX_ACL - if (body->mbo_valid & OBD_MD_FLACL) { - spin_lock(&lli->lli_lock); - if (lli->lli_posix_acl) - posix_acl_release(lli->lli_posix_acl); - lli->lli_posix_acl = md->posix_acl; - spin_unlock(&lli->lli_lock); - } -#endif - inode->i_ino = cl_fid_build_ino(&body->mbo_fid1, - sbi->ll_flags & LL_SBI_32BIT_API); - inode->i_generation = cl_fid_build_gen(&body->mbo_fid1); - - if (body->mbo_valid & OBD_MD_FLATIME) { - if (body->mbo_atime > LTIME_S(inode->i_atime)) - LTIME_S(inode->i_atime) = body->mbo_atime; - lli->lli_atime = body->mbo_atime; - } - if (body->mbo_valid & OBD_MD_FLMTIME) { - if (body->mbo_mtime > LTIME_S(inode->i_mtime)) { - CDEBUG(D_INODE, - "setting ino %lu mtime from %lu to %llu\n", - inode->i_ino, LTIME_S(inode->i_mtime), - body->mbo_mtime); - LTIME_S(inode->i_mtime) = body->mbo_mtime; - } - lli->lli_mtime = body->mbo_mtime; - } - if (body->mbo_valid & OBD_MD_FLCTIME) { - if (body->mbo_ctime > LTIME_S(inode->i_ctime)) - LTIME_S(inode->i_ctime) = body->mbo_ctime; - lli->lli_ctime = body->mbo_ctime; - } - if (body->mbo_valid & OBD_MD_FLMODE) - inode->i_mode = (inode->i_mode & S_IFMT) | - (body->mbo_mode & ~S_IFMT); - if (body->mbo_valid & OBD_MD_FLTYPE) - inode->i_mode = (inode->i_mode & ~S_IFMT) | - (body->mbo_mode & S_IFMT); - LASSERT(inode->i_mode != 0); - if (S_ISREG(inode->i_mode)) - inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, - LL_MAX_BLKSIZE_BITS); - else - inode->i_blkbits = inode->i_sb->s_blocksize_bits; - if (body->mbo_valid & OBD_MD_FLUID) - inode->i_uid = make_kuid(&init_user_ns, body->mbo_uid); - if (body->mbo_valid & OBD_MD_FLGID) - inode->i_gid = make_kgid(&init_user_ns, body->mbo_gid); - if (body->mbo_valid & OBD_MD_FLFLAGS) - inode->i_flags = ll_ext_to_inode_flags(body->mbo_flags); - if (body->mbo_valid & OBD_MD_FLNLINK) - set_nlink(inode, body->mbo_nlink); - if (body->mbo_valid & OBD_MD_FLRDEV) - inode->i_rdev = old_decode_dev(body->mbo_rdev); - - if (body->mbo_valid & OBD_MD_FLID) { - /* FID shouldn't be changed! */ - if (fid_is_sane(&lli->lli_fid)) { - LASSERTF(lu_fid_eq(&lli->lli_fid, &body->mbo_fid1), - "Trying to change FID " DFID " to the " DFID ", inode " DFID "(%p)\n", - PFID(&lli->lli_fid), PFID(&body->mbo_fid1), - PFID(ll_inode2fid(inode)), inode); - } else { - lli->lli_fid = body->mbo_fid1; - } - } - - LASSERT(fid_seq(&lli->lli_fid) != 0); - - if (body->mbo_valid & OBD_MD_FLSIZE) { - i_size_write(inode, body->mbo_size); - - CDEBUG(D_VFSTRACE, "inode=" DFID ", updating i_size %llu\n", - PFID(ll_inode2fid(inode)), - (unsigned long long)body->mbo_size); - - if (body->mbo_valid & OBD_MD_FLBLOCKS) - inode->i_blocks = body->mbo_blocks; - } - - if (body->mbo_valid & OBD_MD_TSTATE) { - if (body->mbo_t_state & MS_RESTORE) - set_bit(LLIF_FILE_RESTORING, &lli->lli_flags); - } - - return 0; -} - -int ll_read_inode2(struct inode *inode, void *opaque) -{ - struct lustre_md *md = opaque; - struct ll_inode_info *lli = ll_i2info(inode); - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(&lli->lli_fid), inode); - - /* Core attributes from the MDS first. This is a new inode, and - * the VFS doesn't zero times in the core inode so we have to do - * it ourselves. They will be overwritten by either MDS or OST - * attributes - we just need to make sure they aren't newer. - */ - LTIME_S(inode->i_mtime) = 0; - LTIME_S(inode->i_atime) = 0; - LTIME_S(inode->i_ctime) = 0; - inode->i_rdev = 0; - rc = ll_update_inode(inode, md); - if (rc) - return rc; - - /* OIDEBUG(inode); */ - - if (S_ISREG(inode->i_mode)) { - struct ll_sb_info *sbi = ll_i2sbi(inode); - - inode->i_op = &ll_file_inode_operations; - inode->i_fop = sbi->ll_fop; - inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ll_dir_inode_operations; - inode->i_fop = &ll_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &ll_fast_symlink_inode_operations; - } else { - inode->i_op = &ll_special_inode_operations; - - init_special_inode(inode, inode->i_mode, - inode->i_rdev); - } - - return 0; -} - -void ll_delete_inode(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - - if (S_ISREG(inode->i_mode) && lli->lli_clob) - /* discard all dirty pages before truncating them, required by - * osc_extent implementation at LU-1030. - */ - cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, - CL_FSYNC_LOCAL, 1); - - truncate_inode_pages_final(&inode->i_data); - - LASSERTF(!inode->i_data.nrpages, - "inode=" DFID "(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n", - PFID(ll_inode2fid(inode)), inode, inode->i_data.nrpages); - - ll_clear_inode(inode); - clear_inode(inode); -} - -int ll_iocontrol(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - int rc, flags = 0; - - switch (cmd) { - case FSFILT_IOC_GETFLAGS: { - struct mdt_body *body; - struct md_op_data *op_data; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, - 0, 0, LUSTRE_OPC_ANY, - NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_FLFLAGS; - rc = md_getattr(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc) { - CERROR("%s: failure inode " DFID ": rc = %d\n", - sbi->ll_md_exp->exp_obd->obd_name, - PFID(ll_inode2fid(inode)), rc); - return -abs(rc); - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - flags = body->mbo_flags; - - ptlrpc_req_finished(req); - - return put_user(flags, (int __user *)arg); - } - case FSFILT_IOC_SETFLAGS: { - struct md_op_data *op_data; - struct cl_object *obj; - struct iattr *attr; - - if (get_user(flags, (int __user *)arg)) - return -EFAULT; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_attr_flags = flags; - op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG; - rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req); - ll_finish_md_op_data(op_data); - ptlrpc_req_finished(req); - if (rc) - return rc; - - inode->i_flags = ll_ext_to_inode_flags(flags); - - obj = ll_i2info(inode)->lli_clob; - if (!obj) - return 0; - - attr = kzalloc(sizeof(*attr), GFP_NOFS); - if (!attr) - return -ENOMEM; - - attr->ia_valid = ATTR_ATTR_FLAG; - rc = cl_setattr_ost(obj, attr, flags); - kfree(attr); - return rc; - } - default: - return -ENOSYS; - } - - return 0; -} - -int ll_flush_ctx(struct inode *inode) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - - CDEBUG(D_SEC, "flush context for user %d\n", - from_kuid(&init_user_ns, current_uid())); - - obd_set_info_async(NULL, sbi->ll_md_exp, - sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, - 0, NULL, NULL); - obd_set_info_async(NULL, sbi->ll_dt_exp, - sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, - 0, NULL, NULL); - return 0; -} - -/* umount -f client means force down, don't save state */ -void ll_umount_begin(struct super_block *sb) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_device *obd; - struct obd_ioctl_data *ioc_data; - int cnt = 0; - - CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb, - sb->s_count, atomic_read(&sb->s_active)); - - obd = class_exp2obd(sbi->ll_md_exp); - if (!obd) { - CERROR("Invalid MDC connection handle %#llx\n", - sbi->ll_md_exp->exp_handle.h_cookie); - return; - } - obd->obd_force = 1; - - obd = class_exp2obd(sbi->ll_dt_exp); - if (!obd) { - CERROR("Invalid LOV connection handle %#llx\n", - sbi->ll_dt_exp->exp_handle.h_cookie); - return; - } - obd->obd_force = 1; - - ioc_data = kzalloc(sizeof(*ioc_data), GFP_NOFS); - if (ioc_data) { - obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp, - sizeof(*ioc_data), ioc_data, NULL); - - obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp, - sizeof(*ioc_data), ioc_data, NULL); - - kfree(ioc_data); - } - - /* Really, we'd like to wait until there are no requests outstanding, - * and then continue. For now, we just periodically checking for vfs - * to decrement mnt_cnt and hope to finish it within 10sec. - */ - while (cnt < 10 && !may_umount(sbi->ll_mnt.mnt)) { - schedule_timeout_uninterruptible(HZ); - cnt++; - } - - schedule(); -} - -int ll_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - char *profilenm = get_profile_name(sb); - int err; - __u32 read_only; - - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { - read_only = *flags & SB_RDONLY; - err = obd_set_info_async(NULL, sbi->ll_md_exp, - sizeof(KEY_READ_ONLY), - KEY_READ_ONLY, sizeof(read_only), - &read_only, NULL); - if (err) { - LCONSOLE_WARN("Failed to remount %s %s (%d)\n", - profilenm, read_only ? - "read-only" : "read-write", err); - return err; - } - - if (read_only) - sb->s_flags |= SB_RDONLY; - else - sb->s_flags &= ~SB_RDONLY; - - if (sbi->ll_flags & LL_SBI_VERBOSE) - LCONSOLE_WARN("Remounted %s %s\n", profilenm, - read_only ? "read-only" : "read-write"); - } - return 0; -} - -/** - * Cleanup the open handle that is cached on MDT-side. - * - * For open case, the client side open handling thread may hit error - * after the MDT grant the open. Under such case, the client should - * send close RPC to the MDT as cleanup; otherwise, the open handle - * on the MDT will be leaked there until the client umount or evicted. - * - * In further, if someone unlinked the file, because the open handle - * holds the reference on such file/object, then it will block the - * subsequent threads that want to locate such object via FID. - * - * \param[in] sb super block for this file-system - * \param[in] open_req pointer to the original open request - */ -void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req) -{ - struct mdt_body *body; - struct md_op_data *op_data; - struct ptlrpc_request *close_req = NULL; - struct obd_export *exp = ll_s2sbi(sb)->ll_md_exp; - - body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY); - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) - return; - - op_data->op_fid1 = body->mbo_fid1; - op_data->op_handle = body->mbo_handle; - op_data->op_mod_time = get_seconds(); - md_close(exp, op_data, NULL, &close_req); - ptlrpc_req_finished(close_req); - ll_finish_md_op_data(op_data); -} - -int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, - struct super_block *sb, struct lookup_intent *it) -{ - struct ll_sb_info *sbi = NULL; - struct lustre_md md = { NULL }; - int rc; - - LASSERT(*inode || sb); - sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode); - rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp, - sbi->ll_md_exp, &md); - if (rc) - goto cleanup; - - if (*inode) { - rc = ll_update_inode(*inode, &md); - if (rc) - goto out; - } else { - LASSERT(sb); - - /* - * At this point server returns to client's same fid as client - * generated for creating. So using ->fid1 is okay here. - */ - if (!fid_is_sane(&md.body->mbo_fid1)) { - CERROR("%s: Fid is insane " DFID "\n", - ll_get_fsname(sb, NULL, 0), - PFID(&md.body->mbo_fid1)); - rc = -EINVAL; - goto out; - } - - *inode = ll_iget(sb, cl_fid_build_ino(&md.body->mbo_fid1, - sbi->ll_flags & LL_SBI_32BIT_API), - &md); - if (IS_ERR(*inode)) { -#ifdef CONFIG_FS_POSIX_ACL - if (md.posix_acl) { - posix_acl_release(md.posix_acl); - md.posix_acl = NULL; - } -#endif - rc = PTR_ERR(*inode); - CERROR("new_inode -fatal: rc %d\n", rc); - goto out; - } - } - - /* Handling piggyback layout lock. - * Layout lock can be piggybacked by getattr and open request. - * The lsm can be applied to inode only if it comes with a layout lock - * otherwise correct layout may be overwritten, for example: - * 1. proc1: mdt returns a lsm but not granting layout - * 2. layout was changed by another client - * 3. proc2: refresh layout and layout lock granted - * 4. proc1: to apply a stale layout - */ - if (it && it->it_lock_mode != 0) { - struct lustre_handle lockh; - struct ldlm_lock *lock; - - lockh.cookie = it->it_lock_handle; - lock = ldlm_handle2lock(&lockh); - LASSERT(lock); - if (ldlm_has_layout(lock)) { - struct cl_object_conf conf; - - memset(&conf, 0, sizeof(conf)); - conf.coc_opc = OBJECT_CONF_SET; - conf.coc_inode = *inode; - conf.coc_lock = lock; - conf.u.coc_layout = md.layout; - (void)ll_layout_conf(*inode, &conf); - } - LDLM_LOCK_PUT(lock); - } - -out: - md_free_lustre_md(sbi->ll_md_exp, &md); -cleanup: - if (rc != 0 && it && it->it_op & IT_OPEN) - ll_open_cleanup(sb ? sb : (*inode)->i_sb, req); - - return rc; -} - -int ll_obd_statfs(struct inode *inode, void __user *arg) -{ - struct ll_sb_info *sbi = NULL; - struct obd_export *exp; - char *buf = NULL; - struct obd_ioctl_data *data = NULL; - __u32 type; - int len = 0, rc; - - if (!inode) { - rc = -EINVAL; - goto out_statfs; - } - - sbi = ll_i2sbi(inode); - if (!sbi) { - rc = -EINVAL; - goto out_statfs; - } - - rc = obd_ioctl_getdata(&buf, &len, arg); - if (rc) - goto out_statfs; - - data = (void *)buf; - if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || - !data->ioc_pbuf1 || !data->ioc_pbuf2) { - rc = -EINVAL; - goto out_statfs; - } - - if (data->ioc_inllen1 != sizeof(__u32) || - data->ioc_inllen2 != sizeof(__u32) || - data->ioc_plen1 != sizeof(struct obd_statfs) || - data->ioc_plen2 != sizeof(struct obd_uuid)) { - rc = -EINVAL; - goto out_statfs; - } - - memcpy(&type, data->ioc_inlbuf1, sizeof(__u32)); - if (type & LL_STATFS_LMV) { - exp = sbi->ll_md_exp; - } else if (type & LL_STATFS_LOV) { - exp = sbi->ll_dt_exp; - } else { - rc = -ENODEV; - goto out_statfs; - } - - rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, NULL); - if (rc) - goto out_statfs; -out_statfs: - kvfree(buf); - return rc; -} - -int ll_process_config(struct lustre_cfg *lcfg) -{ - char *ptr; - void *sb; - struct lprocfs_static_vars lvars; - unsigned long x; - int rc = 0; - - lprocfs_llite_init_vars(&lvars); - - /* The instance name contains the sb: lustre-client-aacfe000 */ - ptr = strrchr(lustre_cfg_string(lcfg, 0), '-'); - if (!ptr || !*(++ptr)) - return -EINVAL; - rc = kstrtoul(ptr, 16, &x); - if (rc != 0) - return -EINVAL; - sb = (void *)x; - /* This better be a real Lustre superblock! */ - LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == - LMD_MAGIC); - - /* Note we have not called client_common_fill_super yet, so - * proc fns must be able to handle that! - */ - rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars, - lcfg, sb); - if (rc > 0) - rc = 0; - return rc; -} - -/* this function prepares md_op_data hint for passing ot down to MD stack. */ -struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, - struct inode *i1, struct inode *i2, - const char *name, size_t namelen, - u32 mode, __u32 opc, void *data) -{ - if (!name) { - /* Do not reuse namelen for something else. */ - if (namelen) - return ERR_PTR(-EINVAL); - } else { - if (namelen > ll_i2sbi(i1)->ll_namelen) - return ERR_PTR(-ENAMETOOLONG); - - if (!lu_name_is_valid_2(name, namelen)) - return ERR_PTR(-EINVAL); - } - - if (!op_data) - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - - if (!op_data) - return ERR_PTR(-ENOMEM); - - ll_i2gids(op_data->op_suppgids, i1, i2); - op_data->op_fid1 = *ll_inode2fid(i1); - op_data->op_default_stripe_offset = -1; - if (S_ISDIR(i1->i_mode)) { - op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md; - if (opc == LUSTRE_OPC_MKDIR) - op_data->op_default_stripe_offset = - ll_i2info(i1)->lli_def_stripe_offset; - } - - if (i2) { - op_data->op_fid2 = *ll_inode2fid(i2); - if (S_ISDIR(i2->i_mode)) - op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md; - } else { - fid_zero(&op_data->op_fid2); - } - - if (ll_i2sbi(i1)->ll_flags & LL_SBI_64BIT_HASH) - op_data->op_cli_flags |= CLI_HASH64; - - if (ll_need_32bit_api(ll_i2sbi(i1))) - op_data->op_cli_flags |= CLI_API32; - - op_data->op_name = name; - op_data->op_namelen = namelen; - op_data->op_mode = mode; - op_data->op_mod_time = ktime_get_real_seconds(); - op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); - op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); - op_data->op_cap = cfs_curproc_cap_pack(); - if ((opc == LUSTRE_OPC_CREATE) && name && - filename_is_volatile(name, namelen, &op_data->op_mds)) - op_data->op_bias |= MDS_CREATE_VOLATILE; - else - op_data->op_mds = 0; - op_data->op_data = data; - - return op_data; -} - -void ll_finish_md_op_data(struct md_op_data *op_data) -{ - kfree(op_data); -} - -int ll_show_options(struct seq_file *seq, struct dentry *dentry) -{ - struct ll_sb_info *sbi; - - LASSERT(seq && dentry); - sbi = ll_s2sbi(dentry->d_sb); - - if (sbi->ll_flags & LL_SBI_NOLCK) - seq_puts(seq, ",nolock"); - - if (sbi->ll_flags & LL_SBI_FLOCK) - seq_puts(seq, ",flock"); - - if (sbi->ll_flags & LL_SBI_LOCALFLOCK) - seq_puts(seq, ",localflock"); - - if (sbi->ll_flags & LL_SBI_USER_XATTR) - seq_puts(seq, ",user_xattr"); - - if (sbi->ll_flags & LL_SBI_LAZYSTATFS) - seq_puts(seq, ",lazystatfs"); - - if (sbi->ll_flags & LL_SBI_USER_FID2PATH) - seq_puts(seq, ",user_fid2path"); - - if (sbi->ll_flags & LL_SBI_ALWAYS_PING) - seq_puts(seq, ",always_ping"); - - return 0; -} - -/** - * Get obd name by cmd, and copy out to user space - */ -int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct obd_device *obd; - - if (cmd == OBD_IOC_GETDTNAME) - obd = class_exp2obd(sbi->ll_dt_exp); - else if (cmd == OBD_IOC_GETMDNAME) - obd = class_exp2obd(sbi->ll_md_exp); - else - return -EINVAL; - - if (!obd) - return -ENOENT; - - if (copy_to_user((void __user *)arg, obd->obd_name, - strlen(obd->obd_name) + 1)) - return -EFAULT; - - return 0; -} - -/** - * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the - * fsname will be returned in this buffer; otherwise, a static buffer will be - * used to store the fsname and returned to caller. - */ -char *ll_get_fsname(struct super_block *sb, char *buf, int buflen) -{ - static char fsname_static[MTI_NAME_MAXLEN]; - struct lustre_sb_info *lsi = s2lsi(sb); - char *ptr; - int len; - - if (!buf) { - /* this means the caller wants to use static buffer - * and it doesn't care about race. Usually this is - * in error reporting path - */ - buf = fsname_static; - buflen = sizeof(fsname_static); - } - - len = strlen(lsi->lsi_lmd->lmd_profile); - ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); - if (ptr && (strcmp(ptr, "-client") == 0)) - len -= 7; - - if (unlikely(len >= buflen)) - len = buflen - 1; - strncpy(buf, lsi->lsi_lmd->lmd_profile, len); - buf[len] = '\0'; - - return buf; -} - -void ll_dirty_page_discard_warn(struct page *page, int ioret) -{ - char *buf, *path = NULL; - struct dentry *dentry = NULL; - struct vvp_object *obj = cl_inode2vvp(page->mapping->host); - - /* this can be called inside spin lock so use GFP_ATOMIC. */ - buf = (char *)__get_free_page(GFP_ATOMIC); - if (buf) { - dentry = d_find_alias(page->mapping->host); - if (dentry) - path = dentry_path_raw(dentry, buf, PAGE_SIZE); - } - - CDEBUG(D_WARNING, - "%s: dirty page discard: %s/fid: " DFID "/%s may get corrupted (rc %d)\n", - ll_get_fsname(page->mapping->host->i_sb, NULL, 0), - s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev, - PFID(&obj->vob_header.coh_lu.loh_fid), - (path && !IS_ERR(path)) ? path : "", ioret); - - if (dentry) - dput(dentry); - - if (buf) - free_page((unsigned long)buf); -} - -ssize_t ll_copy_user_md(const struct lov_user_md __user *md, - struct lov_user_md **kbuf) -{ - struct lov_user_md lum; - ssize_t lum_size; - - if (copy_from_user(&lum, md, sizeof(lum))) { - lum_size = -EFAULT; - goto no_kbuf; - } - - lum_size = ll_lov_user_md_size(&lum); - if (lum_size < 0) - goto no_kbuf; - - *kbuf = kzalloc(lum_size, GFP_NOFS); - if (!*kbuf) { - lum_size = -ENOMEM; - goto no_kbuf; - } - - if (copy_from_user(*kbuf, md, lum_size) != 0) { - kfree(*kbuf); - *kbuf = NULL; - lum_size = -EFAULT; - } -no_kbuf: - return lum_size; -} - -/* - * Compute llite root squash state after a change of root squash - * configuration setting or add/remove of a lnet nid - */ -void ll_compute_rootsquash_state(struct ll_sb_info *sbi) -{ - struct root_squash_info *squash = &sbi->ll_squash; - struct lnet_process_id id; - bool matched; - int i; - - /* Update norootsquash flag */ - down_write(&squash->rsi_sem); - if (list_empty(&squash->rsi_nosquash_nids)) { - sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH; - } else { - /* - * Do not apply root squash as soon as one of our NIDs is - * in the nosquash_nids list - */ - matched = false; - i = 0; - - while (LNetGetId(i++, &id) != -ENOENT) { - if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) - continue; - if (cfs_match_nid(id.nid, &squash->rsi_nosquash_nids)) { - matched = true; - break; - } - } - if (matched) - sbi->ll_flags |= LL_SBI_NOROOTSQUASH; - else - sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH; - } - up_write(&squash->rsi_sem); -} - -/** - * Parse linkea content to extract information about a given hardlink - * - * \param[in] ldata - Initialized linkea data - * \param[in] linkno - Link identifier - * \param[out] parent_fid - The entry's parent FID - * \param[in] size - Entry name destination buffer - * - * \retval 0 on success - * \retval Appropriate negative error code on failure - */ -static int ll_linkea_decode(struct linkea_data *ldata, unsigned int linkno, - struct lu_fid *parent_fid, struct lu_name *ln) -{ - unsigned int idx; - int rc; - - rc = linkea_init_with_rec(ldata); - if (rc < 0) - return rc; - - if (linkno >= ldata->ld_leh->leh_reccount) - /* beyond last link */ - return -ENODATA; - - linkea_first_entry(ldata); - for (idx = 0; ldata->ld_lee; idx++) { - linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, ln, - parent_fid); - if (idx == linkno) - break; - - linkea_next_entry(ldata); - } - - if (idx < linkno) - return -ENODATA; - - return 0; -} - -/** - * Get parent FID and name of an identified link. Operation is performed for - * a given link number, letting the caller iterate over linkno to list one or - * all links of an entry. - * - * \param[in] file - File descriptor against which to perform the operation - * \param[in,out] arg - User-filled structure containing the linkno to operate - * on and the available size. It is eventually filled - * with the requested information or left untouched on - * error - * - * \retval - 0 on success - * \retval - Appropriate negative error code on failure - */ -int ll_getparent(struct file *file, struct getparent __user *arg) -{ - struct inode *inode = file_inode(file); - struct linkea_data *ldata; - struct lu_fid parent_fid; - struct lu_buf buf = { - .lb_buf = NULL, - .lb_len = 0 - }; - struct lu_name ln; - u32 name_size; - u32 linkno; - int rc; - - if (!capable(CAP_DAC_READ_SEARCH) && - !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) - return -EPERM; - - if (get_user(name_size, &arg->gp_name_size)) - return -EFAULT; - - if (get_user(linkno, &arg->gp_linkno)) - return -EFAULT; - - if (name_size > PATH_MAX) - return -EINVAL; - - ldata = kzalloc(sizeof(*ldata), GFP_NOFS); - if (!ldata) - return -ENOMEM; - - rc = linkea_data_new(ldata, &buf); - if (rc < 0) - goto ldata_free; - - rc = ll_xattr_list(inode, XATTR_NAME_LINK, XATTR_TRUSTED_T, buf.lb_buf, - buf.lb_len, OBD_MD_FLXATTR); - if (rc < 0) - goto lb_free; - - rc = ll_linkea_decode(ldata, linkno, &parent_fid, &ln); - if (rc < 0) - goto lb_free; - - if (ln.ln_namelen >= name_size) { - rc = -EOVERFLOW; - goto lb_free; - } - - if (copy_to_user(&arg->gp_fid, &parent_fid, sizeof(arg->gp_fid))) { - rc = -EFAULT; - goto lb_free; - } - - if (copy_to_user(&arg->gp_name, ln.ln_name, ln.ln_namelen)) { - rc = -EFAULT; - goto lb_free; - } - - if (put_user('\0', arg->gp_name + ln.ln_namelen)) { - rc = -EFAULT; - goto lb_free; - } - -lb_free: - kvfree(buf.lb_buf); -ldata_free: - kfree(ldata); - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c deleted file mode 100644 index 214b07554e62..000000000000 --- a/drivers/staging/lustre/lustre/llite/llite_mmap.c +++ /dev/null @@ -1,478 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/errno.h> -#include <linux/unistd.h> -#include <linux/uaccess.h> - -#include <linux/fs.h> -#include <linux/pagemap.h> - -#define DEBUG_SUBSYSTEM S_LLITE - -#include "llite_internal.h" - -static const struct vm_operations_struct ll_file_vm_ops; - -void policy_from_vma(union ldlm_policy_data *policy, - struct vm_area_struct *vma, unsigned long addr, - size_t count) -{ - policy->l_extent.start = ((addr - vma->vm_start) & PAGE_MASK) + - (vma->vm_pgoff << PAGE_SHIFT); - policy->l_extent.end = (policy->l_extent.start + count - 1) | - ~PAGE_MASK; -} - -struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, - size_t count) -{ - struct vm_area_struct *vma, *ret = NULL; - - /* mmap_sem must have been held by caller. */ - LASSERT(!down_write_trylock(&mm->mmap_sem)); - - for (vma = find_vma(mm, addr); - vma && vma->vm_start < (addr + count); vma = vma->vm_next) { - if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && - vma->vm_flags & VM_SHARED) { - ret = vma; - break; - } - } - return ret; -} - -/** - * API independent part for page fault initialization. - * \param vma - virtual memory area addressed to page fault - * \param env - corespondent lu_env to processing - * \param index - page index corespondent to fault. - * \parm ra_flags - vma readahead flags. - * - * \return error codes from cl_io_init. - */ -static struct cl_io * -ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma, - pgoff_t index, unsigned long *ra_flags) -{ - struct file *file = vma->vm_file; - struct inode *inode = file_inode(file); - struct cl_io *io; - struct cl_fault_io *fio; - int rc; - - if (ll_file_nolock(file)) - return ERR_PTR(-EOPNOTSUPP); - -restart: - io = vvp_env_thread_io(env); - io->ci_obj = ll_i2info(inode)->lli_clob; - LASSERT(io->ci_obj); - - fio = &io->u.ci_fault; - fio->ft_index = index; - fio->ft_executable = vma->vm_flags & VM_EXEC; - - /* - * disable VM_SEQ_READ and use VM_RAND_READ to make sure that - * the kernel will not read other pages not covered by ldlm in - * filemap_nopage. we do our readahead in ll_readpage. - */ - if (ra_flags) - *ra_flags = vma->vm_flags & (VM_RAND_READ | VM_SEQ_READ); - vma->vm_flags &= ~VM_SEQ_READ; - vma->vm_flags |= VM_RAND_READ; - - CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, - fio->ft_index, fio->ft_executable); - - rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); - if (rc == 0) { - struct vvp_io *vio = vvp_env_io(env); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - - LASSERT(vio->vui_cl.cis_io == io); - - /* mmap lock must be MANDATORY it has to cache pages. */ - io->ci_lockreq = CILR_MANDATORY; - vio->vui_fd = fd; - } else { - LASSERT(rc < 0); - cl_io_fini(env, io); - if (io->ci_need_restart) - goto restart; - - io = ERR_PTR(rc); - } - - return io; -} - -/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ -static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, - bool *retry) -{ - struct lu_env *env; - struct cl_io *io; - struct vvp_io *vio; - int result; - u16 refcheck; - sigset_t set; - struct inode *inode; - struct ll_inode_info *lli; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = ll_fault_io_init(env, vma, vmpage->index, NULL); - if (IS_ERR(io)) { - result = PTR_ERR(io); - goto out; - } - - result = io->ci_result; - if (result < 0) - goto out_io; - - io->u.ci_fault.ft_mkwrite = 1; - io->u.ci_fault.ft_writable = 1; - - vio = vvp_env_io(env); - vio->u.fault.ft_vma = vma; - vio->u.fault.ft_vmpage = vmpage; - - cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM), &set); - - inode = vvp_object_inode(io->ci_obj); - lli = ll_i2info(inode); - - result = cl_io_loop(env, io); - - cfs_restore_sigs(&set); - - if (result == 0) { - struct inode *inode = file_inode(vma->vm_file); - struct ll_inode_info *lli = ll_i2info(inode); - - lock_page(vmpage); - if (!vmpage->mapping) { - unlock_page(vmpage); - - /* page was truncated and lock was cancelled, return - * ENODATA so that VM_FAULT_NOPAGE will be returned - * to handle_mm_fault(). - */ - if (result == 0) - result = -ENODATA; - } else if (!PageDirty(vmpage)) { - /* race, the page has been cleaned by ptlrpcd after - * it was unlocked, it has to be added into dirty - * cache again otherwise this soon-to-dirty page won't - * consume any grants, even worse if this page is being - * transferred because it will break RPC checksum. - */ - unlock_page(vmpage); - - CDEBUG(D_MMAP, - "Race on page_mkwrite %p/%lu, page has been written out, retry.\n", - vmpage, vmpage->index); - - *retry = true; - result = -EAGAIN; - } - - if (!result) - set_bit(LLIF_DATA_MODIFIED, &lli->lli_flags); - } - -out_io: - cl_io_fini(env, io); -out: - cl_env_put(env, &refcheck); - CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); - LASSERT(ergo(result == 0, PageLocked(vmpage))); - - return result; -} - -static inline int to_fault_error(int result) -{ - switch (result) { - case 0: - result = VM_FAULT_LOCKED; - break; - case -EFAULT: - result = VM_FAULT_NOPAGE; - break; - case -ENOMEM: - result = VM_FAULT_OOM; - break; - default: - result = VM_FAULT_SIGBUS; - break; - } - return result; -} - -/** - * Lustre implementation of a vm_operations_struct::fault() method, called by - * VM to server page fault (both in kernel and user space). - * - * \param vma - is virtual area struct related to page fault - * \param vmf - structure which describe type and address where hit fault - * - * \return allocated and filled _locked_ page for address - * \retval VM_FAULT_ERROR on general error - * \retval NOPAGE_OOM not have memory for allocate new page - */ -static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct lu_env *env; - struct cl_io *io; - struct vvp_io *vio = NULL; - struct page *vmpage; - unsigned long ra_flags; - int result = 0; - int fault_ret = 0; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = ll_fault_io_init(env, vma, vmf->pgoff, &ra_flags); - if (IS_ERR(io)) { - result = to_fault_error(PTR_ERR(io)); - goto out; - } - - result = io->ci_result; - if (result == 0) { - vio = vvp_env_io(env); - vio->u.fault.ft_vma = vma; - vio->u.fault.ft_vmpage = NULL; - vio->u.fault.ft_vmf = vmf; - vio->u.fault.ft_flags = 0; - vio->u.fault.ft_flags_valid = false; - - /* May call ll_readpage() */ - ll_cl_add(vma->vm_file, env, io); - - result = cl_io_loop(env, io); - - ll_cl_remove(vma->vm_file, env); - - /* ft_flags are only valid if we reached - * the call to filemap_fault - */ - if (vio->u.fault.ft_flags_valid) - fault_ret = vio->u.fault.ft_flags; - - vmpage = vio->u.fault.ft_vmpage; - if (result != 0 && vmpage) { - put_page(vmpage); - vmf->page = NULL; - } - } - cl_io_fini(env, io); - - vma->vm_flags |= ra_flags; - -out: - cl_env_put(env, &refcheck); - if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) - fault_ret |= to_fault_error(result); - - CDEBUG(D_MMAP, "%s fault %d/%d\n", current->comm, fault_ret, result); - return fault_ret; -} - -static int ll_fault(struct vm_fault *vmf) -{ - int count = 0; - bool printed = false; - int result; - sigset_t set; - - /* Only SIGKILL and SIGTERM are allowed for fault/nopage/mkwrite - * so that it can be killed by admin but not cause segfault by - * other signals. - */ - cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM), &set); - -restart: - result = ll_fault0(vmf->vma, vmf); - LASSERT(!(result & VM_FAULT_LOCKED)); - if (result == 0) { - struct page *vmpage = vmf->page; - - /* check if this page has been truncated */ - lock_page(vmpage); - if (unlikely(!vmpage->mapping)) { /* unlucky */ - unlock_page(vmpage); - put_page(vmpage); - vmf->page = NULL; - - if (!printed && ++count > 16) { - CWARN("the page is under heavy contention, maybe your app(%s) needs revising :-)\n", - current->comm); - printed = true; - } - - goto restart; - } - - result = VM_FAULT_LOCKED; - } - cfs_restore_sigs(&set); - return result; -} - -static int ll_page_mkwrite(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - int count = 0; - bool printed = false; - bool retry; - int result; - - file_update_time(vma->vm_file); - do { - retry = false; - result = ll_page_mkwrite0(vma, vmf->page, &retry); - - if (!printed && ++count > 16) { - const struct dentry *de = vma->vm_file->f_path.dentry; - - CWARN("app(%s): the page %lu of file " DFID " is under heavy contention\n", - current->comm, vmf->pgoff, - PFID(ll_inode2fid(de->d_inode))); - printed = true; - } - } while (retry); - - switch (result) { - case 0: - LASSERT(PageLocked(vmf->page)); - result = VM_FAULT_LOCKED; - break; - case -ENODATA: - case -EAGAIN: - case -EFAULT: - result = VM_FAULT_NOPAGE; - break; - case -ENOMEM: - result = VM_FAULT_OOM; - break; - default: - result = VM_FAULT_SIGBUS; - break; - } - - return result; -} - -/** - * To avoid cancel the locks covering mmapped region for lock cache pressure, - * we track the mapped vma count in vvp_object::vob_mmap_cnt. - */ -static void ll_vm_open(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct vvp_object *vob = cl_inode2vvp(inode); - - LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); - atomic_inc(&vob->vob_mmap_cnt); -} - -/** - * Dual to ll_vm_open(). - */ -static void ll_vm_close(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct vvp_object *vob = cl_inode2vvp(inode); - - atomic_dec(&vob->vob_mmap_cnt); - LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); -} - -/* XXX put nice comment here. talk about __free_pte -> dirty pages and - * nopage's reference passing to the pte - */ -int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) -{ - int rc = -ENOENT; - - LASSERTF(last > first, "last %llu first %llu\n", last, first); - if (mapping_mapped(mapping)) { - rc = 0; - unmap_mapping_range(mapping, first + PAGE_SIZE - 1, - last - first + 1, 0); - } - - return rc; -} - -static const struct vm_operations_struct ll_file_vm_ops = { - .fault = ll_fault, - .page_mkwrite = ll_page_mkwrite, - .open = ll_vm_open, - .close = ll_vm_close, -}; - -int ll_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(file); - int rc; - - if (ll_file_nolock(file)) - return -EOPNOTSUPP; - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); - rc = generic_file_mmap(file, vma); - if (rc == 0) { - vma->vm_ops = &ll_file_vm_ops; - vma->vm_ops->open(vma); - /* update the inode's size and mtime */ - rc = ll_glimpse_size(inode); - } - - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c deleted file mode 100644 index a6a1d80c711a..000000000000 --- a/drivers/staging/lustre/lustre/llite/llite_nfs.c +++ /dev/null @@ -1,375 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/lustre/llite/llite_nfs.c - * - * NFS export of Lustre Light File System - * - * Author: Yury Umanets <umka@clusterfs.com> - * Author: Huang Hua <huanghua@clusterfs.com> - */ - -#define DEBUG_SUBSYSTEM S_LLITE -#include "llite_internal.h" -#include <linux/exportfs.h> - -__u32 get_uuid2int(const char *name, int len) -{ - __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9; - - while (len--) { - __u32 key = key1 + (key0 ^ (*name++ * 7152373)); - - if (key & 0x80000000) - key -= 0x7fffffff; - key1 = key0; - key0 = key; - } - return (key0 << 1); -} - -void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid) -{ - __u64 key = 0, key0 = 0x12a3fe2d, key1 = 0x37abe8f9; - - while (len--) { - key = key1 + (key0 ^ (*name++ * 7152373)); - if (key & 0x8000000000000000ULL) - key -= 0x7fffffffffffffffULL; - key1 = key0; - key0 = key; - } - - fsid->val[0] = key; - fsid->val[1] = key >> 32; -} - -struct inode *search_inode_for_lustre(struct super_block *sb, - const struct lu_fid *fid) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct ptlrpc_request *req = NULL; - struct inode *inode = NULL; - int eadatalen = 0; - unsigned long hash = cl_fid_build_ino(fid, - ll_need_32bit_api(sbi)); - struct md_op_data *op_data; - int rc; - - CDEBUG(D_INFO, "searching inode for:(%lu," DFID ")\n", hash, PFID(fid)); - - inode = ilookup5(sb, hash, ll_test_inode_by_fid, (void *)fid); - if (inode) - return inode; - - rc = ll_get_default_mdsize(sbi, &eadatalen); - if (rc) - return ERR_PTR(rc); - - /* Because inode is NULL, ll_prep_md_op_data can not - * be used here. So we allocate op_data ourselves - */ - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) - return ERR_PTR(-ENOMEM); - - op_data->op_fid1 = *fid; - op_data->op_mode = eadatalen; - op_data->op_valid = OBD_MD_FLEASIZE; - - /* mds_fid2dentry ignores f_type */ - rc = md_getattr(sbi->ll_md_exp, op_data, &req); - kfree(op_data); - if (rc) { - CDEBUG(D_INFO, "can't get object attrs, fid " DFID ", rc %d\n", - PFID(fid), rc); - return ERR_PTR(rc); - } - rc = ll_prep_inode(&inode, req, sb, NULL); - ptlrpc_req_finished(req); - if (rc) - return ERR_PTR(rc); - - return inode; -} - -struct lustre_nfs_fid { - struct lu_fid lnf_child; - struct lu_fid lnf_parent; -}; - -static struct dentry * -ll_iget_for_nfs(struct super_block *sb, - struct lu_fid *fid, struct lu_fid *parent) -{ - struct inode *inode; - struct dentry *result; - - if (!fid_is_sane(fid)) - return ERR_PTR(-ESTALE); - - CDEBUG(D_INFO, "Get dentry for fid: " DFID "\n", PFID(fid)); - - inode = search_inode_for_lustre(sb, fid); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - if (is_bad_inode(inode)) { - /* we didn't find the right inode.. */ - iput(inode); - return ERR_PTR(-ESTALE); - } - - result = d_obtain_alias(inode); - if (IS_ERR(result)) { - iput(inode); - return result; - } - - /** - * In case d_obtain_alias() found a disconnected dentry, always update - * lli_pfid to allow later operation (normally open) have parent fid, - * which may be used by MDS to create data. - */ - if (parent) { - struct ll_inode_info *lli = ll_i2info(inode); - - spin_lock(&lli->lli_lock); - lli->lli_pfid = *parent; - spin_unlock(&lli->lli_lock); - } - - /* N.B. d_obtain_alias() drops inode ref on error */ - result = d_obtain_alias(inode); - if (!IS_ERR(result)) { - /* - * Need to signal to the ll_intent_file_open that - * we came from NFS and so opencache needs to be - * enabled for this one - */ - ll_d2d(result)->lld_nfs_dentry = 1; - } - - return result; -} - -/** - * \a connectable - is nfsd will connect himself or this should be done - * at lustre - * - * The return value is file handle type: - * 1 -- contains child file handle; - * 2 -- contains child file handle and parent file handle; - * 255 -- error. - */ -static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen, - struct inode *parent) -{ - int fileid_len = sizeof(struct lustre_nfs_fid) / 4; - struct lustre_nfs_fid *nfs_fid = (void *)fh; - - CDEBUG(D_INFO, "%s: encoding for (" DFID ") maxlen=%d minlen=%d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), *plen, fileid_len); - - if (*plen < fileid_len) { - *plen = fileid_len; - return FILEID_INVALID; - } - - nfs_fid->lnf_child = *ll_inode2fid(inode); - if (parent) - nfs_fid->lnf_parent = *ll_inode2fid(parent); - else - fid_zero(&nfs_fid->lnf_parent); - *plen = fileid_len; - - return FILEID_LUSTRE; -} - -static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, - int namelen, loff_t hash, u64 ino, - unsigned int type) -{ - /* It is hack to access lde_fid for comparison with lgd_fid. - * So the input 'name' must be part of the 'lu_dirent'. - */ - struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name); - struct ll_getname_data *lgd = - container_of(ctx, struct ll_getname_data, ctx); - struct lu_fid fid; - - fid_le_to_cpu(&fid, &lde->lde_fid); - if (lu_fid_eq(&fid, &lgd->lgd_fid)) { - memcpy(lgd->lgd_name, name, namelen); - lgd->lgd_name[namelen] = 0; - lgd->lgd_found = 1; - } - return lgd->lgd_found; -} - -static int ll_get_name(struct dentry *dentry, char *name, - struct dentry *child) -{ - struct inode *dir = d_inode(dentry); - int rc; - struct ll_getname_data lgd = { - .lgd_name = name, - .lgd_fid = ll_i2info(d_inode(child))->lli_fid, - .ctx.actor = ll_nfs_get_name_filldir, - }; - struct md_op_data *op_data; - __u64 pos = 0; - - if (!dir || !S_ISDIR(dir->i_mode)) { - rc = -ENOTDIR; - goto out; - } - - if (!dir->i_fop) { - rc = -EINVAL; - goto out; - } - - op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, - LUSTRE_OPC_ANY, dir); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out; - } - - op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; - inode_lock(dir); - rc = ll_dir_read(dir, &pos, op_data, &lgd.ctx); - inode_unlock(dir); - ll_finish_md_op_data(op_data); - if (!rc && !lgd.lgd_found) - rc = -ENOENT; -out: - return rc; -} - -static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; - - if (fh_type != FILEID_LUSTRE) - return ERR_PTR(-EPROTO); - - return ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent); -} - -static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; - - if (fh_type != FILEID_LUSTRE) - return ERR_PTR(-EPROTO); - - return ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL); -} - -int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid) -{ - struct ptlrpc_request *req = NULL; - struct ll_sb_info *sbi; - struct mdt_body *body; - static const char dotdot[] = ".."; - struct md_op_data *op_data; - int rc; - int lmmsize; - - LASSERT(dir && S_ISDIR(dir->i_mode)); - - sbi = ll_s2sbi(dir->i_sb); - - CDEBUG(D_INFO, "%s: getting parent for (" DFID ")\n", - ll_get_fsname(dir->i_sb, NULL, 0), - PFID(ll_inode2fid(dir))); - - rc = ll_get_default_mdsize(sbi, &lmmsize); - if (rc != 0) - return rc; - - op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot, - strlen(dotdot), lmmsize, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc) { - CERROR("%s: failure inode " DFID " get parent: rc = %d\n", - ll_get_fsname(dir->i_sb, NULL, 0), - PFID(ll_inode2fid(dir)), rc); - return rc; - } - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - /* - * LU-3952: MDT may lost the FID of its parent, we should not crash - * the NFS server, ll_iget_for_nfs() will handle the error. - */ - if (body->mbo_valid & OBD_MD_FLID) { - CDEBUG(D_INFO, "parent for " DFID " is " DFID "\n", - PFID(ll_inode2fid(dir)), PFID(&body->mbo_fid1)); - *parent_fid = body->mbo_fid1; - } - - ptlrpc_req_finished(req); - return 0; -} - -static struct dentry *ll_get_parent(struct dentry *dchild) -{ - struct lu_fid parent_fid = { 0 }; - struct dentry *dentry; - int rc; - - rc = ll_dir_get_parent_fid(dchild->d_inode, &parent_fid); - if (rc) - return ERR_PTR(rc); - - dentry = ll_iget_for_nfs(dchild->d_inode->i_sb, &parent_fid, NULL); - - return dentry; -} - -const struct export_operations lustre_export_operations = { - .get_parent = ll_get_parent, - .encode_fh = ll_encode_fh, - .get_name = ll_get_name, - .fh_to_dentry = ll_fh_to_dentry, - .fh_to_parent = ll_fh_to_parent, -}; diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c deleted file mode 100644 index 644bea2f9d37..000000000000 --- a/drivers/staging/lustre/lustre/llite/lproc_llite.c +++ /dev/null @@ -1,1684 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#define DEBUG_SUBSYSTEM S_LLITE - -#include <lprocfs_status.h> -#include <linux/seq_file.h> -#include <obd_support.h> - -#include "llite_internal.h" -#include "vvp_internal.h" - -/* debugfs llite mount point registration */ -static const struct file_operations ll_rw_extents_stats_fops; -static const struct file_operations ll_rw_extents_stats_pp_fops; -static const struct file_operations ll_rw_offset_stats_fops; - -static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%u\n", osfs.os_bsize); - - return rc; -} -LUSTRE_RO_ATTR(blocksize); - -static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_blocks; - - while (blk_size >>= 1) - result <<= 1; - - rc = sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytestotal); - -static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_bfree; - - while (blk_size >>= 1) - result <<= 1; - - rc = sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytesfree); - -static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_bavail; - - while (blk_size >>= 1) - result <<= 1; - - rc = sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytesavail); - -static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%llu\n", osfs.os_files); - - return rc; -} -LUSTRE_RO_ATTR(filestotal); - -static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%llu\n", osfs.os_ffree); - - return rc; -} -LUSTRE_RO_ATTR(filesfree); - -static ssize_t client_type_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - return sprintf(buf, "local client\n"); -} -LUSTRE_RO_ATTR(client_type); - -static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%s\n", sbi->ll_sb->s_type->name); -} -LUSTRE_RO_ATTR(fstype); - -static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%s\n", sbi->ll_sb_uuid.uuid); -} -LUSTRE_RO_ATTR(uuid); - -static int ll_site_stats_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - - /* - * See description of statistical counters in struct cl_site, and - * struct lu_site. - */ - return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m); -} - -LPROC_SEQ_FOPS_RO(ll_site_stats); - -static ssize_t max_read_ahead_mb_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - long pages_number; - int mult; - - spin_lock(&sbi->ll_lock); - pages_number = sbi->ll_ra_info.ra_max_pages; - spin_unlock(&sbi->ll_lock); - - mult = 1 << (20 - PAGE_SHIFT); - return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult); -} - -static ssize_t max_read_ahead_mb_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long pages_number; - - rc = kstrtoul(buffer, 10, &pages_number); - if (rc) - return rc; - - pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */ - - if (pages_number > totalram_pages / 2) { - CERROR("can't set file readahead more than %lu MB\n", - totalram_pages >> (20 - PAGE_SHIFT + 1)); /*1/2 of RAM*/ - return -ERANGE; - } - - spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_pages = pages_number; - spin_unlock(&sbi->ll_lock); - - return count; -} -LUSTRE_RW_ATTR(max_read_ahead_mb); - -static ssize_t max_read_ahead_per_file_mb_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - long pages_number; - int mult; - - spin_lock(&sbi->ll_lock); - pages_number = sbi->ll_ra_info.ra_max_pages_per_file; - spin_unlock(&sbi->ll_lock); - - mult = 1 << (20 - PAGE_SHIFT); - return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult); -} - -static ssize_t max_read_ahead_per_file_mb_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long pages_number; - - rc = kstrtoul(buffer, 10, &pages_number); - if (rc) - return rc; - - if (pages_number > sbi->ll_ra_info.ra_max_pages) { - CERROR("can't set file readahead more than max_read_ahead_mb %lu MB\n", - sbi->ll_ra_info.ra_max_pages); - return -ERANGE; - } - - spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_pages_per_file = pages_number; - spin_unlock(&sbi->ll_lock); - - return count; -} -LUSTRE_RW_ATTR(max_read_ahead_per_file_mb); - -static ssize_t max_read_ahead_whole_mb_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - long pages_number; - int mult; - - spin_lock(&sbi->ll_lock); - pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages; - spin_unlock(&sbi->ll_lock); - - mult = 1 << (20 - PAGE_SHIFT); - return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult); -} - -static ssize_t max_read_ahead_whole_mb_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long pages_number; - - rc = kstrtoul(buffer, 10, &pages_number); - if (rc) - return rc; - - /* Cap this at the current max readahead window size, the readahead - * algorithm does this anyway so it's pointless to set it larger. - */ - if (pages_number > sbi->ll_ra_info.ra_max_pages_per_file) { - CERROR("can't set max_read_ahead_whole_mb more than max_read_ahead_per_file_mb: %lu\n", - sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_SHIFT)); - return -ERANGE; - } - - spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number; - spin_unlock(&sbi->ll_lock); - - return count; -} -LUSTRE_RW_ATTR(max_read_ahead_whole_mb); - -static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct cl_client_cache *cache = sbi->ll_cache; - int shift = 20 - PAGE_SHIFT; - long max_cached_mb; - long unused_mb; - - max_cached_mb = cache->ccc_lru_max >> shift; - unused_mb = atomic_long_read(&cache->ccc_lru_left) >> shift; - seq_printf(m, - "users: %d\n" - "max_cached_mb: %ld\n" - "used_mb: %ld\n" - "unused_mb: %ld\n" - "reclaim_count: %u\n", - atomic_read(&cache->ccc_users), - max_cached_mb, - max_cached_mb - unused_mb, - unused_mb, - cache->ccc_lru_shrinkers); - return 0; -} - -static ssize_t ll_max_cached_mb_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct super_block *sb = ((struct seq_file *)file->private_data)->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct cl_client_cache *cache = sbi->ll_cache; - struct lu_env *env; - long diff = 0; - long nrpages = 0; - u16 refcheck; - long pages_number; - int mult; - long rc; - u64 val; - char kernbuf[128]; - - if (count >= sizeof(kernbuf)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - kernbuf[count] = 0; - - mult = 1 << (20 - PAGE_SHIFT); - buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) - - kernbuf; - rc = lprocfs_write_frac_u64_helper(buffer, count, &val, mult); - if (rc) - return rc; - - if (val > LONG_MAX) - return -ERANGE; - pages_number = (long)val; - - if (pages_number < 0 || pages_number > totalram_pages) { - CERROR("%s: can't set max cache more than %lu MB\n", - ll_get_fsname(sb, NULL, 0), - totalram_pages >> (20 - PAGE_SHIFT)); - return -ERANGE; - } - - spin_lock(&sbi->ll_lock); - diff = pages_number - cache->ccc_lru_max; - spin_unlock(&sbi->ll_lock); - - /* easy - add more LRU slots. */ - if (diff >= 0) { - atomic_long_add(diff, &cache->ccc_lru_left); - rc = 0; - goto out; - } - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return 0; - - diff = -diff; - while (diff > 0) { - long tmp; - - /* reduce LRU budget from free slots. */ - do { - long ov, nv; - - ov = atomic_long_read(&cache->ccc_lru_left); - if (ov == 0) - break; - - nv = ov > diff ? ov - diff : 0; - rc = atomic_long_cmpxchg(&cache->ccc_lru_left, ov, nv); - if (likely(ov == rc)) { - diff -= ov - nv; - nrpages += ov - nv; - break; - } - } while (1); - - if (diff <= 0) - break; - - if (!sbi->ll_dt_exp) { /* being initialized */ - rc = 0; - goto out; - } - - /* difficult - have to ask OSCs to drop LRU slots. */ - tmp = diff << 1; - rc = obd_set_info_async(env, sbi->ll_dt_exp, - sizeof(KEY_CACHE_LRU_SHRINK), - KEY_CACHE_LRU_SHRINK, - sizeof(tmp), &tmp, NULL); - if (rc < 0) - break; - } - cl_env_put(env, &refcheck); - -out: - if (rc >= 0) { - spin_lock(&sbi->ll_lock); - cache->ccc_lru_max = pages_number; - spin_unlock(&sbi->ll_lock); - rc = count; - } else { - atomic_long_add(nrpages, &cache->ccc_lru_left); - } - return rc; -} - -LPROC_SEQ_FOPS(ll_max_cached_mb); - -static ssize_t checksum_pages_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0); -} - -static ssize_t checksum_pages_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - if (!sbi->ll_dt_exp) - /* Not set up yet */ - return -EAGAIN; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - if (val) - sbi->ll_flags |= LL_SBI_CHECKSUM; - else - sbi->ll_flags &= ~LL_SBI_CHECKSUM; - - rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), - KEY_CHECKSUM, sizeof(val), &val, NULL); - if (rc) - CWARN("Failed to set OSC checksum flags: %d\n", rc); - - return count; -} -LUSTRE_RW_ATTR(checksum_pages); - -static ssize_t ll_rd_track_id(struct kobject *kobj, char *buf, - enum stats_track_type type) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - if (sbi->ll_stats_track_type == type) - return sprintf(buf, "%d\n", sbi->ll_stats_track_id); - else if (sbi->ll_stats_track_type == STATS_TRACK_ALL) - return sprintf(buf, "0 (all)\n"); - else - return sprintf(buf, "untracked\n"); -} - -static ssize_t ll_wr_track_id(struct kobject *kobj, const char *buffer, - size_t count, - enum stats_track_type type) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long pid; - - rc = kstrtoul(buffer, 10, &pid); - if (rc) - return rc; - sbi->ll_stats_track_id = pid; - if (pid == 0) - sbi->ll_stats_track_type = STATS_TRACK_ALL; - else - sbi->ll_stats_track_type = type; - lprocfs_clear_stats(sbi->ll_stats); - return count; -} - -static ssize_t stats_track_pid_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - return ll_rd_track_id(kobj, buf, STATS_TRACK_PID); -} - -static ssize_t stats_track_pid_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PID); -} -LUSTRE_RW_ATTR(stats_track_pid); - -static ssize_t stats_track_ppid_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - return ll_rd_track_id(kobj, buf, STATS_TRACK_PPID); -} - -static ssize_t stats_track_ppid_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PPID); -} -LUSTRE_RW_ATTR(stats_track_ppid); - -static ssize_t stats_track_gid_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - return ll_rd_track_id(kobj, buf, STATS_TRACK_GID); -} - -static ssize_t stats_track_gid_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_GID); -} -LUSTRE_RW_ATTR(stats_track_gid); - -static ssize_t statahead_max_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", sbi->ll_sa_max); -} - -static ssize_t statahead_max_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val <= LL_SA_RPC_MAX) - sbi->ll_sa_max = val; - else - CERROR("Bad statahead_max value %lu. Valid values are in the range [0, %d]\n", - val, LL_SA_RPC_MAX); - - return count; -} -LUSTRE_RW_ATTR(statahead_max); - -static ssize_t statahead_agl_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0); -} - -static ssize_t statahead_agl_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val) - sbi->ll_flags |= LL_SBI_AGL_ENABLED; - else - sbi->ll_flags &= ~LL_SBI_AGL_ENABLED; - - return count; -} -LUSTRE_RW_ATTR(statahead_agl); - -static int ll_statahead_stats_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - - seq_printf(m, - "statahead total: %u\n" - "statahead wrong: %u\n" - "agl total: %u\n", - atomic_read(&sbi->ll_sa_total), - atomic_read(&sbi->ll_sa_wrong), - atomic_read(&sbi->ll_agl_total)); - return 0; -} - -LPROC_SEQ_FOPS_RO(ll_statahead_stats); - -static ssize_t lazystatfs_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", sbi->ll_flags & LL_SBI_LAZYSTATFS ? 1 : 0); -} - -static ssize_t lazystatfs_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val) - sbi->ll_flags |= LL_SBI_LAZYSTATFS; - else - sbi->ll_flags &= ~LL_SBI_LAZYSTATFS; - - return count; -} -LUSTRE_RW_ATTR(lazystatfs); - -static ssize_t max_easize_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - unsigned int ealen; - int rc; - - rc = ll_get_max_mdsize(sbi, &ealen); - if (rc) - return rc; - - return sprintf(buf, "%u\n", ealen); -} -LUSTRE_RO_ATTR(max_easize); - -/** - * Get default_easize. - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] kobj kernel object for sysfs tree - * \param[in] attr attribute of this kernel object - * \param[in] buf buffer to write data into - * - * \retval positive \a count on success - * \retval negative negated errno on failure - */ -static ssize_t default_easize_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - unsigned int ealen; - int rc; - - rc = ll_get_default_mdsize(sbi, &ealen); - if (rc) - return rc; - - return sprintf(buf, "%u\n", ealen); -} - -/** - * Set default_easize. - * - * Range checking on the passed value is handled by - * ll_set_default_mdsize(). - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] kobj kernel object for sysfs tree - * \param[in] attr attribute of this kernel object - * \param[in] buffer string passed from user space - * \param[in] count \a buffer length - * - * \retval positive \a count on success - * \retval negative negated errno on failure - */ -static ssize_t default_easize_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - unsigned long val; - int rc; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - rc = ll_set_default_mdsize(sbi, val); - if (rc) - return rc; - - return count; -} -LUSTRE_RW_ATTR(default_easize); - -static int ll_sbi_flags_seq_show(struct seq_file *m, void *v) -{ - const char *str[] = LL_SBI_FLAGS; - struct super_block *sb = m->private; - int flags = ll_s2sbi(sb)->ll_flags; - int i = 0; - - while (flags != 0) { - if (ARRAY_SIZE(str) <= i) { - CERROR("%s: Revise array LL_SBI_FLAGS to match sbi flags please.\n", - ll_get_fsname(sb, NULL, 0)); - return -EINVAL; - } - - if (flags & 0x1) - seq_printf(m, "%s ", str[i]); - flags >>= 1; - ++i; - } - seq_puts(m, "\b\n"); - return 0; -} - -LPROC_SEQ_FOPS_RO(ll_sbi_flags); - -static ssize_t xattr_cache_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", sbi->ll_xattr_cache_enabled); -} - -static ssize_t xattr_cache_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val != 0 && val != 1) - return -ERANGE; - - if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE)) - return -ENOTSUPP; - - sbi->ll_xattr_cache_enabled = val; - - return count; -} -LUSTRE_RW_ATTR(xattr_cache); - -static int ll_unstable_stats_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct cl_client_cache *cache = sbi->ll_cache; - long pages; - int mb; - - pages = atomic_long_read(&cache->ccc_unstable_nr); - mb = (pages * PAGE_SIZE) >> 20; - - seq_printf(m, - "unstable_check: %8d\n" - "unstable_pages: %12ld\n" - "unstable_mb: %8d\n", - cache->ccc_unstable_check, pages, mb); - - return 0; -} - -static ssize_t ll_unstable_stats_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct super_block *sb = ((struct seq_file *)file->private_data)->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - char kernbuf[128]; - int val, rc; - - if (!count) - return 0; - if (count >= sizeof(kernbuf)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - kernbuf[count] = 0; - - buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) - - kernbuf; - rc = lprocfs_write_helper(buffer, count, &val); - if (rc < 0) - return rc; - - /* borrow lru lock to set the value */ - spin_lock(&sbi->ll_cache->ccc_lru_lock); - sbi->ll_cache->ccc_unstable_check = !!val; - spin_unlock(&sbi->ll_cache->ccc_lru_lock); - - return count; -} -LPROC_SEQ_FOPS(ll_unstable_stats); - -static int ll_root_squash_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct root_squash_info *squash = &sbi->ll_squash; - - seq_printf(m, "%u:%u\n", squash->rsi_uid, squash->rsi_gid); - return 0; -} - -static ssize_t ll_root_squash_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct root_squash_info *squash = &sbi->ll_squash; - - return lprocfs_wr_root_squash(buffer, count, squash, - ll_get_fsname(sb, NULL, 0)); -} -LPROC_SEQ_FOPS(ll_root_squash); - -static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct root_squash_info *squash = &sbi->ll_squash; - int len; - - down_read(&squash->rsi_sem); - if (!list_empty(&squash->rsi_nosquash_nids)) { - len = cfs_print_nidlist(m->buf + m->count, m->size - m->count, - &squash->rsi_nosquash_nids); - m->count += len; - seq_puts(m, "\n"); - } else { - seq_puts(m, "NONE\n"); - } - up_read(&squash->rsi_sem); - - return 0; -} - -static ssize_t ll_nosquash_nids_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct root_squash_info *squash = &sbi->ll_squash; - int rc; - - rc = lprocfs_wr_nosquash_nids(buffer, count, squash, - ll_get_fsname(sb, NULL, 0)); - if (rc < 0) - return rc; - - ll_compute_rootsquash_state(sbi); - - return rc; -} - -LPROC_SEQ_FOPS(ll_nosquash_nids); - -static struct lprocfs_vars lprocfs_llite_obd_vars[] = { - /* { "mntpt_path", ll_rd_path, 0, 0 }, */ - { "site", &ll_site_stats_fops, NULL, 0 }, - /* { "filegroups", lprocfs_rd_filegroups, 0, 0 }, */ - { "max_cached_mb", &ll_max_cached_mb_fops, NULL }, - { "statahead_stats", &ll_statahead_stats_fops, NULL, 0 }, - { "unstable_stats", &ll_unstable_stats_fops, NULL }, - { "sbi_flags", &ll_sbi_flags_fops, NULL, 0 }, - { .name = "root_squash", - .fops = &ll_root_squash_fops }, - { .name = "nosquash_nids", - .fops = &ll_nosquash_nids_fops }, - { NULL } -}; - -#define MAX_STRING_SIZE 128 - -static struct attribute *llite_attrs[] = { - &lustre_attr_blocksize.attr, - &lustre_attr_kbytestotal.attr, - &lustre_attr_kbytesfree.attr, - &lustre_attr_kbytesavail.attr, - &lustre_attr_filestotal.attr, - &lustre_attr_filesfree.attr, - &lustre_attr_client_type.attr, - &lustre_attr_fstype.attr, - &lustre_attr_uuid.attr, - &lustre_attr_max_read_ahead_mb.attr, - &lustre_attr_max_read_ahead_per_file_mb.attr, - &lustre_attr_max_read_ahead_whole_mb.attr, - &lustre_attr_checksum_pages.attr, - &lustre_attr_stats_track_pid.attr, - &lustre_attr_stats_track_ppid.attr, - &lustre_attr_stats_track_gid.attr, - &lustre_attr_statahead_max.attr, - &lustre_attr_statahead_agl.attr, - &lustre_attr_lazystatfs.attr, - &lustre_attr_max_easize.attr, - &lustre_attr_default_easize.attr, - &lustre_attr_xattr_cache.attr, - NULL, -}; - -static void llite_sb_release(struct kobject *kobj) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - complete(&sbi->ll_kobj_unregister); -} - -static struct kobj_type llite_ktype = { - .default_attrs = llite_attrs, - .sysfs_ops = &lustre_sysfs_ops, - .release = llite_sb_release, -}; - -static const struct llite_file_opcode { - __u32 opcode; - __u32 type; - const char *opname; -} llite_opcode_table[LPROC_LL_FILE_OPCODES] = { - /* file operation */ - { LPROC_LL_DIRTY_HITS, LPROCFS_TYPE_REGS, "dirty_pages_hits" }, - { LPROC_LL_DIRTY_MISSES, LPROCFS_TYPE_REGS, "dirty_pages_misses" }, - { LPROC_LL_READ_BYTES, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, - "read_bytes" }, - { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, - "write_bytes" }, - { LPROC_LL_BRW_READ, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, - "brw_read" }, - { LPROC_LL_BRW_WRITE, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, - "brw_write" }, - { LPROC_LL_IOCTL, LPROCFS_TYPE_REGS, "ioctl" }, - { LPROC_LL_OPEN, LPROCFS_TYPE_REGS, "open" }, - { LPROC_LL_RELEASE, LPROCFS_TYPE_REGS, "close" }, - { LPROC_LL_MAP, LPROCFS_TYPE_REGS, "mmap" }, - { LPROC_LL_LLSEEK, LPROCFS_TYPE_REGS, "seek" }, - { LPROC_LL_FSYNC, LPROCFS_TYPE_REGS, "fsync" }, - { LPROC_LL_READDIR, LPROCFS_TYPE_REGS, "readdir" }, - /* inode operation */ - { LPROC_LL_SETATTR, LPROCFS_TYPE_REGS, "setattr" }, - { LPROC_LL_TRUNC, LPROCFS_TYPE_REGS, "truncate" }, - { LPROC_LL_FLOCK, LPROCFS_TYPE_REGS, "flock" }, - { LPROC_LL_GETATTR, LPROCFS_TYPE_REGS, "getattr" }, - /* dir inode operation */ - { LPROC_LL_CREATE, LPROCFS_TYPE_REGS, "create" }, - { LPROC_LL_LINK, LPROCFS_TYPE_REGS, "link" }, - { LPROC_LL_UNLINK, LPROCFS_TYPE_REGS, "unlink" }, - { LPROC_LL_SYMLINK, LPROCFS_TYPE_REGS, "symlink" }, - { LPROC_LL_MKDIR, LPROCFS_TYPE_REGS, "mkdir" }, - { LPROC_LL_RMDIR, LPROCFS_TYPE_REGS, "rmdir" }, - { LPROC_LL_MKNOD, LPROCFS_TYPE_REGS, "mknod" }, - { LPROC_LL_RENAME, LPROCFS_TYPE_REGS, "rename" }, - /* special inode operation */ - { LPROC_LL_STAFS, LPROCFS_TYPE_REGS, "statfs" }, - { LPROC_LL_ALLOC_INODE, LPROCFS_TYPE_REGS, "alloc_inode" }, - { LPROC_LL_SETXATTR, LPROCFS_TYPE_REGS, "setxattr" }, - { LPROC_LL_GETXATTR, LPROCFS_TYPE_REGS, "getxattr" }, - { LPROC_LL_GETXATTR_HITS, LPROCFS_TYPE_REGS, "getxattr_hits" }, - { LPROC_LL_LISTXATTR, LPROCFS_TYPE_REGS, "listxattr" }, - { LPROC_LL_REMOVEXATTR, LPROCFS_TYPE_REGS, "removexattr" }, - { LPROC_LL_INODE_PERM, LPROCFS_TYPE_REGS, "inode_permission" }, -}; - -void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) -{ - if (!sbi->ll_stats) - return; - if (sbi->ll_stats_track_type == STATS_TRACK_ALL) - lprocfs_counter_add(sbi->ll_stats, op, count); - else if (sbi->ll_stats_track_type == STATS_TRACK_PID && - sbi->ll_stats_track_id == current->pid) - lprocfs_counter_add(sbi->ll_stats, op, count); - else if (sbi->ll_stats_track_type == STATS_TRACK_PPID && - sbi->ll_stats_track_id == current->real_parent->pid) - lprocfs_counter_add(sbi->ll_stats, op, count); - else if (sbi->ll_stats_track_type == STATS_TRACK_GID && - sbi->ll_stats_track_id == - from_kgid(&init_user_ns, current_gid())) - lprocfs_counter_add(sbi->ll_stats, op, count); -} -EXPORT_SYMBOL(ll_stats_ops_tally); - -static const char *ra_stat_string[] = { - [RA_STAT_HIT] = "hits", - [RA_STAT_MISS] = "misses", - [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", - [RA_STAT_MISS_IN_WINDOW] = "miss inside window", - [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", - [RA_STAT_FAILED_MATCH] = "failed lock match", - [RA_STAT_DISCARDED] = "read but discarded", - [RA_STAT_ZERO_LEN] = "zero length file", - [RA_STAT_ZERO_WINDOW] = "zero size window", - [RA_STAT_EOF] = "read-ahead to EOF", - [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", - [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", - [RA_STAT_FAILED_REACH_END] = "failed to reach end" -}; - -int ldebugfs_register_mountpoint(struct dentry *parent, - struct super_block *sb, char *osc, char *mdc) -{ - struct lustre_sb_info *lsi = s2lsi(sb); - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_device *obd; - struct dentry *dir; - char name[MAX_STRING_SIZE + 1], *ptr; - int err, id, len, rc; - - name[MAX_STRING_SIZE] = '\0'; - - LASSERT(sbi); - LASSERT(mdc); - LASSERT(osc); - - /* Get fsname */ - len = strlen(lsi->lsi_lmd->lmd_profile); - ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); - if (ptr && (strcmp(ptr, "-client") == 0)) - len -= 7; - - /* Mount info */ - snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len, - lsi->lsi_lmd->lmd_profile, sb); - - dir = ldebugfs_register(name, parent, NULL, NULL); - if (IS_ERR_OR_NULL(dir)) { - err = dir ? PTR_ERR(dir) : -ENOMEM; - sbi->ll_debugfs_entry = NULL; - return err; - } - sbi->ll_debugfs_entry = dir; - - rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "dump_page_cache", 0444, - &vvp_dump_pgcache_file_ops, sbi); - if (rc) - CWARN("Error adding the dump_page_cache file\n"); - - rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "extents_stats", 0644, - &ll_rw_extents_stats_fops, sbi); - if (rc) - CWARN("Error adding the extent_stats file\n"); - - rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, - "extents_stats_per_process", - 0644, &ll_rw_extents_stats_pp_fops, sbi); - if (rc) - CWARN("Error adding the extents_stats_per_process file\n"); - - rc = ldebugfs_seq_create(sbi->ll_debugfs_entry, "offset_stats", 0644, - &ll_rw_offset_stats_fops, sbi); - if (rc) - CWARN("Error adding the offset_stats file\n"); - - /* File operations stats */ - sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES, - LPROCFS_STATS_FLAG_NONE); - if (!sbi->ll_stats) { - err = -ENOMEM; - goto out; - } - /* do counter init */ - for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) { - __u32 type = llite_opcode_table[id].type; - void *ptr = NULL; - - if (type & LPROCFS_TYPE_REGS) - ptr = "regs"; - else if (type & LPROCFS_TYPE_BYTES) - ptr = "bytes"; - else if (type & LPROCFS_TYPE_PAGES) - ptr = "pages"; - lprocfs_counter_init(sbi->ll_stats, - llite_opcode_table[id].opcode, - (type & LPROCFS_CNTR_AVGMINMAX), - llite_opcode_table[id].opname, ptr); - } - err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "stats", - sbi->ll_stats); - if (err) - goto out; - - sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string), - LPROCFS_STATS_FLAG_NONE); - if (!sbi->ll_ra_stats) { - err = -ENOMEM; - goto out; - } - - for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++) - lprocfs_counter_init(sbi->ll_ra_stats, id, 0, - ra_stat_string[id], "pages"); - - err = ldebugfs_register_stats(sbi->ll_debugfs_entry, "read_ahead_stats", - sbi->ll_ra_stats); - if (err) - goto out; - - err = ldebugfs_add_vars(sbi->ll_debugfs_entry, - lprocfs_llite_obd_vars, sb); - if (err) - goto out; - - sbi->ll_kobj.kset = llite_kset; - init_completion(&sbi->ll_kobj_unregister); - err = kobject_init_and_add(&sbi->ll_kobj, &llite_ktype, NULL, - "%s", name); - if (err) - goto out; - - /* MDC info */ - obd = class_name2obd(mdc); - - err = sysfs_create_link(&sbi->ll_kobj, &obd->obd_kobj, - obd->obd_type->typ_name); - if (err) - goto out; - - /* OSC */ - obd = class_name2obd(osc); - - err = sysfs_create_link(&sbi->ll_kobj, &obd->obd_kobj, - obd->obd_type->typ_name); -out: - if (err) { - ldebugfs_remove(&sbi->ll_debugfs_entry); - lprocfs_free_stats(&sbi->ll_ra_stats); - lprocfs_free_stats(&sbi->ll_stats); - } - return err; -} - -void ldebugfs_unregister_mountpoint(struct ll_sb_info *sbi) -{ - if (sbi->ll_debugfs_entry) { - ldebugfs_remove(&sbi->ll_debugfs_entry); - kobject_put(&sbi->ll_kobj); - wait_for_completion(&sbi->ll_kobj_unregister); - lprocfs_free_stats(&sbi->ll_ra_stats); - lprocfs_free_stats(&sbi->ll_stats); - } -} - -#undef MAX_STRING_SIZE - -#define pct(a, b) (b ? a * 100 / b : 0) - -static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, - struct seq_file *seq, int which) -{ - unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; - unsigned long start, end, r, w; - char *unitp = "KMGTPEZY"; - int i, units = 10; - struct per_process_info *pp_info = &io_extents->pp_extents[which]; - - read_cum = 0; - write_cum = 0; - start = 0; - - for (i = 0; i < LL_HIST_MAX; i++) { - read_tot += pp_info->pp_r_hist.oh_buckets[i]; - write_tot += pp_info->pp_w_hist.oh_buckets[i]; - } - - for (i = 0; i < LL_HIST_MAX; i++) { - r = pp_info->pp_r_hist.oh_buckets[i]; - w = pp_info->pp_w_hist.oh_buckets[i]; - read_cum += r; - write_cum += w; - end = 1 << (i + LL_HIST_START - units); - seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu | %14lu %4lu %4lu\n", - start, *unitp, end, *unitp, - (i == LL_HIST_MAX - 1) ? '+' : ' ', - r, pct(r, read_tot), pct(read_cum, read_tot), - w, pct(w, write_tot), pct(write_cum, write_tot)); - start = end; - if (start == 1024) { - start = 1; - units += 10; - unitp++; - } - if (read_cum == read_tot && write_cum == write_tot) - break; - } -} - -static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - int k; - - ktime_get_real_ts64(&now); - - if (!sbi->ll_rw_stats_on) { - seq_printf(seq, "disabled\n" - "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); - return 0; - } - seq_printf(seq, "snapshot_time: %llu.%09lu (secs.usecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); - seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", - "extents", "calls", "%", "cum%", - "calls", "%", "cum%"); - spin_lock(&sbi->ll_pp_extent_lock); - for (k = 0; k < LL_PROCESS_HIST_MAX; k++) { - if (io_extents->pp_extents[k].pid != 0) { - seq_printf(seq, "\nPID: %d\n", - io_extents->pp_extents[k].pid); - ll_display_extents_info(io_extents, seq, k); - } - } - spin_unlock(&sbi->ll_pp_extent_lock); - return 0; -} - -static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file, - const char __user *buf, - size_t len, - loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - int i; - int value = 1, rc = 0; - - if (len == 0) - return -EINVAL; - - rc = lprocfs_write_helper(buf, len, &value); - if (rc < 0 && len < 16) { - char kernbuf[16]; - - if (copy_from_user(kernbuf, buf, len)) - return -EFAULT; - kernbuf[len] = 0; - - if (kernbuf[len - 1] == '\n') - kernbuf[len - 1] = 0; - - if (strcmp(kernbuf, "disabled") == 0 || - strcmp(kernbuf, "Disabled") == 0) - value = 0; - } - - if (value == 0) - sbi->ll_rw_stats_on = 0; - else - sbi->ll_rw_stats_on = 1; - - spin_lock(&sbi->ll_pp_extent_lock); - for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { - io_extents->pp_extents[i].pid = 0; - lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); - lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); - } - spin_unlock(&sbi->ll_pp_extent_lock); - return len; -} - -LPROC_SEQ_FOPS(ll_rw_extents_stats_pp); - -static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - - ktime_get_real_ts64(&now); - - if (!sbi->ll_rw_stats_on) { - seq_printf(seq, "disabled\n" - "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); - return 0; - } - seq_printf(seq, "snapshot_time: %llu.%09lu (secs.usecs)\n", - (u64)now.tv_sec, (unsigned long)now.tv_nsec); - - seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); - seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", - "extents", "calls", "%", "cum%", - "calls", "%", "cum%"); - spin_lock(&sbi->ll_lock); - ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX); - spin_unlock(&sbi->ll_lock); - - return 0; -} - -static ssize_t ll_rw_extents_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - int i; - int value = 1, rc = 0; - - if (len == 0) - return -EINVAL; - - rc = lprocfs_write_helper(buf, len, &value); - if (rc < 0 && len < 16) { - char kernbuf[16]; - - if (copy_from_user(kernbuf, buf, len)) - return -EFAULT; - kernbuf[len] = 0; - - if (kernbuf[len - 1] == '\n') - kernbuf[len - 1] = 0; - - if (strcmp(kernbuf, "disabled") == 0 || - strcmp(kernbuf, "Disabled") == 0) - value = 0; - } - - if (value == 0) - sbi->ll_rw_stats_on = 0; - else - sbi->ll_rw_stats_on = 1; - - spin_lock(&sbi->ll_pp_extent_lock); - for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { - io_extents->pp_extents[i].pid = 0; - lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); - lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); - } - spin_unlock(&sbi->ll_pp_extent_lock); - - return len; -} - -LPROC_SEQ_FOPS(ll_rw_extents_stats); - -void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, - struct ll_file_data *file, loff_t pos, - size_t count, int rw) -{ - int i, cur = -1; - struct ll_rw_process_info *process; - struct ll_rw_process_info *offset; - int *off_count = &sbi->ll_rw_offset_entry_count; - int *process_count = &sbi->ll_offset_process_count; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - - if (!sbi->ll_rw_stats_on) - return; - process = sbi->ll_rw_process_info; - offset = sbi->ll_rw_offset_info; - - spin_lock(&sbi->ll_pp_extent_lock); - /* Extent statistics */ - for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { - if (io_extents->pp_extents[i].pid == pid) { - cur = i; - break; - } - } - - if (cur == -1) { - /* new process */ - sbi->ll_extent_process_count = - (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX; - cur = sbi->ll_extent_process_count; - io_extents->pp_extents[cur].pid = pid; - lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist); - lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist); - } - - for (i = 0; (count >= (1 << LL_HIST_START << i)) && - (i < (LL_HIST_MAX - 1)); i++) - ; - if (rw == 0) { - io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++; - io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++; - } else { - io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++; - io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++; - } - spin_unlock(&sbi->ll_pp_extent_lock); - - spin_lock(&sbi->ll_process_lock); - /* Offset statistics */ - for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { - if (process[i].rw_pid == pid) { - if (process[i].rw_last_file != file) { - process[i].rw_range_start = pos; - process[i].rw_last_file_pos = pos + count; - process[i].rw_smallest_extent = count; - process[i].rw_largest_extent = count; - process[i].rw_offset = 0; - process[i].rw_last_file = file; - spin_unlock(&sbi->ll_process_lock); - return; - } - if (process[i].rw_last_file_pos != pos) { - *off_count = - (*off_count + 1) % LL_OFFSET_HIST_MAX; - offset[*off_count].rw_op = process[i].rw_op; - offset[*off_count].rw_pid = pid; - offset[*off_count].rw_range_start = - process[i].rw_range_start; - offset[*off_count].rw_range_end = - process[i].rw_last_file_pos; - offset[*off_count].rw_smallest_extent = - process[i].rw_smallest_extent; - offset[*off_count].rw_largest_extent = - process[i].rw_largest_extent; - offset[*off_count].rw_offset = - process[i].rw_offset; - process[i].rw_op = rw; - process[i].rw_range_start = pos; - process[i].rw_smallest_extent = count; - process[i].rw_largest_extent = count; - process[i].rw_offset = pos - - process[i].rw_last_file_pos; - } - if (process[i].rw_smallest_extent > count) - process[i].rw_smallest_extent = count; - if (process[i].rw_largest_extent < count) - process[i].rw_largest_extent = count; - process[i].rw_last_file_pos = pos + count; - spin_unlock(&sbi->ll_process_lock); - return; - } - } - *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX; - process[*process_count].rw_pid = pid; - process[*process_count].rw_op = rw; - process[*process_count].rw_range_start = pos; - process[*process_count].rw_last_file_pos = pos + count; - process[*process_count].rw_smallest_extent = count; - process[*process_count].rw_largest_extent = count; - process[*process_count].rw_offset = 0; - process[*process_count].rw_last_file = file; - spin_unlock(&sbi->ll_process_lock); -} - -static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_process_info *offset = sbi->ll_rw_offset_info; - struct ll_rw_process_info *process = sbi->ll_rw_process_info; - int i; - - ktime_get_real_ts64(&now); - - if (!sbi->ll_rw_stats_on) { - seq_printf(seq, "disabled\n" - "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); - return 0; - } - spin_lock(&sbi->ll_process_lock); - - seq_printf(seq, "snapshot_time: %llu.%09lu (secs.usecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n", - "R/W", "PID", "RANGE START", "RANGE END", - "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET"); - /* We stored the discontiguous offsets here; print them first */ - for (i = 0; i < LL_OFFSET_HIST_MAX; i++) { - if (offset[i].rw_pid != 0) - seq_printf(seq, - "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu", - offset[i].rw_op == READ ? 'R' : 'W', - offset[i].rw_pid, - offset[i].rw_range_start, - offset[i].rw_range_end, - (unsigned long)offset[i].rw_smallest_extent, - (unsigned long)offset[i].rw_largest_extent, - offset[i].rw_offset); - } - /* Then print the current offsets for each process */ - for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { - if (process[i].rw_pid != 0) - seq_printf(seq, - "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu", - process[i].rw_op == READ ? 'R' : 'W', - process[i].rw_pid, - process[i].rw_range_start, - process[i].rw_last_file_pos, - (unsigned long)process[i].rw_smallest_extent, - (unsigned long)process[i].rw_largest_extent, - process[i].rw_offset); - } - spin_unlock(&sbi->ll_process_lock); - - return 0; -} - -static ssize_t ll_rw_offset_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_process_info *process_info = sbi->ll_rw_process_info; - struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info; - int value = 1, rc = 0; - - if (len == 0) - return -EINVAL; - - rc = lprocfs_write_helper(buf, len, &value); - - if (rc < 0 && len < 16) { - char kernbuf[16]; - - if (copy_from_user(kernbuf, buf, len)) - return -EFAULT; - kernbuf[len] = 0; - - if (kernbuf[len - 1] == '\n') - kernbuf[len - 1] = 0; - - if (strcmp(kernbuf, "disabled") == 0 || - strcmp(kernbuf, "Disabled") == 0) - value = 0; - } - - if (value == 0) - sbi->ll_rw_stats_on = 0; - else - sbi->ll_rw_stats_on = 1; - - spin_lock(&sbi->ll_process_lock); - sbi->ll_offset_process_count = 0; - sbi->ll_rw_offset_entry_count = 0; - memset(process_info, 0, sizeof(struct ll_rw_process_info) * - LL_PROCESS_HIST_MAX); - memset(offset_info, 0, sizeof(struct ll_rw_process_info) * - LL_OFFSET_HIST_MAX); - spin_unlock(&sbi->ll_process_lock); - - return len; -} - -LPROC_SEQ_FOPS(ll_rw_offset_stats); - -void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars) -{ - lvars->obd_vars = lprocfs_llite_obd_vars; -} diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c deleted file mode 100644 index 6c9ec462eb41..000000000000 --- a/drivers/staging/lustre/lustre/llite/namei.c +++ /dev/null @@ -1,1202 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/quotaops.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/security.h> - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd_support.h> -#include <lustre_fid.h> -#include <lustre_dlm.h> -#include "llite_internal.h" - -static int ll_create_it(struct inode *dir, struct dentry *dentry, - struct lookup_intent *it); - -/* called from iget5_locked->find_inode() under inode_hash_lock spinlock */ -static int ll_test_inode(struct inode *inode, void *opaque) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct lustre_md *md = opaque; - - if (unlikely(!(md->body->mbo_valid & OBD_MD_FLID))) { - CERROR("MDS body missing FID\n"); - return 0; - } - - if (!lu_fid_eq(&lli->lli_fid, &md->body->mbo_fid1)) - return 0; - - return 1; -} - -static int ll_set_inode(struct inode *inode, void *opaque) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct mdt_body *body = ((struct lustre_md *)opaque)->body; - - if (unlikely(!(body->mbo_valid & OBD_MD_FLID))) { - CERROR("MDS body missing FID\n"); - return -EINVAL; - } - - lli->lli_fid = body->mbo_fid1; - if (unlikely(!(body->mbo_valid & OBD_MD_FLTYPE))) { - CERROR("Can not initialize inode " DFID - " without object type: valid = %#llx\n", - PFID(&lli->lli_fid), body->mbo_valid); - return -EINVAL; - } - - inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mbo_mode & S_IFMT); - if (unlikely(inode->i_mode == 0)) { - CERROR("Invalid inode " DFID " type\n", PFID(&lli->lli_fid)); - return -EINVAL; - } - - ll_lli_init(lli); - - return 0; -} - -/** - * Get an inode by inode number(@hash), which is already instantiated by - * the intent lookup). - */ -struct inode *ll_iget(struct super_block *sb, ino_t hash, - struct lustre_md *md) -{ - struct inode *inode; - int rc = 0; - - LASSERT(hash != 0); - inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md); - if (!inode) - return ERR_PTR(-ENOMEM); - - if (inode->i_state & I_NEW) { - rc = ll_read_inode2(inode, md); - if (!rc && S_ISREG(inode->i_mode) && - !ll_i2info(inode)->lli_clob) - rc = cl_file_inode_init(inode, md); - - if (rc) { - /* - * Let's clear directory lsm here, otherwise - * make_bad_inode() will reset the inode mode - * to regular, then ll_clear_inode will not - * be able to clear lsm_md - */ - if (S_ISDIR(inode->i_mode)) - ll_dir_clear_lsm_md(inode); - make_bad_inode(inode); - unlock_new_inode(inode); - iput(inode); - inode = ERR_PTR(rc); - } else { - unlock_new_inode(inode); - } - } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) { - rc = ll_update_inode(inode, md); - CDEBUG(D_VFSTRACE, "got inode: " DFID "(%p): rc = %d\n", - PFID(&md->body->mbo_fid1), inode, rc); - if (rc) { - if (S_ISDIR(inode->i_mode)) - ll_dir_clear_lsm_md(inode); - iput(inode); - inode = ERR_PTR(rc); - } - } - return inode; -} - -static void ll_invalidate_negative_children(struct inode *dir) -{ - struct dentry *dentry, *tmp_subdir; - - spin_lock(&dir->i_lock); - hlist_for_each_entry(dentry, &dir->i_dentry, d_u.d_alias) { - spin_lock(&dentry->d_lock); - if (!list_empty(&dentry->d_subdirs)) { - struct dentry *child; - - list_for_each_entry_safe(child, tmp_subdir, - &dentry->d_subdirs, - d_child) { - if (d_really_is_negative(child)) - d_lustre_invalidate(child, 1); - } - } - spin_unlock(&dentry->d_lock); - } - spin_unlock(&dir->i_lock); -} - -int ll_test_inode_by_fid(struct inode *inode, void *opaque) -{ - return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque); -} - -int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, int flag) -{ - struct lustre_handle lockh; - int rc; - - switch (flag) { - case LDLM_CB_BLOCKING: - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); - if (rc < 0) { - CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc); - return rc; - } - break; - case LDLM_CB_CANCELING: { - struct inode *inode = ll_inode_from_resource_lock(lock); - __u64 bits = lock->l_policy_data.l_inodebits.bits; - - /* Inode is set to lock->l_resource->lr_lvb_inode - * for mdc - bug 24555 - */ - LASSERT(!lock->l_ast_data); - - if (!inode) - break; - - /* Invalidate all dentries associated with this inode */ - LASSERT(ldlm_is_canceling(lock)); - - if (!fid_res_name_eq(ll_inode2fid(inode), - &lock->l_resource->lr_name)) { - LDLM_ERROR(lock, - "data mismatch with object " DFID "(%p)", - PFID(ll_inode2fid(inode)), inode); - LBUG(); - } - - if (bits & MDS_INODELOCK_XATTR) { - if (S_ISDIR(inode->i_mode)) - ll_i2info(inode)->lli_def_stripe_offset = -1; - ll_xattr_cache_destroy(inode); - bits &= ~MDS_INODELOCK_XATTR; - } - - /* For OPEN locks we differentiate between lock modes - * LCK_CR, LCK_CW, LCK_PR - bug 22891 - */ - if (bits & MDS_INODELOCK_OPEN) - ll_have_md_lock(inode, &bits, lock->l_req_mode); - - if (bits & MDS_INODELOCK_OPEN) { - fmode_t fmode; - - switch (lock->l_req_mode) { - case LCK_CW: - fmode = FMODE_WRITE; - break; - case LCK_PR: - fmode = FMODE_EXEC; - break; - case LCK_CR: - fmode = FMODE_READ; - break; - default: - LDLM_ERROR(lock, "bad lock mode for OPEN lock"); - LBUG(); - } - - ll_md_real_close(inode, fmode); - } - - if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE | - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM)) - ll_have_md_lock(inode, &bits, LCK_MINMODE); - - if (bits & MDS_INODELOCK_LAYOUT) { - struct cl_object_conf conf = { - .coc_opc = OBJECT_CONF_INVALIDATE, - .coc_inode = inode, - }; - - rc = ll_layout_conf(inode, &conf); - if (rc < 0) - CDEBUG(D_INODE, "cannot invalidate layout of " - DFID ": rc = %d\n", - PFID(ll_inode2fid(inode)), rc); - } - - if (bits & MDS_INODELOCK_UPDATE) { - struct ll_inode_info *lli = ll_i2info(inode); - - spin_lock(&lli->lli_lock); - LTIME_S(inode->i_mtime) = 0; - LTIME_S(inode->i_atime) = 0; - LTIME_S(inode->i_ctime) = 0; - spin_unlock(&lli->lli_lock); - } - - if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) { - struct ll_inode_info *lli = ll_i2info(inode); - - CDEBUG(D_INODE, "invalidating inode " DFID " lli = %p, pfid = " DFID "\n", - PFID(ll_inode2fid(inode)), lli, - PFID(&lli->lli_pfid)); - - truncate_inode_pages(inode->i_mapping, 0); - - if (unlikely(!fid_is_zero(&lli->lli_pfid))) { - struct inode *master_inode = NULL; - unsigned long hash; - - /* - * This is slave inode, since all of the child - * dentry is connected on the master inode, so - * we have to invalidate the negative children - * on master inode - */ - CDEBUG(D_INODE, - "Invalidate s" DFID " m" DFID "\n", - PFID(ll_inode2fid(inode)), - PFID(&lli->lli_pfid)); - - hash = cl_fid_build_ino(&lli->lli_pfid, - ll_need_32bit_api(ll_i2sbi(inode))); - /* - * Do not lookup the inode with ilookup5, - * otherwise it will cause dead lock, - * - * 1. Client1 send chmod req to the MDT0, then - * on MDT0, it enqueues master and all of its - * slaves lock, (mdt_attr_set() -> - * mdt_lock_slaves()), after gets master and - * stripe0 lock, it will send the enqueue req - * (for stripe1) to MDT1, then MDT1 finds the - * lock has been granted to client2. Then MDT1 - * sends blocking ast to client2. - * - * 2. At the same time, client2 tries to unlink - * the striped dir (rm -rf striped_dir), and - * during lookup, it will hold the master inode - * of the striped directory, whose inode state - * is NEW, then tries to revalidate all of its - * slaves, (ll_prep_inode()->ll_iget()-> - * ll_read_inode2()-> ll_update_inode().). And - * it will be blocked on the server side because - * of 1. - * - * 3. Then the client get the blocking_ast req, - * cancel the lock, but being blocked if using - * ->ilookup5()), because master inode state is - * NEW. - */ - master_inode = ilookup5_nowait(inode->i_sb, - hash, - ll_test_inode_by_fid, - (void *)&lli->lli_pfid); - if (master_inode) { - ll_invalidate_negative_children(master_inode); - iput(master_inode); - } - } else { - ll_invalidate_negative_children(inode); - } - } - - if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) && - inode->i_sb->s_root && - !is_root_inode(inode)) - ll_invalidate_aliases(inode); - - iput(inode); - break; - } - default: - LBUG(); - } - - return 0; -} - -__u32 ll_i2suppgid(struct inode *i) -{ - if (in_group_p(i->i_gid)) - return (__u32)from_kgid(&init_user_ns, i->i_gid); - else - return (__u32)(-1); -} - -/* Pack the required supplementary groups into the supplied groups array. - * If we don't need to use the groups from the target inode(s) then we - * instead pack one or more groups from the user's supplementary group - * array in case it might be useful. Not needed if doing an MDS-side upcall. - */ -void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2) -{ - LASSERT(i1); - - suppgids[0] = ll_i2suppgid(i1); - - if (i2) - suppgids[1] = ll_i2suppgid(i2); - else - suppgids[1] = -1; -} - -/* - * Try to reuse unhashed or invalidated dentries. - * This is very similar to d_exact_alias(), and any changes in one should be - * considered for inclusion in the other. The differences are that we don't - * need an unhashed alias, and we don't want d_compare to be used for - * comparison. - */ -static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry) -{ - struct dentry *alias; - - if (hlist_empty(&inode->i_dentry)) - return NULL; - - spin_lock(&inode->i_lock); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { - LASSERT(alias != dentry); - /* - * Don't need alias->d_lock here, because aliases with - * d_parent == entry->d_parent are not subject to name or - * parent changes, because the parent inode i_mutex is held. - */ - - if (alias->d_parent != dentry->d_parent) - continue; - if (alias->d_name.hash != dentry->d_name.hash) - continue; - if (alias->d_name.len != dentry->d_name.len || - memcmp(alias->d_name.name, dentry->d_name.name, - dentry->d_name.len) != 0) - continue; - spin_lock(&alias->d_lock); - dget_dlock(alias); - spin_unlock(&alias->d_lock); - spin_unlock(&inode->i_lock); - return alias; - } - spin_unlock(&inode->i_lock); - - return NULL; -} - -/* - * Similar to d_splice_alias(), but lustre treats invalid alias - * similar to DCACHE_DISCONNECTED, and tries to use it anyway. - */ -struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de) -{ - if (inode && !S_ISDIR(inode->i_mode)) { - struct dentry *new = ll_find_alias(inode, de); - - if (new) { - d_move(new, de); - iput(inode); - CDEBUG(D_DENTRY, - "Reuse dentry %p inode %p refc %d flags %#x\n", - new, d_inode(new), d_count(new), new->d_flags); - return new; - } - d_add(de, inode); - } else { - struct dentry *new = d_splice_alias(inode, de); - - if (new) - de = new; - } - CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n", - de, d_inode(de), d_count(de), de->d_flags); - return de; -} - -static int ll_lookup_it_finish(struct ptlrpc_request *request, - struct lookup_intent *it, - struct inode *parent, struct dentry **de) -{ - struct inode *inode = NULL; - __u64 bits = 0; - int rc = 0; - struct dentry *alias; - - /* NB 1 request reference will be taken away by ll_intent_lock() - * when I return - */ - CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it, - it->it_disposition); - if (!it_disposition(it, DISP_LOOKUP_NEG)) { - rc = ll_prep_inode(&inode, request, (*de)->d_sb, it); - if (rc) - return rc; - - ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits); - - /* We used to query real size from OSTs here, but actually - * this is not needed. For stat() calls size would be updated - * from subsequent do_revalidate()->ll_inode_revalidate_it() in - * 2.4 and - * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6 - * Everybody else who needs correct file size would call - * ll_glimpse_size or some equivalent themselves anyway. - * Also see bug 7198. - */ - } - - alias = ll_splice_alias(inode, *de); - if (IS_ERR(alias)) { - rc = PTR_ERR(alias); - goto out; - } - *de = alias; - - if (!it_disposition(it, DISP_LOOKUP_NEG)) { - /* We have the "lookup" lock, so unhide dentry */ - if (bits & MDS_INODELOCK_LOOKUP) - d_lustre_revalidate(*de); - } else if (!it_disposition(it, DISP_OPEN_CREATE)) { - /* If file created on server, don't depend on parent UPDATE - * lock to unhide it. It is left hidden and next lookup can - * find it in ll_splice_alias. - */ - /* Check that parent has UPDATE lock. */ - struct lookup_intent parent_it = { - .it_op = IT_GETATTR, - .it_lock_handle = 0 }; - struct lu_fid fid = ll_i2info(parent)->lli_fid; - - /* If it is striped directory, get the real stripe parent */ - if (unlikely(ll_i2info(parent)->lli_lsm_md)) { - rc = md_get_fid_from_lsm(ll_i2mdexp(parent), - ll_i2info(parent)->lli_lsm_md, - (*de)->d_name.name, - (*de)->d_name.len, &fid); - if (rc) - return rc; - } - - if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, &fid, - NULL)) { - d_lustre_revalidate(*de); - ll_intent_release(&parent_it); - } - } - -out: - if (rc != 0 && it->it_op & IT_OPEN) - ll_open_cleanup((*de)->d_sb, request); - - return rc; -} - -static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, - struct lookup_intent *it) -{ - struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; - struct dentry *save = dentry, *retval; - struct ptlrpc_request *req = NULL; - struct md_op_data *op_data = NULL; - struct inode *inode; - __u32 opc; - int rc; - - if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen) - return ERR_PTR(-ENAMETOOLONG); - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p),intent=%s\n", - dentry, PFID(ll_inode2fid(parent)), parent, LL_IT2STR(it)); - - if (d_mountpoint(dentry)) - CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it)); - - if (!it || it->it_op == IT_GETXATTR) - it = &lookup_it; - - if (it->it_op == IT_GETATTR && dentry_may_statahead(parent, dentry)) { - rc = ll_statahead(parent, &dentry, 0); - if (rc == 1) { - if (dentry == save) - retval = NULL; - else - retval = dentry; - goto out; - } - } - - if (it->it_op & IT_OPEN && it->it_flags & FMODE_WRITE && sb_rdonly(dentry->d_sb)) - return ERR_PTR(-EROFS); - - if (it->it_op & IT_CREAT) - opc = LUSTRE_OPC_CREATE; - else - opc = LUSTRE_OPC_ANY; - - op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name, - dentry->d_name.len, 0, opc, NULL); - if (IS_ERR(op_data)) - return (void *)op_data; - - /* enforce umask if acl disabled or MDS doesn't support umask */ - if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) - it->it_create_mode &= ~current_umask(); - - rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, - &ll_md_blocking_ast, 0); - /* - * If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the - * client does not know which suppgid should be sent to the MDS, or - * some other(s) changed the target file's GID after this RPC sent - * to the MDS with the suppgid as the original GID, then we should - * try again with right suppgid. - */ - if (rc == -EACCES && it->it_op & IT_OPEN && - it_disposition(it, DISP_OPEN_DENY)) { - struct mdt_body *body; - - LASSERT(req); - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (op_data->op_suppgids[0] == body->mbo_gid || - op_data->op_suppgids[1] == body->mbo_gid || - !in_group_p(make_kgid(&init_user_ns, body->mbo_gid))) { - retval = ERR_PTR(-EACCES); - goto out; - } - - fid_zero(&op_data->op_fid2); - op_data->op_suppgids[1] = body->mbo_gid; - ptlrpc_req_finished(req); - req = NULL; - ll_intent_release(it); - rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, - ll_md_blocking_ast, 0); - } - - if (rc < 0) { - retval = ERR_PTR(rc); - goto out; - } - - rc = ll_lookup_it_finish(req, it, parent, &dentry); - if (rc != 0) { - ll_intent_release(it); - retval = ERR_PTR(rc); - goto out; - } - - inode = d_inode(dentry); - if ((it->it_op & IT_OPEN) && inode && - !S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode)) { - ll_release_openhandle(inode, it); - } - ll_lookup_finish_locks(it, inode); - - if (dentry == save) - retval = NULL; - else - retval = dentry; -out: - if (op_data && !IS_ERR(op_data)) - ll_finish_md_op_data(op_data); - - ptlrpc_req_finished(req); - return retval; -} - -static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, - unsigned int flags) -{ - struct lookup_intent *itp, it = { .it_op = IT_GETATTR }; - struct dentry *de; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p),flags=%u\n", - dentry, PFID(ll_inode2fid(parent)), parent, flags); - - /* Optimize away (CREATE && !OPEN). Let .create handle the race. - * but only if we have write permissions there, otherwise we need - * to proceed with lookup. LU-4185 - */ - if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) && - (inode_permission(parent, MAY_WRITE | MAY_EXEC) == 0)) - return NULL; - - if (flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE)) - itp = NULL; - else - itp = ⁢ - de = ll_lookup_it(parent, dentry, itp); - - if (itp) - ll_intent_release(itp); - - return de; -} - -/* - * For cached negative dentry and new dentry, handle lookup/create/open - * together. - */ -static int ll_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned int open_flags, - umode_t mode, int *opened) -{ - struct lookup_intent *it; - struct dentry *de; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p),file %p,open_flags %x,mode %x opened %d\n", - dentry, PFID(ll_inode2fid(dir)), dir, file, open_flags, mode, - *opened); - - /* Only negative dentries enter here */ - LASSERT(!d_inode(dentry)); - - if (!d_in_lookup(dentry)) { - /* A valid negative dentry that just passed revalidation, - * there's little point to try and open it server-side, - * even though there's a minuscle chance it might succeed. - * Either way it's a valid race to just return -ENOENT here. - */ - if (!(open_flags & O_CREAT)) - return -ENOENT; - - /* Otherwise we just unhash it to be rehashed afresh via - * lookup if necessary - */ - d_drop(dentry); - } - - it = kzalloc(sizeof(*it), GFP_NOFS); - if (!it) - return -ENOMEM; - - it->it_op = IT_OPEN; - if (open_flags & O_CREAT) - it->it_op |= IT_CREAT; - it->it_create_mode = (mode & S_IALLUGO) | S_IFREG; - it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags); - it->it_flags &= ~MDS_OPEN_FL_INTERNAL; - - /* Dentry added to dcache tree in ll_lookup_it */ - de = ll_lookup_it(dir, dentry, it); - if (IS_ERR(de)) - rc = PTR_ERR(de); - else if (de) - dentry = de; - - if (!rc) { - if (it_disposition(it, DISP_OPEN_CREATE)) { - /* Dentry instantiated in ll_create_it. */ - rc = ll_create_it(dir, dentry, it); - if (rc) { - /* We dget in ll_splice_alias. */ - if (de) - dput(de); - goto out_release; - } - - *opened |= FILE_CREATED; - } - if (d_really_is_positive(dentry) && - it_disposition(it, DISP_OPEN_OPEN)) { - /* Open dentry. */ - if (S_ISFIFO(d_inode(dentry)->i_mode)) { - /* We cannot call open here as it might - * deadlock. This case is unreachable in - * practice because of OBD_CONNECT_NODEVOH. - */ - rc = finish_no_open(file, de); - } else { - file->private_data = it; - rc = finish_open(file, dentry, NULL, opened); - /* We dget in ll_splice_alias. finish_open takes - * care of dget for fd open. - */ - if (de) - dput(de); - } - } else { - rc = finish_no_open(file, de); - } - } - -out_release: - ll_intent_release(it); - kfree(it); - - return rc; -} - -/* We depend on "mode" being set with the proper file type/umask by now */ -static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it) -{ - struct inode *inode = NULL; - struct ptlrpc_request *request = NULL; - struct ll_sb_info *sbi = ll_i2sbi(dir); - int rc; - - LASSERT(it && it->it_disposition); - - LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF)); - request = it->it_request; - it_clear_disposition(it, DISP_ENQ_CREATE_REF); - rc = ll_prep_inode(&inode, request, dir->i_sb, it); - if (rc) { - inode = ERR_PTR(rc); - goto out; - } - - LASSERT(hlist_empty(&inode->i_dentry)); - - /* We asked for a lock on the directory, but were granted a - * lock on the inode. Since we finally have an inode pointer, - * stuff it in the lock. - */ - CDEBUG(D_DLMTRACE, "setting l_ast_data to inode " DFID "(%p)\n", - PFID(ll_inode2fid(dir)), inode); - ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); - out: - ptlrpc_req_finished(request); - return inode; -} - -/* - * By the time this is called, we already have created the directory cache - * entry for the new file, but it is so far negative - it has no inode. - * - * We defer creating the OBD object(s) until open, to keep the intent and - * non-intent code paths similar, and also because we do not have the MDS - * inode number before calling ll_create_node() (which is needed for LOV), - * so we would need to do yet another RPC to the MDS to store the LOV EA - * data on the MDS. If needed, we would pass the PACKED lmm as data and - * lmm_size in datalen (the MDS still has code which will handle that). - * - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ -static int ll_create_it(struct inode *dir, struct dentry *dentry, - struct lookup_intent *it) -{ - struct inode *inode; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p), intent=%s\n", - dentry, PFID(ll_inode2fid(dir)), dir, LL_IT2STR(it)); - - rc = it_open_error(DISP_OPEN_CREATE, it); - if (rc) - return rc; - - inode = ll_create_node(dir, it); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - d_instantiate(dentry, inode); - - return ll_init_security(dentry, inode, dir); -} - -void ll_update_times(struct ptlrpc_request *request, struct inode *inode) -{ - struct mdt_body *body = req_capsule_server_get(&request->rq_pill, - &RMF_MDT_BODY); - - LASSERT(body); - if (body->mbo_valid & OBD_MD_FLMTIME && - body->mbo_mtime > LTIME_S(inode->i_mtime)) { - CDEBUG(D_INODE, "setting fid " DFID " mtime from %lu to %llu\n", - PFID(ll_inode2fid(inode)), LTIME_S(inode->i_mtime), - body->mbo_mtime); - LTIME_S(inode->i_mtime) = body->mbo_mtime; - } - if (body->mbo_valid & OBD_MD_FLCTIME && - body->mbo_ctime > LTIME_S(inode->i_ctime)) - LTIME_S(inode->i_ctime) = body->mbo_ctime; -} - -static int ll_new_node(struct inode *dir, struct dentry *dentry, - const char *tgt, umode_t mode, int rdev, - __u32 opc) -{ - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - struct inode *inode = NULL; - struct ll_sb_info *sbi = ll_i2sbi(dir); - int tgt_len = 0; - int err; - - if (unlikely(tgt)) - tgt_len = strlen(tgt) + 1; -again: - op_data = ll_prep_md_op_data(NULL, dir, NULL, - dentry->d_name.name, - dentry->d_name.len, - 0, opc, NULL); - if (IS_ERR(op_data)) { - err = PTR_ERR(op_data); - goto err_exit; - } - - err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode, - from_kuid(&init_user_ns, current_fsuid()), - from_kgid(&init_user_ns, current_fsgid()), - cfs_curproc_cap_pack(), rdev, &request); - ll_finish_md_op_data(op_data); - if (err < 0 && err != -EREMOTE) - goto err_exit; - - /* - * If the client doesn't know where to create a subdirectory (or - * in case of a race that sends the RPC to the wrong MDS), the - * MDS will return -EREMOTE and the client will fetch the layout - * of the directory, then create the directory on the right MDT. - */ - if (unlikely(err == -EREMOTE)) { - struct ll_inode_info *lli = ll_i2info(dir); - struct lmv_user_md *lum; - int lumsize, err2; - - ptlrpc_req_finished(request); - request = NULL; - - err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request, - OBD_MD_DEFAULT_MEA); - if (!err2) { - /* Update stripe_offset and retry */ - lli->lli_def_stripe_offset = lum->lum_stripe_offset; - } else if (err2 == -ENODATA && - lli->lli_def_stripe_offset != -1) { - /* - * If there are no default stripe EA on the MDT, but the - * client has default stripe, then it probably means - * default stripe EA has just been deleted. - */ - lli->lli_def_stripe_offset = -1; - } else { - goto err_exit; - } - - ptlrpc_req_finished(request); - request = NULL; - goto again; - } - - ll_update_times(request, dir); - - err = ll_prep_inode(&inode, request, dir->i_sb, NULL); - if (err) - goto err_exit; - - d_instantiate(dentry, inode); - - err = ll_init_security(dentry, inode, dir); -err_exit: - if (request) - ptlrpc_req_finished(request); - - return err; -} - -static int ll_mknod(struct inode *dir, struct dentry *dchild, - umode_t mode, dev_t rdev) -{ - int err; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p) mode %o dev %x\n", - dchild, PFID(ll_inode2fid(dir)), dir, mode, - old_encode_dev(rdev)); - - if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) - mode &= ~current_umask(); - - switch (mode & S_IFMT) { - case 0: - mode |= S_IFREG; - /* for mode = 0 case */ - /* fall through */ - case S_IFREG: - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: - err = ll_new_node(dir, dchild, NULL, mode, - old_encode_dev(rdev), - LUSTRE_OPC_MKNOD); - break; - case S_IFDIR: - err = -EPERM; - break; - default: - err = -EINVAL; - } - - if (!err) - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1); - - return err; -} - -/* - * Plain create. Intent create is handled in atomic_open. - */ -static int ll_create_nd(struct inode *dir, struct dentry *dentry, - umode_t mode, bool want_excl) -{ - int rc; - - CDEBUG(D_VFSTRACE, - "VFS Op:name=%pd, dir=" DFID "(%p), flags=%u, excl=%d\n", - dentry, PFID(ll_inode2fid(dir)), dir, mode, want_excl); - - rc = ll_mknod(dir, dentry, mode, 0); - - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1); - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, unhashed %d\n", - dentry, d_unhashed(dentry)); - - return rc; -} - -static int ll_unlink(struct inode *dir, struct dentry *dchild) -{ - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n", - dchild, dir->i_ino, dir->i_generation, dir); - - op_data = ll_prep_md_op_data(NULL, dir, NULL, - dchild->d_name.name, - dchild->d_name.len, - 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - if (dchild->d_inode) - op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); - - op_data->op_fid2 = op_data->op_fid3; - rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); - ll_finish_md_op_data(op_data); - if (rc) - goto out; - - ll_update_times(request, dir); - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1); - - out: - ptlrpc_req_finished(request); - return rc; -} - -static int ll_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - int err; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir" DFID "(%p)\n", - dentry, PFID(ll_inode2fid(dir)), dir); - - if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) - mode &= ~current_umask(); - mode = (mode & (0777 | S_ISVTX)) | S_IFDIR; - - err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR); - if (!err) - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1); - - return err; -} - -static int ll_rmdir(struct inode *dir, struct dentry *dchild) -{ - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p)\n", - dchild, PFID(ll_inode2fid(dir)), dir); - - op_data = ll_prep_md_op_data(NULL, dir, NULL, - dchild->d_name.name, - dchild->d_name.len, - S_IFDIR, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - if (dchild->d_inode) - op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); - - op_data->op_fid2 = op_data->op_fid3; - rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); - ll_finish_md_op_data(op_data); - if (rc == 0) { - ll_update_times(request, dir); - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1); - } - - ptlrpc_req_finished(request); - return rc; -} - -static int ll_symlink(struct inode *dir, struct dentry *dentry, - const char *oldname) -{ - int err; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p),target=%.*s\n", - dentry, PFID(ll_inode2fid(dir)), dir, 3000, oldname); - - err = ll_new_node(dir, dentry, oldname, S_IFLNK | 0777, - 0, LUSTRE_OPC_SYMLINK); - - if (!err) - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1); - - return err; -} - -static int ll_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) -{ - struct inode *src = d_inode(old_dentry); - struct ll_sb_info *sbi = ll_i2sbi(dir); - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - int err; - - CDEBUG(D_VFSTRACE, - "VFS Op: inode=" DFID "(%p), dir=" DFID "(%p), target=%pd\n", - PFID(ll_inode2fid(src)), src, PFID(ll_inode2fid(dir)), dir, - new_dentry); - - op_data = ll_prep_md_op_data(NULL, src, dir, new_dentry->d_name.name, - new_dentry->d_name.len, - 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - err = md_link(sbi->ll_md_exp, op_data, &request); - ll_finish_md_op_data(op_data); - if (err) - goto out; - - ll_update_times(request, dir); - ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1); -out: - ptlrpc_req_finished(request); - return err; -} - -static int ll_rename(struct inode *src, struct dentry *src_dchild, - struct inode *tgt, struct dentry *tgt_dchild, - unsigned int flags) -{ - struct ptlrpc_request *request = NULL; - struct ll_sb_info *sbi = ll_i2sbi(src); - struct md_op_data *op_data; - int err; - - if (flags) - return -EINVAL; - - CDEBUG(D_VFSTRACE, - "VFS Op:oldname=%pd, src_dir=" DFID "(%p), newname=%pd, tgt_dir=" DFID "(%p)\n", - src_dchild, PFID(ll_inode2fid(src)), src, - tgt_dchild, PFID(ll_inode2fid(tgt)), tgt); - - op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - if (src_dchild->d_inode) - op_data->op_fid3 = *ll_inode2fid(src_dchild->d_inode); - if (tgt_dchild->d_inode) - op_data->op_fid4 = *ll_inode2fid(tgt_dchild->d_inode); - - err = md_rename(sbi->ll_md_exp, op_data, - src_dchild->d_name.name, - src_dchild->d_name.len, - tgt_dchild->d_name.name, - tgt_dchild->d_name.len, &request); - ll_finish_md_op_data(op_data); - if (!err) { - ll_update_times(request, src); - ll_update_times(request, tgt); - ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1); - } - - ptlrpc_req_finished(request); - if (!err) - d_move(src_dchild, tgt_dchild); - return err; -} - -const struct inode_operations ll_dir_inode_operations = { - .mknod = ll_mknod, - .atomic_open = ll_atomic_open, - .lookup = ll_lookup_nd, - .create = ll_create_nd, - /* We need all these non-raw things for NFSD, to not patch it. */ - .unlink = ll_unlink, - .mkdir = ll_mkdir, - .rmdir = ll_rmdir, - .symlink = ll_symlink, - .link = ll_link, - .rename = ll_rename, - .setattr = ll_setattr, - .getattr = ll_getattr, - .permission = ll_inode_permission, - .listxattr = ll_listxattr, - .get_acl = ll_get_acl, -}; - -const struct inode_operations ll_special_inode_operations = { - .setattr = ll_setattr, - .getattr = ll_getattr, - .permission = ll_inode_permission, - .listxattr = ll_listxattr, - .get_acl = ll_get_acl, -}; diff --git a/drivers/staging/lustre/lustre/llite/range_lock.c b/drivers/staging/lustre/lustre/llite/range_lock.c deleted file mode 100644 index cc9565f6bfe2..000000000000 --- a/drivers/staging/lustre/lustre/llite/range_lock.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Range lock is used to allow multiple threads writing a single shared - * file given each thread is writing to a non-overlapping portion of the - * file. - * - * Refer to the possible upstream kernel version of range lock by - * Jan Kara <jack@suse.cz>: https://lkml.org/lkml/2013/1/31/480 - * - * This file could later replaced by the upstream kernel version. - */ -/* - * Author: Prakash Surya <surya1@llnl.gov> - * Author: Bobi Jam <bobijam.xu@intel.com> - */ -#include "range_lock.h" -#include <uapi/linux/lustre/lustre_idl.h> - -/** - * Initialize a range lock tree - * - * \param tree [in] an empty range lock tree - * - * Pre: Caller should have allocated the range lock tree. - * Post: The range lock tree is ready to function. - */ -void range_lock_tree_init(struct range_lock_tree *tree) -{ - tree->rlt_root = NULL; - tree->rlt_sequence = 0; - spin_lock_init(&tree->rlt_lock); -} - -/** - * Initialize a range lock node - * - * \param lock [in] an empty range lock node - * \param start [in] start of the covering region - * \param end [in] end of the covering region - * - * Pre: Caller should have allocated the range lock node. - * Post: The range lock node is meant to cover [start, end] region - */ -int range_lock_init(struct range_lock *lock, __u64 start, __u64 end) -{ - int rc; - - memset(&lock->rl_node, 0, sizeof(lock->rl_node)); - if (end != LUSTRE_EOF) - end >>= PAGE_SHIFT; - rc = interval_set(&lock->rl_node, start >> PAGE_SHIFT, end); - if (rc) - return rc; - - INIT_LIST_HEAD(&lock->rl_next_lock); - lock->rl_task = NULL; - lock->rl_lock_count = 0; - lock->rl_blocking_ranges = 0; - lock->rl_sequence = 0; - return rc; -} - -static inline struct range_lock *next_lock(struct range_lock *lock) -{ - return list_entry(lock->rl_next_lock.next, typeof(*lock), rl_next_lock); -} - -/** - * Helper function of range_unlock() - * - * \param node [in] a range lock found overlapped during interval node - * search - * \param arg [in] the range lock to be tested - * - * \retval INTERVAL_ITER_CONT indicate to continue the search for next - * overlapping range node - * \retval INTERVAL_ITER_STOP indicate to stop the search - */ -static enum interval_iter range_unlock_cb(struct interval_node *node, void *arg) -{ - struct range_lock *lock = arg; - struct range_lock *overlap = node2rangelock(node); - struct range_lock *iter; - - list_for_each_entry(iter, &overlap->rl_next_lock, rl_next_lock) { - if (iter->rl_sequence > lock->rl_sequence) { - --iter->rl_blocking_ranges; - LASSERT(iter->rl_blocking_ranges > 0); - } - } - if (overlap->rl_sequence > lock->rl_sequence) { - --overlap->rl_blocking_ranges; - if (overlap->rl_blocking_ranges == 0) - wake_up_process(overlap->rl_task); - } - return INTERVAL_ITER_CONT; -} - -/** - * Unlock a range lock, wake up locks blocked by this lock. - * - * \param tree [in] range lock tree - * \param lock [in] range lock to be deleted - * - * If this lock has been granted, relase it; if not, just delete it from - * the tree or the same region lock list. Wake up those locks only blocked - * by this lock through range_unlock_cb(). - */ -void range_unlock(struct range_lock_tree *tree, struct range_lock *lock) -{ - spin_lock(&tree->rlt_lock); - if (!list_empty(&lock->rl_next_lock)) { - struct range_lock *next; - - if (interval_is_intree(&lock->rl_node)) { /* first lock */ - /* Insert the next same range lock into the tree */ - next = next_lock(lock); - next->rl_lock_count = lock->rl_lock_count - 1; - interval_erase(&lock->rl_node, &tree->rlt_root); - interval_insert(&next->rl_node, &tree->rlt_root); - } else { - /* find the first lock in tree */ - list_for_each_entry(next, &lock->rl_next_lock, - rl_next_lock) { - if (!interval_is_intree(&next->rl_node)) - continue; - - LASSERT(next->rl_lock_count > 0); - next->rl_lock_count--; - break; - } - } - list_del_init(&lock->rl_next_lock); - } else { - LASSERT(interval_is_intree(&lock->rl_node)); - interval_erase(&lock->rl_node, &tree->rlt_root); - } - - interval_search(tree->rlt_root, &lock->rl_node.in_extent, - range_unlock_cb, lock); - spin_unlock(&tree->rlt_lock); -} - -/** - * Helper function of range_lock() - * - * \param node [in] a range lock found overlapped during interval node - * search - * \param arg [in] the range lock to be tested - * - * \retval INTERVAL_ITER_CONT indicate to continue the search for next - * overlapping range node - * \retval INTERVAL_ITER_STOP indicate to stop the search - */ -static enum interval_iter range_lock_cb(struct interval_node *node, void *arg) -{ - struct range_lock *lock = arg; - struct range_lock *overlap = node2rangelock(node); - - lock->rl_blocking_ranges += overlap->rl_lock_count + 1; - return INTERVAL_ITER_CONT; -} - -/** - * Lock a region - * - * \param tree [in] range lock tree - * \param lock [in] range lock node containing the region span - * - * \retval 0 get the range lock - * \retval <0 error code while not getting the range lock - * - * If there exists overlapping range lock, the new lock will wait and - * retry, if later it find that it is not the chosen one to wake up, - * it wait again. - */ -int range_lock(struct range_lock_tree *tree, struct range_lock *lock) -{ - struct interval_node *node; - int rc = 0; - - spin_lock(&tree->rlt_lock); - /* - * We need to check for all conflicting intervals - * already in the tree. - */ - interval_search(tree->rlt_root, &lock->rl_node.in_extent, - range_lock_cb, lock); - /* - * Insert to the tree if I am unique, otherwise I've been linked to - * the rl_next_lock of another lock which has the same range as mine - * in range_lock_cb(). - */ - node = interval_insert(&lock->rl_node, &tree->rlt_root); - if (node) { - struct range_lock *tmp = node2rangelock(node); - - list_add_tail(&lock->rl_next_lock, &tmp->rl_next_lock); - tmp->rl_lock_count++; - } - lock->rl_sequence = ++tree->rlt_sequence; - - while (lock->rl_blocking_ranges > 0) { - lock->rl_task = current; - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock(&tree->rlt_lock); - schedule(); - - if (signal_pending(current)) { - range_unlock(tree, lock); - rc = -EINTR; - goto out; - } - spin_lock(&tree->rlt_lock); - } - spin_unlock(&tree->rlt_lock); -out: - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/range_lock.h b/drivers/staging/lustre/lustre/llite/range_lock.h deleted file mode 100644 index 38b2be4e378f..000000000000 --- a/drivers/staging/lustre/lustre/llite/range_lock.h +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Range lock is used to allow multiple threads writing a single shared - * file given each thread is writing to a non-overlapping portion of the - * file. - * - * Refer to the possible upstream kernel version of range lock by - * Jan Kara <jack@suse.cz>: https://lkml.org/lkml/2013/1/31/480 - * - * This file could later replaced by the upstream kernel version. - */ -/* - * Author: Prakash Surya <surya1@llnl.gov> - * Author: Bobi Jam <bobijam.xu@intel.com> - */ -#ifndef _RANGE_LOCK_H -#define _RANGE_LOCK_H - -#include <linux/libcfs/libcfs.h> -#include <interval_tree.h> - -struct range_lock { - struct interval_node rl_node; - /** - * Process to enqueue this lock. - */ - struct task_struct *rl_task; - /** - * List of locks with the same range. - */ - struct list_head rl_next_lock; - /** - * Number of locks in the list rl_next_lock - */ - unsigned int rl_lock_count; - /** - * Number of ranges which are blocking acquisition of the lock - */ - unsigned int rl_blocking_ranges; - /** - * Sequence number of range lock. This number is used to get to know - * the order the locks are queued; this is required for range_cancel(). - */ - __u64 rl_sequence; -}; - -static inline struct range_lock *node2rangelock(const struct interval_node *n) -{ - return container_of(n, struct range_lock, rl_node); -} - -struct range_lock_tree { - struct interval_node *rlt_root; - spinlock_t rlt_lock; /* protect range lock tree */ - __u64 rlt_sequence; -}; - -void range_lock_tree_init(struct range_lock_tree *tree); -int range_lock_init(struct range_lock *lock, __u64 start, __u64 end); -int range_lock(struct range_lock_tree *tree, struct range_lock *lock); -void range_unlock(struct range_lock_tree *tree, struct range_lock *lock); -#endif diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c deleted file mode 100644 index 3e008ce7275d..000000000000 --- a/drivers/staging/lustre/lustre/llite/rw.c +++ /dev/null @@ -1,1214 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/rw.c - * - * Lustre Lite I/O page cache routines shared by different kernel revs - */ - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/errno.h> -#include <linux/unistd.h> -#include <linux/writeback.h> -#include <linux/uaccess.h> - -#include <linux/fs.h> -#include <linux/pagemap.h> -/* current_is_kswapd() */ -#include <linux/swap.h> -#include <linux/bvec.h> - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd_cksum.h> -#include "llite_internal.h" - -static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); - -/** - * Get readahead pages from the filesystem readahead pool of the client for a - * thread. - * - * /param sbi superblock for filesystem readahead state ll_ra_info - * /param ria per-thread readahead state - * /param pages number of pages requested for readahead for the thread. - * - * WARNING: This algorithm is used to reduce contention on sbi->ll_lock. - * It should work well if the ra_max_pages is much greater than the single - * file's read-ahead window, and not too many threads contending for - * these readahead pages. - * - * TODO: There may be a 'global sync problem' if many threads are trying - * to get an ra budget that is larger than the remaining readahead pages - * and reach here at exactly the same time. They will compute /a ret to - * consume the remaining pages, but will fail at atomic_add_return() and - * get a zero ra window, although there is still ra space remaining. - Jay - */ -static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, - struct ra_io_arg *ria, - unsigned long pages, unsigned long min) -{ - struct ll_ra_info *ra = &sbi->ll_ra_info; - long ret; - - /* If read-ahead pages left are less than 1M, do not do read-ahead, - * otherwise it will form small read RPC(< 1M), which hurt server - * performance a lot. - */ - ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages); - if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages)) { - ret = 0; - goto out; - } - - if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { - atomic_sub(ret, &ra->ra_cur_pages); - ret = 0; - } - -out: - if (ret < min) { - /* override ra limit for maximum performance */ - atomic_add(min - ret, &ra->ra_cur_pages); - ret = min; - } - return ret; -} - -void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) -{ - struct ll_ra_info *ra = &sbi->ll_ra_info; - - atomic_sub(len, &ra->ra_cur_pages); -} - -static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which) -{ - LASSERTF(which < _NR_RA_STAT, "which: %u\n", which); - lprocfs_counter_incr(sbi->ll_ra_stats, which); -} - -void ll_ra_stats_inc(struct inode *inode, enum ra_stat which) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - - ll_ra_stats_inc_sbi(sbi, which); -} - -#define RAS_CDEBUG(ras) \ - CDEBUG(D_READA, \ - "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu " \ - "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n", \ - ras->ras_last_readpage, ras->ras_consecutive_requests, \ - ras->ras_consecutive_pages, ras->ras_window_start, \ - ras->ras_window_len, ras->ras_next_readahead, \ - ras->ras_rpc_size, \ - ras->ras_requests, ras->ras_request_index, \ - ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ - ras->ras_stride_pages, ras->ras_stride_length) - -static int index_in_window(unsigned long index, unsigned long point, - unsigned long before, unsigned long after) -{ - unsigned long start = point - before, end = point + after; - - if (start > point) - start = 0; - if (end < point) - end = ~0; - - return start <= index && index <= end; -} - -void ll_ras_enter(struct file *f) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(f); - struct ll_readahead_state *ras = &fd->fd_ras; - - spin_lock(&ras->ras_lock); - ras->ras_requests++; - ras->ras_request_index = 0; - ras->ras_consecutive_requests++; - spin_unlock(&ras->ras_lock); -} - -/** - * Initiates read-ahead of a page with given index. - * - * \retval +ve: page was already uptodate so it will be skipped - * from being added; - * \retval -ve: page wasn't added to \a queue for error; - * \retval 0: page was added into \a queue for read ahead. - */ -static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, pgoff_t index) -{ - enum ra_stat which = _NR_RA_STAT; /* keep gcc happy */ - struct cl_object *clob = io->ci_obj; - struct inode *inode = vvp_object_inode(clob); - const char *msg = NULL; - struct cl_page *page; - struct vvp_page *vpg; - struct page *vmpage; - int rc = 0; - - vmpage = grab_cache_page_nowait(inode->i_mapping, index); - if (!vmpage) { - which = RA_STAT_FAILED_GRAB_PAGE; - msg = "g_c_p_n failed"; - rc = -EBUSY; - goto out; - } - - /* Check if vmpage was truncated or reclaimed */ - if (vmpage->mapping != inode->i_mapping) { - which = RA_STAT_WRONG_GRAB_PAGE; - msg = "g_c_p_n returned invalid page"; - rc = -EBUSY; - goto out; - } - - page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); - if (IS_ERR(page)) { - which = RA_STAT_FAILED_GRAB_PAGE; - msg = "cl_page_find failed"; - rc = PTR_ERR(page); - goto out; - } - - lu_ref_add(&page->cp_reference, "ra", current); - cl_page_assume(env, io, page); - vpg = cl2vvp_page(cl_object_page_slice(clob, page)); - if (!vpg->vpg_defer_uptodate && !PageUptodate(vmpage)) { - vpg->vpg_defer_uptodate = 1; - vpg->vpg_ra_used = 0; - cl_page_list_add(queue, page); - } else { - /* skip completed pages */ - cl_page_unassume(env, io, page); - /* This page is already uptodate, returning a positive number - * to tell the callers about this - */ - rc = 1; - } - - lu_ref_del(&page->cp_reference, "ra", current); - cl_page_put(env, page); -out: - if (vmpage) { - if (rc) - unlock_page(vmpage); - put_page(vmpage); - } - if (msg) { - ll_ra_stats_inc(inode, which); - CDEBUG(D_READA, "%s\n", msg); - } - return rc; -} - -#define RIA_DEBUG(ria) \ - CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \ - ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ - ria->ria_pages) - -static inline int stride_io_mode(struct ll_readahead_state *ras) -{ - return ras->ras_consecutive_stride_requests > 1; -} - -/* The function calculates how much pages will be read in - * [off, off + length], in such stride IO area, - * stride_offset = st_off, stride_length = st_len, - * stride_pages = st_pgs - * - * |------------------|*****|------------------|*****|------------|*****|.... - * st_off - * |--- st_pgs ---| - * |----- st_len -----| - * - * How many pages it should read in such pattern - * |-------------------------------------------------------------| - * off - * |<------ length ------->| - * - * = |<----->| + |-------------------------------------| + |---| - * start_left st_pgs * i end_left - */ -static unsigned long -stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs, - unsigned long off, unsigned long length) -{ - __u64 start = off > st_off ? off - st_off : 0; - __u64 end = off + length > st_off ? off + length - st_off : 0; - unsigned long start_left = 0; - unsigned long end_left = 0; - unsigned long pg_count; - - if (st_len == 0 || length == 0 || end == 0) - return length; - - start_left = do_div(start, st_len); - if (start_left < st_pgs) - start_left = st_pgs - start_left; - else - start_left = 0; - - end_left = do_div(end, st_len); - if (end_left > st_pgs) - end_left = st_pgs; - - CDEBUG(D_READA, "start %llu, end %llu start_left %lu end_left %lu\n", - start, end, start_left, end_left); - - if (start == end) - pg_count = end_left - (st_pgs - start_left); - else - pg_count = start_left + st_pgs * (end - start - 1) + end_left; - - CDEBUG(D_READA, - "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu pgcount %lu\n", - st_off, st_len, st_pgs, off, length, pg_count); - - return pg_count; -} - -static int ria_page_count(struct ra_io_arg *ria) -{ - __u64 length = ria->ria_end >= ria->ria_start ? - ria->ria_end - ria->ria_start + 1 : 0; - - return stride_pg_count(ria->ria_stoff, ria->ria_length, - ria->ria_pages, ria->ria_start, - length); -} - -static unsigned long ras_align(struct ll_readahead_state *ras, - unsigned long index, - unsigned long *remainder) -{ - unsigned long rem = index % ras->ras_rpc_size; - - if (remainder) - *remainder = rem; - return index - rem; -} - -/*Check whether the index is in the defined ra-window */ -static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) -{ - /* If ria_length == ria_pages, it means non-stride I/O mode, - * idx should always inside read-ahead window in this case - * For stride I/O mode, just check whether the idx is inside - * the ria_pages. - */ - return ria->ria_length == 0 || ria->ria_length == ria->ria_pages || - (idx >= ria->ria_stoff && (idx - ria->ria_stoff) % - ria->ria_length < ria->ria_pages); -} - -static unsigned long -ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, struct ll_readahead_state *ras, - struct ra_io_arg *ria) -{ - struct cl_read_ahead ra = { 0 }; - unsigned long ra_end = 0; - bool stride_ria; - pgoff_t page_idx; - int rc; - - LASSERT(ria); - RIA_DEBUG(ria); - - stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0; - for (page_idx = ria->ria_start; - page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) { - if (ras_inside_ra_window(page_idx, ria)) { - if (!ra.cra_end || ra.cra_end < page_idx) { - unsigned long end; - - cl_read_ahead_release(env, &ra); - - rc = cl_io_read_ahead(env, io, page_idx, &ra); - if (rc < 0) - break; - - CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n", - page_idx, ra.cra_end, ra.cra_rpc_size); - LASSERTF(ra.cra_end >= page_idx, - "object: %p, indcies %lu / %lu\n", - io->ci_obj, ra.cra_end, page_idx); - /* - * update read ahead RPC size. - * NB: it's racy but doesn't matter - */ - if (ras->ras_rpc_size > ra.cra_rpc_size && - ra.cra_rpc_size > 0) - ras->ras_rpc_size = ra.cra_rpc_size; - /* trim it to align with optimal RPC size */ - end = ras_align(ras, ria->ria_end + 1, NULL); - if (end > 0 && !ria->ria_eof) - ria->ria_end = end - 1; - if (ria->ria_end < ria->ria_end_min) - ria->ria_end = ria->ria_end_min; - if (ria->ria_end > ra.cra_end) - ria->ria_end = ra.cra_end; - } - - /* If the page is inside the read-ahead window */ - rc = ll_read_ahead_page(env, io, queue, page_idx); - if (rc < 0) - break; - - ra_end = page_idx; - if (!rc) - ria->ria_reserved--; - } else if (stride_ria) { - /* If it is not in the read-ahead window, and it is - * read-ahead mode, then check whether it should skip - * the stride gap - */ - pgoff_t offset; - /* FIXME: This assertion only is valid when it is for - * forward read-ahead, it will be fixed when backward - * read-ahead is implemented - */ - LASSERTF(page_idx >= ria->ria_stoff, - "Invalid page_idx %lu rs %lu re %lu ro %lu rl %lu rp %lu\n", - page_idx, - ria->ria_start, ria->ria_end, ria->ria_stoff, - ria->ria_length, ria->ria_pages); - offset = page_idx - ria->ria_stoff; - offset = offset % (ria->ria_length); - if (offset > ria->ria_pages) { - page_idx += ria->ria_length - offset; - CDEBUG(D_READA, "i %lu skip %lu\n", page_idx, - ria->ria_length - offset); - continue; - } - } - } - cl_read_ahead_release(env, &ra); - - return ra_end; -} - -static int ll_readahead(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, - struct ll_readahead_state *ras, bool hit) -{ - struct vvp_io *vio = vvp_env_io(env); - struct ll_thread_info *lti = ll_env_info(env); - struct cl_attr *attr = vvp_env_thread_attr(env); - unsigned long len, mlen = 0; - pgoff_t ra_end, start = 0, end = 0; - struct inode *inode; - struct ra_io_arg *ria = <i->lti_ria; - struct cl_object *clob; - int ret = 0; - __u64 kms; - - clob = io->ci_obj; - inode = vvp_object_inode(clob); - - memset(ria, 0, sizeof(*ria)); - - cl_object_attr_lock(clob); - ret = cl_object_attr_get(env, clob, attr); - cl_object_attr_unlock(clob); - - if (ret != 0) - return ret; - kms = attr->cat_kms; - if (kms == 0) { - ll_ra_stats_inc(inode, RA_STAT_ZERO_LEN); - return 0; - } - - spin_lock(&ras->ras_lock); - - /** - * Note: other thread might rollback the ras_next_readahead, - * if it can not get the full size of prepared pages, see the - * end of this function. For stride read ahead, it needs to - * make sure the offset is no less than ras_stride_offset, - * so that stride read ahead can work correctly. - */ - if (stride_io_mode(ras)) - start = max(ras->ras_next_readahead, ras->ras_stride_offset); - else - start = ras->ras_next_readahead; - - if (ras->ras_window_len > 0) - end = ras->ras_window_start + ras->ras_window_len - 1; - - /* Enlarge the RA window to encompass the full read */ - if (vio->vui_ra_valid && - end < vio->vui_ra_start + vio->vui_ra_count - 1) - end = vio->vui_ra_start + vio->vui_ra_count - 1; - - if (end) { - unsigned long end_index; - - /* Truncate RA window to end of file */ - end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT); - if (end_index <= end) { - end = end_index; - ria->ria_eof = true; - } - - ras->ras_next_readahead = max(end, end + 1); - RAS_CDEBUG(ras); - } - ria->ria_start = start; - ria->ria_end = end; - /* If stride I/O mode is detected, get stride window*/ - if (stride_io_mode(ras)) { - ria->ria_stoff = ras->ras_stride_offset; - ria->ria_length = ras->ras_stride_length; - ria->ria_pages = ras->ras_stride_pages; - } - spin_unlock(&ras->ras_lock); - - if (end == 0) { - ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW); - return 0; - } - len = ria_page_count(ria); - if (len == 0) { - ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW); - return 0; - } - - CDEBUG(D_READA, DFID ": ria: %lu/%lu, bead: %lu/%lu, hit: %d\n", - PFID(lu_object_fid(&clob->co_lu)), - ria->ria_start, ria->ria_end, - vio->vui_ra_valid ? vio->vui_ra_start : 0, - vio->vui_ra_valid ? vio->vui_ra_count : 0, - hit); - - /* at least to extend the readahead window to cover current read */ - if (!hit && vio->vui_ra_valid && - vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) { - unsigned long remainder; - - /* to the end of current read window. */ - mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start; - /* trim to RPC boundary */ - ras_align(ras, ria->ria_start, &remainder); - mlen = min(mlen, ras->ras_rpc_size - remainder); - ria->ria_end_min = ria->ria_start + mlen; - } - - ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen); - if (ria->ria_reserved < len) - ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); - - CDEBUG(D_READA, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n", - ria->ria_reserved, len, mlen, - atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), - ll_i2sbi(inode)->ll_ra_info.ra_max_pages); - - ra_end = ll_read_ahead_pages(env, io, queue, ras, ria); - - if (ria->ria_reserved) - ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); - - if (ra_end == end && ra_end == (kms >> PAGE_SHIFT)) - ll_ra_stats_inc(inode, RA_STAT_EOF); - - /* if we didn't get to the end of the region we reserved from - * the ras we need to go back and update the ras so that the - * next read-ahead tries from where we left off. we only do so - * if the region we failed to issue read-ahead on is still ahead - * of the app and behind the next index to start read-ahead from - */ - CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n", - ra_end, end, ria->ria_end, ret); - - if (ra_end > 0 && ra_end != end) { - ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); - spin_lock(&ras->ras_lock); - if (ra_end <= ras->ras_next_readahead && - index_in_window(ra_end, ras->ras_window_start, 0, - ras->ras_window_len)) { - ras->ras_next_readahead = ra_end + 1; - RAS_CDEBUG(ras); - } - spin_unlock(&ras->ras_lock); - } - - return ret; -} - -static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras, - unsigned long index) -{ - ras->ras_window_start = ras_align(ras, index, NULL); -} - -/* called with the ras_lock held or from places where it doesn't matter */ -static void ras_reset(struct inode *inode, struct ll_readahead_state *ras, - unsigned long index) -{ - ras->ras_last_readpage = index; - ras->ras_consecutive_requests = 0; - ras->ras_consecutive_pages = 0; - ras->ras_window_len = 0; - ras_set_start(inode, ras, index); - ras->ras_next_readahead = max(ras->ras_window_start, index + 1); - - RAS_CDEBUG(ras); -} - -/* called with the ras_lock held or from places where it doesn't matter */ -static void ras_stride_reset(struct ll_readahead_state *ras) -{ - ras->ras_consecutive_stride_requests = 0; - ras->ras_stride_length = 0; - ras->ras_stride_pages = 0; - RAS_CDEBUG(ras); -} - -void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) -{ - spin_lock_init(&ras->ras_lock); - ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES; - ras_reset(inode, ras, 0); - ras->ras_requests = 0; -} - -/* - * Check whether the read request is in the stride window. - * If it is in the stride window, return 1, otherwise return 0. - */ -static int index_in_stride_window(struct ll_readahead_state *ras, - unsigned long index) -{ - unsigned long stride_gap; - - if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 || - ras->ras_stride_pages == ras->ras_stride_length) - return 0; - - stride_gap = index - ras->ras_last_readpage - 1; - - /* If it is contiguous read */ - if (stride_gap == 0) - return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages; - - /* Otherwise check the stride by itself */ - return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap && - ras->ras_consecutive_pages == ras->ras_stride_pages; -} - -static void ras_update_stride_detector(struct ll_readahead_state *ras, - unsigned long index) -{ - unsigned long stride_gap = index - ras->ras_last_readpage - 1; - - if ((stride_gap != 0 || ras->ras_consecutive_stride_requests == 0) && - !stride_io_mode(ras)) { - ras->ras_stride_pages = ras->ras_consecutive_pages; - ras->ras_stride_length = ras->ras_consecutive_pages + - stride_gap; - } - LASSERT(ras->ras_request_index == 0); - LASSERT(ras->ras_consecutive_stride_requests == 0); - - if (index <= ras->ras_last_readpage) { - /*Reset stride window for forward read*/ - ras_stride_reset(ras); - return; - } - - ras->ras_stride_pages = ras->ras_consecutive_pages; - ras->ras_stride_length = stride_gap + ras->ras_consecutive_pages; - - RAS_CDEBUG(ras); -} - -/* Stride Read-ahead window will be increased inc_len according to - * stride I/O pattern - */ -static void ras_stride_increase_window(struct ll_readahead_state *ras, - struct ll_ra_info *ra, - unsigned long inc_len) -{ - unsigned long left, step, window_len; - unsigned long stride_len; - - LASSERT(ras->ras_stride_length > 0); - LASSERTF(ras->ras_window_start + ras->ras_window_len >= - ras->ras_stride_offset, - "window_start %lu, window_len %lu stride_offset %lu\n", - ras->ras_window_start, - ras->ras_window_len, ras->ras_stride_offset); - - stride_len = ras->ras_window_start + ras->ras_window_len - - ras->ras_stride_offset; - - left = stride_len % ras->ras_stride_length; - window_len = ras->ras_window_len - left; - - if (left < ras->ras_stride_pages) - left += inc_len; - else - left = ras->ras_stride_pages + inc_len; - - LASSERT(ras->ras_stride_pages != 0); - - step = left / ras->ras_stride_pages; - left %= ras->ras_stride_pages; - - window_len += step * ras->ras_stride_length + left; - - if (stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length, - ras->ras_stride_pages, ras->ras_stride_offset, - window_len) <= ra->ra_max_pages_per_file) - ras->ras_window_len = window_len; - - RAS_CDEBUG(ras); -} - -static void ras_increase_window(struct inode *inode, - struct ll_readahead_state *ras, - struct ll_ra_info *ra) -{ - /* The stretch of ra-window should be aligned with max rpc_size - * but current clio architecture does not support retrieve such - * information from lower layer. FIXME later - */ - if (stride_io_mode(ras)) { - ras_stride_increase_window(ras, ra, ras->ras_rpc_size); - } else { - unsigned long wlen; - - wlen = min(ras->ras_window_len + ras->ras_rpc_size, - ra->ra_max_pages_per_file); - ras->ras_window_len = ras_align(ras, wlen, NULL); - } -} - -static void ras_update(struct ll_sb_info *sbi, struct inode *inode, - struct ll_readahead_state *ras, unsigned long index, - enum ras_update_flags flags) -{ - struct ll_ra_info *ra = &sbi->ll_ra_info; - int zero = 0, stride_detect = 0, ra_miss = 0; - bool hit = flags & LL_RAS_HIT; - - spin_lock(&ras->ras_lock); - - if (!hit) - CDEBUG(D_READA, DFID " pages at %lu miss.\n", - PFID(ll_inode2fid(inode)), index); - - ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS); - - /* reset the read-ahead window in two cases. First when the app seeks - * or reads to some other part of the file. Secondly if we get a - * read-ahead miss that we think we've previously issued. This can - * be a symptom of there being so many read-ahead pages that the VM is - * reclaiming it before we get to it. - */ - if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) { - zero = 1; - ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE); - } else if (!hit && ras->ras_window_len && - index < ras->ras_next_readahead && - index_in_window(index, ras->ras_window_start, 0, - ras->ras_window_len)) { - ra_miss = 1; - ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); - } - - /* On the second access to a file smaller than the tunable - * ra_max_read_ahead_whole_pages trigger RA on all pages in the - * file up to ra_max_pages_per_file. This is simply a best effort - * and only occurs once per open file. Normal RA behavior is reverted - * to for subsequent IO. The mmap case does not increment - * ras_requests and thus can never trigger this behavior. - */ - if (ras->ras_requests >= 2 && !ras->ras_request_index) { - __u64 kms_pages; - - kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - - CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages, - ra->ra_max_read_ahead_whole_pages, - ra->ra_max_pages_per_file); - - if (kms_pages && - kms_pages <= ra->ra_max_read_ahead_whole_pages) { - ras->ras_window_start = 0; - ras->ras_next_readahead = index + 1; - ras->ras_window_len = min(ra->ra_max_pages_per_file, - ra->ra_max_read_ahead_whole_pages); - goto out_unlock; - } - } - if (zero) { - /* check whether it is in stride I/O mode*/ - if (!index_in_stride_window(ras, index)) { - if (ras->ras_consecutive_stride_requests == 0 && - ras->ras_request_index == 0) { - ras_update_stride_detector(ras, index); - ras->ras_consecutive_stride_requests++; - } else { - ras_stride_reset(ras); - } - ras_reset(inode, ras, index); - ras->ras_consecutive_pages++; - goto out_unlock; - } else { - ras->ras_consecutive_pages = 0; - ras->ras_consecutive_requests = 0; - if (++ras->ras_consecutive_stride_requests > 1) - stride_detect = 1; - RAS_CDEBUG(ras); - } - } else { - if (ra_miss) { - if (index_in_stride_window(ras, index) && - stride_io_mode(ras)) { - if (index != ras->ras_last_readpage + 1) - ras->ras_consecutive_pages = 0; - ras_reset(inode, ras, index); - - /* If stride-RA hit cache miss, the stride - * detector will not be reset to avoid the - * overhead of redetecting read-ahead mode, - * but on the condition that the stride window - * is still intersect with normal sequential - * read-ahead window. - */ - if (ras->ras_window_start < - ras->ras_stride_offset) - ras_stride_reset(ras); - RAS_CDEBUG(ras); - } else { - /* Reset both stride window and normal RA - * window - */ - ras_reset(inode, ras, index); - ras->ras_consecutive_pages++; - ras_stride_reset(ras); - goto out_unlock; - } - } else if (stride_io_mode(ras)) { - /* If this is contiguous read but in stride I/O mode - * currently, check whether stride step still is valid, - * if invalid, it will reset the stride ra window - */ - if (!index_in_stride_window(ras, index)) { - /* Shrink stride read-ahead window to be zero */ - ras_stride_reset(ras); - ras->ras_window_len = 0; - ras->ras_next_readahead = index; - } - } - } - ras->ras_consecutive_pages++; - ras->ras_last_readpage = index; - ras_set_start(inode, ras, index); - - if (stride_io_mode(ras)) { - /* Since stride readahead is sensitive to the offset - * of read-ahead, so we use original offset here, - * instead of ras_window_start, which is RPC aligned - */ - ras->ras_next_readahead = max(index, ras->ras_next_readahead); - ras->ras_window_start = max(ras->ras_stride_offset, - ras->ras_window_start); - } else { - if (ras->ras_next_readahead < ras->ras_window_start) - ras->ras_next_readahead = ras->ras_window_start; - if (!hit) - ras->ras_next_readahead = index + 1; - } - RAS_CDEBUG(ras); - - /* Trigger RA in the mmap case where ras_consecutive_requests - * is not incremented and thus can't be used to trigger RA - */ - if (ras->ras_consecutive_pages >= 4 && flags & LL_RAS_MMAP) { - ras_increase_window(inode, ras, ra); - /* - * reset consecutive pages so that the readahead window can - * grow gradually. - */ - ras->ras_consecutive_pages = 0; - goto out_unlock; - } - - /* Initially reset the stride window offset to next_readahead*/ - if (ras->ras_consecutive_stride_requests == 2 && stride_detect) { - /** - * Once stride IO mode is detected, next_readahead should be - * reset to make sure next_readahead > stride offset - */ - ras->ras_next_readahead = max(index, ras->ras_next_readahead); - ras->ras_stride_offset = index; - ras->ras_window_start = max(index, ras->ras_window_start); - } - - /* The initial ras_window_len is set to the request size. To avoid - * uselessly reading and discarding pages for random IO the window is - * only increased once per consecutive request received. - */ - if ((ras->ras_consecutive_requests > 1 || stride_detect) && - !ras->ras_request_index) - ras_increase_window(inode, ras, ra); -out_unlock: - RAS_CDEBUG(ras); - ras->ras_request_index++; - spin_unlock(&ras->ras_lock); -} - -int ll_writepage(struct page *vmpage, struct writeback_control *wbc) -{ - struct inode *inode = vmpage->mapping->host; - struct ll_inode_info *lli = ll_i2info(inode); - struct lu_env *env; - struct cl_io *io; - struct cl_page *page; - struct cl_object *clob; - bool redirtied = false; - bool unlocked = false; - int result; - u16 refcheck; - - LASSERT(PageLocked(vmpage)); - LASSERT(!PageWriteback(vmpage)); - - LASSERT(ll_i2dtexp(inode)); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) { - result = PTR_ERR(env); - goto out; - } - - clob = ll_i2info(inode)->lli_clob; - LASSERT(clob); - - io = vvp_env_thread_io(env); - io->ci_obj = clob; - io->ci_ignore_layout = 1; - result = cl_io_init(env, io, CIT_MISC, clob); - if (result == 0) { - page = cl_page_find(env, clob, vmpage->index, - vmpage, CPT_CACHEABLE); - if (!IS_ERR(page)) { - lu_ref_add(&page->cp_reference, "writepage", - current); - cl_page_assume(env, io, page); - result = cl_page_flush(env, io, page); - if (result != 0) { - /* - * Re-dirty page on error so it retries write, - * but not in case when IO has actually - * occurred and completed with an error. - */ - if (!PageError(vmpage)) { - redirty_page_for_writepage(wbc, vmpage); - result = 0; - redirtied = true; - } - } - cl_page_disown(env, io, page); - unlocked = true; - lu_ref_del(&page->cp_reference, - "writepage", current); - cl_page_put(env, page); - } else { - result = PTR_ERR(page); - } - } - cl_io_fini(env, io); - - if (redirtied && wbc->sync_mode == WB_SYNC_ALL) { - loff_t offset = cl_offset(clob, vmpage->index); - - /* Flush page failed because the extent is being written out. - * Wait for the write of extent to be finished to avoid - * breaking kernel which assumes ->writepage should mark - * PageWriteback or clean the page. - */ - result = cl_sync_file_range(inode, offset, - offset + PAGE_SIZE - 1, - CL_FSYNC_LOCAL, 1); - if (result > 0) { - /* actually we may have written more than one page. - * decreasing this page because the caller will count - * it. - */ - wbc->nr_to_write -= result - 1; - result = 0; - } - } - - cl_env_put(env, &refcheck); - goto out; - -out: - if (result < 0) { - if (!lli->lli_async_rc) - lli->lli_async_rc = result; - SetPageError(vmpage); - if (!unlocked) - unlock_page(vmpage); - } - return result; -} - -int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - struct inode *inode = mapping->host; - struct ll_sb_info *sbi = ll_i2sbi(inode); - loff_t start; - loff_t end; - enum cl_fsync_mode mode; - int range_whole = 0; - int result; - int ignore_layout = 0; - - if (wbc->range_cyclic) { - start = mapping->writeback_index << PAGE_SHIFT; - end = OBD_OBJECT_EOF; - } else { - start = wbc->range_start; - end = wbc->range_end; - if (end == LLONG_MAX) { - end = OBD_OBJECT_EOF; - range_whole = start == 0; - } - } - - mode = CL_FSYNC_NONE; - if (wbc->sync_mode == WB_SYNC_ALL) - mode = CL_FSYNC_LOCAL; - - if (sbi->ll_umounting) - /* if the mountpoint is being umounted, all pages have to be - * evicted to avoid hitting LBUG when truncate_inode_pages() - * is called later on. - */ - ignore_layout = 1; - - if (!ll_i2info(inode)->lli_clob) - return 0; - - result = cl_sync_file_range(inode, start, end, mode, ignore_layout); - if (result > 0) { - wbc->nr_to_write -= result; - result = 0; - } - - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) { - if (end == OBD_OBJECT_EOF) - mapping->writeback_index = 0; - else - mapping->writeback_index = (end >> PAGE_SHIFT) + 1; - } - return result; -} - -struct ll_cl_context *ll_cl_find(struct file *file) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_cl_context *lcc; - struct ll_cl_context *found = NULL; - - read_lock(&fd->fd_lock); - list_for_each_entry(lcc, &fd->fd_lccs, lcc_list) { - if (lcc->lcc_cookie == current) { - found = lcc; - break; - } - } - read_unlock(&fd->fd_lock); - - return found; -} - -void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx; - - memset(lcc, 0, sizeof(*lcc)); - INIT_LIST_HEAD(&lcc->lcc_list); - lcc->lcc_cookie = current; - lcc->lcc_env = env; - lcc->lcc_io = io; - - write_lock(&fd->fd_lock); - list_add(&lcc->lcc_list, &fd->fd_lccs); - write_unlock(&fd->fd_lock); -} - -void ll_cl_remove(struct file *file, const struct lu_env *env) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx; - - write_lock(&fd->fd_lock); - list_del_init(&lcc->lcc_list); - write_unlock(&fd->fd_lock); -} - -static int ll_io_read_page(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) -{ - struct inode *inode = vvp_object_inode(page->cp_obj); - struct ll_file_data *fd = vvp_env_io(env)->vui_fd; - struct ll_readahead_state *ras = &fd->fd_ras; - struct cl_2queue *queue = &io->ci_queue; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct vvp_page *vpg; - bool uptodate; - int rc = 0; - - vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); - uptodate = vpg->vpg_defer_uptodate; - - if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && - sbi->ll_ra_info.ra_max_pages > 0) { - struct vvp_io *vio = vvp_env_io(env); - enum ras_update_flags flags = 0; - - if (uptodate) - flags |= LL_RAS_HIT; - if (!vio->vui_ra_valid) - flags |= LL_RAS_MMAP; - ras_update(sbi, inode, ras, vvp_index(vpg), flags); - } - - cl_2queue_init(queue); - if (uptodate) { - vpg->vpg_ra_used = 1; - cl_page_export(env, page, 1); - cl_page_disown(env, io, page); - } else { - cl_page_list_add(&queue->c2_qin, page); - } - - if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && - sbi->ll_ra_info.ra_max_pages > 0) { - int rc2; - - rc2 = ll_readahead(env, io, &queue->c2_qin, ras, - uptodate); - CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n", - PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg)); - } - - if (queue->c2_qin.pl_nr > 0) - rc = cl_io_submit_rw(env, io, CRT_READ, queue); - - /* - * Unlock unsent pages in case of error. - */ - cl_page_list_disown(env, io, &queue->c2_qin); - cl_2queue_fini(env, queue); - - return rc; -} - -int ll_readpage(struct file *file, struct page *vmpage) -{ - struct cl_object *clob = ll_i2info(file_inode(file))->lli_clob; - struct ll_cl_context *lcc; - const struct lu_env *env; - struct cl_io *io; - struct cl_page *page; - int result; - - lcc = ll_cl_find(file); - if (!lcc) { - unlock_page(vmpage); - return -EIO; - } - - env = lcc->lcc_env; - io = lcc->lcc_io; - LASSERT(io->ci_state == CIS_IO_GOING); - page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); - if (!IS_ERR(page)) { - LASSERT(page->cp_type == CPT_CACHEABLE); - if (likely(!PageUptodate(vmpage))) { - cl_page_assume(env, io, page); - result = ll_io_read_page(env, io, page); - } else { - /* Page from a non-object file. */ - unlock_page(vmpage); - result = 0; - } - cl_page_put(env, page); - } else { - unlock_page(vmpage); - result = PTR_ERR(page); - } - return result; -} - -int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, - struct cl_page *page, enum cl_req_type crt) -{ - struct cl_2queue *queue; - int result; - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - queue = &io->ci_queue; - cl_2queue_init_page(queue, page); - - result = cl_io_submit_sync(env, io, crt, queue, 0); - LASSERT(cl_page_is_owned(page, io)); - - if (crt == CRT_READ) - /* - * in CRT_WRITE case page is left locked even in case of - * error. - */ - cl_page_list_disown(env, io, &queue->c2_qin); - cl_2queue_fini(env, queue); - - return result; -} diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c deleted file mode 100644 index 722e5ea1af5f..000000000000 --- a/drivers/staging/lustre/lustre/llite/rw26.c +++ /dev/null @@ -1,641 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/lustre/llite/rw26.c - * - * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version - */ - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/errno.h> -#include <linux/unistd.h> -#include <linux/uaccess.h> - -#include <linux/migrate.h> -#include <linux/fs.h> -#include <linux/buffer_head.h> -#include <linux/mpage.h> -#include <linux/writeback.h> -#include <linux/pagemap.h> - -#define DEBUG_SUBSYSTEM S_LLITE - -#include "llite_internal.h" - -/** - * Implements Linux VM address_space::invalidatepage() method. This method is - * called when the page is truncate from a file, either as a result of - * explicit truncate, or when inode is removed from memory (as a result of - * final iput(), umount, or memory pressure induced icache shrinking). - * - * [0, offset] bytes of the page remain valid (this is for a case of not-page - * aligned truncate). Lustre leaves partially truncated page in the cache, - * relying on struct inode::i_size to limit further accesses. - */ -static void ll_invalidatepage(struct page *vmpage, unsigned int offset, - unsigned int length) -{ - struct inode *inode; - struct lu_env *env; - struct cl_page *page; - struct cl_object *obj; - - LASSERT(PageLocked(vmpage)); - LASSERT(!PageWriteback(vmpage)); - - /* - * It is safe to not check anything in invalidatepage/releasepage - * below because they are run with page locked and all our io is - * happening with locked page too - */ - if (offset == 0 && length == PAGE_SIZE) { - /* See the comment in ll_releasepage() */ - env = cl_env_percpu_get(); - LASSERT(!IS_ERR(env)); - inode = vmpage->mapping->host; - obj = ll_i2info(inode)->lli_clob; - if (obj) { - page = cl_vmpage_page(vmpage, obj); - if (page) { - cl_page_delete(env, page); - cl_page_put(env, page); - } - } else { - LASSERT(vmpage->private == 0); - } - cl_env_percpu_put(env); - } -} - -static int ll_releasepage(struct page *vmpage, gfp_t gfp_mask) -{ - struct lu_env *env; - struct cl_object *obj; - struct cl_page *page; - struct address_space *mapping; - int result = 0; - - LASSERT(PageLocked(vmpage)); - if (PageWriteback(vmpage) || PageDirty(vmpage)) - return 0; - - mapping = vmpage->mapping; - if (!mapping) - return 1; - - obj = ll_i2info(mapping->host)->lli_clob; - if (!obj) - return 1; - - /* 1 for caller, 1 for cl_page and 1 for page cache */ - if (page_count(vmpage) > 3) - return 0; - - page = cl_vmpage_page(vmpage, obj); - if (!page) - return 1; - - env = cl_env_percpu_get(); - LASSERT(!IS_ERR(env)); - - if (!cl_page_in_use(page)) { - result = 1; - cl_page_delete(env, page); - } - - /* To use percpu env array, the call path can not be rescheduled; - * otherwise percpu array will be messed if ll_releaspage() called - * again on the same CPU. - * - * If this page holds the last refc of cl_object, the following - * call path may cause reschedule: - * cl_page_put -> cl_page_free -> cl_object_put -> - * lu_object_put -> lu_object_free -> lov_delete_raid0. - * - * However, the kernel can't get rid of this inode until all pages have - * been cleaned up. Now that we hold page lock here, it's pretty safe - * that we won't get into object delete path. - */ - LASSERT(cl_object_refc(obj) > 1); - cl_page_put(env, page); - - cl_env_percpu_put(env); - return result; -} - -#define MAX_DIRECTIO_SIZE (2 * 1024 * 1024 * 1024UL) - -/* ll_free_user_pages - tear down page struct array - * @pages: array of page struct pointers underlying target buffer - */ -static void ll_free_user_pages(struct page **pages, int npages, int do_dirty) -{ - int i; - - for (i = 0; i < npages; i++) { - if (do_dirty) - set_page_dirty_lock(pages[i]); - put_page(pages[i]); - } - kvfree(pages); -} - -ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, - int rw, struct inode *inode, - struct ll_dio_pages *pv) -{ - struct cl_page *clp; - struct cl_2queue *queue; - struct cl_object *obj = io->ci_obj; - int i; - ssize_t rc = 0; - loff_t file_offset = pv->ldp_start_offset; - size_t size = pv->ldp_size; - int page_count = pv->ldp_nr; - struct page **pages = pv->ldp_pages; - size_t page_size = cl_page_size(obj); - bool do_io; - int io_pages = 0; - - queue = &io->ci_queue; - cl_2queue_init(queue); - for (i = 0; i < page_count; i++) { - if (pv->ldp_offsets) - file_offset = pv->ldp_offsets[i]; - - LASSERT(!(file_offset & (page_size - 1))); - clp = cl_page_find(env, obj, cl_index(obj, file_offset), - pv->ldp_pages[i], CPT_TRANSIENT); - if (IS_ERR(clp)) { - rc = PTR_ERR(clp); - break; - } - - rc = cl_page_own(env, io, clp); - if (rc) { - LASSERT(clp->cp_state == CPS_FREEING); - cl_page_put(env, clp); - break; - } - - do_io = true; - - /* check the page type: if the page is a host page, then do - * write directly - */ - if (clp->cp_type == CPT_CACHEABLE) { - struct page *vmpage = cl_page_vmpage(clp); - struct page *src_page; - struct page *dst_page; - void *src; - void *dst; - - src_page = (rw == WRITE) ? pages[i] : vmpage; - dst_page = (rw == WRITE) ? vmpage : pages[i]; - - src = kmap_atomic(src_page); - dst = kmap_atomic(dst_page); - memcpy(dst, src, min(page_size, size)); - kunmap_atomic(dst); - kunmap_atomic(src); - - /* make sure page will be added to the transfer by - * cl_io_submit()->...->vvp_page_prep_write(). - */ - if (rw == WRITE) - set_page_dirty(vmpage); - - if (rw == READ) { - /* do not issue the page for read, since it - * may reread a ra page which has NOT uptodate - * bit set. - */ - cl_page_disown(env, io, clp); - do_io = false; - } - } - - if (likely(do_io)) { - /* - * Add a page to the incoming page list of 2-queue. - */ - cl_page_list_add(&queue->c2_qin, clp); - - /* - * Set page clip to tell transfer formation engine - * that page has to be sent even if it is beyond KMS. - */ - cl_page_clip(env, clp, 0, min(size, page_size)); - - ++io_pages; - } - - /* drop the reference count for cl_page_find */ - cl_page_put(env, clp); - size -= page_size; - file_offset += page_size; - } - - if (rc == 0 && io_pages) { - rc = cl_io_submit_sync(env, io, - rw == READ ? CRT_READ : CRT_WRITE, - queue, 0); - } - if (rc == 0) - rc = pv->ldp_size; - - cl_2queue_discard(env, io, queue); - cl_2queue_disown(env, io, queue); - cl_2queue_fini(env, queue); - return rc; -} -EXPORT_SYMBOL(ll_direct_rw_pages); - -static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, - int rw, struct inode *inode, - struct address_space *mapping, - size_t size, loff_t file_offset, - struct page **pages, int page_count) -{ - struct ll_dio_pages pvec = { - .ldp_pages = pages, - .ldp_nr = page_count, - .ldp_size = size, - .ldp_offsets = NULL, - .ldp_start_offset = file_offset - }; - - return ll_direct_rw_pages(env, io, rw, inode, &pvec); -} - -/* This is the maximum size of a single O_DIRECT request, based on the - * kmalloc limit. We need to fit all of the brw_page structs, each one - * representing PAGE_SIZE worth of user data, into a single buffer, and - * then truncate this to be a full-sized RPC. For 4kB PAGE_SIZE this is - * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. - */ -#define MAX_DIO_SIZE ((KMALLOC_MAX_SIZE / sizeof(struct brw_page) * \ - PAGE_SIZE) & ~(DT_MAX_BRW_SIZE - 1)) -static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter) -{ - struct ll_cl_context *lcc; - const struct lu_env *env; - struct cl_io *io; - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - loff_t file_offset = iocb->ki_pos; - ssize_t count = iov_iter_count(iter); - ssize_t tot_bytes = 0, result = 0; - long size = MAX_DIO_SIZE; - - /* Check EOF by ourselves */ - if (iov_iter_rw(iter) == READ && file_offset >= i_size_read(inode)) - return 0; - - /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ - if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK)) - return -EINVAL; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n", - PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, - file_offset, file_offset, count >> PAGE_SHIFT, - MAX_DIO_SIZE >> PAGE_SHIFT); - - /* Check that all user buffers are aligned as well */ - if (iov_iter_alignment(iter) & ~PAGE_MASK) - return -EINVAL; - - lcc = ll_cl_find(file); - if (!lcc) - return -EIO; - - env = lcc->lcc_env; - LASSERT(!IS_ERR(env)); - io = lcc->lcc_io; - LASSERT(io); - - while (iov_iter_count(iter)) { - struct page **pages; - size_t offs; - - count = min_t(size_t, iov_iter_count(iter), size); - if (iov_iter_rw(iter) == READ) { - if (file_offset >= i_size_read(inode)) - break; - if (file_offset + count > i_size_read(inode)) - count = i_size_read(inode) - file_offset; - } - - result = iov_iter_get_pages_alloc(iter, &pages, count, &offs); - if (likely(result > 0)) { - int n = DIV_ROUND_UP(result + offs, PAGE_SIZE); - - result = ll_direct_IO_26_seg(env, io, iov_iter_rw(iter), - inode, file->f_mapping, - result, file_offset, pages, - n); - ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ); - } - if (unlikely(result <= 0)) { - /* If we can't allocate a large enough buffer - * for the request, shrink it to a smaller - * PAGE_SIZE multiple and try again. - * We should always be able to kmalloc for a - * page worth of page pointers = 4MB on i386. - */ - if (result == -ENOMEM && - size > (PAGE_SIZE / sizeof(*pages)) * - PAGE_SIZE) { - size = ((((size / 2) - 1) | - ~PAGE_MASK) + 1) & - PAGE_MASK; - CDEBUG(D_VFSTRACE, "DIO size now %lu\n", - size); - continue; - } - - goto out; - } - iov_iter_advance(iter, result); - tot_bytes += result; - file_offset += result; - } -out: - if (tot_bytes > 0) { - struct vvp_io *vio = vvp_env_io(env); - - /* no commit async for direct IO */ - vio->u.write.vui_written += tot_bytes; - } - - return tot_bytes ? tot_bytes : result; -} - -/** - * Prepare partially written-to page for a write. - */ -static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg) -{ - struct cl_attr *attr = vvp_env_thread_attr(env); - struct cl_object *obj = io->ci_obj; - struct vvp_page *vpg = cl_object_page_slice(obj, pg); - loff_t offset = cl_offset(obj, vvp_index(vpg)); - int result; - - cl_object_attr_lock(obj); - result = cl_object_attr_get(env, obj, attr); - cl_object_attr_unlock(obj); - if (result == 0) { - /* - * If are writing to a new page, no need to read old data. - * The extent locking will have updated the KMS, and for our - * purposes here we can treat it like i_size. - */ - if (attr->cat_kms <= offset) { - char *kaddr = kmap_atomic(vpg->vpg_page); - - memset(kaddr, 0, cl_page_size(obj)); - kunmap_atomic(kaddr); - } else if (vpg->vpg_defer_uptodate) { - vpg->vpg_ra_used = 1; - } else { - result = ll_page_sync_io(env, io, pg, CRT_READ); - } - } - return result; -} - -static int ll_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int flags, - struct page **pagep, void **fsdata) -{ - struct ll_cl_context *lcc; - const struct lu_env *env = NULL; - struct cl_io *io; - struct cl_page *page = NULL; - struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; - pgoff_t index = pos >> PAGE_SHIFT; - struct page *vmpage = NULL; - unsigned int from = pos & (PAGE_SIZE - 1); - unsigned int to = from + len; - int result = 0; - - CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len); - - lcc = ll_cl_find(file); - if (!lcc) { - io = NULL; - result = -EIO; - goto out; - } - - env = lcc->lcc_env; - io = lcc->lcc_io; - - /* To avoid deadlock, try to lock page first. */ - vmpage = grab_cache_page_nowait(mapping, index); - if (unlikely(!vmpage || PageDirty(vmpage) || PageWriteback(vmpage))) { - struct vvp_io *vio = vvp_env_io(env); - struct cl_page_list *plist = &vio->u.write.vui_queue; - - /* if the page is already in dirty cache, we have to commit - * the pages right now; otherwise, it may cause deadlock - * because it holds page lock of a dirty page and request for - * more grants. It's okay for the dirty page to be the first - * one in commit page list, though. - */ - if (vmpage && plist->pl_nr > 0) { - unlock_page(vmpage); - put_page(vmpage); - vmpage = NULL; - } - - /* commit pages and then wait for page lock */ - result = vvp_io_write_commit(env, io); - if (result < 0) - goto out; - - if (!vmpage) { - vmpage = grab_cache_page_write_begin(mapping, index, - flags); - if (!vmpage) { - result = -ENOMEM; - goto out; - } - } - } - - page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); - if (IS_ERR(page)) { - result = PTR_ERR(page); - goto out; - } - - lcc->lcc_page = page; - lu_ref_add(&page->cp_reference, "cl_io", io); - - cl_page_assume(env, io, page); - if (!PageUptodate(vmpage)) { - /* - * We're completely overwriting an existing page, - * so _don't_ set it up to date until commit_write - */ - if (from == 0 && to == PAGE_SIZE) { - CL_PAGE_HEADER(D_PAGE, env, page, "full page write\n"); - POISON_PAGE(vmpage, 0x11); - } else { - /* TODO: can be optimized at OSC layer to check if it - * is a lockless IO. In that case, it's not necessary - * to read the data. - */ - result = ll_prepare_partial_page(env, io, page); - if (result == 0) - SetPageUptodate(vmpage); - } - } - if (result < 0) - cl_page_unassume(env, io, page); -out: - if (result < 0) { - if (vmpage) { - unlock_page(vmpage); - put_page(vmpage); - } - if (!IS_ERR_OR_NULL(page)) { - lu_ref_del(&page->cp_reference, "cl_io", io); - cl_page_put(env, page); - } - if (io) - io->ci_result = result; - } else { - *pagep = vmpage; - *fsdata = lcc; - } - return result; -} - -static int ll_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct page *vmpage, void *fsdata) -{ - struct ll_cl_context *lcc = fsdata; - const struct lu_env *env; - struct cl_io *io; - struct vvp_io *vio; - struct cl_page *page; - unsigned int from = pos & (PAGE_SIZE - 1); - bool unplug = false; - int result = 0; - - put_page(vmpage); - - env = lcc->lcc_env; - page = lcc->lcc_page; - io = lcc->lcc_io; - vio = vvp_env_io(env); - - LASSERT(cl_page_is_owned(page, io)); - if (copied > 0) { - struct cl_page_list *plist = &vio->u.write.vui_queue; - - lcc->lcc_page = NULL; /* page will be queued */ - - /* Add it into write queue */ - cl_page_list_add(plist, page); - if (plist->pl_nr == 1) /* first page */ - vio->u.write.vui_from = from; - else - LASSERT(from == 0); - vio->u.write.vui_to = from + copied; - - /* - * To address the deadlock in balance_dirty_pages() where - * this dirty page may be written back in the same thread. - */ - if (PageDirty(vmpage)) - unplug = true; - - /* We may have one full RPC, commit it soon */ - if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES) - unplug = true; - - CL_PAGE_DEBUG(D_VFSTRACE, env, page, - "queued page: %d.\n", plist->pl_nr); - } else { - cl_page_disown(env, io, page); - - lcc->lcc_page = NULL; - lu_ref_del(&page->cp_reference, "cl_io", io); - cl_page_put(env, page); - - /* page list is not contiguous now, commit it now */ - unplug = true; - } - - if (unplug || - file->f_flags & O_SYNC || IS_SYNC(file_inode(file))) - result = vvp_io_write_commit(env, io); - - if (result < 0) - io->ci_result = result; - return result >= 0 ? copied : result; -} - -#ifdef CONFIG_MIGRATION -static int ll_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode - ) -{ - /* Always fail page migration until we have a proper implementation */ - return -EIO; -} -#endif - -const struct address_space_operations ll_aops = { - .readpage = ll_readpage, - .direct_IO = ll_direct_IO_26, - .writepage = ll_writepage, - .writepages = ll_writepages, - .set_page_dirty = __set_page_dirty_nobuffers, - .write_begin = ll_write_begin, - .write_end = ll_write_end, - .invalidatepage = ll_invalidatepage, - .releasepage = (void *)ll_releasepage, -#ifdef CONFIG_MIGRATION - .migratepage = ll_migratepage, -#endif -}; diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c deleted file mode 100644 index 155ce3cf6f60..000000000000 --- a/drivers/staging/lustre/lustre/llite/statahead.c +++ /dev/null @@ -1,1577 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd_support.h> -#include <lustre_dlm.h> -#include "llite_internal.h" - -#define SA_OMITTED_ENTRY_MAX 8ULL - -enum se_stat { - /** negative values are for error cases */ - SA_ENTRY_INIT = 0, /** init entry */ - SA_ENTRY_SUCC = 1, /** stat succeed */ - SA_ENTRY_INVA = 2, /** invalid entry */ -}; - -/* - * sa_entry is not refcounted: statahead thread allocates it and do async stat, - * and in async stat callback ll_statahead_interpret() will add it into - * sai_interim_entries, later statahead thread will call sa_handle_callback() to - * instantiate entry and move it into sai_entries, and then only scanner process - * can access and free it. - */ -struct sa_entry { - /* link into sai_interim_entries or sai_entries */ - struct list_head se_list; - /* link into sai hash table locally */ - struct list_head se_hash; - /* entry index in the sai */ - __u64 se_index; - /* low layer ldlm lock handle */ - __u64 se_handle; - /* entry status */ - enum se_stat se_state; - /* entry size, contains name */ - int se_size; - /* pointer to async getattr enqueue info */ - struct md_enqueue_info *se_minfo; - /* pointer to the async getattr request */ - struct ptlrpc_request *se_req; - /* pointer to the target inode */ - struct inode *se_inode; - /* entry name */ - struct qstr se_qstr; - /* entry fid */ - struct lu_fid se_fid; -}; - -static unsigned int sai_generation; -static DEFINE_SPINLOCK(sai_generation_lock); - -/* sa_entry is ready to use */ -static inline int sa_ready(struct sa_entry *entry) -{ - smp_rmb(); - return (entry->se_state != SA_ENTRY_INIT); -} - -/* hash value to put in sai_cache */ -static inline int sa_hash(int val) -{ - return val & LL_SA_CACHE_MASK; -} - -/* hash entry into sai_cache */ -static inline void -sa_rehash(struct ll_statahead_info *sai, struct sa_entry *entry) -{ - int i = sa_hash(entry->se_qstr.hash); - - spin_lock(&sai->sai_cache_lock[i]); - list_add_tail(&entry->se_hash, &sai->sai_cache[i]); - spin_unlock(&sai->sai_cache_lock[i]); -} - -/* - * Remove entry from SA table. - */ -static inline void -sa_unhash(struct ll_statahead_info *sai, struct sa_entry *entry) -{ - int i = sa_hash(entry->se_qstr.hash); - - spin_lock(&sai->sai_cache_lock[i]); - list_del_init(&entry->se_hash); - spin_unlock(&sai->sai_cache_lock[i]); -} - -static inline int agl_should_run(struct ll_statahead_info *sai, - struct inode *inode) -{ - return (inode && S_ISREG(inode->i_mode) && sai->sai_agl_valid); -} - -/* statahead window is full */ -static inline int sa_sent_full(struct ll_statahead_info *sai) -{ - return atomic_read(&sai->sai_cache_count) >= sai->sai_max; -} - -/* got async stat replies */ -static inline int sa_has_callback(struct ll_statahead_info *sai) -{ - return !list_empty(&sai->sai_interim_entries); -} - -static inline int agl_list_empty(struct ll_statahead_info *sai) -{ - return list_empty(&sai->sai_agls); -} - -/** - * (1) hit ratio less than 80% - * or - * (2) consecutive miss more than 8 - * then means low hit. - */ -static inline int sa_low_hit(struct ll_statahead_info *sai) -{ - return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || - (sai->sai_consecutive_miss > 8)); -} - -/* - * if the given index is behind of statahead window more than - * SA_OMITTED_ENTRY_MAX, then it is old. - */ -static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) -{ - return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < - sai->sai_index); -} - -/* allocate sa_entry and hash it to allow scanner process to find it */ -static struct sa_entry * -sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index, - const char *name, int len, const struct lu_fid *fid) -{ - struct ll_inode_info *lli; - struct sa_entry *entry; - int entry_size; - char *dname; - - entry_size = sizeof(struct sa_entry) + (len & ~3) + 4; - entry = kzalloc(entry_size, GFP_NOFS); - if (unlikely(!entry)) - return ERR_PTR(-ENOMEM); - - CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n", - len, name, entry, index); - - entry->se_index = index; - entry->se_state = SA_ENTRY_INIT; - entry->se_size = entry_size; - dname = (char *)entry + sizeof(struct sa_entry); - memcpy(dname, name, len); - dname[len] = 0; - - entry->se_qstr.hash = full_name_hash(parent, name, len); - entry->se_qstr.len = len; - entry->se_qstr.name = dname; - entry->se_fid = *fid; - - lli = ll_i2info(sai->sai_dentry->d_inode); - spin_lock(&lli->lli_sa_lock); - INIT_LIST_HEAD(&entry->se_list); - sa_rehash(sai, entry); - spin_unlock(&lli->lli_sa_lock); - - atomic_inc(&sai->sai_cache_count); - - return entry; -} - -/* free sa_entry, which should have been unhashed and not in any list */ -static void sa_free(struct ll_statahead_info *sai, struct sa_entry *entry) -{ - CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", - entry->se_qstr.len, entry->se_qstr.name, entry, - entry->se_index); - - LASSERT(list_empty(&entry->se_list)); - LASSERT(list_empty(&entry->se_hash)); - - kfree(entry); - atomic_dec(&sai->sai_cache_count); -} - -/* - * find sa_entry by name, used by directory scanner, lock is not needed because - * only scanner can remove the entry from cache. - */ -static struct sa_entry * -sa_get(struct ll_statahead_info *sai, const struct qstr *qstr) -{ - struct sa_entry *entry; - int i = sa_hash(qstr->hash); - - list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { - if (entry->se_qstr.hash == qstr->hash && - entry->se_qstr.len == qstr->len && - memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) - return entry; - } - return NULL; -} - -/* unhash and unlink sa_entry, and then free it */ -static inline void -sa_kill(struct ll_statahead_info *sai, struct sa_entry *entry) -{ - struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); - - LASSERT(!list_empty(&entry->se_hash)); - LASSERT(!list_empty(&entry->se_list)); - LASSERT(sa_ready(entry)); - - sa_unhash(sai, entry); - - spin_lock(&lli->lli_sa_lock); - list_del_init(&entry->se_list); - spin_unlock(&lli->lli_sa_lock); - - if (entry->se_inode) - iput(entry->se_inode); - - sa_free(sai, entry); -} - -/* called by scanner after use, sa_entry will be killed */ -static void -sa_put(struct ll_statahead_info *sai, struct sa_entry *entry, struct ll_inode_info *lli) -{ - struct sa_entry *tmp, *next; - - if (entry && entry->se_state == SA_ENTRY_SUCC) { - struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); - - sai->sai_hit++; - sai->sai_consecutive_miss = 0; - sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); - } else { - sai->sai_miss++; - sai->sai_consecutive_miss++; - } - - if (entry) - sa_kill(sai, entry); - - /* - * kill old completed entries, only scanner process does this, no need - * to lock - */ - list_for_each_entry_safe(tmp, next, &sai->sai_entries, se_list) { - if (!is_omitted_entry(sai, tmp->se_index)) - break; - sa_kill(sai, tmp); - } - - spin_lock(&lli->lli_sa_lock); - if (sai->sai_task) - wake_up_process(sai->sai_task); - spin_unlock(&lli->lli_sa_lock); - -} - -/* - * update state and sort add entry to sai_entries by index, return true if - * scanner is waiting on this entry. - */ -static bool -__sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) -{ - struct list_head *pos = &sai->sai_entries; - __u64 index = entry->se_index; - struct sa_entry *se; - - LASSERT(!sa_ready(entry)); - LASSERT(list_empty(&entry->se_list)); - - list_for_each_entry_reverse(se, &sai->sai_entries, se_list) { - if (se->se_index < entry->se_index) { - pos = &se->se_list; - break; - } - } - list_add(&entry->se_list, pos); - entry->se_state = ret < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC; - - return (index == sai->sai_index_wait); -} - -/* - * release resources used in async stat RPC, update entry state and wakeup if - * scanner process it waiting on this entry. - */ -static void -sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) -{ - struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); - struct md_enqueue_info *minfo = entry->se_minfo; - struct ptlrpc_request *req = entry->se_req; - bool wakeup; - - /* release resources used in RPC */ - if (minfo) { - entry->se_minfo = NULL; - ll_intent_release(&minfo->mi_it); - iput(minfo->mi_dir); - kfree(minfo); - } - - if (req) { - entry->se_req = NULL; - ptlrpc_req_finished(req); - } - - spin_lock(&lli->lli_sa_lock); - wakeup = __sa_make_ready(sai, entry, ret); - spin_unlock(&lli->lli_sa_lock); - - if (wakeup) - wake_up(&sai->sai_waitq); -} - -/* Insert inode into the list of sai_agls. */ -static void ll_agl_add(struct ll_statahead_info *sai, - struct inode *inode, int index) -{ - struct ll_inode_info *child = ll_i2info(inode); - struct ll_inode_info *parent = ll_i2info(sai->sai_dentry->d_inode); - int added = 0; - - spin_lock(&child->lli_agl_lock); - if (child->lli_agl_index == 0) { - child->lli_agl_index = index; - spin_unlock(&child->lli_agl_lock); - - LASSERT(list_empty(&child->lli_agl_list)); - - igrab(inode); - spin_lock(&parent->lli_agl_lock); - if (list_empty(&sai->sai_agls)) - added = 1; - list_add_tail(&child->lli_agl_list, &sai->sai_agls); - spin_unlock(&parent->lli_agl_lock); - } else { - spin_unlock(&child->lli_agl_lock); - } - - if (added > 0) - wake_up_process(sai->sai_agl_task); -} - -/* allocate sai */ -static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry) -{ - struct ll_inode_info *lli = ll_i2info(dentry->d_inode); - struct ll_statahead_info *sai; - int i; - - sai = kzalloc(sizeof(*sai), GFP_NOFS); - if (!sai) - return NULL; - - sai->sai_dentry = dget(dentry); - atomic_set(&sai->sai_refcount, 1); - - sai->sai_max = LL_SA_RPC_MIN; - sai->sai_index = 1; - init_waitqueue_head(&sai->sai_waitq); - - INIT_LIST_HEAD(&sai->sai_interim_entries); - INIT_LIST_HEAD(&sai->sai_entries); - INIT_LIST_HEAD(&sai->sai_agls); - - for (i = 0; i < LL_SA_CACHE_SIZE; i++) { - INIT_LIST_HEAD(&sai->sai_cache[i]); - spin_lock_init(&sai->sai_cache_lock[i]); - } - atomic_set(&sai->sai_cache_count, 0); - - spin_lock(&sai_generation_lock); - lli->lli_sa_generation = ++sai_generation; - if (unlikely(!sai_generation)) - lli->lli_sa_generation = ++sai_generation; - spin_unlock(&sai_generation_lock); - - return sai; -} - -/* free sai */ -static inline void ll_sai_free(struct ll_statahead_info *sai) -{ - LASSERT(sai->sai_dentry); - dput(sai->sai_dentry); - kfree(sai); -} - -/* - * take refcount of sai if sai for @dir exists, which means statahead is on for - * this directory. - */ -static inline struct ll_statahead_info *ll_sai_get(struct inode *dir) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = NULL; - - spin_lock(&lli->lli_sa_lock); - sai = lli->lli_sai; - if (sai) - atomic_inc(&sai->sai_refcount); - spin_unlock(&lli->lli_sa_lock); - - return sai; -} - -/* - * put sai refcount after use, if refcount reaches zero, free sai and sa_entries - * attached to it. - */ -static void ll_sai_put(struct ll_statahead_info *sai) -{ - struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); - - if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { - struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); - struct sa_entry *entry, *next; - - lli->lli_sai = NULL; - spin_unlock(&lli->lli_sa_lock); - - LASSERT(sai->sai_task == NULL); - LASSERT(sai->sai_agl_task == NULL); - LASSERT(sai->sai_sent == sai->sai_replied); - LASSERT(!sa_has_callback(sai)); - - list_for_each_entry_safe(entry, next, &sai->sai_entries, - se_list) - sa_kill(sai, entry); - - LASSERT(atomic_read(&sai->sai_cache_count) == 0); - LASSERT(list_empty(&sai->sai_agls)); - - ll_sai_free(sai); - atomic_dec(&sbi->ll_sa_running); - } -} - -/* Do NOT forget to drop inode refcount when into sai_agls. */ -static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) -{ - struct ll_inode_info *lli = ll_i2info(inode); - __u64 index = lli->lli_agl_index; - int rc; - - LASSERT(list_empty(&lli->lli_agl_list)); - - /* AGL maybe fall behind statahead with one entry */ - if (is_omitted_entry(sai, index + 1)) { - lli->lli_agl_index = 0; - iput(inode); - return; - } - - /* Someone is in glimpse (sync or async), do nothing. */ - rc = down_write_trylock(&lli->lli_glimpse_sem); - if (rc == 0) { - lli->lli_agl_index = 0; - iput(inode); - return; - } - - /* - * Someone triggered glimpse within 1 sec before. - * 1) The former glimpse succeeded with glimpse lock granted by OST, and - * if the lock is still cached on client, AGL needs to do nothing. If - * it is cancelled by other client, AGL maybe cannot obtain new lock - * for no glimpse callback triggered by AGL. - * 2) The former glimpse succeeded, but OST did not grant glimpse lock. - * Under such case, it is quite possible that the OST will not grant - * glimpse lock for AGL also. - * 3) The former glimpse failed, compared with other two cases, it is - * relative rare. AGL can ignore such case, and it will not muchly - * affect the performance. - */ - if (lli->lli_glimpse_time != 0 && - time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) { - up_write(&lli->lli_glimpse_sem); - lli->lli_agl_index = 0; - iput(inode); - return; - } - - CDEBUG(D_READA, "Handling (init) async glimpse: inode = " - DFID ", idx = %llu\n", PFID(&lli->lli_fid), index); - - cl_agl(inode); - lli->lli_agl_index = 0; - lli->lli_glimpse_time = cfs_time_current(); - up_write(&lli->lli_glimpse_sem); - - CDEBUG(D_READA, "Handled (init) async glimpse: inode= " - DFID ", idx = %llu, rc = %d\n", - PFID(&lli->lli_fid), index, rc); - - iput(inode); -} - -/* - * prepare inode for sa entry, add it into agl list, now sa_entry is ready - * to be used by scanner process. - */ -static void sa_instantiate(struct ll_statahead_info *sai, - struct sa_entry *entry) -{ - struct inode *dir = sai->sai_dentry->d_inode; - struct inode *child; - struct md_enqueue_info *minfo; - struct lookup_intent *it; - struct ptlrpc_request *req; - struct mdt_body *body; - int rc = 0; - - LASSERT(entry->se_handle != 0); - - minfo = entry->se_minfo; - it = &minfo->mi_it; - req = entry->se_req; - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - rc = -EFAULT; - goto out; - } - - child = entry->se_inode; - if (child) { - /* revalidate; unlinked and re-created with the same name */ - if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->mbo_fid1))) { - entry->se_inode = NULL; - iput(child); - child = NULL; - } - } - - it->it_lock_handle = entry->se_handle; - rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); - if (rc != 1) { - rc = -EAGAIN; - goto out; - } - - rc = ll_prep_inode(&child, req, dir->i_sb, it); - if (rc) - goto out; - - CDEBUG(D_READA, "%s: setting %.*s" DFID " l_data to inode %p\n", - ll_get_fsname(child->i_sb, NULL, 0), - entry->se_qstr.len, entry->se_qstr.name, - PFID(ll_inode2fid(child)), child); - ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); - - entry->se_inode = child; - - if (agl_should_run(sai, child)) - ll_agl_add(sai, child, entry->se_index); - -out: - /* - * sa_make_ready() will drop ldlm ibits lock refcount by calling - * ll_intent_drop_lock() in spite of failures. Do not worry about - * calling ll_intent_drop_lock() more than once. - */ - sa_make_ready(sai, entry, rc); -} - -/* once there are async stat replies, instantiate sa_entry from replies */ -static void sa_handle_callback(struct ll_statahead_info *sai) -{ - struct ll_inode_info *lli; - - lli = ll_i2info(sai->sai_dentry->d_inode); - - while (sa_has_callback(sai)) { - struct sa_entry *entry; - - spin_lock(&lli->lli_sa_lock); - if (unlikely(!sa_has_callback(sai))) { - spin_unlock(&lli->lli_sa_lock); - break; - } - entry = list_entry(sai->sai_interim_entries.next, - struct sa_entry, se_list); - list_del_init(&entry->se_list); - spin_unlock(&lli->lli_sa_lock); - - sa_instantiate(sai, entry); - } -} - -/* - * callback for async stat, because this is called in ptlrpcd context, we only - * put sa_entry in sai_cb_entries list, and let sa_handle_callback() to really - * prepare inode and instantiate sa_entry later. - */ -static int ll_statahead_interpret(struct ptlrpc_request *req, - struct md_enqueue_info *minfo, int rc) -{ - struct lookup_intent *it = &minfo->mi_it; - struct inode *dir = minfo->mi_dir; - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = lli->lli_sai; - struct sa_entry *entry = (struct sa_entry *)minfo->mi_cbdata; - __u64 handle = 0; - - if (it_disposition(it, DISP_LOOKUP_NEG)) - rc = -ENOENT; - - /* - * because statahead thread will wait for all inflight RPC to finish, - * sai should be always valid, no need to refcount - */ - LASSERT(sai); - LASSERT(entry); - - CDEBUG(D_READA, "sa_entry %.*s rc %d\n", - entry->se_qstr.len, entry->se_qstr.name, rc); - - if (rc) { - ll_intent_release(it); - iput(dir); - kfree(minfo); - } else { - /* - * release ibits lock ASAP to avoid deadlock when statahead - * thread enqueues lock on parent in readdir and another - * process enqueues lock on child with parent lock held, eg. - * unlink. - */ - handle = it->it_lock_handle; - ll_intent_drop_lock(it); - } - - spin_lock(&lli->lli_sa_lock); - if (rc) { - if (__sa_make_ready(sai, entry, rc)) - wake_up(&sai->sai_waitq); - } else { - int first = 0; - entry->se_minfo = minfo; - entry->se_req = ptlrpc_request_addref(req); - /* - * Release the async ibits lock ASAP to avoid deadlock - * when statahead thread tries to enqueue lock on parent - * for readpage and other tries to enqueue lock on child - * with parent's lock held, for example: unlink. - */ - entry->se_handle = handle; - if (!sa_has_callback(sai)) - first = 1; - - list_add_tail(&entry->se_list, &sai->sai_interim_entries); - - if (first && sai->sai_task) - wake_up_process(sai->sai_task); - } - sai->sai_replied++; - - spin_unlock(&lli->lli_sa_lock); - - return rc; -} - -/* finish async stat RPC arguments */ -static void sa_fini_data(struct md_enqueue_info *minfo) -{ - iput(minfo->mi_dir); - kfree(minfo); -} - -/** - * prepare arguments for async stat RPC. - */ -static struct md_enqueue_info * -sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry) -{ - struct md_enqueue_info *minfo; - struct ldlm_enqueue_info *einfo; - struct md_op_data *op_data; - - minfo = kzalloc(sizeof(*minfo), GFP_NOFS); - if (!minfo) - return ERR_PTR(-ENOMEM); - - op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) { - kfree(minfo); - return (struct md_enqueue_info *)op_data; - } - - if (!child) - op_data->op_fid2 = entry->se_fid; - - minfo->mi_it.it_op = IT_GETATTR; - minfo->mi_dir = igrab(dir); - minfo->mi_cb = ll_statahead_interpret; - minfo->mi_cbdata = entry; - - einfo = &minfo->mi_einfo; - einfo->ei_type = LDLM_IBITS; - einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); - einfo->ei_cb_bl = ll_md_blocking_ast; - einfo->ei_cb_cp = ldlm_completion_ast; - einfo->ei_cb_gl = NULL; - einfo->ei_cbdata = NULL; - - return minfo; -} - -/* async stat for file not found in dcache */ -static int sa_lookup(struct inode *dir, struct sa_entry *entry) -{ - struct md_enqueue_info *minfo; - int rc; - - minfo = sa_prep_data(dir, NULL, entry); - if (IS_ERR(minfo)) - return PTR_ERR(minfo); - - rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo); - if (rc) - sa_fini_data(minfo); - - return rc; -} - -/** - * async stat for file found in dcache, similar to .revalidate - * - * \retval 1 dentry valid, no RPC sent - * \retval 0 dentry invalid, will send async stat RPC - * \retval negative number upon error - */ -static int sa_revalidate(struct inode *dir, struct sa_entry *entry, - struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - struct lookup_intent it = { .it_op = IT_GETATTR, - .it_lock_handle = 0 }; - struct md_enqueue_info *minfo; - int rc; - - if (unlikely(!inode)) - return 1; - - if (d_mountpoint(dentry)) - return 1; - - entry->se_inode = igrab(inode); - rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), - NULL); - if (rc == 1) { - entry->se_handle = it.it_lock_handle; - ll_intent_release(&it); - return 1; - } - - minfo = sa_prep_data(dir, inode, entry); - if (IS_ERR(minfo)) { - entry->se_inode = NULL; - iput(inode); - return PTR_ERR(minfo); - } - - rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo); - if (rc) { - entry->se_inode = NULL; - iput(inode); - sa_fini_data(minfo); - } - - return rc; -} - -/* async stat for file with @name */ -static void sa_statahead(struct dentry *parent, const char *name, int len, - const struct lu_fid *fid) -{ - struct inode *dir = d_inode(parent); - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = lli->lli_sai; - struct dentry *dentry = NULL; - struct sa_entry *entry; - int rc; - - entry = sa_alloc(parent, sai, sai->sai_index, name, len, fid); - if (IS_ERR(entry)) - return; - - dentry = d_lookup(parent, &entry->se_qstr); - if (!dentry) { - rc = sa_lookup(dir, entry); - } else { - rc = sa_revalidate(dir, entry, dentry); - if (rc == 1 && agl_should_run(sai, d_inode(dentry))) - ll_agl_add(sai, d_inode(dentry), entry->se_index); - } - - if (dentry) - dput(dentry); - - if (rc) - sa_make_ready(sai, entry, rc); - else - sai->sai_sent++; - - sai->sai_index++; -} - -/* async glimpse (agl) thread main function */ -static int ll_agl_thread(void *arg) -{ - struct dentry *parent = arg; - struct inode *dir = d_inode(parent); - struct ll_inode_info *plli = ll_i2info(dir); - struct ll_inode_info *clli; - /* We already own this reference, so it is safe to take it without a lock. */ - struct ll_statahead_info *sai = plli->lli_sai; - - CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n", - sai, parent); - - while (!kthread_should_stop()) { - - spin_lock(&plli->lli_agl_lock); - /* The statahead thread maybe help to process AGL entries, - * so check whether list empty again. - */ - if (!list_empty(&sai->sai_agls)) { - clli = list_entry(sai->sai_agls.next, - struct ll_inode_info, lli_agl_list); - list_del_init(&clli->lli_agl_list); - spin_unlock(&plli->lli_agl_lock); - ll_agl_trigger(&clli->lli_vfs_inode, sai); - } else { - spin_unlock(&plli->lli_agl_lock); - } - - set_current_state(TASK_IDLE); - if (list_empty(&sai->sai_agls) && - !kthread_should_stop()) - schedule(); - __set_current_state(TASK_RUNNING); - } - - spin_lock(&plli->lli_agl_lock); - sai->sai_agl_valid = 0; - while (!list_empty(&sai->sai_agls)) { - clli = list_entry(sai->sai_agls.next, - struct ll_inode_info, lli_agl_list); - list_del_init(&clli->lli_agl_list); - spin_unlock(&plli->lli_agl_lock); - clli->lli_agl_index = 0; - iput(&clli->lli_vfs_inode); - spin_lock(&plli->lli_agl_lock); - } - spin_unlock(&plli->lli_agl_lock); - CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n", - sai, parent); - ll_sai_put(sai); - return 0; -} - -/* start agl thread */ -static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) -{ - struct ll_inode_info *plli; - struct task_struct *task; - - CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n", - sai, parent); - - plli = ll_i2info(d_inode(parent)); - task = kthread_create(ll_agl_thread, parent, "ll_agl_%u", - plli->lli_opendir_pid); - if (IS_ERR(task)) { - CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); - return; - } - - sai->sai_agl_task = task; - atomic_inc(&ll_i2sbi(d_inode(parent))->ll_agl_total); - spin_lock(&plli->lli_agl_lock); - sai->sai_agl_valid = 1; - spin_unlock(&plli->lli_agl_lock); - /* Get an extra reference that the thread holds */ - ll_sai_get(d_inode(parent)); - - wake_up_process(task); -} - -/* statahead thread main function */ -static int ll_statahead_thread(void *arg) -{ - struct dentry *parent = arg; - struct inode *dir = d_inode(parent); - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_sb_info *sbi = ll_i2sbi(dir); - struct ll_statahead_info *sai = lli->lli_sai; - struct page *page = NULL; - __u64 pos = 0; - int first = 0; - int rc = 0; - struct md_op_data *op_data; - - CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n", - sai, parent); - - op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, - LUSTRE_OPC_ANY, dir); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out; - } - - op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; - - while (pos != MDS_DIR_END_OFF && sai->sai_task) { - struct lu_dirpage *dp; - struct lu_dirent *ent; - - sai->sai_in_readpage = 1; - page = ll_get_dir_page(dir, op_data, pos); - sai->sai_in_readpage = 0; - if (IS_ERR(page)) { - rc = PTR_ERR(page); - CDEBUG(D_READA, "error reading dir " DFID " at %llu/%llu: opendir_pid = %u: rc = %d\n", - PFID(ll_inode2fid(dir)), pos, sai->sai_index, - lli->lli_opendir_pid, rc); - break; - } - - dp = page_address(page); - for (ent = lu_dirent_start(dp); - ent && sai->sai_task && !sa_low_hit(sai); - ent = lu_dirent_next(ent)) { - struct lu_fid fid; - __u64 hash; - int namelen; - char *name; - - hash = le64_to_cpu(ent->lde_hash); - if (unlikely(hash < pos)) - /* - * Skip until we find target hash value. - */ - continue; - - namelen = le16_to_cpu(ent->lde_namelen); - if (unlikely(namelen == 0)) - /* - * Skip dummy record. - */ - continue; - - name = ent->lde_name; - if (name[0] == '.') { - if (namelen == 1) { - /* - * skip "." - */ - continue; - } else if (name[1] == '.' && namelen == 2) { - /* - * skip ".." - */ - continue; - } else if (!sai->sai_ls_all) { - /* - * skip hidden files. - */ - sai->sai_skip_hidden++; - continue; - } - } - - /* - * don't stat-ahead first entry. - */ - if (unlikely(++first == 1)) - continue; - - fid_le_to_cpu(&fid, &ent->lde_fid); - - do { - sa_handle_callback(sai); - - spin_lock(&lli->lli_agl_lock); - while (sa_sent_full(sai) && - !agl_list_empty(sai)) { - struct ll_inode_info *clli; - - clli = list_entry(sai->sai_agls.next, - struct ll_inode_info, - lli_agl_list); - list_del_init(&clli->lli_agl_list); - spin_unlock(&lli->lli_agl_lock); - - ll_agl_trigger(&clli->lli_vfs_inode, - sai); - - spin_lock(&lli->lli_agl_lock); - } - spin_unlock(&lli->lli_agl_lock); - - set_current_state(TASK_IDLE); - if (sa_sent_full(sai) && - !sa_has_callback(sai) && - agl_list_empty(sai) && - sai->sai_task) - /* wait for spare statahead window */ - schedule(); - __set_current_state(TASK_RUNNING); - } while (sa_sent_full(sai) && sai->sai_task); - - sa_statahead(parent, name, namelen, &fid); - } - - pos = le64_to_cpu(dp->ldp_hash_end); - ll_release_page(dir, page, - le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); - - if (sa_low_hit(sai)) { - rc = -EFAULT; - atomic_inc(&sbi->ll_sa_wrong); - CDEBUG(D_READA, "Statahead for dir " DFID " hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread: pid %d\n", - PFID(&lli->lli_fid), sai->sai_hit, - sai->sai_miss, sai->sai_sent, - sai->sai_replied, current_pid()); - break; - } - } - ll_finish_md_op_data(op_data); - - if (rc < 0) { - spin_lock(&lli->lli_sa_lock); - sai->sai_task = NULL; - lli->lli_sa_enabled = 0; - spin_unlock(&lli->lli_sa_lock); - } - - /* - * statahead is finished, but statahead entries need to be cached, wait - * for file release to stop me. - */ - while (sai->sai_task) { - sa_handle_callback(sai); - - set_current_state(TASK_IDLE); - if (!sa_has_callback(sai) && - sai->sai_task) - schedule(); - __set_current_state(TASK_RUNNING); - } -out: - if (sai->sai_agl_task) { - kthread_stop(sai->sai_agl_task); - - CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", - sai, (unsigned int)sai->sai_agl_task->pid); - sai->sai_agl_task = NULL; - } - /* - * wait for inflight statahead RPCs to finish, and then we can free sai - * safely because statahead RPC will access sai data - */ - while (sai->sai_sent != sai->sai_replied) { - /* in case we're not woken up, timeout wait */ - schedule_timeout_idle(HZ>>3); - } - - /* release resources held by statahead RPCs */ - sa_handle_callback(sai); - - CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %pd\n", - sai, parent); - - spin_lock(&lli->lli_sa_lock); - sai->sai_task = NULL; - spin_unlock(&lli->lli_sa_lock); - - wake_up(&sai->sai_waitq); - ll_sai_put(sai); - - do_exit(rc); -} - -/* authorize opened dir handle @key to statahead */ -void ll_authorize_statahead(struct inode *dir, void *key) -{ - struct ll_inode_info *lli = ll_i2info(dir); - - spin_lock(&lli->lli_sa_lock); - if (!lli->lli_opendir_key && !lli->lli_sai) { - /* - * if lli_sai is not NULL, it means previous statahead is not - * finished yet, we'd better not start a new statahead for now. - */ - LASSERT(!lli->lli_opendir_pid); - lli->lli_opendir_key = key; - lli->lli_opendir_pid = current_pid(); - lli->lli_sa_enabled = 1; - } - spin_unlock(&lli->lli_sa_lock); -} - -/* - * deauthorize opened dir handle @key to statahead, but statahead thread may - * still be running, notify it to quit. - */ -void ll_deauthorize_statahead(struct inode *dir, void *key) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai; - - LASSERT(lli->lli_opendir_key == key); - LASSERT(lli->lli_opendir_pid); - - CDEBUG(D_READA, "deauthorize statahead for " DFID "\n", - PFID(&lli->lli_fid)); - - spin_lock(&lli->lli_sa_lock); - lli->lli_opendir_key = NULL; - lli->lli_opendir_pid = 0; - lli->lli_sa_enabled = 0; - sai = lli->lli_sai; - if (sai && sai->sai_task) { - /* - * statahead thread may not quit yet because it needs to cache - * entries, now it's time to tell it to quit. - */ - wake_up_process(sai->sai_task); - sai->sai_task = NULL; - } - spin_unlock(&lli->lli_sa_lock); -} - -enum { - /** - * not first dirent, or is "." - */ - LS_NOT_FIRST_DE = 0, - /** - * the first non-hidden dirent - */ - LS_FIRST_DE, - /** - * the first hidden dirent, that is "." - */ - LS_FIRST_DOT_DE -}; - -/* file is first dirent under @dir */ -static int is_first_dirent(struct inode *dir, struct dentry *dentry) -{ - const struct qstr *target = &dentry->d_name; - struct md_op_data *op_data; - struct page *page; - __u64 pos = 0; - int dot_de; - int rc = LS_NOT_FIRST_DE; - - op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, - LUSTRE_OPC_ANY, dir); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - /** - * FIXME choose the start offset of the readdir - */ - op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; - - page = ll_get_dir_page(dir, op_data, pos); - - while (1) { - struct lu_dirpage *dp; - struct lu_dirent *ent; - - if (IS_ERR(page)) { - struct ll_inode_info *lli = ll_i2info(dir); - - rc = PTR_ERR(page); - CERROR("%s: error reading dir " DFID " at %llu: opendir_pid = %u : rc = %d\n", - ll_get_fsname(dir->i_sb, NULL, 0), - PFID(ll_inode2fid(dir)), pos, - lli->lli_opendir_pid, rc); - break; - } - - dp = page_address(page); - for (ent = lu_dirent_start(dp); ent; - ent = lu_dirent_next(ent)) { - __u64 hash; - int namelen; - char *name; - - hash = le64_to_cpu(ent->lde_hash); - /* The ll_get_dir_page() can return any page containing - * the given hash which may be not the start hash. - */ - if (unlikely(hash < pos)) - continue; - - namelen = le16_to_cpu(ent->lde_namelen); - if (unlikely(namelen == 0)) - /* - * skip dummy record. - */ - continue; - - name = ent->lde_name; - if (name[0] == '.') { - if (namelen == 1) - /* - * skip "." - */ - continue; - else if (name[1] == '.' && namelen == 2) - /* - * skip ".." - */ - continue; - else - dot_de = 1; - } else { - dot_de = 0; - } - - if (dot_de && target->name[0] != '.') { - CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", - target->len, target->name, - namelen, name); - continue; - } - - if (target->len != namelen || - memcmp(target->name, name, namelen) != 0) - rc = LS_NOT_FIRST_DE; - else if (!dot_de) - rc = LS_FIRST_DE; - else - rc = LS_FIRST_DOT_DE; - - ll_release_page(dir, page, false); - goto out; - } - pos = le64_to_cpu(dp->ldp_hash_end); - if (pos == MDS_DIR_END_OFF) { - /* - * End of directory reached. - */ - ll_release_page(dir, page, false); - goto out; - } else { - /* - * chain is exhausted - * Normal case: continue to the next page. - */ - ll_release_page(dir, page, - le32_to_cpu(dp->ldp_flags) & - LDF_COLLIDE); - page = ll_get_dir_page(dir, op_data, pos); - } - } -out: - ll_finish_md_op_data(op_data); - return rc; -} - -/** - * revalidate @dentryp from statahead cache - * - * \param[in] dir parent directory - * \param[in] sai sai structure - * \param[out] dentryp pointer to dentry which will be revalidated - * \param[in] unplug unplug statahead window only (normally for negative - * dentry) - * \retval 1 on success, dentry is saved in @dentryp - * \retval 0 if revalidation failed (no proper lock on client) - * \retval negative number upon error - */ -static int revalidate_statahead_dentry(struct inode *dir, - struct ll_statahead_info *sai, - struct dentry **dentryp, - bool unplug) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct sa_entry *entry = NULL; - struct ll_dentry_data *ldd; - int rc = 0; - - if ((*dentryp)->d_name.name[0] == '.') { - if (sai->sai_ls_all || - sai->sai_miss_hidden >= sai->sai_skip_hidden) { - /* - * Hidden dentry is the first one, or statahead - * thread does not skip so many hidden dentries - * before "sai_ls_all" enabled as below. - */ - } else { - if (!sai->sai_ls_all) - /* - * It maybe because hidden dentry is not - * the first one, "sai_ls_all" was not - * set, then "ls -al" missed. Enable - * "sai_ls_all" for such case. - */ - sai->sai_ls_all = 1; - - /* - * Such "getattr" has been skipped before - * "sai_ls_all" enabled as above. - */ - sai->sai_miss_hidden++; - return -EAGAIN; - } - } - - if (unplug) { - rc = 1; - goto out_unplug; - } - - entry = sa_get(sai, &(*dentryp)->d_name); - if (!entry) { - rc = -EAGAIN; - goto out_unplug; - } - - /* if statahead is busy in readdir, help it do post-work */ - if (!sa_ready(entry) && sai->sai_in_readpage) - sa_handle_callback(sai); - - if (!sa_ready(entry)) { - spin_lock(&lli->lli_sa_lock); - sai->sai_index_wait = entry->se_index; - spin_unlock(&lli->lli_sa_lock); - if (0 == wait_event_idle_timeout(sai->sai_waitq, - sa_ready(entry), 30 * HZ)) { - /* - * entry may not be ready, so it may be used by inflight - * statahead RPC, don't free it. - */ - entry = NULL; - rc = -EAGAIN; - goto out_unplug; - } - } - - if (entry->se_state == SA_ENTRY_SUCC && entry->se_inode) { - struct inode *inode = entry->se_inode; - struct lookup_intent it = { .it_op = IT_GETATTR, - .it_lock_handle = entry->se_handle }; - __u64 bits; - - rc = md_revalidate_lock(ll_i2mdexp(dir), &it, - ll_inode2fid(inode), &bits); - if (rc == 1) { - if (!(*dentryp)->d_inode) { - struct dentry *alias; - - alias = ll_splice_alias(inode, *dentryp); - if (IS_ERR(alias)) { - ll_intent_release(&it); - rc = PTR_ERR(alias); - goto out_unplug; - } - *dentryp = alias; - /** - * statahead prepared this inode, transfer inode - * refcount from sa_entry to dentry - */ - entry->se_inode = NULL; - } else if ((*dentryp)->d_inode != inode) { - /* revalidate, but inode is recreated */ - CDEBUG(D_READA, - "%s: stale dentry %pd inode " DFID ", statahead inode " DFID "\n", - ll_get_fsname((*dentryp)->d_inode->i_sb, - NULL, 0), - *dentryp, - PFID(ll_inode2fid((*dentryp)->d_inode)), - PFID(ll_inode2fid(inode))); - ll_intent_release(&it); - rc = -ESTALE; - goto out_unplug; - } - - if ((bits & MDS_INODELOCK_LOOKUP) && - d_lustre_invalid(*dentryp)) - d_lustre_revalidate(*dentryp); - ll_intent_release(&it); - } - } -out_unplug: - /* - * statahead cached sa_entry can be used only once, and will be killed - * right after use, so if lookup/revalidate accessed statahead cache, - * set dentry ldd_sa_generation to parent lli_sa_generation, later if we - * stat this file again, we know we've done statahead before, see - * dentry_may_statahead(). - */ - ldd = ll_d2d(*dentryp); - ldd->lld_sa_generation = lli->lli_sa_generation; - sa_put(sai, entry, lli); - return rc; -} - -/** - * start statahead thread - * - * \param[in] dir parent directory - * \param[in] dentry dentry that triggers statahead, normally the first - * dirent under @dir - * \retval -EAGAIN on success, because when this function is - * called, it's already in lookup call, so client should - * do it itself instead of waiting for statahead thread - * to do it asynchronously. - * \retval negative number upon error - */ -static int start_statahead_thread(struct inode *dir, struct dentry *dentry) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = NULL; - struct task_struct *task; - struct dentry *parent = dentry->d_parent; - int rc; - - /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ - rc = is_first_dirent(dir, dentry); - if (rc == LS_NOT_FIRST_DE) { - /* It is not "ls -{a}l" operation, no need statahead for it. */ - rc = -EFAULT; - goto out; - } - - sai = ll_sai_alloc(parent); - if (!sai) { - rc = -ENOMEM; - goto out; - } - - sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); - /* - * if current lli_opendir_key was deauthorized, or dir re-opened by - * another process, don't start statahead, otherwise the newly spawned - * statahead thread won't be notified to quit. - */ - spin_lock(&lli->lli_sa_lock); - if (unlikely(lli->lli_sai || lli->lli_opendir_key || - lli->lli_opendir_pid != current->pid)) { - spin_unlock(&lli->lli_sa_lock); - rc = -EPERM; - goto out; - } - lli->lli_sai = sai; - spin_unlock(&lli->lli_sa_lock); - - atomic_inc(&ll_i2sbi(parent->d_inode)->ll_sa_running); - - CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %pd]\n", - current_pid(), parent); - - task = kthread_create(ll_statahead_thread, parent, "ll_sa_%u", - lli->lli_opendir_pid); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("can't start ll_sa thread, rc : %d\n", rc); - goto out; - } - - if (ll_i2sbi(parent->d_inode)->ll_flags & LL_SBI_AGL_ENABLED) - ll_start_agl(parent, sai); - - atomic_inc(&ll_i2sbi(parent->d_inode)->ll_sa_total); - sai->sai_task = task; - - wake_up_process(task); - - /* - * We don't stat-ahead for the first dirent since we are already in - * lookup. - */ - return -EAGAIN; - -out: - /* - * once we start statahead thread failed, disable statahead so - * that subsequent stat won't waste time to try it. - */ - spin_lock(&lli->lli_sa_lock); - lli->lli_sa_enabled = 0; - lli->lli_sai = NULL; - spin_unlock(&lli->lli_sa_lock); - if (sai) - ll_sai_free(sai); - return rc; -} - -/** - * statahead entry function, this is called when client getattr on a file, it - * will start statahead thread if this is the first dir entry, else revalidate - * dentry from statahead cache. - * - * \param[in] dir parent directory - * \param[out] dentryp dentry to getattr - * \param[in] unplug unplug statahead window only (normally for negative - * dentry) - * \retval 1 on success - * \retval 0 revalidation from statahead cache failed, caller needs - * to getattr from server directly - * \retval negative number on error, caller often ignores this and - * then getattr from server - */ -int ll_statahead(struct inode *dir, struct dentry **dentryp, bool unplug) -{ - struct ll_statahead_info *sai; - - sai = ll_sai_get(dir); - if (sai) { - int rc; - - rc = revalidate_statahead_dentry(dir, sai, dentryp, unplug); - CDEBUG(D_READA, "revalidate statahead %pd: %d.\n", - *dentryp, rc); - ll_sai_put(sai); - return rc; - } - return start_statahead_thread(dir, *dentryp); -} diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c deleted file mode 100644 index 861e7a60f408..000000000000 --- a/drivers/staging/lustre/lustre/llite/super25.c +++ /dev/null @@ -1,185 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <linux/module.h> -#include <linux/types.h> -#include <lustre_ha.h> -#include <lustre_dlm.h> -#include <linux/init.h> -#include <linux/fs.h> -#include <lprocfs_status.h> -#include "llite_internal.h" - -static struct kmem_cache *ll_inode_cachep; - -static struct inode *ll_alloc_inode(struct super_block *sb) -{ - struct ll_inode_info *lli; - - ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1); - lli = kmem_cache_zalloc(ll_inode_cachep, GFP_NOFS); - if (!lli) - return NULL; - - inode_init_once(&lli->lli_vfs_inode); - return &lli->lli_vfs_inode; -} - -static void ll_inode_destroy_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - struct ll_inode_info *ptr = ll_i2info(inode); - - kmem_cache_free(ll_inode_cachep, ptr); -} - -static void ll_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, ll_inode_destroy_callback); -} - -/* exported operations */ -struct super_operations lustre_super_operations = { - .alloc_inode = ll_alloc_inode, - .destroy_inode = ll_destroy_inode, - .evict_inode = ll_delete_inode, - .put_super = ll_put_super, - .statfs = ll_statfs, - .umount_begin = ll_umount_begin, - .remount_fs = ll_remount_fs, - .show_options = ll_show_options, -}; -MODULE_ALIAS_FS("lustre"); - -static int __init lustre_init(void) -{ - int rc; - - BUILD_BUG_ON(sizeof(LUSTRE_VOLATILE_HDR) != - LUSTRE_VOLATILE_HDR_LEN + 1); - - /* print an address of _any_ initialized kernel symbol from this - * module, to allow debugging with gdb that doesn't support data - * symbols from modules. - */ - CDEBUG(D_INFO, "Lustre client module (%p).\n", - &lustre_super_operations); - - rc = -ENOMEM; - ll_inode_cachep = kmem_cache_create("lustre_inode_cache", - sizeof(struct ll_inode_info), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, - NULL); - if (!ll_inode_cachep) - goto out_cache; - - ll_file_data_slab = kmem_cache_create("ll_file_data", - sizeof(struct ll_file_data), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ll_file_data_slab) - goto out_cache; - - llite_root = debugfs_create_dir("llite", debugfs_lustre_root); - if (IS_ERR_OR_NULL(llite_root)) { - rc = llite_root ? PTR_ERR(llite_root) : -ENOMEM; - llite_root = NULL; - goto out_cache; - } - - llite_kset = kset_create_and_add("llite", NULL, lustre_kobj); - if (!llite_kset) { - rc = -ENOMEM; - goto out_debugfs; - } - - rc = vvp_global_init(); - if (rc != 0) - goto out_sysfs; - - cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck, - LCT_REMEMBER | LCT_NOREF); - if (IS_ERR(cl_inode_fini_env)) { - rc = PTR_ERR(cl_inode_fini_env); - goto out_vvp; - } - - cl_inode_fini_env->le_ctx.lc_cookie = 0x4; - - rc = ll_xattr_init(); - if (rc != 0) - goto out_inode_fini_env; - - lustre_register_super_ops(THIS_MODULE, ll_fill_super, ll_kill_super); - lustre_register_client_process_config(ll_process_config); - - return 0; - -out_inode_fini_env: - cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck); -out_vvp: - vvp_global_fini(); -out_sysfs: - kset_unregister(llite_kset); -out_debugfs: - debugfs_remove(llite_root); -out_cache: - kmem_cache_destroy(ll_inode_cachep); - kmem_cache_destroy(ll_file_data_slab); - return rc; -} - -static void __exit lustre_exit(void) -{ - lustre_register_super_ops(NULL, NULL, NULL); - lustre_register_client_process_config(NULL); - - debugfs_remove(llite_root); - kset_unregister(llite_kset); - - ll_xattr_fini(); - cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck); - vvp_global_fini(); - - kmem_cache_destroy(ll_inode_cachep); - kmem_cache_destroy(ll_file_data_slab); -} - -MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>"); -MODULE_DESCRIPTION("Lustre Client File System"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(lustre_init); -module_exit(lustre_exit); diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c deleted file mode 100644 index 0690fdbf49f5..000000000000 --- a/drivers/staging/lustre/lustre/llite/symlink.c +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/stat.h> -#define DEBUG_SUBSYSTEM S_LLITE - -#include "llite_internal.h" - -static int ll_readlink_internal(struct inode *inode, - struct ptlrpc_request **request, char **symname) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc, symlen = i_size_read(inode) + 1; - struct mdt_body *body; - struct md_op_data *op_data; - - *request = NULL; - - if (lli->lli_symlink_name) { - int print_limit = min_t(int, PAGE_SIZE - 128, symlen); - - *symname = lli->lli_symlink_name; - /* If the total CDEBUG() size is larger than a page, it - * will print a warning to the console, avoid this by - * printing just the last part of the symlink. - */ - CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n", - print_limit < symlen ? "..." : "", print_limit, - (*symname) + symlen - print_limit, symlen); - return 0; - } - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_LINKNAME; - rc = md_getattr(sbi->ll_md_exp, op_data, request); - ll_finish_md_op_data(op_data); - if (rc) { - if (rc != -ENOENT) - CERROR("%s: inode " DFID ": rc = %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), rc); - goto failed; - } - - body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); - if ((body->mbo_valid & OBD_MD_LINKNAME) == 0) { - CERROR("OBD_MD_LINKNAME not set on reply\n"); - rc = -EPROTO; - goto failed; - } - - LASSERT(symlen != 0); - if (body->mbo_eadatasize != symlen) { - CERROR("%s: inode " DFID ": symlink length %d not expected %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), body->mbo_eadatasize - 1, - symlen - 1); - rc = -EPROTO; - goto failed; - } - - *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD); - if (!*symname || - strnlen(*symname, symlen) != symlen - 1) { - /* not full/NULL terminated */ - CERROR("inode %lu: symlink not NULL terminated string of length %d\n", - inode->i_ino, symlen - 1); - rc = -EPROTO; - goto failed; - } - - lli->lli_symlink_name = kzalloc(symlen, GFP_NOFS); - /* do not return an error if we cannot cache the symlink locally */ - if (lli->lli_symlink_name) { - memcpy(lli->lli_symlink_name, *symname, symlen); - *symname = lli->lli_symlink_name; - } - return 0; - -failed: - return rc; -} - -static void ll_put_link(void *p) -{ - ptlrpc_req_finished(p); -} - -static const char *ll_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - struct ptlrpc_request *request = NULL; - int rc; - char *symname = NULL; - - if (!dentry) - return ERR_PTR(-ECHILD); - - CDEBUG(D_VFSTRACE, "VFS Op\n"); - ll_inode_size_lock(inode); - rc = ll_readlink_internal(inode, &request, &symname); - ll_inode_size_unlock(inode); - if (rc) { - ptlrpc_req_finished(request); - return ERR_PTR(rc); - } - - /* symname may contain a pointer to the request message buffer, - * we delay request releasing then. - */ - set_delayed_call(done, ll_put_link, request); - return symname; -} - -const struct inode_operations ll_fast_symlink_inode_operations = { - .setattr = ll_setattr, - .get_link = ll_get_link, - .getattr = ll_getattr, - .permission = ll_inode_permission, - .listxattr = ll_listxattr, -}; diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c deleted file mode 100644 index 987c03b058e6..000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_dev.c +++ /dev/null @@ -1,659 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * cl_device and cl_device_type implementation for VVP layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Jinshan Xiong <jinshan.xiong@intel.com> - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd.h> -#include "llite_internal.h" -#include "vvp_internal.h" - -/***************************************************************************** - * - * Vvp device and device type functions. - * - */ - -/* - * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical - * "llite_" (var. "ll_") prefix. - */ - -static struct kmem_cache *ll_thread_kmem; -struct kmem_cache *vvp_lock_kmem; -struct kmem_cache *vvp_object_kmem; -static struct kmem_cache *vvp_session_kmem; -static struct kmem_cache *vvp_thread_kmem; - -static struct lu_kmem_descr vvp_caches[] = { - { - .ckd_cache = &ll_thread_kmem, - .ckd_name = "ll_thread_kmem", - .ckd_size = sizeof(struct ll_thread_info), - }, - { - .ckd_cache = &vvp_lock_kmem, - .ckd_name = "vvp_lock_kmem", - .ckd_size = sizeof(struct vvp_lock), - }, - { - .ckd_cache = &vvp_object_kmem, - .ckd_name = "vvp_object_kmem", - .ckd_size = sizeof(struct vvp_object), - }, - { - .ckd_cache = &vvp_session_kmem, - .ckd_name = "vvp_session_kmem", - .ckd_size = sizeof(struct vvp_session) - }, - { - .ckd_cache = &vvp_thread_kmem, - .ckd_name = "vvp_thread_kmem", - .ckd_size = sizeof(struct vvp_thread_info), - }, - { - .ckd_cache = NULL - } -}; - -static void *ll_thread_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct vvp_thread_info *info; - - info = kmem_cache_zalloc(ll_thread_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void ll_thread_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct vvp_thread_info *info = data; - - kmem_cache_free(ll_thread_kmem, info); -} - -struct lu_context_key ll_thread_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = ll_thread_key_init, - .lct_fini = ll_thread_key_fini -}; - -static void *vvp_session_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct vvp_session *session; - - session = kmem_cache_zalloc(vvp_session_kmem, GFP_NOFS); - if (!session) - session = ERR_PTR(-ENOMEM); - return session; -} - -static void vvp_session_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct vvp_session *session = data; - - kmem_cache_free(vvp_session_kmem, session); -} - -struct lu_context_key vvp_session_key = { - .lct_tags = LCT_SESSION, - .lct_init = vvp_session_key_init, - .lct_fini = vvp_session_key_fini -}; - -static void *vvp_thread_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct vvp_thread_info *vti; - - vti = kmem_cache_zalloc(vvp_thread_kmem, GFP_NOFS); - if (!vti) - vti = ERR_PTR(-ENOMEM); - return vti; -} - -static void vvp_thread_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct vvp_thread_info *vti = data; - - kmem_cache_free(vvp_thread_kmem, vti); -} - -struct lu_context_key vvp_thread_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = vvp_thread_key_init, - .lct_fini = vvp_thread_key_fini -}; - -/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */ -LU_TYPE_INIT_FINI(vvp, &vvp_thread_key, &ll_thread_key, &vvp_session_key); - -static const struct lu_device_operations vvp_lu_ops = { - .ldo_object_alloc = vvp_object_alloc -}; - -static struct lu_device *vvp_device_free(const struct lu_env *env, - struct lu_device *d) -{ - struct vvp_device *vdv = lu2vvp_dev(d); - struct cl_site *site = lu2cl_site(d->ld_site); - struct lu_device *next = cl2lu_dev(vdv->vdv_next); - - if (d->ld_site) { - cl_site_fini(site); - kfree(site); - } - cl_device_fini(lu2cl_dev(d)); - kfree(vdv); - return next; -} - -static struct lu_device *vvp_device_alloc(const struct lu_env *env, - struct lu_device_type *t, - struct lustre_cfg *cfg) -{ - struct vvp_device *vdv; - struct lu_device *lud; - struct cl_site *site; - int rc; - - vdv = kzalloc(sizeof(*vdv), GFP_NOFS); - if (!vdv) - return ERR_PTR(-ENOMEM); - - lud = &vdv->vdv_cl.cd_lu_dev; - cl_device_init(&vdv->vdv_cl, t); - vvp2lu_dev(vdv)->ld_ops = &vvp_lu_ops; - - site = kzalloc(sizeof(*site), GFP_NOFS); - if (site) { - rc = cl_site_init(site, &vdv->vdv_cl); - if (rc == 0) { - rc = lu_site_init_finish(&site->cs_lu); - } else { - LASSERT(!lud->ld_site); - CERROR("Cannot init lu_site, rc %d.\n", rc); - kfree(site); - } - } else { - rc = -ENOMEM; - } - if (rc != 0) { - vvp_device_free(env, lud); - lud = ERR_PTR(rc); - } - return lud; -} - -static int vvp_device_init(const struct lu_env *env, struct lu_device *d, - const char *name, struct lu_device *next) -{ - struct vvp_device *vdv; - int rc; - - vdv = lu2vvp_dev(d); - vdv->vdv_next = lu2cl_dev(next); - - LASSERT(d->ld_site && next->ld_type); - next->ld_site = d->ld_site; - rc = next->ld_type->ldt_ops->ldto_device_init(env, next, - next->ld_type->ldt_name, - NULL); - if (rc == 0) { - lu_device_get(next); - lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); - } - return rc; -} - -static struct lu_device *vvp_device_fini(const struct lu_env *env, - struct lu_device *d) -{ - return cl2lu_dev(lu2vvp_dev(d)->vdv_next); -} - -static const struct lu_device_type_operations vvp_device_type_ops = { - .ldto_init = vvp_type_init, - .ldto_fini = vvp_type_fini, - - .ldto_start = vvp_type_start, - .ldto_stop = vvp_type_stop, - - .ldto_device_alloc = vvp_device_alloc, - .ldto_device_free = vvp_device_free, - .ldto_device_init = vvp_device_init, - .ldto_device_fini = vvp_device_fini, -}; - -struct lu_device_type vvp_device_type = { - .ldt_tags = LU_DEVICE_CL, - .ldt_name = LUSTRE_VVP_NAME, - .ldt_ops = &vvp_device_type_ops, - .ldt_ctx_tags = LCT_CL_THREAD -}; - -/** - * A mutex serializing calls to vvp_inode_fini() under extreme memory - * pressure, when environments cannot be allocated. - */ -int vvp_global_init(void) -{ - int rc; - - rc = lu_kmem_init(vvp_caches); - if (rc != 0) - return rc; - - rc = lu_device_type_init(&vvp_device_type); - if (rc != 0) - goto out_kmem; - - return 0; - -out_kmem: - lu_kmem_fini(vvp_caches); - - return rc; -} - -void vvp_global_fini(void) -{ - lu_device_type_fini(&vvp_device_type); - lu_kmem_fini(vvp_caches); -} - -/***************************************************************************** - * - * mirror obd-devices into cl devices. - * - */ - -int cl_sb_init(struct super_block *sb) -{ - struct ll_sb_info *sbi; - struct cl_device *cl; - struct lu_env *env; - int rc = 0; - u16 refcheck; - - sbi = ll_s2sbi(sb); - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - cl = cl_type_setup(env, NULL, &vvp_device_type, - sbi->ll_dt_exp->exp_obd->obd_lu_dev); - if (!IS_ERR(cl)) { - sbi->ll_cl = cl; - sbi->ll_site = cl2lu_dev(cl)->ld_site; - } - cl_env_put(env, &refcheck); - } else { - rc = PTR_ERR(env); - } - return rc; -} - -int cl_sb_fini(struct super_block *sb) -{ - struct ll_sb_info *sbi; - struct lu_env *env; - struct cl_device *cld; - u16 refcheck; - int result; - - sbi = ll_s2sbi(sb); - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - cld = sbi->ll_cl; - - if (cld) { - cl_stack_fini(env, cld); - sbi->ll_cl = NULL; - sbi->ll_site = NULL; - } - cl_env_put(env, &refcheck); - result = 0; - } else { - CERROR("Cannot cleanup cl-stack due to memory shortage.\n"); - result = PTR_ERR(env); - } - return result; -} - -/**************************************************************************** - * - * debugfs/lustre/llite/$MNT/dump_page_cache - * - ****************************************************************************/ - -/* - * To represent contents of a page cache as a byte stream, following - * information if encoded in 64bit offset: - * - * - file hash bucket in lu_site::ls_hash[] 28bits - * - * - how far file is from bucket head 4bits - * - * - page index 32bits - * - * First two data identify a file in the cache uniquely. - */ - -#define PGC_OBJ_SHIFT (32 + 4) -#define PGC_DEPTH_SHIFT (32) - -struct vvp_pgcache_id { - unsigned int vpi_bucket; - unsigned int vpi_depth; - u32 vpi_index; - - unsigned int vpi_curdep; - struct lu_object_header *vpi_obj; -}; - -static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id) -{ - BUILD_BUG_ON(sizeof(pos) != sizeof(__u64)); - - id->vpi_index = pos & 0xffffffff; - id->vpi_depth = (pos >> PGC_DEPTH_SHIFT) & 0xf; - id->vpi_bucket = (unsigned long long)pos >> PGC_OBJ_SHIFT; -} - -static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id) -{ - return - ((__u64)id->vpi_index) | - ((__u64)id->vpi_depth << PGC_DEPTH_SHIFT) | - ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT); -} - -static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *data) -{ - struct vvp_pgcache_id *id = data; - struct lu_object_header *hdr = cfs_hash_object(hs, hnode); - - if (id->vpi_curdep-- > 0) - return 0; /* continue */ - - if (lu_object_is_dying(hdr)) - return 1; - - cfs_hash_get(hs, hnode); - id->vpi_obj = hdr; - return 1; -} - -static struct cl_object *vvp_pgcache_obj(const struct lu_env *env, - struct lu_device *dev, - struct vvp_pgcache_id *id) -{ - LASSERT(lu_device_is_cl(dev)); - - id->vpi_depth &= 0xf; - id->vpi_obj = NULL; - id->vpi_curdep = id->vpi_depth; - - cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket, - vvp_pgcache_obj_get, id); - if (id->vpi_obj) { - struct lu_object *lu_obj; - - lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type); - if (lu_obj) { - lu_object_ref_add(lu_obj, "dump", current); - return lu2cl(lu_obj); - } - lu_object_put(env, lu_object_top(id->vpi_obj)); - - } else if (id->vpi_curdep > 0) { - id->vpi_depth = 0xf; - } - return NULL; -} - -static loff_t vvp_pgcache_find(const struct lu_env *env, - struct lu_device *dev, loff_t pos) -{ - struct cl_object *clob; - struct lu_site *site; - struct vvp_pgcache_id id; - - site = dev->ld_site; - vvp_pgcache_id_unpack(pos, &id); - - while (1) { - if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash)) - return ~0ULL; - clob = vvp_pgcache_obj(env, dev, &id); - if (clob) { - struct inode *inode = vvp_object_inode(clob); - struct page *vmpage; - int nr; - - nr = find_get_pages_contig(inode->i_mapping, - id.vpi_index, 1, &vmpage); - if (nr > 0) { - id.vpi_index = vmpage->index; - /* Cant support over 16T file */ - nr = !(vmpage->index > 0xffffffff); - put_page(vmpage); - } - - lu_object_ref_del(&clob->co_lu, "dump", current); - cl_object_put(env, clob); - if (nr > 0) - return vvp_pgcache_id_pack(&id); - } - /* to the next object. */ - ++id.vpi_depth; - id.vpi_depth &= 0xf; - if (id.vpi_depth == 0 && ++id.vpi_bucket == 0) - return ~0ULL; - id.vpi_index = 0; - } -} - -#define seq_page_flag(seq, page, flag, has_flags) do { \ - if (test_bit(PG_##flag, &(page)->flags)) { \ - seq_printf(seq, "%s"#flag, has_flags ? "|" : ""); \ - has_flags = 1; \ - } \ -} while (0) - -static void vvp_pgcache_page_show(const struct lu_env *env, - struct seq_file *seq, struct cl_page *page) -{ - struct vvp_page *vpg; - struct page *vmpage; - int has_flags; - - vpg = cl2vvp_page(cl_page_at(page, &vvp_device_type)); - vmpage = vpg->vpg_page; - seq_printf(seq, " %5i | %p %p %s %s %s | %p " DFID "(%p) %lu %u [", - 0 /* gen */, - vpg, page, - "none", - vpg->vpg_defer_uptodate ? "du" : "- ", - PageWriteback(vmpage) ? "wb" : "-", - vmpage, PFID(ll_inode2fid(vmpage->mapping->host)), - vmpage->mapping->host, vmpage->index, - page_count(vmpage)); - has_flags = 0; - seq_page_flag(seq, vmpage, locked, has_flags); - seq_page_flag(seq, vmpage, error, has_flags); - seq_page_flag(seq, vmpage, referenced, has_flags); - seq_page_flag(seq, vmpage, uptodate, has_flags); - seq_page_flag(seq, vmpage, dirty, has_flags); - seq_page_flag(seq, vmpage, writeback, has_flags); - seq_printf(seq, "%s]\n", has_flags ? "" : "-"); -} - -static int vvp_pgcache_show(struct seq_file *f, void *v) -{ - loff_t pos; - struct ll_sb_info *sbi; - struct cl_object *clob; - struct lu_env *env; - struct vvp_pgcache_id id; - u16 refcheck; - int result; - - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - pos = *(loff_t *)v; - vvp_pgcache_id_unpack(pos, &id); - sbi = f->private; - clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id); - if (clob) { - struct inode *inode = vvp_object_inode(clob); - struct cl_page *page = NULL; - struct page *vmpage; - - result = find_get_pages_contig(inode->i_mapping, - id.vpi_index, 1, - &vmpage); - if (result > 0) { - lock_page(vmpage); - page = cl_vmpage_page(vmpage, clob); - unlock_page(vmpage); - put_page(vmpage); - } - - seq_printf(f, "%8x@" DFID ": ", id.vpi_index, - PFID(lu_object_fid(&clob->co_lu))); - if (page) { - vvp_pgcache_page_show(env, f, page); - cl_page_put(env, page); - } else { - seq_puts(f, "missing\n"); - } - lu_object_ref_del(&clob->co_lu, "dump", current); - cl_object_put(env, clob); - } else { - seq_printf(f, "%llx missing\n", pos); - } - cl_env_put(env, &refcheck); - result = 0; - } else { - result = PTR_ERR(env); - } - return result; -} - -static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos) -{ - struct ll_sb_info *sbi; - struct lu_env *env; - u16 refcheck; - - sbi = f->private; - - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - sbi = f->private; - if (sbi->ll_site->ls_obj_hash->hs_cur_bits > - 64 - PGC_OBJ_SHIFT) { - pos = ERR_PTR(-EFBIG); - } else { - *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, - *pos); - if (*pos == ~0ULL) - pos = NULL; - } - cl_env_put(env, &refcheck); - } - return pos; -} - -static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos) -{ - struct ll_sb_info *sbi; - struct lu_env *env; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - sbi = f->private; - *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1); - if (*pos == ~0ULL) - pos = NULL; - cl_env_put(env, &refcheck); - } - return pos; -} - -static void vvp_pgcache_stop(struct seq_file *f, void *v) -{ - /* Nothing to do */ -} - -static const struct seq_operations vvp_pgcache_ops = { - .start = vvp_pgcache_start, - .next = vvp_pgcache_next, - .stop = vvp_pgcache_stop, - .show = vvp_pgcache_show -}; - -static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp) -{ - struct seq_file *seq; - int rc; - - rc = seq_open(filp, &vvp_pgcache_ops); - if (rc) - return rc; - - seq = filp->private_data; - seq->private = inode->i_private; - - return 0; -} - -const struct file_operations vvp_dump_pgcache_file_ops = { - .owner = THIS_MODULE, - .open = vvp_dump_pgcache_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h deleted file mode 100644 index 02ea5161d635..000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_internal.h +++ /dev/null @@ -1,321 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2013, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Internal definitions for VVP layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - */ - -#ifndef VVP_INTERNAL_H -#define VVP_INTERNAL_H - -#include <uapi/linux/lustre/lustre_idl.h> -#include <cl_object.h> - -enum obd_notify_event; -struct inode; -struct lustre_md; -struct obd_device; -struct obd_export; -struct page; - -/** - * IO state private to IO state private to VVP layer. - */ -struct vvp_io { - /** super class */ - struct cl_io_slice vui_cl; - struct cl_io_lock_link vui_link; - /** - * I/O vector information to or from which read/write is going. - */ - struct iov_iter *vui_iter; - /** - * Total size for the left IO. - */ - size_t vui_tot_count; - - union { - struct vvp_fault_io { - /** - * Inode modification time that is checked across DLM - * lock request. - */ - time64_t ft_mtime; - struct vm_area_struct *ft_vma; - /** - * locked page returned from vvp_io - */ - struct page *ft_vmpage; - /** - * kernel fault info - */ - struct vm_fault *ft_vmf; - /** - * fault API used bitflags for return code. - */ - unsigned int ft_flags; - /** - * check that flags are from filemap_fault - */ - bool ft_flags_valid; - } fault; - struct { - struct cl_page_list vui_queue; - unsigned long vui_written; - int vui_from; - int vui_to; - } write; - } u; - - /** - * Layout version when this IO is initialized - */ - __u32 vui_layout_gen; - /** - * File descriptor against which IO is done. - */ - struct ll_file_data *vui_fd; - struct kiocb *vui_iocb; - - /* Readahead state. */ - pgoff_t vui_ra_start; - pgoff_t vui_ra_count; - /* Set when vui_ra_{start,count} have been initialized. */ - bool vui_ra_valid; -}; - -extern struct lu_device_type vvp_device_type; - -extern struct lu_context_key vvp_session_key; -extern struct lu_context_key vvp_thread_key; - -extern struct kmem_cache *vvp_lock_kmem; -extern struct kmem_cache *vvp_object_kmem; - -struct vvp_thread_info { - struct cl_lock vti_lock; - struct cl_lock_descr vti_descr; - struct cl_io vti_io; - struct cl_attr vti_attr; -}; - -static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env) -{ - struct vvp_thread_info *vti; - - vti = lu_context_key_get(&env->le_ctx, &vvp_thread_key); - LASSERT(vti); - - return vti; -} - -static inline struct cl_lock *vvp_env_lock(const struct lu_env *env) -{ - struct cl_lock *lock = &vvp_env_info(env)->vti_lock; - - memset(lock, 0, sizeof(*lock)); - return lock; -} - -static inline struct cl_attr *vvp_env_thread_attr(const struct lu_env *env) -{ - struct cl_attr *attr = &vvp_env_info(env)->vti_attr; - - memset(attr, 0, sizeof(*attr)); - - return attr; -} - -static inline struct cl_io *vvp_env_thread_io(const struct lu_env *env) -{ - struct cl_io *io = &vvp_env_info(env)->vti_io; - - memset(io, 0, sizeof(*io)); - - return io; -} - -struct vvp_session { - struct vvp_io cs_ios; -}; - -static inline struct vvp_session *vvp_env_session(const struct lu_env *env) -{ - struct vvp_session *ses; - - ses = lu_context_key_get(env->le_ses, &vvp_session_key); - LASSERT(ses); - - return ses; -} - -static inline struct vvp_io *vvp_env_io(const struct lu_env *env) -{ - return &vvp_env_session(env)->cs_ios; -} - -/** - * ccc-private object state. - */ -struct vvp_object { - struct cl_object_header vob_header; - struct cl_object vob_cl; - struct inode *vob_inode; - - /** - * Number of transient pages. This is no longer protected by i_sem, - * and needs to be atomic. This is not actually used for anything, - * and can probably be removed. - */ - atomic_t vob_transient_pages; - - /** - * Number of outstanding mmaps on this file. - * - * \see ll_vm_open(), ll_vm_close(). - */ - atomic_t vob_mmap_cnt; - - /** - * various flags - * vob_discard_page_warned - * if pages belonging to this object are discarded when a client - * is evicted, some debug info will be printed, this flag will be set - * during processing the first discarded page, then avoid flooding - * debug message for lots of discarded pages. - * - * \see ll_dirty_page_discard_warn. - */ - unsigned int vob_discard_page_warned:1; -}; - -/** - * VVP-private page state. - */ -struct vvp_page { - struct cl_page_slice vpg_cl; - unsigned int vpg_defer_uptodate:1, - vpg_ra_used:1; - /** VM page */ - struct page *vpg_page; -}; - -static inline struct vvp_page *cl2vvp_page(const struct cl_page_slice *slice) -{ - return container_of(slice, struct vvp_page, vpg_cl); -} - -static inline pgoff_t vvp_index(struct vvp_page *vvp) -{ - return vvp->vpg_cl.cpl_index; -} - -struct vvp_device { - struct cl_device vdv_cl; - struct cl_device *vdv_next; -}; - -struct vvp_lock { - struct cl_lock_slice vlk_cl; -}; - -void *ccc_key_init(const struct lu_context *ctx, - struct lu_context_key *key); -void ccc_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data); - -void ccc_umount(const struct lu_env *env, struct cl_device *dev); - -static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv) -{ - return &vdv->vdv_cl.cd_lu_dev; -} - -static inline struct vvp_device *lu2vvp_dev(const struct lu_device *d) -{ - return container_of0(d, struct vvp_device, vdv_cl.cd_lu_dev); -} - -static inline struct vvp_device *cl2vvp_dev(const struct cl_device *d) -{ - return container_of0(d, struct vvp_device, vdv_cl); -} - -static inline struct vvp_object *cl2vvp(const struct cl_object *obj) -{ - return container_of0(obj, struct vvp_object, vob_cl); -} - -static inline struct vvp_object *lu2vvp(const struct lu_object *obj) -{ - return container_of0(obj, struct vvp_object, vob_cl.co_lu); -} - -static inline struct inode *vvp_object_inode(const struct cl_object *obj) -{ - return cl2vvp(obj)->vob_inode; -} - -int vvp_object_invariant(const struct cl_object *obj); -struct vvp_object *cl_inode2vvp(struct inode *inode); - -static inline struct page *cl2vm_page(const struct cl_page_slice *slice) -{ - return cl2vvp_page(slice)->vpg_page; -} - -static inline struct vvp_lock *cl2vvp_lock(const struct cl_lock_slice *slice) -{ - return container_of(slice, struct vvp_lock, vlk_cl); -} - -# define CLOBINVRNT(env, clob, expr) \ - ((void)sizeof(env), (void)sizeof(clob), (void)sizeof(!!(expr))) - -int vvp_io_init(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io); -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); -int vvp_lock_init(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io); -int vvp_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index); -struct lu_object *vvp_object_alloc(const struct lu_env *env, - const struct lu_object_header *hdr, - struct lu_device *dev); - -int vvp_global_init(void); -void vvp_global_fini(void); - -extern const struct file_operations vvp_dump_pgcache_file_ops; - -#endif /* VVP_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c deleted file mode 100644 index e7a4778e02e4..000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ /dev/null @@ -1,1374 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_io for VVP layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com> - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd.h> - -#include "llite_internal.h" -#include "vvp_internal.h" - -static struct vvp_io *cl2vvp_io(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct vvp_io *vio; - - vio = container_of(slice, struct vvp_io, vui_cl); - LASSERT(vio == vvp_env_io(env)); - - return vio; -} - -/** - * For swapping layout. The file's layout may have changed. - * To avoid populating pages to a wrong stripe, we have to verify the - * correctness of layout. It works because swapping layout processes - * have to acquire group lock. - */ -static bool can_populate_pages(const struct lu_env *env, struct cl_io *io, - struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct vvp_io *vio = vvp_env_io(env); - bool rc = true; - - switch (io->ci_type) { - case CIT_READ: - case CIT_WRITE: - /* don't need lock here to check lli_layout_gen as we have held - * extent lock and GROUP lock has to hold to swap layout - */ - if (ll_layout_version_get(lli) != vio->vui_layout_gen || - OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_LOST_LAYOUT, 0)) { - io->ci_need_restart = 1; - /* this will cause a short read/write */ - io->ci_continue = 0; - rc = false; - } - case CIT_FAULT: - /* fault is okay because we've already had a page. */ - default: - break; - } - - return rc; -} - -static void vvp_object_size_lock(struct cl_object *obj) -{ - struct inode *inode = vvp_object_inode(obj); - - ll_inode_size_lock(inode); - cl_object_attr_lock(obj); -} - -static void vvp_object_size_unlock(struct cl_object *obj) -{ - struct inode *inode = vvp_object_inode(obj); - - cl_object_attr_unlock(obj); - ll_inode_size_unlock(inode); -} - -/** - * Helper function that if necessary adjusts file size (inode->i_size), when - * position at the offset \a pos is accessed. File size can be arbitrary stale - * on a Lustre client, but client at least knows KMS. If accessed area is - * inside [0, KMS], set file size to KMS, otherwise glimpse file size. - * - * Locking: cl_isize_lock is used to serialize changes to inode size and to - * protect consistency between inode size and cl_object - * attributes. cl_object_size_lock() protects consistency between cl_attr's of - * top-object and sub-objects. - */ -static int vvp_prep_size(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io, loff_t start, size_t count, - int *exceed) -{ - struct cl_attr *attr = vvp_env_thread_attr(env); - struct inode *inode = vvp_object_inode(obj); - loff_t pos = start + count - 1; - loff_t kms; - int result; - - /* - * Consistency guarantees: following possibilities exist for the - * relation between region being accessed and real file size at this - * moment: - * - * (A): the region is completely inside of the file; - * - * (B-x): x bytes of region are inside of the file, the rest is - * outside; - * - * (C): the region is completely outside of the file. - * - * This classification is stable under DLM lock already acquired by - * the caller, because to change the class, other client has to take - * DLM lock conflicting with our lock. Also, any updates to ->i_size - * by other threads on this client are serialized by - * ll_inode_size_lock(). This guarantees that short reads are handled - * correctly in the face of concurrent writes and truncates. - */ - vvp_object_size_lock(obj); - result = cl_object_attr_get(env, obj, attr); - if (result == 0) { - kms = attr->cat_kms; - if (pos > kms) { - /* - * A glimpse is necessary to determine whether we - * return a short read (B) or some zeroes at the end - * of the buffer (C) - */ - vvp_object_size_unlock(obj); - result = cl_glimpse_lock(env, io, inode, obj, 0); - if (result == 0 && exceed) { - /* If objective page index exceed end-of-file - * page index, return directly. Do not expect - * kernel will check such case correctly. - * linux-2.6.18-128.1.1 miss to do that. - * --bug 17336 - */ - loff_t size = i_size_read(inode); - loff_t cur_index = start >> PAGE_SHIFT; - loff_t size_index = (size - 1) >> PAGE_SHIFT; - - if ((size == 0 && cur_index != 0) || - size_index < cur_index) - *exceed = 1; - } - return result; - } - /* - * region is within kms and, hence, within real file - * size (A). We need to increase i_size to cover the - * read region so that generic_file_read() will do its - * job, but that doesn't mean the kms size is - * _correct_, it is only the _minimum_ size. If - * someone does a stat they will get the correct size - * which will always be >= the kms value here. - * b=11081 - */ - if (i_size_read(inode) < kms) { - i_size_write(inode, kms); - CDEBUG(D_VFSTRACE, DFID " updating i_size %llu\n", - PFID(lu_object_fid(&obj->co_lu)), - (__u64)i_size_read(inode)); - } - } - - vvp_object_size_unlock(obj); - - return result; -} - -/***************************************************************************** - * - * io operations. - * - */ - -static int vvp_io_one_lock_index(const struct lu_env *env, struct cl_io *io, - __u32 enqflags, enum cl_lock_mode mode, - pgoff_t start, pgoff_t end) -{ - struct vvp_io *vio = vvp_env_io(env); - struct cl_lock_descr *descr = &vio->vui_link.cill_descr; - struct cl_object *obj = io->ci_obj; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end); - - memset(&vio->vui_link, 0, sizeof(vio->vui_link)); - - if (vio->vui_fd && (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - descr->cld_mode = CLM_GROUP; - descr->cld_gid = vio->vui_fd->fd_grouplock.lg_gid; - enqflags |= CEF_LOCK_MATCH; - } else { - descr->cld_mode = mode; - } - descr->cld_obj = obj; - descr->cld_start = start; - descr->cld_end = end; - descr->cld_enq_flags = enqflags; - - cl_io_lock_add(env, io, &vio->vui_link); - return 0; -} - -static int vvp_io_one_lock(const struct lu_env *env, struct cl_io *io, - __u32 enqflags, enum cl_lock_mode mode, - loff_t start, loff_t end) -{ - struct cl_object *obj = io->ci_obj; - - return vvp_io_one_lock_index(env, io, enqflags, mode, - cl_index(obj, start), cl_index(obj, end)); -} - -static int vvp_io_write_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - - cl_page_list_init(&vio->u.write.vui_queue); - vio->u.write.vui_written = 0; - vio->u.write.vui_from = 0; - vio->u.write.vui_to = PAGE_SIZE; - - return 0; -} - -static void vvp_io_write_iter_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - - LASSERT(vio->u.write.vui_queue.pl_nr == 0); -} - -static int vvp_io_fault_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - struct inode *inode = vvp_object_inode(ios->cis_obj); - - LASSERT(inode == file_inode(vio->vui_fd->fd_file)); - vio->u.fault.ft_mtime = inode->i_mtime.tv_sec; - return 0; -} - -static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct vvp_io *vio = cl2vvp_io(env, ios); - struct inode *inode = vvp_object_inode(obj); - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - CDEBUG(D_VFSTRACE, DFID - " ignore/verify layout %d/%d, layout version %d restore needed %d\n", - PFID(lu_object_fid(&obj->co_lu)), - io->ci_ignore_layout, io->ci_verify_layout, - vio->vui_layout_gen, io->ci_restore_needed); - - if (io->ci_restore_needed) { - int rc; - - /* file was detected release, we need to restore it - * before finishing the io - */ - rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF); - /* if restore registration failed, no restart, - * we will return -ENODATA - */ - /* The layout will change after restore, so we need to - * block on layout lock hold by the MDT - * as MDT will not send new layout in lvb (see LU-3124) - * we have to explicitly fetch it, all this will be done - * by ll_layout_refresh() - */ - if (rc == 0) { - io->ci_restore_needed = 0; - io->ci_need_restart = 1; - io->ci_verify_layout = 1; - } else { - io->ci_restore_needed = 1; - io->ci_need_restart = 0; - io->ci_verify_layout = 0; - io->ci_result = rc; - } - } - - if (!io->ci_ignore_layout && io->ci_verify_layout) { - __u32 gen = 0; - - /* check layout version */ - ll_layout_refresh(inode, &gen); - io->ci_need_restart = vio->vui_layout_gen != gen; - if (io->ci_need_restart) { - CDEBUG(D_VFSTRACE, - DFID " layout changed from %d to %d.\n", - PFID(lu_object_fid(&obj->co_lu)), - vio->vui_layout_gen, gen); - /* today successful restore is the only possible case */ - /* restore was done, clear restoring state */ - clear_bit(LLIF_FILE_RESTORING, - &ll_i2info(inode)->lli_flags); - } - } -} - -static void vvp_io_fault_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct cl_page *page = io->u.ci_fault.ft_page; - - CLOBINVRNT(env, io->ci_obj, vvp_object_invariant(io->ci_obj)); - - if (page) { - lu_ref_del(&page->cp_reference, "fault", io); - cl_page_put(env, page); - io->u.ci_fault.ft_page = NULL; - } - vvp_io_fini(env, ios); -} - -static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma) -{ - /* - * we only want to hold PW locks if the mmap() can generate - * writes back to the file and that only happens in shared - * writable vmas - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return CLM_WRITE; - return CLM_READ; -} - -static int vvp_mmap_locks(const struct lu_env *env, - struct vvp_io *vio, struct cl_io *io) -{ - struct vvp_thread_info *cti = vvp_env_info(env); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct cl_lock_descr *descr = &cti->vti_descr; - union ldlm_policy_data policy; - unsigned long addr; - ssize_t count; - int result = 0; - struct iov_iter i; - struct iovec iov; - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - if (!vio->vui_iter) /* nfs or loop back device write */ - return 0; - - /* No MM (e.g. NFS)? No vmas too. */ - if (!mm) - return 0; - - iov_for_each(iov, i, *vio->vui_iter) { - addr = (unsigned long)iov.iov_base; - count = iov.iov_len; - if (count == 0) - continue; - - count += addr & (~PAGE_MASK); - addr &= PAGE_MASK; - - down_read(&mm->mmap_sem); - while ((vma = our_vma(mm, addr, count)) != NULL) { - struct inode *inode = file_inode(vma->vm_file); - int flags = CEF_MUST; - - if (ll_file_nolock(vma->vm_file)) { - /* - * For no lock case is not allowed for mmap - */ - result = -EINVAL; - break; - } - - /* - * XXX: Required lock mode can be weakened: CIT_WRITE - * io only ever reads user level buffer, and CIT_READ - * only writes on it. - */ - policy_from_vma(&policy, vma, addr, count); - descr->cld_mode = vvp_mode_from_vma(vma); - descr->cld_obj = ll_i2info(inode)->lli_clob; - descr->cld_start = cl_index(descr->cld_obj, - policy.l_extent.start); - descr->cld_end = cl_index(descr->cld_obj, - policy.l_extent.end); - descr->cld_enq_flags = flags; - result = cl_io_lock_alloc_add(env, io, descr); - - CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", - descr->cld_mode, descr->cld_start, - descr->cld_end); - - if (result < 0) - break; - - if (vma->vm_end - addr >= count) - break; - - count -= vma->vm_end - addr; - addr = vma->vm_end; - } - up_read(&mm->mmap_sem); - if (result < 0) - break; - } - return result; -} - -static void vvp_io_advance(const struct lu_env *env, - const struct cl_io_slice *ios, - size_t nob) -{ - struct cl_object *obj = ios->cis_io->ci_obj; - struct vvp_io *vio = cl2vvp_io(env, ios); - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - vio->vui_tot_count -= nob; - iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count); -} - -static void vvp_io_update_iov(const struct lu_env *env, - struct vvp_io *vio, struct cl_io *io) -{ - size_t size = io->u.ci_rw.crw_count; - - if (!vio->vui_iter) - return; - - iov_iter_truncate(vio->vui_iter, size); -} - -static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io, - enum cl_lock_mode mode, loff_t start, loff_t end) -{ - struct vvp_io *vio = vvp_env_io(env); - int result; - int ast_flags = 0; - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - vvp_io_update_iov(env, vio, io); - - if (io->u.ci_rw.crw_nonblock) - ast_flags |= CEF_NONBLOCK; - result = vvp_mmap_locks(env, vio, io); - if (result == 0) - result = vvp_io_one_lock(env, io, ast_flags, mode, start, end); - return result; -} - -static int vvp_io_read_lock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct cl_io_rw_common *rd = &io->u.ci_rd.rd; - int result; - - result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos, - rd->crw_pos + rd->crw_count - 1); - - return result; -} - -static int vvp_io_fault_lock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct vvp_io *vio = cl2vvp_io(env, ios); - /* - * XXX LDLM_FL_CBPENDING - */ - return vvp_io_one_lock_index(env, - io, 0, - vvp_mode_from_vma(vio->u.fault.ft_vma), - io->u.ci_fault.ft_index, - io->u.ci_fault.ft_index); -} - -static int vvp_io_write_lock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - loff_t start; - loff_t end; - - if (io->u.ci_wr.wr_append) { - start = 0; - end = OBD_OBJECT_EOF; - } else { - start = io->u.ci_wr.wr.crw_pos; - end = start + io->u.ci_wr.wr.crw_count - 1; - } - return vvp_io_rw_lock(env, io, CLM_WRITE, start, end); -} - -static int vvp_io_setattr_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - return 0; -} - -/** - * Implementation of cl_io_operations::vio_lock() method for CIT_SETATTR io. - * - * Handles "lockless io" mode when extent locking is done by server. - */ -static int vvp_io_setattr_lock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - __u64 new_size; - __u32 enqflags = 0; - - if (cl_io_is_trunc(io)) { - new_size = io->u.ci_setattr.sa_attr.lvb_size; - if (new_size == 0) - enqflags = CEF_DISCARD_DATA; - } else { - unsigned int valid = io->u.ci_setattr.sa_valid; - - if (!(valid & TIMES_SET_FLAGS)) - return 0; - - if ((!(valid & ATTR_MTIME) || - io->u.ci_setattr.sa_attr.lvb_mtime >= - io->u.ci_setattr.sa_attr.lvb_ctime) && - (!(valid & ATTR_ATIME) || - io->u.ci_setattr.sa_attr.lvb_atime >= - io->u.ci_setattr.sa_attr.lvb_ctime)) - return 0; - new_size = 0; - } - - return vvp_io_one_lock(env, io, enqflags, CLM_WRITE, - new_size, OBD_OBJECT_EOF); -} - -static int vvp_do_vmtruncate(struct inode *inode, size_t size) -{ - int result; - /* - * Only ll_inode_size_lock is taken at this level. - */ - ll_inode_size_lock(inode); - result = inode_newsize_ok(inode, size); - if (result < 0) { - ll_inode_size_unlock(inode); - return result; - } - truncate_setsize(inode, size); - ll_inode_size_unlock(inode); - return result; -} - -static int vvp_io_setattr_time(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct cl_attr *attr = vvp_env_thread_attr(env); - int result; - unsigned valid = CAT_CTIME; - - cl_object_attr_lock(obj); - attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime; - if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) { - attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime; - valid |= CAT_ATIME; - } - if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) { - attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime; - valid |= CAT_MTIME; - } - result = cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); - - return result; -} - -static int vvp_io_setattr_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct inode *inode = vvp_object_inode(io->ci_obj); - struct ll_inode_info *lli = ll_i2info(inode); - - if (cl_io_is_trunc(io)) { - down_write(&lli->lli_trunc_sem); - inode_lock(inode); - inode_dio_wait(inode); - } else { - inode_lock(inode); - } - - if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS) - return vvp_io_setattr_time(env, ios); - - return 0; -} - -static void vvp_io_setattr_end(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct inode *inode = vvp_object_inode(io->ci_obj); - struct ll_inode_info *lli = ll_i2info(inode); - - if (cl_io_is_trunc(io)) { - /* Truncate in memory pages - they must be clean pages - * because osc has already notified to destroy osc_extents. - */ - vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size); - inode_unlock(inode); - up_write(&lli->lli_trunc_sem); - } else { - inode_unlock(inode); - } -} - -static void vvp_io_setattr_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - bool restore_needed = ios->cis_io->ci_restore_needed; - struct inode *inode = vvp_object_inode(ios->cis_obj); - - vvp_io_fini(env, ios); - - if (restore_needed && !ios->cis_io->ci_restore_needed) { - /* restore finished, set data modified flag for HSM */ - set_bit(LLIF_DATA_MODIFIED, &(ll_i2info(inode))->lli_flags); - } -} - -static int vvp_io_read_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct inode *inode = vvp_object_inode(obj); - struct ll_inode_info *lli = ll_i2info(inode); - struct file *file = vio->vui_fd->fd_file; - - int result; - loff_t pos = io->u.ci_rd.rd.crw_pos; - long cnt = io->u.ci_rd.rd.crw_count; - long tot = vio->vui_tot_count; - int exceed = 0; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt); - - down_read(&lli->lli_trunc_sem); - - if (!can_populate_pages(env, io, inode)) - return 0; - - result = vvp_prep_size(env, obj, io, pos, tot, &exceed); - if (result != 0) - return result; - if (exceed != 0) - goto out; - - LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, - "Read ino %lu, %lu bytes, offset %lld, size %llu\n", - inode->i_ino, cnt, pos, i_size_read(inode)); - - /* turn off the kernel's read-ahead */ - vio->vui_fd->fd_file->f_ra.ra_pages = 0; - - /* initialize read-ahead window once per syscall */ - if (!vio->vui_ra_valid) { - vio->vui_ra_valid = true; - vio->vui_ra_start = cl_index(obj, pos); - vio->vui_ra_count = cl_index(obj, tot + PAGE_SIZE - 1); - ll_ras_enter(file); - } - - /* BUG: 5972 */ - file_accessed(file); - LASSERT(vio->vui_iocb->ki_pos == pos); - result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter); - -out: - if (result >= 0) { - if (result < cnt) - io->ci_continue = 0; - io->ci_nob += result; - ll_rw_stats_tally(ll_i2sbi(inode), current->pid, - vio->vui_fd, pos, result, READ); - result = 0; - } - return result; -} - -static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *plist, int from, int to) -{ - struct cl_2queue *queue = &io->ci_queue; - struct cl_page *page; - unsigned int bytes = 0; - int rc = 0; - - if (plist->pl_nr == 0) - return 0; - - if (from > 0 || to != PAGE_SIZE) { - page = cl_page_list_first(plist); - if (plist->pl_nr == 1) { - cl_page_clip(env, page, from, to); - } else { - if (from > 0) - cl_page_clip(env, page, from, PAGE_SIZE); - if (to != PAGE_SIZE) { - page = cl_page_list_last(plist); - cl_page_clip(env, page, 0, to); - } - } - } - - cl_2queue_init(queue); - cl_page_list_splice(plist, &queue->c2_qin); - rc = cl_io_submit_sync(env, io, CRT_WRITE, queue, 0); - - /* plist is not sorted any more */ - cl_page_list_splice(&queue->c2_qin, plist); - cl_page_list_splice(&queue->c2_qout, plist); - cl_2queue_fini(env, queue); - - if (rc == 0) { - /* calculate bytes */ - bytes = plist->pl_nr << PAGE_SHIFT; - bytes -= from + PAGE_SIZE - to; - - while (plist->pl_nr > 0) { - page = cl_page_list_first(plist); - cl_page_list_del(env, plist, page); - - cl_page_clip(env, page, 0, PAGE_SIZE); - - SetPageUptodate(cl_page_vmpage(page)); - cl_page_disown(env, io, page); - - /* held in ll_cl_init() */ - lu_ref_del(&page->cp_reference, "cl_io", io); - cl_page_put(env, page); - } - } - - return bytes > 0 ? bytes : rc; -} - -static void write_commit_callback(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) -{ - struct page *vmpage = page->cp_vmpage; - - SetPageUptodate(vmpage); - set_page_dirty(vmpage); - - cl_page_disown(env, io, page); - - /* held in ll_cl_init() */ - lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io)); - cl_page_put(env, page); -} - -/* make sure the page list is contiguous */ -static bool page_list_sanity_check(struct cl_object *obj, - struct cl_page_list *plist) -{ - struct cl_page *page; - pgoff_t index = CL_PAGE_EOF; - - cl_page_list_for_each(page, plist) { - struct vvp_page *vpg = cl_object_page_slice(obj, page); - - if (index == CL_PAGE_EOF) { - index = vvp_index(vpg); - continue; - } - - ++index; - if (index == vvp_index(vpg)) - continue; - - return false; - } - return true; -} - -/* Return how many bytes have queued or written */ -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) -{ - struct cl_object *obj = io->ci_obj; - struct inode *inode = vvp_object_inode(obj); - struct vvp_io *vio = vvp_env_io(env); - struct cl_page_list *queue = &vio->u.write.vui_queue; - struct cl_page *page; - int rc = 0; - int bytes = 0; - unsigned int npages = vio->u.write.vui_queue.pl_nr; - - if (npages == 0) - return 0; - - CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n", - npages, vio->u.write.vui_from, vio->u.write.vui_to); - - LASSERT(page_list_sanity_check(obj, queue)); - - /* submit IO with async write */ - rc = cl_io_commit_async(env, io, queue, - vio->u.write.vui_from, vio->u.write.vui_to, - write_commit_callback); - npages -= queue->pl_nr; /* already committed pages */ - if (npages > 0) { - /* calculate how many bytes were written */ - bytes = npages << PAGE_SHIFT; - - /* first page */ - bytes -= vio->u.write.vui_from; - if (queue->pl_nr == 0) /* last page */ - bytes -= PAGE_SIZE - vio->u.write.vui_to; - LASSERTF(bytes > 0, "bytes = %d, pages = %d\n", bytes, npages); - - vio->u.write.vui_written += bytes; - - CDEBUG(D_VFSTRACE, "Committed %d pages %d bytes, tot: %ld\n", - npages, bytes, vio->u.write.vui_written); - - /* the first page must have been written. */ - vio->u.write.vui_from = 0; - } - LASSERT(page_list_sanity_check(obj, queue)); - LASSERT(ergo(rc == 0, queue->pl_nr == 0)); - - /* out of quota, try sync write */ - if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) { - rc = vvp_io_commit_sync(env, io, queue, - vio->u.write.vui_from, - vio->u.write.vui_to); - if (rc > 0) { - vio->u.write.vui_written += rc; - rc = 0; - } - } - - /* update inode size */ - ll_merge_attr(env, inode); - - /* Now the pages in queue were failed to commit, discard them - * unless they were dirtied before. - */ - while (queue->pl_nr > 0) { - page = cl_page_list_first(queue); - cl_page_list_del(env, queue, page); - - if (!PageDirty(cl_page_vmpage(page))) - cl_page_discard(env, io, page); - - cl_page_disown(env, io, page); - - /* held in ll_cl_init() */ - lu_ref_del(&page->cp_reference, "cl_io", io); - cl_page_put(env, page); - } - cl_page_list_fini(env, queue); - - return rc; -} - -static int vvp_io_write_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct inode *inode = vvp_object_inode(obj); - struct ll_inode_info *lli = ll_i2info(inode); - ssize_t result = 0; - loff_t pos = io->u.ci_wr.wr.crw_pos; - size_t cnt = io->u.ci_wr.wr.crw_count; - - down_read(&lli->lli_trunc_sem); - - if (!can_populate_pages(env, io, inode)) - return 0; - - if (cl_io_is_append(io)) { - /* - * PARALLEL IO This has to be changed for parallel IO doing - * out-of-order writes. - */ - ll_merge_attr(env, inode); - pos = i_size_read(inode); - io->u.ci_wr.wr.crw_pos = pos; - vio->vui_iocb->ki_pos = pos; - } else { - LASSERT(vio->vui_iocb->ki_pos == pos); - } - - CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt); - - /* - * The maximum Lustre file size is variable, based on the OST maximum - * object size and number of stripes. This needs another check in - * addition to the VFS checks earlier. - */ - if (pos + cnt > ll_file_maxbytes(inode)) { - CDEBUG(D_INODE, - "%s: file " DFID " offset %llu > maxbytes %llu\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), pos + cnt, - ll_file_maxbytes(inode)); - return -EFBIG; - } - - if (!vio->vui_iter) { - /* from a temp io in ll_cl_init(). */ - result = 0; - } else { - /* - * When using the locked AIO function (generic_file_aio_write()) - * testing has shown the inode mutex to be a limiting factor - * with multi-threaded single shared file performance. To get - * around this, we now use the lockless version. To maintain - * consistency, proper locking to protect against writes, - * trucates, etc. is handled in the higher layers of lustre. - */ - bool lock_node = !IS_NOSEC(inode); - - if (lock_node) - inode_lock(inode); - result = __generic_file_write_iter(vio->vui_iocb, - vio->vui_iter); - if (lock_node) - inode_unlock(inode); - - if (result > 0 || result == -EIOCBQUEUED) - result = generic_write_sync(vio->vui_iocb, result); - } - - if (result > 0) { - result = vvp_io_write_commit(env, io); - if (vio->u.write.vui_written > 0) { - result = vio->u.write.vui_written; - io->ci_nob += result; - - CDEBUG(D_VFSTRACE, "write: nob %zd, result: %zd\n", - io->ci_nob, result); - } - } - if (result > 0) { - set_bit(LLIF_DATA_MODIFIED, &(ll_i2info(inode))->lli_flags); - - if (result < cnt) - io->ci_continue = 0; - ll_rw_stats_tally(ll_i2sbi(inode), current->pid, - vio->vui_fd, pos, result, WRITE); - result = 0; - } - return result; -} - -static void vvp_io_rw_end(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct inode *inode = vvp_object_inode(ios->cis_obj); - struct ll_inode_info *lli = ll_i2info(inode); - - up_read(&lli->lli_trunc_sem); -} - -static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) -{ - struct vm_fault *vmf = cfio->ft_vmf; - - cfio->ft_flags = filemap_fault(vmf); - cfio->ft_flags_valid = 1; - - if (vmf->page) { - CDEBUG(D_PAGE, - "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", - vmf->page, vmf->page->mapping, vmf->page->index, - (long)vmf->page->flags, page_count(vmf->page), - page_private(vmf->page), (void *)vmf->address); - if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { - lock_page(vmf->page); - cfio->ft_flags |= VM_FAULT_LOCKED; - } - - cfio->ft_vmpage = vmf->page; - return 0; - } - - if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { - CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address); - return -EFAULT; - } - - if (cfio->ft_flags & VM_FAULT_OOM) { - CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address); - return -ENOMEM; - } - - if (cfio->ft_flags & VM_FAULT_RETRY) - return -EAGAIN; - - CERROR("Unknown error in page fault %d!\n", cfio->ft_flags); - return -EINVAL; -} - -static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) -{ - set_page_dirty(page->cp_vmpage); -} - -static int vvp_io_fault_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct inode *inode = vvp_object_inode(obj); - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_fault_io *fio = &io->u.ci_fault; - struct vvp_fault_io *cfio = &vio->u.fault; - loff_t offset; - int result = 0; - struct page *vmpage = NULL; - struct cl_page *page; - loff_t size; - pgoff_t last_index; - - down_read(&lli->lli_trunc_sem); - - /* offset of the last byte on the page */ - offset = cl_offset(obj, fio->ft_index + 1) - 1; - LASSERT(cl_index(obj, offset) == fio->ft_index); - result = vvp_prep_size(env, obj, io, 0, offset + 1, NULL); - if (result != 0) - return result; - - /* must return locked page */ - if (fio->ft_mkwrite) { - LASSERT(cfio->ft_vmpage); - lock_page(cfio->ft_vmpage); - } else { - result = vvp_io_kernel_fault(cfio); - if (result != 0) - return result; - } - - vmpage = cfio->ft_vmpage; - LASSERT(PageLocked(vmpage)); - - if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) - ll_invalidate_page(vmpage); - - size = i_size_read(inode); - /* Though we have already held a cl_lock upon this page, but - * it still can be truncated locally. - */ - if (unlikely((vmpage->mapping != inode->i_mapping) || - (page_offset(vmpage) > size))) { - CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n"); - - /* return +1 to stop cl_io_loop() and ll_fault() will catch - * and retry. - */ - result = 1; - goto out; - } - - last_index = cl_index(obj, size - 1); - - if (fio->ft_mkwrite) { - /* - * Capture the size while holding the lli_trunc_sem from above - * we want to make sure that we complete the mkwrite action - * while holding this lock. We need to make sure that we are - * not past the end of the file. - */ - if (last_index < fio->ft_index) { - CDEBUG(D_PAGE, - "llite: mkwrite and truncate race happened: %p: 0x%lx 0x%lx\n", - vmpage->mapping, fio->ft_index, last_index); - /* - * We need to return if we are - * passed the end of the file. This will propagate - * up the call stack to ll_page_mkwrite where - * we will return VM_FAULT_NOPAGE. Any non-negative - * value returned here will be silently - * converted to 0. If the vmpage->mapping is null - * the error code would be converted back to ENODATA - * in ll_page_mkwrite0. Thus we return -ENODATA - * to handle both cases - */ - result = -ENODATA; - goto out; - } - } - - page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE); - if (IS_ERR(page)) { - result = PTR_ERR(page); - goto out; - } - - /* if page is going to be written, we should add this page into cache - * earlier. - */ - if (fio->ft_mkwrite) { - wait_on_page_writeback(vmpage); - if (!PageDirty(vmpage)) { - struct cl_page_list *plist = &io->ci_queue.c2_qin; - struct vvp_page *vpg = cl_object_page_slice(obj, page); - int to = PAGE_SIZE; - - /* vvp_page_assume() calls wait_on_page_writeback(). */ - cl_page_assume(env, io, page); - - cl_page_list_init(plist); - cl_page_list_add(plist, page); - - /* size fixup */ - if (last_index == vvp_index(vpg)) - to = size & ~PAGE_MASK; - - /* Do not set Dirty bit here so that in case IO is - * started before the page is really made dirty, we - * still have chance to detect it. - */ - result = cl_io_commit_async(env, io, plist, 0, to, - mkwrite_commit_callback); - LASSERT(cl_page_is_owned(page, io)); - cl_page_list_fini(env, plist); - - vmpage = NULL; - if (result < 0) { - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - - cl_page_put(env, page); - - /* we're in big trouble, what can we do now? */ - if (result == -EDQUOT) - result = -ENOSPC; - goto out; - } else { - cl_page_disown(env, io, page); - } - } - } - - /* - * The ft_index is only used in the case of - * a mkwrite action. We need to check - * our assertions are correct, since - * we should have caught this above - */ - LASSERT(!fio->ft_mkwrite || fio->ft_index <= last_index); - if (fio->ft_index == last_index) - /* - * Last page is mapped partially. - */ - fio->ft_nob = size - cl_offset(obj, fio->ft_index); - else - fio->ft_nob = cl_page_size(obj); - - lu_ref_add(&page->cp_reference, "fault", io); - fio->ft_page = page; - -out: - /* return unlocked vmpage to avoid deadlocking */ - if (vmpage) - unlock_page(vmpage); - - cfio->ft_flags &= ~VM_FAULT_LOCKED; - - return result; -} - -static void vvp_io_fault_end(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct inode *inode = vvp_object_inode(ios->cis_obj); - struct ll_inode_info *lli = ll_i2info(inode); - - CLOBINVRNT(env, ios->cis_io->ci_obj, - vvp_object_invariant(ios->cis_io->ci_obj)); - up_read(&lli->lli_trunc_sem); -} - -static int vvp_io_fsync_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - /* we should mark TOWRITE bit to each dirty page in radix tree to - * verify pages have been written, but this is difficult because of - * race. - */ - return 0; -} - -static int vvp_io_read_ahead(const struct lu_env *env, - const struct cl_io_slice *ios, - pgoff_t start, struct cl_read_ahead *ra) -{ - int result = 0; - - if (ios->cis_io->ci_type == CIT_READ || - ios->cis_io->ci_type == CIT_FAULT) { - struct vvp_io *vio = cl2vvp_io(env, ios); - - if (unlikely(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - ra->cra_end = CL_PAGE_EOF; - result = 1; /* no need to call down */ - } - } - - return result; -} - -static const struct cl_io_operations vvp_io_ops = { - .op = { - [CIT_READ] = { - .cio_fini = vvp_io_fini, - .cio_lock = vvp_io_read_lock, - .cio_start = vvp_io_read_start, - .cio_end = vvp_io_rw_end, - .cio_advance = vvp_io_advance, - }, - [CIT_WRITE] = { - .cio_fini = vvp_io_fini, - .cio_iter_init = vvp_io_write_iter_init, - .cio_iter_fini = vvp_io_write_iter_fini, - .cio_lock = vvp_io_write_lock, - .cio_start = vvp_io_write_start, - .cio_end = vvp_io_rw_end, - .cio_advance = vvp_io_advance, - }, - [CIT_SETATTR] = { - .cio_fini = vvp_io_setattr_fini, - .cio_iter_init = vvp_io_setattr_iter_init, - .cio_lock = vvp_io_setattr_lock, - .cio_start = vvp_io_setattr_start, - .cio_end = vvp_io_setattr_end - }, - [CIT_FAULT] = { - .cio_fini = vvp_io_fault_fini, - .cio_iter_init = vvp_io_fault_iter_init, - .cio_lock = vvp_io_fault_lock, - .cio_start = vvp_io_fault_start, - .cio_end = vvp_io_fault_end, - }, - [CIT_FSYNC] = { - .cio_start = vvp_io_fsync_start, - .cio_fini = vvp_io_fini - }, - [CIT_MISC] = { - .cio_fini = vvp_io_fini - } - }, - .cio_read_ahead = vvp_io_read_ahead, -}; - -int vvp_io_init(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io) -{ - struct vvp_io *vio = vvp_env_io(env); - struct inode *inode = vvp_object_inode(obj); - int result; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - CDEBUG(D_VFSTRACE, DFID - " ignore/verify layout %d/%d, layout version %d restore needed %d\n", - PFID(lu_object_fid(&obj->co_lu)), - io->ci_ignore_layout, io->ci_verify_layout, - vio->vui_layout_gen, io->ci_restore_needed); - - CL_IO_SLICE_CLEAN(vio, vui_cl); - cl_io_slice_add(io, &vio->vui_cl, obj, &vvp_io_ops); - vio->vui_ra_valid = false; - result = 0; - if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { - size_t count; - struct ll_inode_info *lli = ll_i2info(inode); - - count = io->u.ci_rw.crw_count; - /* "If nbyte is 0, read() will return 0 and have no other - * results." -- Single Unix Spec - */ - if (count == 0) - result = 1; - else - vio->vui_tot_count = count; - - /* for read/write, we store the jobid in the inode, and - * it'll be fetched by osc when building RPC. - * - * it's not accurate if the file is shared by different - * jobs. - */ - lustre_get_jobid(lli->lli_jobid); - } else if (io->ci_type == CIT_SETATTR) { - if (!cl_io_is_trunc(io)) - io->ci_lockreq = CILR_MANDATORY; - } - - /* Enqueue layout lock and get layout version. We need to do this - * even for operations requiring to open file, such as read and write, - * because it might not grant layout lock in IT_OPEN. - */ - if (result == 0 && !io->ci_ignore_layout) { - result = ll_layout_refresh(inode, &vio->vui_layout_gen); - if (result == -ENOENT) - /* If the inode on MDS has been removed, but the objects - * on OSTs haven't been destroyed (async unlink), layout - * fetch will return -ENOENT, we'd ignore this error - * and continue with dirty flush. LU-3230. - */ - result = 0; - if (result < 0) - CERROR("%s: refresh file layout " DFID " error %d.\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(lu_object_fid(&obj->co_lu)), result); - } - - return result; -} diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c deleted file mode 100644 index 4b6c7143bd2c..000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_lock.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2014, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_lock for VVP layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd_support.h> - -#include "vvp_internal.h" - -/***************************************************************************** - * - * Vvp lock functions. - * - */ - -static void vvp_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice) -{ - struct vvp_lock *vlk = cl2vvp_lock(slice); - - kmem_cache_free(vvp_lock_kmem, vlk); -} - -static int vvp_lock_enqueue(const struct lu_env *env, - const struct cl_lock_slice *slice, - struct cl_io *unused, struct cl_sync_io *anchor) -{ - CLOBINVRNT(env, slice->cls_obj, vvp_object_invariant(slice->cls_obj)); - - return 0; -} - -static const struct cl_lock_operations vvp_lock_ops = { - .clo_fini = vvp_lock_fini, - .clo_enqueue = vvp_lock_enqueue, -}; - -int vvp_lock_init(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *unused) -{ - struct vvp_lock *vlk; - int result; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - vlk = kmem_cache_zalloc(vvp_lock_kmem, GFP_NOFS); - if (vlk) { - cl_lock_slice_add(lock, &vlk->vlk_cl, obj, &vvp_lock_ops); - result = 0; - } else { - result = -ENOMEM; - } - return result; -} diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c deleted file mode 100644 index 05ad3b322a29..000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_object.c +++ /dev/null @@ -1,305 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * cl_object implementation for VVP layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <linux/libcfs/libcfs.h> - -#include <obd.h> - -#include "llite_internal.h" -#include "vvp_internal.h" - -/***************************************************************************** - * - * Object operations. - * - */ - -int vvp_object_invariant(const struct cl_object *obj) -{ - struct inode *inode = vvp_object_inode(obj); - struct ll_inode_info *lli = ll_i2info(inode); - - return (S_ISREG(inode->i_mode) || inode->i_mode == 0) && - lli->lli_clob == obj; -} - -static int vvp_object_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o) -{ - struct vvp_object *obj = lu2vvp(o); - struct inode *inode = obj->vob_inode; - struct ll_inode_info *lli; - - (*p)(env, cookie, "(%d %d) inode: %p ", - atomic_read(&obj->vob_transient_pages), - atomic_read(&obj->vob_mmap_cnt), inode); - if (inode) { - lli = ll_i2info(inode); - (*p)(env, cookie, "%lu/%u %o %u %d %p " DFID, - inode->i_ino, inode->i_generation, inode->i_mode, - inode->i_nlink, atomic_read(&inode->i_count), - lli->lli_clob, PFID(&lli->lli_fid)); - } - return 0; -} - -static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr) -{ - struct inode *inode = vvp_object_inode(obj); - - /* - * lov overwrites most of these fields in - * lov_attr_get()->...lov_merge_lvb_kms(), except when inode - * attributes are newer. - */ - - attr->cat_size = i_size_read(inode); - attr->cat_mtime = inode->i_mtime.tv_sec; - attr->cat_atime = inode->i_atime.tv_sec; - attr->cat_ctime = inode->i_ctime.tv_sec; - attr->cat_blocks = inode->i_blocks; - attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid); - attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid); - /* KMS is not known by this layer */ - return 0; /* layers below have to fill in the rest */ -} - -static int vvp_attr_update(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int valid) -{ - struct inode *inode = vvp_object_inode(obj); - - if (valid & CAT_UID) - inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid); - if (valid & CAT_GID) - inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid); - if (valid & CAT_ATIME) - inode->i_atime.tv_sec = attr->cat_atime; - if (valid & CAT_MTIME) - inode->i_mtime.tv_sec = attr->cat_mtime; - if (valid & CAT_CTIME) - inode->i_ctime.tv_sec = attr->cat_ctime; - if (0 && valid & CAT_SIZE) - i_size_write(inode, attr->cat_size); - /* not currently necessary */ - if (0 && valid & (CAT_UID | CAT_GID | CAT_SIZE)) - mark_inode_dirty(inode); - return 0; -} - -static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_object_conf *conf) -{ - struct ll_inode_info *lli = ll_i2info(conf->coc_inode); - - if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { - CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n", - PFID(&lli->lli_fid)); - - ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE); - - /* Clean up page mmap for this inode. - * The reason for us to do this is that if the page has - * already been installed into memory space, the process - * can access it without interacting with lustre, so this - * page may be stale due to layout change, and the process - * will never be notified. - * This operation is expensive but mmap processes have to pay - * a price themselves. - */ - unmap_mapping_range(conf->coc_inode->i_mapping, - 0, OBD_OBJECT_EOF, 0); - } - - return 0; -} - -static int vvp_prune(const struct lu_env *env, struct cl_object *obj) -{ - struct inode *inode = vvp_object_inode(obj); - int rc; - - rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1); - if (rc < 0) { - CDEBUG(D_VFSTRACE, DFID ": writeback failed: %d\n", - PFID(lu_object_fid(&obj->co_lu)), rc); - return rc; - } - - truncate_inode_pages(inode->i_mapping, 0); - return 0; -} - -static int vvp_object_glimpse(const struct lu_env *env, - const struct cl_object *obj, struct ost_lvb *lvb) -{ - struct inode *inode = vvp_object_inode(obj); - - lvb->lvb_mtime = LTIME_S(inode->i_mtime); - lvb->lvb_atime = LTIME_S(inode->i_atime); - lvb->lvb_ctime = LTIME_S(inode->i_ctime); - /* - * LU-417: Add dirty pages block count lest i_blocks reports 0, some - * "cp" or "tar" on remote node may think it's a completely sparse file - * and skip it. - */ - if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0) - lvb->lvb_blocks = dirty_cnt(inode); - return 0; -} - -static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj, - struct cl_req_attr *attr) -{ - u64 valid_flags = OBD_MD_FLTYPE; - struct inode *inode; - struct obdo *oa; - - oa = attr->cra_oa; - inode = vvp_object_inode(obj); - - if (attr->cra_type == CRT_WRITE) - valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME | - OBD_MD_FLUID | OBD_MD_FLGID; - obdo_from_inode(oa, inode, valid_flags & attr->cra_flags); - obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid); - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID)) - oa->o_parent_oid++; - memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, LUSTRE_JOBID_SIZE); -} - -static const struct cl_object_operations vvp_ops = { - .coo_page_init = vvp_page_init, - .coo_lock_init = vvp_lock_init, - .coo_io_init = vvp_io_init, - .coo_attr_get = vvp_attr_get, - .coo_attr_update = vvp_attr_update, - .coo_conf_set = vvp_conf_set, - .coo_prune = vvp_prune, - .coo_glimpse = vvp_object_glimpse, - .coo_req_attr_set = vvp_req_attr_set -}; - -static int vvp_object_init0(const struct lu_env *env, - struct vvp_object *vob, - const struct cl_object_conf *conf) -{ - vob->vob_inode = conf->coc_inode; - atomic_set(&vob->vob_transient_pages, 0); - cl_object_page_init(&vob->vob_cl, sizeof(struct vvp_page)); - return 0; -} - -static int vvp_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf) -{ - struct vvp_device *dev = lu2vvp_dev(obj->lo_dev); - struct vvp_object *vob = lu2vvp(obj); - struct lu_object *below; - struct lu_device *under; - int result; - - under = &dev->vdv_next->cd_lu_dev; - below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); - if (below) { - const struct cl_object_conf *cconf; - - cconf = lu2cl_conf(conf); - lu_object_add(obj, below); - result = vvp_object_init0(env, vob, cconf); - } else { - result = -ENOMEM; - } - - return result; -} - -static void vvp_object_free(const struct lu_env *env, struct lu_object *obj) -{ - struct vvp_object *vob = lu2vvp(obj); - - lu_object_fini(obj); - lu_object_header_fini(obj->lo_header); - kmem_cache_free(vvp_object_kmem, vob); -} - -static const struct lu_object_operations vvp_lu_obj_ops = { - .loo_object_init = vvp_object_init, - .loo_object_free = vvp_object_free, - .loo_object_print = vvp_object_print, -}; - -struct vvp_object *cl_inode2vvp(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *obj = lli->lli_clob; - struct lu_object *lu; - - lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type); - LASSERT(lu); - return lu2vvp(lu); -} - -struct lu_object *vvp_object_alloc(const struct lu_env *env, - const struct lu_object_header *unused, - struct lu_device *dev) -{ - struct vvp_object *vob; - struct lu_object *obj; - - vob = kmem_cache_zalloc(vvp_object_kmem, GFP_NOFS); - if (vob) { - struct cl_object_header *hdr; - - obj = &vob->vob_cl.co_lu; - hdr = &vob->vob_header; - cl_object_header_init(hdr); - hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page)); - - lu_object_init(obj, &hdr->coh_lu, dev); - lu_object_add_top(&hdr->coh_lu, obj); - - vob->vob_cl.co_ops = &vvp_ops; - obj->lo_ops = &vvp_lu_obj_ops; - } else { - obj = NULL; - } - return obj; -} diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c deleted file mode 100644 index 6eb0565ddc22..000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_page.c +++ /dev/null @@ -1,523 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_page for VVP layer. - * - * Author: Nikita Danilov <nikita.danilov@sun.com> - * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com> - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/mm.h> -#include <linux/mutex.h> -#include <linux/page-flags.h> -#include <linux/pagemap.h> - -#include "llite_internal.h" -#include "vvp_internal.h" - -/***************************************************************************** - * - * Page operations. - * - */ - -static void vvp_page_fini_common(struct vvp_page *vpg) -{ - struct page *vmpage = vpg->vpg_page; - - LASSERT(vmpage); - put_page(vmpage); -} - -static void vvp_page_fini(const struct lu_env *env, - struct cl_page_slice *slice) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct page *vmpage = vpg->vpg_page; - - /* - * vmpage->private was already cleared when page was moved into - * VPG_FREEING state. - */ - LASSERT((struct cl_page *)vmpage->private != slice->cpl_page); - vvp_page_fini_common(vpg); -} - -static int vvp_page_own(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io, - int nonblock) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct page *vmpage = vpg->vpg_page; - - LASSERT(vmpage); - if (nonblock) { - if (!trylock_page(vmpage)) - return -EAGAIN; - - if (unlikely(PageWriteback(vmpage))) { - unlock_page(vmpage); - return -EAGAIN; - } - - return 0; - } - - lock_page(vmpage); - wait_on_page_writeback(vmpage); - - return 0; -} - -static void vvp_page_assume(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct page *vmpage = cl2vm_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); - wait_on_page_writeback(vmpage); -} - -static void vvp_page_unassume(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct page *vmpage = cl2vm_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); -} - -static void vvp_page_disown(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io) -{ - struct page *vmpage = cl2vm_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); - - unlock_page(cl2vm_page(slice)); -} - -static void vvp_page_discard(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct page *vmpage = cl2vm_page(slice); - struct vvp_page *vpg = cl2vvp_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); - - if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used) - ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED); - - ll_invalidate_page(vmpage); -} - -static void vvp_page_delete(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct page *vmpage = cl2vm_page(slice); - struct inode *inode = vmpage->mapping->host; - struct cl_object *obj = slice->cpl_obj; - struct cl_page *page = slice->cpl_page; - int refc; - - LASSERT(PageLocked(vmpage)); - LASSERT((struct cl_page *)vmpage->private == page); - LASSERT(inode == vvp_object_inode(obj)); - - /* Drop the reference count held in vvp_page_init */ - refc = atomic_dec_return(&page->cp_ref); - LASSERTF(refc >= 1, "page = %p, refc = %d\n", page, refc); - - ClearPagePrivate(vmpage); - vmpage->private = 0; - /* - * Reference from vmpage to cl_page is removed, but the reference back - * is still here. It is removed later in vvp_page_fini(). - */ -} - -static void vvp_page_export(const struct lu_env *env, - const struct cl_page_slice *slice, - int uptodate) -{ - struct page *vmpage = cl2vm_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); - if (uptodate) - SetPageUptodate(vmpage); - else - ClearPageUptodate(vmpage); -} - -static int vvp_page_is_vmlocked(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA; -} - -static int vvp_page_prep_read(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - /* Skip the page already marked as PG_uptodate. */ - return PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0; -} - -static int vvp_page_prep_write(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct page *vmpage = cl2vm_page(slice); - struct cl_page *pg = slice->cpl_page; - - LASSERT(PageLocked(vmpage)); - LASSERT(!PageDirty(vmpage)); - - /* ll_writepage path is not a sync write, so need to set page writeback - * flag - */ - if (!pg->cp_sync_io) - set_page_writeback(vmpage); - - return 0; -} - -/** - * Handles page transfer errors at VM level. - * - * This takes inode as a separate argument, because inode on which error is to - * be set can be different from \a vmpage inode in case of direct-io. - */ -static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, - int ioret) -{ - struct vvp_object *obj = cl_inode2vvp(inode); - - if (ioret == 0) { - ClearPageError(vmpage); - obj->vob_discard_page_warned = 0; - } else { - SetPageError(vmpage); - mapping_set_error(inode->i_mapping, ioret); - - if ((ioret == -ESHUTDOWN || ioret == -EINTR) && - obj->vob_discard_page_warned == 0) { - obj->vob_discard_page_warned = 1; - ll_dirty_page_discard_warn(vmpage, ioret); - } - } -} - -static void vvp_page_completion_read(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct page *vmpage = vpg->vpg_page; - struct cl_page *page = slice->cpl_page; - struct inode *inode = vvp_object_inode(page->cp_obj); - - LASSERT(PageLocked(vmpage)); - CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret); - - if (vpg->vpg_defer_uptodate) - ll_ra_count_put(ll_i2sbi(inode), 1); - - if (ioret == 0) { - if (!vpg->vpg_defer_uptodate) - cl_page_export(env, page, 1); - } else { - vpg->vpg_defer_uptodate = 0; - } - - if (!page->cp_sync_io) - unlock_page(vmpage); -} - -static void vvp_page_completion_write(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct cl_page *pg = slice->cpl_page; - struct page *vmpage = vpg->vpg_page; - - CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret); - - if (pg->cp_sync_io) { - LASSERT(PageLocked(vmpage)); - LASSERT(!PageWriteback(vmpage)); - } else { - LASSERT(PageWriteback(vmpage)); - /* - * Only mark the page error only when it's an async write - * because applications won't wait for IO to finish. - */ - vvp_vmpage_error(vvp_object_inode(pg->cp_obj), vmpage, ioret); - - end_page_writeback(vmpage); - } -} - -/** - * Implements cl_page_operations::cpo_make_ready() method. - * - * This is called to yank a page from the transfer cache and to send it out as - * a part of transfer. This function try-locks the page. If try-lock failed, - * page is owned by some concurrent IO, and should be skipped (this is bad, - * but hopefully rare situation, as it usually results in transfer being - * shorter than possible). - * - * \retval 0 success, page can be placed into transfer - * - * \retval -EAGAIN page is either used by concurrent IO has been - * truncated. Skip it. - */ -static int vvp_page_make_ready(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct page *vmpage = cl2vm_page(slice); - struct cl_page *pg = slice->cpl_page; - int result = 0; - - lock_page(vmpage); - if (clear_page_dirty_for_io(vmpage)) { - LASSERT(pg->cp_state == CPS_CACHED); - /* This actually clears the dirty bit in the radix tree. */ - set_page_writeback(vmpage); - CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); - } else if (pg->cp_state == CPS_PAGEOUT) { - /* is it possible for osc_flush_async_page() to already - * make it ready? - */ - result = -EALREADY; - } else { - CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n", - pg->cp_state); - LBUG(); - } - unlock_page(vmpage); - return result; -} - -static int vvp_page_print(const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t printer) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct page *vmpage = vpg->vpg_page; - - (*printer)(env, cookie, LUSTRE_VVP_NAME "-page@%p(%d:%d) vm@%p ", - vpg, vpg->vpg_defer_uptodate, vpg->vpg_ra_used, vmpage); - if (vmpage) { - (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru", - (long)vmpage->flags, page_count(vmpage), - page_mapcount(vmpage), vmpage->private, - vmpage->index, - list_empty(&vmpage->lru) ? "not-" : ""); - } - - (*printer)(env, cookie, "\n"); - - return 0; -} - -static int vvp_page_fail(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - /* - * Cached read? - */ - LBUG(); - - return 0; -} - -static const struct cl_page_operations vvp_page_ops = { - .cpo_own = vvp_page_own, - .cpo_assume = vvp_page_assume, - .cpo_unassume = vvp_page_unassume, - .cpo_disown = vvp_page_disown, - .cpo_discard = vvp_page_discard, - .cpo_delete = vvp_page_delete, - .cpo_export = vvp_page_export, - .cpo_is_vmlocked = vvp_page_is_vmlocked, - .cpo_fini = vvp_page_fini, - .cpo_print = vvp_page_print, - .io = { - [CRT_READ] = { - .cpo_prep = vvp_page_prep_read, - .cpo_completion = vvp_page_completion_read, - .cpo_make_ready = vvp_page_fail, - }, - [CRT_WRITE] = { - .cpo_prep = vvp_page_prep_write, - .cpo_completion = vvp_page_completion_write, - .cpo_make_ready = vvp_page_make_ready, - }, - }, -}; - -static int vvp_transient_page_prep(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - /* transient page should always be sent. */ - return 0; -} - -static int vvp_transient_page_own(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused, int nonblock) -{ - return 0; -} - -static void vvp_transient_page_assume(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ -} - -static void vvp_transient_page_unassume(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ -} - -static void vvp_transient_page_disown(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ -} - -static void vvp_transient_page_discard(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct cl_page *page = slice->cpl_page; - - /* - * For transient pages, remove it from the radix tree. - */ - cl_page_delete(env, page); -} - -static int vvp_transient_page_is_vmlocked(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct inode *inode = vvp_object_inode(slice->cpl_obj); - int locked; - - locked = !inode_trylock(inode); - if (!locked) - inode_unlock(inode); - return locked ? -EBUSY : -ENODATA; -} - -static void -vvp_transient_page_completion(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) -{ -} - -static void vvp_transient_page_fini(const struct lu_env *env, - struct cl_page_slice *slice) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct cl_page *clp = slice->cpl_page; - struct vvp_object *clobj = cl2vvp(clp->cp_obj); - - vvp_page_fini_common(vpg); - atomic_dec(&clobj->vob_transient_pages); -} - -static const struct cl_page_operations vvp_transient_page_ops = { - .cpo_own = vvp_transient_page_own, - .cpo_assume = vvp_transient_page_assume, - .cpo_unassume = vvp_transient_page_unassume, - .cpo_disown = vvp_transient_page_disown, - .cpo_discard = vvp_transient_page_discard, - .cpo_fini = vvp_transient_page_fini, - .cpo_is_vmlocked = vvp_transient_page_is_vmlocked, - .cpo_print = vvp_page_print, - .io = { - [CRT_READ] = { - .cpo_prep = vvp_transient_page_prep, - .cpo_completion = vvp_transient_page_completion, - }, - [CRT_WRITE] = { - .cpo_prep = vvp_transient_page_prep, - .cpo_completion = vvp_transient_page_completion, - } - } -}; - -int vvp_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - struct vvp_page *vpg = cl_object_page_slice(obj, page); - struct page *vmpage = page->cp_vmpage; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - vpg->vpg_page = vmpage; - get_page(vmpage); - - if (page->cp_type == CPT_CACHEABLE) { - /* in cache, decref in vvp_page_delete */ - atomic_inc(&page->cp_ref); - SetPagePrivate(vmpage); - vmpage->private = (unsigned long)page; - cl_page_slice_add(page, &vpg->vpg_cl, obj, index, - &vvp_page_ops); - } else { - struct vvp_object *clobj = cl2vvp(obj); - - cl_page_slice_add(page, &vpg->vpg_cl, obj, index, - &vvp_transient_page_ops); - atomic_inc(&clobj->vob_transient_pages); - } - return 0; -} diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c deleted file mode 100644 index 2d78432963dc..000000000000 --- a/drivers/staging/lustre/lustre/llite/xattr.c +++ /dev/null @@ -1,638 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/xattr.h> -#include <linux/selinux.h> - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <obd_support.h> -#include <lustre_dlm.h> - -#include "llite_internal.h" - -const struct xattr_handler *get_xattr_type(const char *name) -{ - int i = 0; - - while (ll_xattr_handlers[i]) { - size_t len = strlen(ll_xattr_handlers[i]->prefix); - - if (!strncmp(ll_xattr_handlers[i]->prefix, name, len)) - return ll_xattr_handlers[i]; - i++; - } - return NULL; -} - -static int xattr_type_filter(struct ll_sb_info *sbi, - const struct xattr_handler *handler) -{ - /* No handler means XATTR_OTHER_T */ - if (!handler) - return -EOPNOTSUPP; - - if ((handler->flags == XATTR_ACL_ACCESS_T || - handler->flags == XATTR_ACL_DEFAULT_T) && - !(sbi->ll_flags & LL_SBI_ACL)) - return -EOPNOTSUPP; - - if (handler->flags == XATTR_USER_T && - !(sbi->ll_flags & LL_SBI_USER_XATTR)) - return -EOPNOTSUPP; - - if (handler->flags == XATTR_TRUSTED_T && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return 0; -} - -static int -ll_xattr_set_common(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, size_t size, - int flags) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - const char *pv = value; - char *fullname; - __u64 valid; - int rc; - - if (flags == XATTR_REPLACE) { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1); - valid = OBD_MD_FLXATTRRM; - } else { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); - valid = OBD_MD_FLXATTR; - } - - rc = xattr_type_filter(sbi, handler); - if (rc) - return rc; - - if ((handler->flags == XATTR_ACL_ACCESS_T || - handler->flags == XATTR_ACL_DEFAULT_T) && - !inode_owner_or_capable(inode)) - return -EPERM; - - /* b10667: ignore lustre special xattr for now */ - if (!strcmp(name, "hsm") || - ((handler->flags == XATTR_TRUSTED_T && !strcmp(name, "lov")) || - (handler->flags == XATTR_LUSTRE_T && !strcmp(name, "lov")))) - return 0; - - /* b15587: ignore security.capability xattr for now */ - if ((handler->flags == XATTR_SECURITY_T && - !strcmp(name, "capability"))) - return 0; - - /* LU-549: Disable security.selinux when selinux is disabled */ - if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() && - strcmp(name, "selinux") == 0) - return -EOPNOTSUPP; - - /*FIXME: enable IMA when the conditions are ready */ - if (handler->flags == XATTR_SECURITY_T && - (!strcmp(name, "ima") || !strcmp(name, "evm"))) - return -EOPNOTSUPP; - - /* - * In user.* namespace, only regular files and directories can have - * extended attributes. - */ - if (handler->flags == XATTR_USER_T) { - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) - return -EPERM; - } - - fullname = kasprintf(GFP_KERNEL, "%s%s\n", handler->prefix, name); - if (!fullname) - return -ENOMEM; - rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), - valid, fullname, pv, size, 0, flags, - ll_i2suppgid(inode), &req); - kfree(fullname); - if (rc) { - if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) { - LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); - sbi->ll_flags &= ~LL_SBI_USER_XATTR; - } - return rc; - } - - ptlrpc_req_finished(req); - return 0; -} - -static int get_hsm_state(struct inode *inode, u32 *hus_states) -{ - struct md_op_data *op_data; - struct hsm_user_state *hus; - int rc; - - hus = kzalloc(sizeof(*hus), GFP_NOFS); - if (!hus) - return -ENOMEM; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, hus); - if (!IS_ERR(op_data)) { - rc = obd_iocontrol(LL_IOC_HSM_STATE_GET, ll_i2mdexp(inode), - sizeof(*op_data), op_data, NULL); - if (!rc) - *hus_states = hus->hus_states; - else - CDEBUG(D_VFSTRACE, "obd_iocontrol failed. rc = %d\n", - rc); - - ll_finish_md_op_data(op_data); - } else { - rc = PTR_ERR(op_data); - CDEBUG(D_VFSTRACE, "Could not prepare the opdata. rc = %d\n", - rc); - } - kfree(hus); - return rc; -} - -static int ll_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, size_t size, - int flags) -{ - LASSERT(inode); - LASSERT(name); - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), xattr %s\n", - PFID(ll_inode2fid(inode)), inode, name); - - if (!strcmp(name, "lov")) { - struct lov_user_md *lump = (struct lov_user_md *)value; - int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR : - LPROC_LL_SETXATTR; - int rc = 0; - - ll_stats_ops_tally(ll_i2sbi(inode), op_type, 1); - - if (size != 0 && size < sizeof(struct lov_user_md)) - return -EINVAL; - - /* - * It is possible to set an xattr to a "" value of zero size. - * For this case we are going to treat it as a removal. - */ - if (!size && lump) - lump = NULL; - - /* Attributes that are saved via getxattr will always have - * the stripe_offset as 0. Instead, the MDS should be - * allowed to pick the starting OST index. b=17846 - */ - if (lump && lump->lmm_stripe_offset == 0) - lump->lmm_stripe_offset = -1; - - /* Avoid anyone directly setting the RELEASED flag. */ - if (lump && (lump->lmm_pattern & LOV_PATTERN_F_RELEASED)) { - /* Only if we have a released flag check if the file - * was indeed archived. - */ - u32 state = HS_NONE; - - rc = get_hsm_state(inode, &state); - if (rc) - return rc; - - if (!(state & HS_ARCHIVED)) { - CDEBUG(D_VFSTRACE, - "hus_states state = %x, pattern = %x\n", - state, lump->lmm_pattern); - /* - * Here the state is: real file is not - * archived but user is requesting to set - * the RELEASED flag so we mask off the - * released flag from the request - */ - lump->lmm_pattern ^= LOV_PATTERN_F_RELEASED; - } - } - - if (lump && S_ISREG(inode->i_mode)) { - __u64 it_flags = FMODE_WRITE; - int lum_size; - - lum_size = ll_lov_user_md_size(lump); - if (lum_size < 0 || size < lum_size) - return 0; /* b=10667: ignore error */ - - rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, - lump, lum_size); - /* b=10667: rc always be 0 here for now */ - rc = 0; - } else if (S_ISDIR(inode->i_mode)) { - rc = ll_dir_setstripe(inode, lump, 0); - } - - return rc; - - } else if (!strcmp(name, "lma") || !strcmp(name, "link")) { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); - return 0; - } - - return ll_xattr_set_common(handler, dentry, inode, name, value, size, - flags); -} - -int -ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer, - size_t size, __u64 valid) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - struct mdt_body *body; - void *xdata; - int rc; - - if (sbi->ll_xattr_cache_enabled && type != XATTR_ACL_ACCESS_T && - (type != XATTR_SECURITY_T || strcmp(name, "security.selinux"))) { - rc = ll_xattr_cache_get(inode, name, buffer, size, valid); - if (rc == -EAGAIN) - goto getxattr_nocache; - if (rc < 0) - goto out_xattr; - - /* Add "system.posix_acl_access" to the list */ - if (lli->lli_posix_acl && valid & OBD_MD_FLXATTRLS) { - if (size == 0) { - rc += sizeof(XATTR_NAME_ACL_ACCESS); - } else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) { - memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS, - sizeof(XATTR_NAME_ACL_ACCESS)); - rc += sizeof(XATTR_NAME_ACL_ACCESS); - } else { - rc = -ERANGE; - goto out_xattr; - } - } - } else { -getxattr_nocache: - rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), - valid, name, NULL, 0, size, 0, &req); - if (rc < 0) - goto out_xattr; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - LASSERT(body); - - /* only detect the xattr size */ - if (size == 0) { - rc = body->mbo_eadatasize; - goto out; - } - - if (size < body->mbo_eadatasize) { - CERROR("server bug: replied size %u > %u\n", - body->mbo_eadatasize, (int)size); - rc = -ERANGE; - goto out; - } - - if (body->mbo_eadatasize == 0) { - rc = -ENODATA; - goto out; - } - - /* do not need swab xattr data */ - xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, - body->mbo_eadatasize); - if (!xdata) { - rc = -EFAULT; - goto out; - } - - memcpy(buffer, xdata, body->mbo_eadatasize); - rc = body->mbo_eadatasize; - } - -out_xattr: - if (rc == -EOPNOTSUPP && type == XATTR_USER_T) { - LCONSOLE_INFO( - "%s: disabling user_xattr feature because it is not supported on the server: rc = %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), rc); - sbi->ll_flags &= ~LL_SBI_USER_XATTR; - } -out: - ptlrpc_req_finished(req); - return rc; -} - -static int ll_xattr_get_common(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); -#ifdef CONFIG_FS_POSIX_ACL - struct ll_inode_info *lli = ll_i2info(inode); -#endif - char *fullname; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); - - rc = xattr_type_filter(sbi, handler); - if (rc) - return rc; - - /* b15587: ignore security.capability xattr for now */ - if ((handler->flags == XATTR_SECURITY_T && !strcmp(name, "capability"))) - return -ENODATA; - - /* LU-549: Disable security.selinux when selinux is disabled */ - if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() && - !strcmp(name, "selinux")) - return -EOPNOTSUPP; - -#ifdef CONFIG_FS_POSIX_ACL - /* posix acl is under protection of LOOKUP lock. when calling to this, - * we just have path resolution to the target inode, so we have great - * chance that cached ACL is uptodate. - */ - if (handler->flags == XATTR_ACL_ACCESS_T) { - struct posix_acl *acl; - - spin_lock(&lli->lli_lock); - acl = posix_acl_dup(lli->lli_posix_acl); - spin_unlock(&lli->lli_lock); - - if (!acl) - return -ENODATA; - - rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - return rc; - } - if (handler->flags == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) - return -ENODATA; -#endif - fullname = kasprintf(GFP_KERNEL, "%s%s\n", handler->prefix, name); - if (!fullname) - return -ENOMEM; - rc = ll_xattr_list(inode, fullname, handler->flags, buffer, size, - OBD_MD_FLXATTR); - kfree(fullname); - return rc; -} - -static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size) -{ - ssize_t rc; - - if (S_ISREG(inode->i_mode)) { - struct cl_object *obj = ll_i2info(inode)->lli_clob; - struct cl_layout cl = { - .cl_buf.lb_buf = buf, - .cl_buf.lb_len = buf_size, - }; - struct lu_env *env; - u16 refcheck; - - if (!obj) - return -ENODATA; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - rc = cl_object_layout_get(env, obj, &cl); - if (rc < 0) - goto out_env; - - if (!cl.cl_size) { - rc = -ENODATA; - goto out_env; - } - - rc = cl.cl_size; - - if (!buf_size) - goto out_env; - - LASSERT(buf && rc <= buf_size); - - /* - * Do not return layout gen for getxattr() since - * otherwise it would confuse tar --xattr by - * recognizing layout gen as stripe offset when the - * file is restored. See LU-2809. - */ - ((struct lov_mds_md *)buf)->lmm_layout_gen = 0; -out_env: - cl_env_put(env, &refcheck); - - return rc; - } else if (S_ISDIR(inode->i_mode)) { - struct ptlrpc_request *req = NULL; - struct lov_mds_md *lmm = NULL; - int lmm_size = 0; - - rc = ll_dir_getstripe(inode, (void **)&lmm, &lmm_size, - &req, 0); - if (rc < 0) - goto out_req; - - if (!buf_size) { - rc = lmm_size; - goto out_req; - } - - if (buf_size < lmm_size) { - rc = -ERANGE; - goto out_req; - } - - memcpy(buf, lmm, lmm_size); - rc = lmm_size; -out_req: - if (req) - ptlrpc_req_finished(req); - - return rc; - } else { - return -ENODATA; - } -} - -static int ll_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - LASSERT(inode); - LASSERT(name); - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), xattr %s\n", - PFID(ll_inode2fid(inode)), inode, name); - - if (!strcmp(name, "lov")) { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); - - return ll_getxattr_lov(inode, buffer, size); - } - - return ll_xattr_get_common(handler, dentry, inode, name, buffer, size); -} - -ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct inode *inode = d_inode(dentry); - struct ll_sb_info *sbi = ll_i2sbi(inode); - char *xattr_name; - ssize_t rc, rc2; - size_t len, rem; - - LASSERT(inode); - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1); - - rc = ll_xattr_list(inode, NULL, XATTR_OTHER_T, buffer, size, - OBD_MD_FLXATTRLS); - if (rc < 0) - return rc; - /* - * If we're being called to get the size of the xattr list - * (buf_size == 0) then just assume that a lustre.lov xattr - * exists. - */ - if (!size) - return rc + sizeof(XATTR_LUSTRE_LOV); - - xattr_name = buffer; - rem = rc; - - while (rem > 0) { - len = strnlen(xattr_name, rem - 1) + 1; - rem -= len; - if (!xattr_type_filter(sbi, get_xattr_type(xattr_name))) { - /* Skip OK xattr type leave it in buffer */ - xattr_name += len; - continue; - } - - /* - * Move up remaining xattrs in buffer - * removing the xattr that is not OK - */ - memmove(xattr_name, xattr_name + len, rem); - rc -= len; - } - - rc2 = ll_getxattr_lov(inode, NULL, 0); - if (rc2 == -ENODATA) - return rc; - - if (rc2 < 0) - return rc2; - - if (size < rc + sizeof(XATTR_LUSTRE_LOV)) - return -ERANGE; - - memcpy(buffer + rc, XATTR_LUSTRE_LOV, sizeof(XATTR_LUSTRE_LOV)); - - return rc + sizeof(XATTR_LUSTRE_LOV); -} - -static const struct xattr_handler ll_user_xattr_handler = { - .prefix = XATTR_USER_PREFIX, - .flags = XATTR_USER_T, - .get = ll_xattr_get_common, - .set = ll_xattr_set_common, -}; - -static const struct xattr_handler ll_trusted_xattr_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .flags = XATTR_TRUSTED_T, - .get = ll_xattr_get, - .set = ll_xattr_set, -}; - -static const struct xattr_handler ll_security_xattr_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .flags = XATTR_SECURITY_T, - .get = ll_xattr_get_common, - .set = ll_xattr_set_common, -}; - -static const struct xattr_handler ll_acl_access_xattr_handler = { - .prefix = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = XATTR_ACL_ACCESS_T, - .get = ll_xattr_get_common, - .set = ll_xattr_set_common, -}; - -static const struct xattr_handler ll_acl_default_xattr_handler = { - .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = XATTR_ACL_DEFAULT_T, - .get = ll_xattr_get_common, - .set = ll_xattr_set_common, -}; - -static const struct xattr_handler ll_lustre_xattr_handler = { - .prefix = XATTR_LUSTRE_PREFIX, - .flags = XATTR_LUSTRE_T, - .get = ll_xattr_get, - .set = ll_xattr_set, -}; - -const struct xattr_handler *ll_xattr_handlers[] = { - &ll_user_xattr_handler, - &ll_trusted_xattr_handler, - &ll_security_xattr_handler, -#ifdef CONFIG_FS_POSIX_ACL - &ll_acl_access_xattr_handler, - &ll_acl_default_xattr_handler, -#endif - &ll_lustre_xattr_handler, - NULL, -}; diff --git a/drivers/staging/lustre/lustre/llite/xattr_cache.c b/drivers/staging/lustre/lustre/llite/xattr_cache.c deleted file mode 100644 index 4dc799d60a9f..000000000000 --- a/drivers/staging/lustre/lustre/llite/xattr_cache.c +++ /dev/null @@ -1,523 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2012 Xyratex Technology Limited - * - * Copyright (c) 2013, 2015, Intel Corporation. - * - * Author: Andrew Perepechko <Andrew_Perepechko@xyratex.com> - * - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <obd_support.h> -#include <lustre_dlm.h> -#include "llite_internal.h" - -/* If we ever have hundreds of extended attributes, we might want to consider - * using a hash or a tree structure instead of list for faster lookups. - */ -struct ll_xattr_entry { - struct list_head xe_list; /* protected with - * lli_xattrs_list_rwsem - */ - char *xe_name; /* xattr name, \0-terminated */ - char *xe_value; /* xattr value */ - unsigned int xe_namelen; /* strlen(xe_name) + 1 */ - unsigned int xe_vallen; /* xattr value length */ -}; - -static struct kmem_cache *xattr_kmem; -static struct lu_kmem_descr xattr_caches[] = { - { - .ckd_cache = &xattr_kmem, - .ckd_name = "xattr_kmem", - .ckd_size = sizeof(struct ll_xattr_entry) - }, - { - .ckd_cache = NULL - } -}; - -int ll_xattr_init(void) -{ - return lu_kmem_init(xattr_caches); -} - -void ll_xattr_fini(void) -{ - lu_kmem_fini(xattr_caches); -} - -/** - * Initializes xattr cache for an inode. - * - * This initializes the xattr list and marks cache presence. - */ -static void ll_xattr_cache_init(struct ll_inode_info *lli) -{ - INIT_LIST_HEAD(&lli->lli_xattrs); - set_bit(LLIF_XATTR_CACHE, &lli->lli_flags); -} - -/** - * This looks for a specific extended attribute. - * - * Find in @cache and return @xattr_name attribute in @xattr, - * for the NULL @xattr_name return the first cached @xattr. - * - * \retval 0 success - * \retval -ENODATA if not found - */ -static int ll_xattr_cache_find(struct list_head *cache, - const char *xattr_name, - struct ll_xattr_entry **xattr) -{ - struct ll_xattr_entry *entry; - - list_for_each_entry(entry, cache, xe_list) { - /* xattr_name == NULL means look for any entry */ - if (!xattr_name || strcmp(xattr_name, entry->xe_name) == 0) { - *xattr = entry; - CDEBUG(D_CACHE, "find: [%s]=%.*s\n", - entry->xe_name, entry->xe_vallen, - entry->xe_value); - return 0; - } - } - - return -ENODATA; -} - -/** - * This adds an xattr. - * - * Add @xattr_name attr with @xattr_val value and @xattr_val_len length, - * - * \retval 0 success - * \retval -ENOMEM if no memory could be allocated for the cached attr - * \retval -EPROTO if duplicate xattr is being added - */ -static int ll_xattr_cache_add(struct list_head *cache, - const char *xattr_name, - const char *xattr_val, - unsigned int xattr_val_len) -{ - struct ll_xattr_entry *xattr; - - if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { - CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name); - return -EPROTO; - } - - xattr = kmem_cache_zalloc(xattr_kmem, GFP_NOFS); - if (!xattr) { - CDEBUG(D_CACHE, "failed to allocate xattr\n"); - return -ENOMEM; - } - - xattr->xe_name = kstrdup(xattr_name, GFP_NOFS); - if (!xattr->xe_name) { - CDEBUG(D_CACHE, "failed to alloc xattr name %u\n", - xattr->xe_namelen); - goto err_name; - } - xattr->xe_value = kmemdup(xattr_val, xattr_val_len, GFP_NOFS); - if (!xattr->xe_value) - goto err_value; - - xattr->xe_vallen = xattr_val_len; - list_add(&xattr->xe_list, cache); - - CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name, xattr_val_len, - xattr_val); - - return 0; -err_value: - kfree(xattr->xe_name); -err_name: - kmem_cache_free(xattr_kmem, xattr); - - return -ENOMEM; -} - -/** - * This removes an extended attribute from cache. - * - * Remove @xattr_name attribute from @cache. - * - * \retval 0 success - * \retval -ENODATA if @xattr_name is not cached - */ -static int ll_xattr_cache_del(struct list_head *cache, - const char *xattr_name) -{ - struct ll_xattr_entry *xattr; - - CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name); - - if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { - list_del(&xattr->xe_list); - kfree(xattr->xe_name); - kfree(xattr->xe_value); - kmem_cache_free(xattr_kmem, xattr); - - return 0; - } - - return -ENODATA; -} - -/** - * This iterates cached extended attributes. - * - * Walk over cached attributes in @cache and - * fill in @xld_buffer or only calculate buffer - * size if @xld_buffer is NULL. - * - * \retval >= 0 buffer list size - * \retval -ENODATA if the list cannot fit @xld_size buffer - */ -static int ll_xattr_cache_list(struct list_head *cache, - char *xld_buffer, - int xld_size) -{ - struct ll_xattr_entry *xattr, *tmp; - int xld_tail = 0; - - list_for_each_entry_safe(xattr, tmp, cache, xe_list) { - CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n", - xld_buffer, xld_tail, xattr->xe_name); - - if (xld_buffer) { - xld_size -= xattr->xe_namelen; - if (xld_size < 0) - break; - memcpy(&xld_buffer[xld_tail], - xattr->xe_name, xattr->xe_namelen); - } - xld_tail += xattr->xe_namelen; - } - - if (xld_size < 0) - return -ERANGE; - - return xld_tail; -} - -/** - * Check if the xattr cache is initialized (filled). - * - * \retval 0 @cache is not initialized - * \retval 1 @cache is initialized - */ -static int ll_xattr_cache_valid(struct ll_inode_info *lli) -{ - return test_bit(LLIF_XATTR_CACHE, &lli->lli_flags); -} - -/** - * This finalizes the xattr cache. - * - * Free all xattr memory. @lli is the inode info pointer. - * - * \retval 0 no error occurred - */ -static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli) -{ - if (!ll_xattr_cache_valid(lli)) - return 0; - - while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0) - ; /* empty loop */ - - clear_bit(LLIF_XATTR_CACHE, &lli->lli_flags); - - return 0; -} - -int ll_xattr_cache_destroy(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - int rc; - - down_write(&lli->lli_xattrs_list_rwsem); - rc = ll_xattr_cache_destroy_locked(lli); - up_write(&lli->lli_xattrs_list_rwsem); - - return rc; -} - -/** - * Match or enqueue a PR lock. - * - * Find or request an LDLM lock with xattr data. - * Since LDLM does not provide API for atomic match_or_enqueue, - * the function handles it with a separate enq lock. - * If successful, the function exits with the list lock held. - * - * \retval 0 no error occurred - * \retval -ENOMEM not enough memory - */ -static int ll_xattr_find_get_lock(struct inode *inode, - struct lookup_intent *oit, - struct ptlrpc_request **req) -{ - enum ldlm_mode mode; - struct lustre_handle lockh = { 0 }; - struct md_op_data *op_data; - struct ll_inode_info *lli = ll_i2info(inode); - struct ldlm_enqueue_info einfo = { - .ei_type = LDLM_IBITS, - .ei_mode = it_to_lock_mode(oit), - .ei_cb_bl = &ll_md_blocking_ast, - .ei_cb_cp = &ldlm_completion_ast, - }; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct obd_export *exp = sbi->ll_md_exp; - int rc; - - mutex_lock(&lli->lli_xattrs_enq_lock); - /* inode may have been shrunk and recreated, so data is gone, match lock - * only when data exists. - */ - if (ll_xattr_cache_valid(lli)) { - /* Try matching first. */ - mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0, - LCK_PR); - if (mode != 0) { - /* fake oit in mdc_revalidate_lock() manner */ - oit->it_lock_handle = lockh.cookie; - oit->it_lock_mode = mode; - goto out; - } - } - - /* Enqueue if the lock isn't cached locally. */ - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) { - mutex_unlock(&lli->lli_xattrs_enq_lock); - return PTR_ERR(op_data); - } - - op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS; - - rc = md_enqueue(exp, &einfo, NULL, oit, op_data, &lockh, 0); - ll_finish_md_op_data(op_data); - - if (rc < 0) { - CDEBUG(D_CACHE, - "md_intent_lock failed with %d for fid " DFID "\n", - rc, PFID(ll_inode2fid(inode))); - mutex_unlock(&lli->lli_xattrs_enq_lock); - return rc; - } - - *req = oit->it_request; -out: - down_write(&lli->lli_xattrs_list_rwsem); - mutex_unlock(&lli->lli_xattrs_enq_lock); - - return 0; -} - -/** - * Refill the xattr cache. - * - * Fetch and cache the whole of xattrs for @inode, acquiring - * a read or a write xattr lock depending on operation in @oit. - * Intent is dropped on exit unless the operation is setxattr. - * - * \retval 0 no error occurred - * \retval -EPROTO network protocol error - * \retval -ENOMEM not enough memory for the cache - */ -static int ll_xattr_cache_refill(struct inode *inode, struct lookup_intent *oit) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - const char *xdata, *xval, *xtail, *xvtail; - struct ll_inode_info *lli = ll_i2info(inode); - struct mdt_body *body; - __u32 *xsizes; - int rc, i; - - rc = ll_xattr_find_get_lock(inode, oit, &req); - if (rc) - goto out_no_unlock; - - /* Do we have the data at this point? */ - if (ll_xattr_cache_valid(lli)) { - ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1); - rc = 0; - goto out_maybe_drop; - } - - /* Matched but no cache? Cancelled on error by a parallel refill. */ - if (unlikely(!req)) { - CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n"); - rc = -EIO; - goto out_maybe_drop; - } - - if (oit->it_status < 0) { - CDEBUG(D_CACHE, - "getxattr intent returned %d for fid " DFID "\n", - oit->it_status, PFID(ll_inode2fid(inode))); - rc = oit->it_status; - /* xattr data is so large that we don't want to cache it */ - if (rc == -ERANGE) - rc = -EAGAIN; - goto out_destroy; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - CERROR("no MDT BODY in the refill xattr reply\n"); - rc = -EPROTO; - goto out_destroy; - } - /* do not need swab xattr data */ - xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, - body->mbo_eadatasize); - xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS, - body->mbo_aclsize); - xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS, - body->mbo_max_mdsize * sizeof(__u32)); - if (!xdata || !xval || !xsizes) { - CERROR("wrong setxattr reply\n"); - rc = -EPROTO; - goto out_destroy; - } - - xtail = xdata + body->mbo_eadatasize; - xvtail = xval + body->mbo_aclsize; - - CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail); - - ll_xattr_cache_init(lli); - - for (i = 0; i < body->mbo_max_mdsize; i++) { - CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval); - /* Perform consistency checks: attr names and vals in pill */ - if (!memchr(xdata, 0, xtail - xdata)) { - CERROR("xattr protocol violation (names are broken)\n"); - rc = -EPROTO; - } else if (xval + *xsizes > xvtail) { - CERROR("xattr protocol violation (vals are broken)\n"); - rc = -EPROTO; - } else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) { - rc = -ENOMEM; - } else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) { - /* Filter out ACL ACCESS since it's cached separately */ - CDEBUG(D_CACHE, "not caching %s\n", - XATTR_NAME_ACL_ACCESS); - rc = 0; - } else if (!strcmp(xdata, "security.selinux")) { - /* Filter out security.selinux, it is cached in slab */ - CDEBUG(D_CACHE, "not caching security.selinux\n"); - rc = 0; - } else { - rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval, - *xsizes); - } - if (rc < 0) { - ll_xattr_cache_destroy_locked(lli); - goto out_destroy; - } - xdata += strlen(xdata) + 1; - xval += *xsizes; - xsizes++; - } - - if (xdata != xtail || xval != xvtail) - CERROR("a hole in xattr data\n"); - - ll_set_lock_data(sbi->ll_md_exp, inode, oit, NULL); - - goto out_maybe_drop; -out_maybe_drop: - - ll_intent_drop_lock(oit); - - if (rc != 0) - up_write(&lli->lli_xattrs_list_rwsem); -out_no_unlock: - ptlrpc_req_finished(req); - - return rc; - -out_destroy: - up_write(&lli->lli_xattrs_list_rwsem); - - ldlm_lock_decref_and_cancel((struct lustre_handle *) - &oit->it_lock_handle, - oit->it_lock_mode); - - goto out_no_unlock; -} - -/** - * Get an xattr value or list xattrs using the write-through cache. - * - * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or - * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode. - * The resulting value/list is stored in @buffer if the former - * is not larger than @size. - * - * \retval 0 no error occurred - * \retval -EPROTO network protocol error - * \retval -ENOMEM not enough memory for the cache - * \retval -ERANGE the buffer is not large enough - * \retval -ENODATA no such attr or the list is empty - */ -int ll_xattr_cache_get(struct inode *inode, const char *name, char *buffer, - size_t size, __u64 valid) -{ - struct lookup_intent oit = { .it_op = IT_GETXATTR }; - struct ll_inode_info *lli = ll_i2info(inode); - int rc = 0; - - LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS)); - - down_read(&lli->lli_xattrs_list_rwsem); - if (!ll_xattr_cache_valid(lli)) { - up_read(&lli->lli_xattrs_list_rwsem); - rc = ll_xattr_cache_refill(inode, &oit); - if (rc) - return rc; - downgrade_write(&lli->lli_xattrs_list_rwsem); - } else { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1); - } - - if (valid & OBD_MD_FLXATTR) { - struct ll_xattr_entry *xattr; - - rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr); - if (rc == 0) { - rc = xattr->xe_vallen; - /* zero size means we are only requested size in rc */ - if (size != 0) { - if (size >= xattr->xe_vallen) - memcpy(buffer, xattr->xe_value, - xattr->xe_vallen); - else - rc = -ERANGE; - } - } - } else if (valid & OBD_MD_FLXATTRLS) { - rc = ll_xattr_cache_list(&lli->lli_xattrs, - size ? buffer : NULL, size); - } - - goto out; -out: - up_read(&lli->lli_xattrs_list_rwsem); - - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/xattr_security.c b/drivers/staging/lustre/lustre/llite/xattr_security.c deleted file mode 100644 index 93ec07531ac7..000000000000 --- a/drivers/staging/lustre/lustre/llite/xattr_security.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * GPL HEADER END - */ - -/* - * Copyright (c) 2014 Bull SAS - * Author: Sebastien Buisson sebastien.buisson@bull.net - */ - -/* - * lustre/llite/xattr_security.c - * Handler for storing security labels as extended attributes. - */ - -#include <linux/types.h> -#include <linux/security.h> -#include <linux/selinux.h> -#include <linux/xattr.h> -#include "llite_internal.h" - -/** - * A helper function for ll_security_inode_init_security() - * that takes care of setting xattrs - * - * Get security context of @inode from @xattr_array, - * and put it in 'security.xxx' xattr of dentry - * stored in @fs_info. - * - * \retval 0 success - * \retval -ENOMEM if no memory could be allocated for xattr name - * \retval < 0 failure to set xattr - */ -static int -ll_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) -{ - struct dentry *dentry = fs_info; - const struct xattr *xattr; - int err = 0; - - for (xattr = xattr_array; xattr->name; xattr++) { - char *full_name; - - full_name = kasprintf(GFP_KERNEL, "%s%s", - XATTR_SECURITY_PREFIX, xattr->name); - if (!full_name) { - err = -ENOMEM; - break; - } - - err = __vfs_setxattr(dentry, inode, full_name, xattr->value, - xattr->value_len, XATTR_CREATE); - kfree(full_name); - if (err < 0) - break; - } - return err; -} - -/** - * Initializes security context - * - * Get security context of @inode in @dir, - * and put it in 'security.xxx' xattr of @dentry. - * - * \retval 0 success, or SELinux is disabled - * \retval -ENOMEM if no memory could be allocated for xattr name - * \retval < 0 failure to get security context or set xattr - */ -int -ll_init_security(struct dentry *dentry, struct inode *inode, struct inode *dir) -{ - if (!selinux_is_enabled()) - return 0; - - return security_inode_init_security(inode, dir, NULL, - &ll_initxattrs, dentry); -} |