aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/staging/lustre/lustre/include
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/lustre/lustre/include')
-rw-r--r--drivers/staging/lustre/lustre/include/cl_object.h3279
-rw-r--r--drivers/staging/lustre/lustre/include/dt_object.h1498
-rw-r--r--drivers/staging/lustre/lustre/include/interval_tree.h124
-rw-r--r--drivers/staging/lustre/lustre/include/ioctl.h106
-rw-r--r--drivers/staging/lustre/lustre/include/lclient.h437
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lprocfs_status.h58
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_acl.h66
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_common.h22
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_compat25.h349
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_debug.h47
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_dlm.h46
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h181
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_handles.h53
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_intent.h62
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_lib.h87
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_lite.h100
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_log.h57
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_net.h50
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h83
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_quota.h47
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_user.h67
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lvfs.h134
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lvfs_linux.h66
-rw-r--r--drivers/staging/lustre/lustre/include/linux/obd.h128
-rw-r--r--drivers/staging/lustre/lustre/include/linux/obd_class.h58
-rw-r--r--drivers/staging/lustre/lustre/include/linux/obd_support.h63
-rw-r--r--drivers/staging/lustre/lustre/include/lprocfs_status.h1043
-rw-r--r--drivers/staging/lustre/lustre/include/lu_object.h1346
-rw-r--r--drivers/staging/lustre/lustre/include/lu_ref.h170
-rw-r--r--drivers/staging/lustre/lustre/include/lu_target.h91
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/libiam.h145
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/liblustreapi.h43
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h121
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h2
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_idl.h3653
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h95
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_user.h1145
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustreapi.h310
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_acl.h42
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_capa.h305
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_cfg.h299
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_debug.h76
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_disk.h543
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_dlm.h1671
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_eacl.h95
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_export.h389
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_fid.h762
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_fld.h202
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_fsfilt.h48
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_ha.h67
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_handles.h93
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_idmap.h104
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_import.h367
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_lib.h667
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_linkea.h57
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_lite.h147
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_log.h576
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_mdc.h176
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_mds.h81
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_mdt.h84
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_net.h3451
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_param.h121
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_quota.h239
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_req_layout.h334
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_sec.h1145
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_update.h189
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_ver.h24
-rw-r--r--drivers/staging/lustre/lustre/include/lvfs.h57
-rw-r--r--drivers/staging/lustre/lustre/include/md_object.h908
-rw-r--r--drivers/staging/lustre/lustre/include/obd.h1677
-rw-r--r--drivers/staging/lustre/lustre/include/obd_cache.h39
-rw-r--r--drivers/staging/lustre/lustre/include/obd_cksum.h176
-rw-r--r--drivers/staging/lustre/lustre/include/obd_class.h2281
-rw-r--r--drivers/staging/lustre/lustre/include/obd_lov.h126
-rw-r--r--drivers/staging/lustre/lustre/include/obd_ost.h96
-rw-r--r--drivers/staging/lustre/lustre/include/obd_support.h851
76 files changed, 33997 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h
new file mode 100644
index 000000000000..4bb68801d3a9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/cl_object.h
@@ -0,0 +1,3279 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ * - cl_object
+ *
+ * - cl_page
+ *
+ * - cl_lock represents an extent lock on an object.
+ *
+ * - cl_io represents high-level i/o activity such as whole read/write
+ * system call, or write-out of pages from under the lock being
+ * canceled. cl_io has sub-ios that can be stopped and resumed
+ * independently, thus achieving high degree of transfer
+ * parallelism. Single cl_io can be advanced forward by
+ * the multiple threads (although in the most usual case of
+ * read/write system call it is associated with the single user
+ * thread, that issued the system call).
+ *
+ * - cl_req represents a collection of pages for a transfer. cl_req is
+ * constructed by req-forming engine that tries to saturate
+ * transport with large and continuous transfers.
+ *
+ * Terminology
+ *
+ * - to avoid confusion high-level I/O operation like read or write system
+ * call is referred to as "an io", whereas low-level I/O operation, like
+ * RPC, is referred to as "a transfer"
+ *
+ * - "generic code" means generic (not file system specific) code in the
+ * hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ * is not layer specific.
+ *
+ * Locking.
+ *
+ * - i_mutex
+ * - PG_locked
+ * - cl_object_header::coh_page_guard
+ * - cl_object_header::coh_lock_guard
+ * - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+#include <lvfs.h>
+# include <linux/mutex.h>
+# include <linux/radix-tree.h>
+
+struct inode;
+
+struct cl_device;
+struct cl_device_operations;
+
+struct cl_object;
+struct cl_object_page_operations;
+struct cl_object_lock_operations;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req;
+struct cl_req_slice;
+
+/**
+ * Operations for each data device in the client stack.
+ *
+ * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
+ */
+struct cl_device_operations {
+ /**
+ * Initialize cl_req. This method is called top-to-bottom on all
+ * devices in the stack to get them a chance to allocate layer-private
+ * data, and to attach them to the cl_req by calling
+ * cl_req_slice_add().
+ *
+ * \see osc_req_init(), lov_req_init(), lovsub_req_init()
+ * \see ccc_req_init()
+ */
+ int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req);
+};
+
+/**
+ * Device in the client stack.
+ *
+ * \see ccc_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+ /** Super-class. */
+ struct lu_device cd_lu_dev;
+ /** Per-layer operation vector. */
+ const struct cl_device_operations *cd_ops;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+ /** Object size, in bytes */
+ loff_t cat_size;
+ /**
+ * Known minimal size, in bytes.
+ *
+ * This is only valid when at least one DLM lock is held.
+ */
+ loff_t cat_kms;
+ /** Modification time. Measured in seconds since epoch. */
+ time_t cat_mtime;
+ /** Access time. Measured in seconds since epoch. */
+ time_t cat_atime;
+ /** Change time. Measured in seconds since epoch. */
+ time_t cat_ctime;
+ /**
+ * Blocks allocated to this cl_object on the server file system.
+ *
+ * \todo XXX An interface for block size is needed.
+ */
+ __u64 cat_blocks;
+ /**
+ * User identifier for quota purposes.
+ */
+ uid_t cat_uid;
+ /**
+ * Group identifier for quota purposes.
+ */
+ gid_t cat_gid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+ CAT_SIZE = 1 << 0,
+ CAT_KMS = 1 << 1,
+ CAT_MTIME = 1 << 3,
+ CAT_ATIME = 1 << 4,
+ CAT_CTIME = 1 << 5,
+ CAT_BLOCKS = 1 << 6,
+ CAT_UID = 1 << 7,
+ CAT_GID = 1 << 8
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ * stripe. cl_object is based on lu_object: it is identified by a fid,
+ * layered, cached, hashed, and lrued. Important distinction with the server
+ * side, where md_object and dt_object are used, is that cl_object "fans out"
+ * at the lov/sns level: depending on the file layout, single file is
+ * represented as a set of "sub-objects" (stripes). At the implementation
+ * level, struct lov_object contains an array of cl_objects. Each sub-object
+ * is a full-fledged cl_object, having its fid, living in the lru and hash
+ * table.
+ *
+ * This leads to the next important difference with the server side: on the
+ * client, it's quite usual to have objects with the different sequence of
+ * layers. For example, typical top-object is composed of the following
+ * layers:
+ *
+ * - vvp
+ * - lov
+ *
+ * whereas its sub-objects are composed of
+ *
+ * - lovsub
+ * - osc
+ *
+ * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ * track of the object-subobject relationship.
+ *
+ * Sub-objects are not cached independently: when top-object is about to
+ * be discarded from the memory, all its sub-objects are torn-down and
+ * destroyed too.
+ *
+ * \see ccc_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+ /** super class */
+ struct lu_object co_lu;
+ /** per-object-layer operations */
+ const struct cl_object_operations *co_ops;
+ /** offset of page slice in cl_page buffer */
+ int co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+ /** Super-class. */
+ struct lu_object_conf coc_lu;
+ union {
+ /**
+ * Object layout. This is consumed by lov.
+ */
+ struct lustre_md *coc_md;
+ /**
+ * Description of particular stripe location in the
+ * cluster. This is consumed by osc.
+ */
+ struct lov_oinfo *coc_oinfo;
+ } u;
+ /**
+ * VFS inode. This is consumed by vvp.
+ */
+ struct inode *coc_inode;
+ /**
+ * Layout lock handle.
+ */
+ struct ldlm_lock *coc_lock;
+ /**
+ * Operation to handle layout, OBJECT_CONF_XYZ.
+ */
+ int coc_opc;
+};
+
+enum {
+ /** configure layout, set up a new stripe, must be called while
+ * holding layout lock. */
+ OBJECT_CONF_SET = 0,
+ /** invalidate the current stripe configuration due to losing
+ * layout lock. */
+ OBJECT_CONF_INVALIDATE = 1,
+ /** wait for old layout to go away so that new layout can be
+ * set up. */
+ OBJECT_CONF_WAIT = 2
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+ /**
+ * Initialize page slice for this layer. Called top-to-bottom through
+ * every object layer when a new cl_page is instantiated. Layer
+ * keeping private per-page data, or requiring its own page operations
+ * vector should allocate these data here, and attach then to the page
+ * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+ * sense). Optional.
+ *
+ * \retval NULL success.
+ *
+ * \retval ERR_PTR(errno) failure code.
+ *
+ * \retval valid-pointer pointer to already existing referenced page
+ * to be used instead of newly created.
+ */
+ int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage);
+ /**
+ * Initialize lock slice for this layer. Called top-to-bottom through
+ * every object layer when a new cl_lock is instantiated. Layer
+ * keeping private per-lock data, or requiring its own lock operations
+ * vector should allocate these data here, and attach then to the lock
+ * by calling cl_lock_slice_add(). Mandatory.
+ */
+ int (*coo_lock_init)(const struct lu_env *env,
+ struct cl_object *obj, struct cl_lock *lock,
+ const struct cl_io *io);
+ /**
+ * Initialize io state for a given layer.
+ *
+ * called top-to-bottom once per io existence to initialize io
+ * state. If layer wants to keep some state for this type of io, it
+ * has to embed struct cl_io_slice in lu_env::le_ses, and register
+ * slice with cl_io_slice_add(). It is guaranteed that all threads
+ * participating in this io share the same session.
+ */
+ int (*coo_io_init)(const struct lu_env *env,
+ struct cl_object *obj, struct cl_io *io);
+ /**
+ * Fill portion of \a attr that this layer controls. This method is
+ * called top-to-bottom through all object layers.
+ *
+ * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+ *
+ * \return 0: to continue
+ * \return +ve: to stop iterating through layers (but 0 is returned
+ * from enclosing cl_object_attr_get())
+ * \return -ve: to signal error
+ */
+ int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr);
+ /**
+ * Update attributes.
+ *
+ * \a valid is a bitmask composed from enum #cl_attr_valid, and
+ * indicating what attributes are to be set.
+ *
+ * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+ *
+ * \return the same convention as for
+ * cl_object_operations::coo_attr_get() is used.
+ */
+ int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid);
+ /**
+ * Update object configuration. Called top-to-bottom to modify object
+ * configuration.
+ *
+ * XXX error conditions and handling.
+ */
+ int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf);
+ /**
+ * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+ * object. Layers are supposed to fill parts of \a lvb that will be
+ * shipped to the glimpse originator as a glimpse result.
+ *
+ * \see ccc_object_glimpse(), lovsub_object_glimpse(),
+ * \see osc_object_glimpse()
+ */
+ int (*coo_glimpse)(const struct lu_env *env,
+ const struct cl_object *obj, struct ost_lvb *lvb);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+ /** Standard lu_object_header. cl_object::co_lu::lo_header points
+ * here. */
+ struct lu_object_header coh_lu;
+ /** \name locks
+ * \todo XXX move locks below to the separate cache-lines, they are
+ * mostly useless otherwise.
+ */
+ /** @{ */
+ /** Lock protecting page tree. */
+ spinlock_t coh_page_guard;
+ /** Lock protecting lock list. */
+ spinlock_t coh_lock_guard;
+ /** @} locks */
+ /** Radix tree of cl_page's, cached for this object. */
+ struct radix_tree_root coh_tree;
+ /** # of pages in radix tree. */
+ unsigned long coh_pages;
+ /** List of cl_lock's granted for this object. */
+ struct list_head coh_locks;
+
+ /**
+ * Parent object. It is assumed that an object has a well-defined
+ * parent, but not a well-defined child (there may be multiple
+ * sub-objects, for the same top-object). cl_object_header::coh_parent
+ * field allows certain code to be written generically, without
+ * limiting possible cl_object layouts unduly.
+ */
+ struct cl_object_header *coh_parent;
+ /**
+ * Protects consistency between cl_attr of parent object and
+ * attributes of sub-objects, that the former is calculated ("merged")
+ * from.
+ *
+ * \todo XXX this can be read/write lock if needed.
+ */
+ spinlock_t coh_attr_guard;
+ /**
+ * Size of cl_page + page slices
+ */
+ unsigned short coh_page_bufsize;
+ /**
+ * Number of objects above this one: 0 for a top-object, 1 for its
+ * sub-object, etc.
+ */
+ unsigned char coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj) \
+ list_for_each_entry((slice), \
+ &(obj)->co_lu.lo_header->loh_layers, \
+ co_lu.lo_linkage)
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj) \
+ list_for_each_entry_reverse((slice), \
+ &(obj)->co_lu.lo_header->loh_layers, \
+ co_lu.lo_linkage)
+/** @} cl_object */
+
+#ifndef pgoff_t
+#define pgoff_t unsigned long
+#endif
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ * of the given file are of the same size, and are kept in the radix tree
+ * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ * of the top-level file object are first class cl_objects, they have their
+ * own radix trees of pages and hence page is implemented as a sequence of
+ * struct cl_pages's, linked into double-linked list through
+ * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ * corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ * page in Linux kernel, for example), struct page. It is assumed, that this
+ * association is implemented by one of cl_page layers (top layer in the
+ * current design) that
+ *
+ * - intercepts per-VM-page call-backs made by the environment (e.g.,
+ * memory pressure),
+ *
+ * - translates state (page flag bits) and locking between lustre and
+ * environment.
+ *
+ * The association between cl_page and struct page is immutable and
+ * established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ * this io an exclusive access to this page w.r.t. other io attempts and
+ * various events changing page state (such as transfer completion, or
+ * eviction of the page from the memory). Note, that in general cl_io
+ * cannot be identified with a particular thread, and page ownership is not
+ * exactly equal to the current thread holding a lock on the page. Layer
+ * implementing association between cl_page and struct page has to implement
+ * ownership on top of available synchronization mechanisms.
+ *
+ * While lustre client maintains the notion of an page ownership by io,
+ * hosting MM/VM usually has its own page concurrency control
+ * mechanisms. For example, in Linux, page access is synchronized by the
+ * per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ * takes care to acquire and release such locks as necessary around the
+ * calls to the file system methods (->readpage(), ->prepare_write(),
+ * ->commit_write(), etc.). This leads to the situation when there are two
+ * different ways to own a page in the client:
+ *
+ * - client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ * - VM locks a page and then calls the client, that has "to assume"
+ * the ownership from the VM (cl_page_assume()).
+ *
+ * Dual methods to release ownership are cl_page_disown() and
+ * cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ * drops to 0, the page is returned to the cache, unless it is in
+ * cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * The general logic guaranteeing the absence of "existential races" for
+ * pages is the following:
+ *
+ * - there are fixed known ways for a thread to obtain a new reference
+ * to a page:
+ *
+ * - by doing a lookup in the cl_object radix tree, protected by the
+ * spin-lock;
+ *
+ * - by starting from VM-locked struct page and following some
+ * hosting environment method (e.g., following ->private pointer in
+ * the case of Linux kernel), see cl_vmpage_page();
+ *
+ * - when the page enters cl_page_state::CPS_FREEING state, all these
+ * ways are severed with the proper synchronization
+ * (cl_page_delete());
+ *
+ * - entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ * lock;
+ *
+ * - no new references to the page in cl_page_state::CPS_FREEING state
+ * are allowed (checked in cl_page_get()).
+ *
+ * Together this guarantees that when last reference to a
+ * cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ * page, as neither references to it can be acquired at that point, nor
+ * ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ * cl_page_state. Possible state transitions are enumerated in
+ * cl_page_state_set(). State transition process (i.e., actual changing of
+ * cl_page::cp_state field) is protected by the lock on the underlying VM
+ * page.
+ *
+ * Linux Kernel implementation.
+ *
+ * Binding between cl_page and struct page (which is a typedef for
+ * struct page) is implemented in the vvp layer. cl_page is attached to the
+ * ->private pointer of the struct page, together with the setting of
+ * PG_private bit in page->flags, and acquiring additional reference on the
+ * struct page (much like struct buffer_head, or any similar file system
+ * private data structures).
+ *
+ * PG_locked lock is used to implement both ownership and transfer
+ * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ * states. No additional references are acquired for the duration of the
+ * transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ * write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+ /**
+ * Page is in the cache, un-owned. Page leaves cached state in the
+ * following cases:
+ *
+ * - [cl_page_state::CPS_OWNED] io comes across the page and
+ * owns it;
+ *
+ * - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+ * req-formation engine decides that it wants to include this page
+ * into an cl_req being constructed, and yanks it from the cache;
+ *
+ * - [cl_page_state::CPS_FREEING] VM callback is executed to
+ * evict the page form the memory;
+ *
+ * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+ */
+ CPS_CACHED,
+ /**
+ * Page is exclusively owned by some cl_io. Page may end up in this
+ * state as a result of
+ *
+ * - io creating new page and immediately owning it;
+ *
+ * - [cl_page_state::CPS_CACHED] io finding existing cached page
+ * and owning it;
+ *
+ * - [cl_page_state::CPS_OWNED] io finding existing owned page
+ * and waiting for owner to release the page;
+ *
+ * Page leaves owned state in the following cases:
+ *
+ * - [cl_page_state::CPS_CACHED] io decides to leave the page in
+ * the cache, doing nothing;
+ *
+ * - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+ * this page;
+ *
+ * - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+ * transfer for this page;
+ *
+ * - [cl_page_state::CPS_FREEING] io decides to destroy this
+ * page (e.g., as part of truncate or extent lock cancellation).
+ *
+ * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+ */
+ CPS_OWNED,
+ /**
+ * Page is being written out, as a part of a transfer. This state is
+ * entered when req-formation logic decided that it wants this page to
+ * be sent through the wire _now_. Specifically, it means that once
+ * this state is achieved, transfer completion handler (with either
+ * success or failure indication) is guaranteed to be executed against
+ * this page independently of any locks and any scheduling decisions
+ * made by the hosting environment (that effectively means that the
+ * page is never put into cl_page_state::CPS_PAGEOUT state "in
+ * advance". This property is mentioned, because it is important when
+ * reasoning about possible dead-locks in the system). The page can
+ * enter this state as a result of
+ *
+ * - [cl_page_state::CPS_OWNED] an io requesting an immediate
+ * write-out of this page, or
+ *
+ * - [cl_page_state::CPS_CACHED] req-forming engine deciding
+ * that it has enough dirty pages cached to issue a "good"
+ * transfer.
+ *
+ * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+ * is completed---it is moved into cl_page_state::CPS_CACHED state.
+ *
+ * Underlying VM page is locked for the duration of transfer.
+ *
+ * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+ */
+ CPS_PAGEOUT,
+ /**
+ * Page is being read in, as a part of a transfer. This is quite
+ * similar to the cl_page_state::CPS_PAGEOUT state, except that
+ * read-in is always "immediate"---there is no such thing a sudden
+ * construction of read cl_req from cached, presumably not up to date,
+ * pages.
+ *
+ * Underlying VM page is locked for the duration of transfer.
+ *
+ * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+ */
+ CPS_PAGEIN,
+ /**
+ * Page is being destroyed. This state is entered when client decides
+ * that page has to be deleted from its host object, as, e.g., a part
+ * of truncate.
+ *
+ * Once this state is reached, there is no way to escape it.
+ *
+ * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+ */
+ CPS_FREEING,
+ CPS_NR
+};
+
+enum cl_page_type {
+ /** Host page, the page is from the host inode which the cl_page
+ * belongs to. */
+ CPT_CACHEABLE = 1,
+
+ /** Transient page, the transient cl_page is used to bind a cl_page
+ * to vmpage which is not belonging to the same object of cl_page.
+ * it is used in DirectIO, lockless IO and liblustre. */
+ CPT_TRANSIENT,
+};
+
+/**
+ * Flags maintained for every cl_page.
+ */
+enum cl_page_flags {
+ /**
+ * Set when pagein completes. Used for debugging (read completes at
+ * most once for a page).
+ */
+ CPF_READ_COMPLETED = 1 << 0
+};
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+ /** Reference counter. */
+ atomic_t cp_ref;
+ /** An object this page is a part of. Immutable after creation. */
+ struct cl_object *cp_obj;
+ /** Logical page index within the object. Immutable after creation. */
+ pgoff_t cp_index;
+ /** List of slices. Immutable after creation. */
+ struct list_head cp_layers;
+ /** Parent page, NULL for top-level page. Immutable after creation. */
+ struct cl_page *cp_parent;
+ /** Lower-layer page. NULL for bottommost page. Immutable after
+ * creation. */
+ struct cl_page *cp_child;
+ /**
+ * Page state. This field is const to avoid accidental update, it is
+ * modified only internally within cl_page.c. Protected by a VM lock.
+ */
+ const enum cl_page_state cp_state;
+ /** Linkage of pages within group. Protected by cl_page::cp_mutex. */
+ struct list_head cp_batch;
+ /** Mutex serializing membership of a page in a batch. */
+ struct mutex cp_mutex;
+ /** Linkage of pages within cl_req. */
+ struct list_head cp_flight;
+ /** Transfer error. */
+ int cp_error;
+
+ /**
+ * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+ * creation.
+ */
+ enum cl_page_type cp_type;
+
+ /**
+ * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+ * by sub-io. Protected by a VM lock.
+ */
+ struct cl_io *cp_owner;
+ /**
+ * Debug information, the task is owning the page.
+ */
+ task_t *cp_task;
+ /**
+ * Owning IO request in cl_page_state::CPS_PAGEOUT and
+ * cl_page_state::CPS_PAGEIN states. This field is maintained only in
+ * the top-level pages. Protected by a VM lock.
+ */
+ struct cl_req *cp_req;
+ /** List of references to this page, for debugging. */
+ struct lu_ref cp_reference;
+ /** Link to an object, for debugging. */
+ struct lu_ref_link *cp_obj_ref;
+ /** Link to a queue, for debugging. */
+ struct lu_ref_link *cp_queue_ref;
+ /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
+ unsigned cp_flags;
+ /** Assigned if doing a sync_io */
+ struct cl_sync_io *cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see ccc_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+ struct cl_page *cpl_page;
+ /**
+ * Object slice corresponding to this page slice. Immutable after
+ * creation.
+ */
+ struct cl_object *cpl_obj;
+ const struct cl_page_operations *cpl_ops;
+ /** Linkage into cl_page::cp_layers. Immutable after creation. */
+ struct list_head cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \warning: cl_lock_mode_match() assumes particular ordering here.
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+ /**
+ * Mode of a lock that protects no data, and exists only as a
+ * placeholder. This is used for `glimpse' requests. A phantom lock
+ * might get promoted to real lock at some point.
+ */
+ CLM_PHANTOM,
+ CLM_READ,
+ CLM_WRITE,
+ CLM_GROUP
+};
+
+/**
+ * Requested transfer type.
+ * \ingroup cl_req
+ */
+enum cl_req_type {
+ CRT_READ,
+ CRT_WRITE,
+ CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+ /**
+ * cl_page<->struct page methods. Only one layer in the stack has to
+ * implement these. Current code assumes that this functionality is
+ * provided by the topmost layer, see cl_page_disown0() as an example.
+ */
+
+ /**
+ * \return the underlying VM page. Optional.
+ */
+ struct page *(*cpo_vmpage)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /**
+ * Called when \a io acquires this page into the exclusive
+ * ownership. When this method returns, it is guaranteed that the is
+ * not owned by other io, and no transfer is going on against
+ * it. Optional.
+ *
+ * \see cl_page_own()
+ * \see vvp_page_own(), lov_page_own()
+ */
+ int (*cpo_own)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io, int nonblock);
+ /** Called when ownership it yielded. Optional.
+ *
+ * \see cl_page_disown()
+ * \see vvp_page_disown()
+ */
+ void (*cpo_disown)(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io);
+ /**
+ * Called for a page that is already "owned" by \a io from VM point of
+ * view. Optional.
+ *
+ * \see cl_page_assume()
+ * \see vvp_page_assume(), lov_page_assume()
+ */
+ void (*cpo_assume)(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io);
+ /** Dual to cl_page_operations::cpo_assume(). Optional. Called
+ * bottom-to-top when IO releases a page without actually unlocking
+ * it.
+ *
+ * \see cl_page_unassume()
+ * \see vvp_page_unassume()
+ */
+ void (*cpo_unassume)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ /**
+ * Announces whether the page contains valid data or not by \a uptodate.
+ *
+ * \see cl_page_export()
+ * \see vvp_page_export()
+ */
+ void (*cpo_export)(const struct lu_env *env,
+ const struct cl_page_slice *slice, int uptodate);
+ /**
+ * Unmaps page from the user space (if it is mapped).
+ *
+ * \see cl_page_unmap()
+ * \see vvp_page_unmap()
+ */
+ int (*cpo_unmap)(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io);
+ /**
+ * Checks whether underlying VM page is locked (in the suitable
+ * sense). Used for assertions.
+ *
+ * \retval -EBUSY: page is protected by a lock of a given mode;
+ * \retval -ENODATA: page is not protected by a lock;
+ * \retval 0: this layer cannot decide. (Should never happen.)
+ */
+ int (*cpo_is_vmlocked)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /**
+ * Page destruction.
+ */
+
+ /**
+ * Called when page is truncated from the object. Optional.
+ *
+ * \see cl_page_discard()
+ * \see vvp_page_discard(), osc_page_discard()
+ */
+ void (*cpo_discard)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ /**
+ * Called when page is removed from the cache, and is about to being
+ * destroyed. Optional.
+ *
+ * \see cl_page_delete()
+ * \see vvp_page_delete(), osc_page_delete()
+ */
+ void (*cpo_delete)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /** Destructor. Frees resources and slice itself. */
+ void (*cpo_fini)(const struct lu_env *env,
+ struct cl_page_slice *slice);
+
+ /**
+ * Checks whether the page is protected by a cl_lock. This is a
+ * per-layer method, because certain layers have ways to check for the
+ * lock much more efficiently than through the generic locks scan, or
+ * implement locking mechanisms separate from cl_lock, e.g.,
+ * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
+ * being canceled, or scheduled for cancellation as soon as the last
+ * user goes away, too.
+ *
+ * \retval -EBUSY: page is protected by a lock of a given mode;
+ * \retval -ENODATA: page is not protected by a lock;
+ * \retval 0: this layer cannot decide.
+ *
+ * \see cl_page_is_under_lock()
+ */
+ int (*cpo_is_under_lock)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+
+ /**
+ * Optional debugging helper. Prints given page slice.
+ *
+ * \see cl_page_print()
+ */
+ int (*cpo_print)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ void *cookie, lu_printer_t p);
+ /**
+ * \name transfer
+ *
+ * Transfer methods. See comment on cl_req for a description of
+ * transfer formation and life-cycle.
+ *
+ * @{
+ */
+ /**
+ * Request type dependent vector of operations.
+ *
+ * Transfer operations depend on transfer mode (cl_req_type). To avoid
+ * passing transfer mode to each and every of these methods, and to
+ * avoid branching on request type inside of the methods, separate
+ * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+ * provided. That is, method invocation usually looks like
+ *
+ * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+ */
+ struct {
+ /**
+ * Called when a page is submitted for a transfer as a part of
+ * cl_page_list.
+ *
+ * \return 0 : page is eligible for submission;
+ * \return -EALREADY : skip this page;
+ * \return -ve : error.
+ *
+ * \see cl_page_prep()
+ */
+ int (*cpo_prep)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ /**
+ * Completion handler. This is guaranteed to be eventually
+ * fired after cl_page_operations::cpo_prep() or
+ * cl_page_operations::cpo_make_ready() call.
+ *
+ * This method can be called in a non-blocking context. It is
+ * guaranteed however, that the page involved and its object
+ * are pinned in memory (and, hence, calling cl_page_put() is
+ * safe).
+ *
+ * \see cl_page_completion()
+ */
+ void (*cpo_completion)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int ioret);
+ /**
+ * Called when cached page is about to be added to the
+ * cl_req as a part of req formation.
+ *
+ * \return 0 : proceed with this page;
+ * \return -EAGAIN : skip this page;
+ * \return -ve : error.
+ *
+ * \see cl_page_make_ready()
+ */
+ int (*cpo_make_ready)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /**
+ * Announce that this page is to be written out
+ * opportunistically, that is, page is dirty, it is not
+ * necessary to start write-out transfer right now, but
+ * eventually page has to be written out.
+ *
+ * Main caller of this is the write path (see
+ * vvp_io_commit_write()), using this method to build a
+ * "transfer cache" from which large transfers are then
+ * constructed by the req-formation engine.
+ *
+ * \todo XXX it would make sense to add page-age tracking
+ * semantics here, and to oblige the req-formation engine to
+ * send the page out not later than it is too old.
+ *
+ * \see cl_page_cache_add()
+ */
+ int (*cpo_cache_add)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ } io[CRT_NR];
+ /**
+ * Tell transfer engine that only [to, from] part of a page should be
+ * transmitted.
+ *
+ * This is used for immediate transfers.
+ *
+ * \todo XXX this is not very good interface. It would be much better
+ * if all transfer parameters were supplied as arguments to
+ * cl_io_operations::cio_submit() call, but it is not clear how to do
+ * this for page queues.
+ *
+ * \see cl_page_clip()
+ */
+ void (*cpo_clip)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int from, int to);
+ /**
+ * \pre the page was queued for transferring.
+ * \post page is removed from client's pending list, or -EBUSY
+ * is returned if it has already been in transferring.
+ *
+ * This is one of seldom page operation which is:
+ * 0. called from top level;
+ * 1. don't have vmpage locked;
+ * 2. every layer should synchronize execution of its ->cpo_cancel()
+ * with completion handlers. Osc uses client obd lock for this
+ * purpose. Based on there is no vvp_page_cancel and
+ * lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+ *
+ * \see osc_page_cancel().
+ */
+ int (*cpo_cancel)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /**
+ * Write out a page by kernel. This is only called by ll_writepage
+ * right now.
+ *
+ * \see cl_page_flush()
+ */
+ int (*cpo_flush)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ /** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ cl_page_print(env, &msgdata, lu_cdebug_printer, page); \
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+static inline int __page_in_use(const struct cl_page *page, int refc)
+{
+ if (page->cp_type == CPT_CACHEABLE)
+ ++refc;
+ LASSERT(atomic_read(&page->cp_ref) > 0);
+ return (atomic_read(&page->cp_ref) > refc);
+}
+#define cl_page_in_use(pg) __page_in_use(pg, 1)
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ * struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * All locks for a given object are linked into cl_object_header::coh_locks
+ * list (protected by cl_object_header::coh_lock_guard spin-lock) through
+ * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
+ * sort it in starting lock offset, or use altogether different data structure
+ * like a tree.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ * - vvp_lock (vvp specific data), and
+ * - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ * - lovsub_lock, and
+ * - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is reference counted. When reference counter drops to 0, lock is
+ * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
+ * lock is destroyed when last reference is released. Referencing between
+ * top-lock and its sub-locks is described in the lov documentation module.
+ *
+ * STATE MACHINE
+ *
+ * Also, cl_lock is a state machine. This requires some clarification. One of
+ * the goals of client IO re-write was to make IO path non-blocking, or at
+ * least to make it easier to make it non-blocking in the future. Here
+ * `non-blocking' means that when a system call (read, write, truncate)
+ * reaches a situation where it has to wait for a communication with the
+ * server, it should --instead of waiting-- remember its current state and
+ * switch to some other work. E.g,. instead of waiting for a lock enqueue,
+ * client should proceed doing IO on the next stripe, etc. Obviously this is
+ * rather radical redesign, and it is not planned to be fully implemented at
+ * this time, instead we are putting some infrastructure in place, that would
+ * make it easier to do asynchronous non-blocking IO easier in the
+ * future. Specifically, where old locking code goes to sleep (waiting for
+ * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
+ * enqueue reply comes, its completion handler signals that lock state-machine
+ * is ready to transit to the next state. There is some generic code in
+ * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
+ * this cl_lock.c code, it looks like locking is done in normal blocking
+ * fashion, and it the same time it is possible to switch to the non-blocking
+ * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
+ * functions).
+ *
+ * For a description of state machine states and transitions see enum
+ * cl_lock_state.
+ *
+ * There are two ways to restrict a set of states which lock might move to:
+ *
+ * - placing a "hold" on a lock guarantees that lock will not be moved
+ * into cl_lock_state::CLS_FREEING state until hold is released. Hold
+ * can be only acquired on a lock that is not in
+ * cl_lock_state::CLS_FREEING. All holds on a lock are counted in
+ * cl_lock::cll_holds. Hold protects lock from cancellation and
+ * destruction. Requests to cancel and destroy a lock on hold will be
+ * recorded, but only honored when last hold on a lock is released;
+ *
+ * - placing a "user" on a lock guarantees that lock will not leave
+ * cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
+ * cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
+ * states, once it enters this set. That is, if a user is added onto a
+ * lock in a state not from this set, it doesn't immediately enforce
+ * lock to move to this set, but once lock enters this set it will
+ * remain there until all users are removed. Lock users are counted in
+ * cl_lock::cll_users.
+ *
+ * User is used to assure that lock is not canceled or destroyed while
+ * it is being enqueued, or actively used by some IO.
+ *
+ * Currently, a user always comes with a hold (cl_lock_invariant()
+ * checks that a number of holds is not less than a number of users).
+ *
+ * CONCURRENCY
+ *
+ * This is how lock state-machine operates. struct cl_lock contains a mutex
+ * cl_lock::cll_guard that protects struct fields.
+ *
+ * - mutex is taken, and cl_lock::cll_state is examined.
+ *
+ * - for every state there are possible target states where lock can move
+ * into. They are tried in order. Attempts to move into next state are
+ * done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
+ *
+ * - if the transition can be performed immediately, state is changed,
+ * and mutex is released.
+ *
+ * - if the transition requires blocking, _try() function returns
+ * cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
+ * sleep, waiting for possibility of lock state change. It is woken
+ * up when some event occurs, that makes lock state change possible
+ * (e.g., the reception of the reply from the server), and repeats
+ * the loop.
+ *
+ * Top-lock and sub-lock has separate mutexes and the latter has to be taken
+ * first to avoid dead-lock.
+ *
+ * To see an example of interaction of all these issues, take a look at the
+ * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
+ * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
+ * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
+ * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
+ * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
+ * done in parallel, rather than one after another (this is used for glimpse
+ * locks, that cannot dead-lock).
+ *
+ * INTERFACE AND USAGE
+ *
+ * struct cl_lock_operations provide a number of call-backs that are invoked
+ * when events of interest occurs. Layers can intercept and handle glimpse,
+ * blocking, cancel ASTs and a reception of the reply from the server.
+ *
+ * One important difference with the old client locking model is that new
+ * client has a representation for the top-lock, whereas in the old code only
+ * sub-locks existed as real data structures and file-level locks are
+ * represented by "request sets" that are created and destroyed on each and
+ * every lock creation.
+ *
+ * Top-locks are cached, and can be found in the cache by the system calls. It
+ * is possible that top-lock is in cache, but some of its sub-locks were
+ * canceled and destroyed. In that case top-lock has to be enqueued again
+ * before it can be used.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ * is called on each layer. Responsibility of this method is to add locks,
+ * needed by a given layer into cl_io.ci_lockset.
+ *
+ * - once locks for all layers were collected, they are sorted to avoid
+ * dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ * - when all locks are acquired, IO is performed;
+ *
+ * - locks are released into cache.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ * atomicity;
+ *
+ * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * As multi-stripe locks have to be allowed, it makes sense to cache them, so
+ * that, for example, a sequence of O_APPEND writes can proceed quickly
+ * without going down to the individual stripes to do lock matching. On the
+ * other hand, multi-stripe locks shouldn't be used by normal read/write
+ * calls. To achieve this, every layer can implement ->clo_fits_into() method,
+ * that is called by lock matching code (cl_lock_lookup()), and that can be
+ * used to selectively disable matching of certain locks for certain IOs. For
+ * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe
+ * locks to be matched only for truncates and O_APPEND writes.
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+ /** Object this lock is granted for. */
+ struct cl_object *cld_obj;
+ /** Index of the first page protected by this lock. */
+ pgoff_t cld_start;
+ /** Index of the last page (inclusive) protected by this lock. */
+ pgoff_t cld_end;
+ /** Group ID, for group lock */
+ __u64 cld_gid;
+ /** Lock mode. */
+ enum cl_lock_mode cld_mode;
+ /**
+ * flags to enqueue lock. A combination of bit-flags from
+ * enum cl_enq_flags.
+ */
+ __u32 cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]"
+#define PDESCR(descr) \
+ cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \
+ (descr)->cld_start, (descr)->cld_end
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Lock state-machine states.
+ *
+ * \htmlonly
+ * <pre>
+ *
+ * Possible state transitions:
+ *
+ * +------------------>NEW
+ * | |
+ * | | cl_enqueue_try()
+ * | |
+ * | cl_unuse_try() V
+ * | +--------------QUEUING (*)
+ * | | |
+ * | | | cl_enqueue_try()
+ * | | |
+ * | | cl_unuse_try() V
+ * sub-lock | +-------------ENQUEUED (*)
+ * canceled | | |
+ * | | | cl_wait_try()
+ * | | |
+ * | | (R)
+ * | | |
+ * | | V
+ * | | HELD<---------+
+ * | | | |
+ * | | | | cl_use_try()
+ * | | cl_unuse_try() | |
+ * | | | |
+ * | | V ---+
+ * | +------------>INTRANSIT (D) <--+
+ * | | |
+ * | cl_unuse_try() | | cached lock found
+ * | | | cl_use_try()
+ * | | |
+ * | V |
+ * +------------------CACHED---------+
+ * |
+ * (C)
+ * |
+ * V
+ * FREEING
+ *
+ * Legend:
+ *
+ * In states marked with (*) transition to the same state (i.e., a loop
+ * in the diagram) is possible.
+ *
+ * (R) is the point where Receive call-back is invoked: it allows layers
+ * to handle arrival of lock reply.
+ *
+ * (C) is the point where Cancellation call-back is invoked.
+ *
+ * (D) is the transit state which means the lock is changing.
+ *
+ * Transition to FREEING state is possible from any other state in the
+ * diagram in case of unrecoverable error.
+ * </pre>
+ * \endhtmlonly
+ *
+ * These states are for individual cl_lock object. Top-lock and its sub-locks
+ * can be in the different states. Another way to say this is that we have
+ * nested state-machines.
+ *
+ * Separate QUEUING and ENQUEUED states are needed to support non-blocking
+ * operation for locks with multiple sub-locks. Imagine lock on a file F, that
+ * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
+ * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
+ * its completion and at last enqueue lock for S2, and wait for its
+ * completion. In that case, top-lock is in QUEUING state while S0, S1 are
+ * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
+ * that in this case, sub-locks move from state to state, and top-lock remains
+ * in the same state).
+ */
+enum cl_lock_state {
+ /**
+ * Lock that wasn't yet enqueued
+ */
+ CLS_NEW,
+ /**
+ * Enqueue is in progress, blocking for some intermediate interaction
+ * with the other side.
+ */
+ CLS_QUEUING,
+ /**
+ * Lock is fully enqueued, waiting for server to reply when it is
+ * granted.
+ */
+ CLS_ENQUEUED,
+ /**
+ * Lock granted, actively used by some IO.
+ */
+ CLS_HELD,
+ /**
+ * This state is used to mark the lock is being used, or unused.
+ * We need this state because the lock may have several sublocks,
+ * so it's impossible to have an atomic way to bring all sublocks
+ * into CLS_HELD state at use case, or all sublocks to CLS_CACHED
+ * at unuse case.
+ * If a thread is referring to a lock, and it sees the lock is in this
+ * state, it must wait for the lock.
+ * See state diagram for details.
+ */
+ CLS_INTRANSIT,
+ /**
+ * Lock granted, not used.
+ */
+ CLS_CACHED,
+ /**
+ * Lock is being destroyed.
+ */
+ CLS_FREEING,
+ CLS_NR
+};
+
+enum cl_lock_flags {
+ /**
+ * lock has been cancelled. This flag is never cleared once set (by
+ * cl_lock_cancel0()).
+ */
+ CLF_CANCELLED = 1 << 0,
+ /** cancellation is pending for this lock. */
+ CLF_CANCELPEND = 1 << 1,
+ /** destruction is pending for this lock. */
+ CLF_DOOMED = 1 << 2,
+ /** from enqueue RPC reply upcall. */
+ CLF_FROM_UPCALL= 1 << 3,
+};
+
+/**
+ * Lock closure.
+ *
+ * Lock closure is a collection of locks (both top-locks and sub-locks) that
+ * might be updated in a result of an operation on a certain lock (which lock
+ * this is a closure of).
+ *
+ * Closures are needed to guarantee dead-lock freedom in the presence of
+ *
+ * - nested state-machines (top-lock state-machine composed of sub-lock
+ * state-machines), and
+ *
+ * - shared sub-locks.
+ *
+ * Specifically, many operations, such as lock enqueue, wait, unlock,
+ * etc. start from a top-lock, and then operate on a sub-locks of this
+ * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
+ * of such operation, this change has to be propagated to all top-locks that
+ * share this sub-lock. Obviously, no natural lock ordering (e.g.,
+ * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
+ * to be used. Lock closure systematizes this try-and-repeat logic.
+ */
+struct cl_lock_closure {
+ /**
+ * Lock that is mutexed when closure construction is started. When
+ * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
+ * origin is released before waiting.
+ */
+ struct cl_lock *clc_origin;
+ /**
+ * List of enclosed locks, so far. Locks are linked here through
+ * cl_lock::cll_inclosure.
+ */
+ struct list_head clc_list;
+ /**
+ * True iff closure is in a `wait' mode. This determines what
+ * cl_lock_enclosure() does when a lock L to be added to the closure
+ * is currently mutexed by some other thread.
+ *
+ * If cl_lock_closure::clc_wait is not set, then closure construction
+ * fails with CLO_REPEAT immediately.
+ *
+ * In wait mode, cl_lock_enclosure() waits until next attempt to build
+ * a closure might succeed. To this end it releases an origin mutex
+ * (cl_lock_closure::clc_origin), that has to be the only lock mutex
+ * owned by the current thread, and then waits on L mutex (by grabbing
+ * it and immediately releasing), before returning CLO_REPEAT to the
+ * caller.
+ */
+ int clc_wait;
+ /** Number of locks in the closure. */
+ int clc_nr;
+};
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+ /** Reference counter. */
+ atomic_t cll_ref;
+ /** List of slices. Immutable after creation. */
+ struct list_head cll_layers;
+ /**
+ * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
+ * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
+ */
+ struct list_head cll_linkage;
+ /**
+ * Parameters of this lock. Protected by
+ * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
+ * cl_lock::cll_guard. Modified only on lock creation and in
+ * cl_lock_modify().
+ */
+ struct cl_lock_descr cll_descr;
+ /** Protected by cl_lock::cll_guard. */
+ enum cl_lock_state cll_state;
+ /** signals state changes. */
+ wait_queue_head_t cll_wq;
+ /**
+ * Recursive lock, most fields in cl_lock{} are protected by this.
+ *
+ * Locking rules: this mutex is never held across network
+ * communication, except when lock is being canceled.
+ *
+ * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
+ * on a top-lock. Other direction is implemented through a
+ * try-lock-repeat loop. Mutices of unrelated locks can be taken only
+ * by try-locking.
+ *
+ * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
+ */
+ struct mutex cll_guard;
+ task_t *cll_guarder;
+ int cll_depth;
+
+ /**
+ * the owner for INTRANSIT state
+ */
+ task_t *cll_intransit_owner;
+ int cll_error;
+ /**
+ * Number of holds on a lock. A hold prevents a lock from being
+ * canceled and destroyed. Protected by cl_lock::cll_guard.
+ *
+ * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
+ */
+ int cll_holds;
+ /**
+ * Number of lock users. Valid in cl_lock_state::CLS_HELD state
+ * only. Lock user pins lock in CLS_HELD state. Protected by
+ * cl_lock::cll_guard.
+ *
+ * \see cl_wait(), cl_unuse().
+ */
+ int cll_users;
+ /**
+ * Flag bit-mask. Values from enum cl_lock_flags. Updates are
+ * protected by cl_lock::cll_guard.
+ */
+ unsigned long cll_flags;
+ /**
+ * A linkage into a list of locks in a closure.
+ *
+ * \see cl_lock_closure
+ */
+ struct list_head cll_inclosure;
+ /**
+ * Confict lock at queuing time.
+ */
+ struct cl_lock *cll_conflict;
+ /**
+ * A list of references to this lock, for debugging.
+ */
+ struct lu_ref cll_reference;
+ /**
+ * A list of holds on this lock, for debugging.
+ */
+ struct lu_ref cll_holders;
+ /**
+ * A reference for cl_lock::cll_descr::cld_obj. For debugging.
+ */
+ struct lu_ref_link *cll_obj_ref;
+#ifdef CONFIG_LOCKDEP
+ /* "dep_map" name is assumed by lockdep.h macros. */
+ struct lockdep_map dep_map;
+#endif
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+ struct cl_lock *cls_lock;
+ /** Object slice corresponding to this lock slice. Immutable after
+ * creation. */
+ struct cl_object *cls_obj;
+ const struct cl_lock_operations *cls_ops;
+ /** Linkage into cl_lock::cll_layers. Immutable after creation. */
+ struct list_head cls_linkage;
+};
+
+/**
+ * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
+ *
+ * NOTE: lov_subresult() depends on ordering here.
+ */
+enum cl_lock_transition {
+ /** operation cannot be completed immediately. Wait for state change. */
+ CLO_WAIT = 1,
+ /** operation had to release lock mutex, restart. */
+ CLO_REPEAT = 2,
+ /** lower layer re-enqueued. */
+ CLO_REENQUEUED = 3,
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+ /**
+ * \name statemachine
+ *
+ * State machine transitions. These 3 methods are called to transfer
+ * lock from one state to another, as described in the commentary
+ * above enum #cl_lock_state.
+ *
+ * \retval 0 this layer has nothing more to do to before
+ * transition to the target state happens;
+ *
+ * \retval CLO_REPEAT method had to release and re-acquire cl_lock
+ * mutex, repeat invocation of transition method
+ * across all layers;
+ *
+ * \retval CLO_WAIT this layer cannot move to the target state
+ * immediately, as it has to wait for certain event
+ * (e.g., the communication with the server). It
+ * is guaranteed, that when the state transfer
+ * becomes possible, cl_lock::cll_wq wait-queue
+ * is signaled. Caller can wait for this event by
+ * calling cl_lock_state_wait();
+ *
+ * \retval -ve failure, abort state transition, move the lock
+ * into cl_lock_state::CLS_FREEING state, and set
+ * cl_lock::cll_error.
+ *
+ * Once all layers voted to agree to transition (by returning 0), lock
+ * is moved into corresponding target state. All state transition
+ * methods are optional.
+ */
+ /** @{ */
+ /**
+ * Attempts to enqueue the lock. Called top-to-bottom.
+ *
+ * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+ * \see osc_lock_enqueue()
+ */
+ int (*clo_enqueue)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_io *io, __u32 enqflags);
+ /**
+ * Attempts to wait for enqueue result. Called top-to-bottom.
+ *
+ * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
+ */
+ int (*clo_wait)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /**
+ * Attempts to unlock the lock. Called bottom-to-top. In addition to
+ * usual return values of lock state-machine methods, this can return
+ * -ESTALE to indicate that lock cannot be returned to the cache, and
+ * has to be re-initialized.
+ * unuse is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse()
+ */
+ int (*clo_unuse)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /**
+ * Notifies layer that cached lock is started being used.
+ *
+ * \pre lock->cll_state == CLS_CACHED
+ *
+ * \see lov_lock_use(), osc_lock_use()
+ */
+ int (*clo_use)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /** @} statemachine */
+ /**
+ * A method invoked when lock state is changed (as a result of state
+ * transition). This is used, for example, to track when the state of
+ * a sub-lock changes, to propagate this change to the corresponding
+ * top-lock. Optional
+ *
+ * \see lovsub_lock_state()
+ */
+ void (*clo_state)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ enum cl_lock_state st);
+ /**
+ * Returns true, iff given lock is suitable for the given io, idea
+ * being, that there are certain "unsafe" locks, e.g., ones acquired
+ * for O_APPEND writes, that we don't want to re-use for a normal
+ * write, to avoid the danger of cascading evictions. Optional. Runs
+ * under cl_object_header::coh_lock_guard.
+ *
+ * XXX this should take more information about lock needed by
+ * io. Probably lock description or something similar.
+ *
+ * \see lov_fits_into()
+ */
+ int (*clo_fits_into)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io);
+ /**
+ * \name ast
+ * Asynchronous System Traps. All of then are optional, all are
+ * executed bottom-to-top.
+ */
+ /** @{ */
+
+ /**
+ * Cancellation callback. Cancel a lock voluntarily, or under
+ * the request of server.
+ */
+ void (*clo_cancel)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /**
+ * Lock weighting ast. Executed to estimate how precious this lock
+ * is. The sum of results across all layers is used to determine
+ * whether lock worth keeping in cache given present memory usage.
+ *
+ * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
+ */
+ unsigned long (*clo_weigh)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /** @} ast */
+
+ /**
+ * \see lovsub_lock_closure()
+ */
+ int (*clo_closure)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_lock_closure *closure);
+ /**
+ * Executed bottom-to-top when lock description changes (e.g., as a
+ * result of server granting more generous lock than was requested).
+ *
+ * \see lovsub_lock_modify()
+ */
+ int (*clo_modify)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *updated);
+ /**
+ * Notifies layers (bottom-to-top) that lock is going to be
+ * destroyed. Responsibility of layers is to prevent new references on
+ * this lock from being acquired once this method returns.
+ *
+ * This can be called multiple times due to the races.
+ *
+ * \see cl_lock_delete()
+ * \see osc_lock_delete(), lovsub_lock_delete()
+ */
+ void (*clo_delete)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /**
+ * Destructor. Frees resources and the slice.
+ *
+ * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+ * \see osc_lock_fini()
+ */
+ void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+ /**
+ * Optional debugging helper. Prints given lock slice.
+ */
+ int (*clo_print)(const struct lu_env *env,
+ void *cookie, lu_printer_t p,
+ const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do { \
+ if (likely(expr)) \
+ break; \
+ \
+ CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \
+ LBUG(); \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ * - submit pages for an immediate transfer,
+ *
+ * - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ * - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+ unsigned pl_nr;
+ struct list_head pl_pages;
+ task_t *pl_owner;
+};
+
+/**
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+ struct cl_page_list c2_qin;
+ struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ * (0) initialize io state through all layers;
+ *
+ * (1) loop: prepare chunk of work to do
+ *
+ * (2) call all layers to collect locks they need to process current chunk
+ *
+ * (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ * (4) process the chunk: call per-page methods
+ * (cl_io_operations::cio_read_page() for read,
+ * cl_io_operations::cio_prepare_write(),
+ * cl_io_operations::cio_commit_write() for write)
+ *
+ * (5) release locks
+ *
+ * (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+ /** read system call */
+ CIT_READ,
+ /** write system call */
+ CIT_WRITE,
+ /** truncate, utime system calls */
+ CIT_SETATTR,
+ /**
+ * page fault handling
+ */
+ CIT_FAULT,
+ /**
+ * fsync system call handling
+ * To write out a range of file
+ */
+ CIT_FSYNC,
+ /**
+ * Miscellaneous io. This is used for occasional io activity that
+ * doesn't fit into other types. Currently this is used for:
+ *
+ * - cancellation of an extent lock. This io exists as a context
+ * to write dirty pages from under the lock being canceled back
+ * to the server;
+ *
+ * - VM induced page write-out. An io context for writing page out
+ * for memory cleansing;
+ *
+ * - glimpse. An io context to acquire glimpse lock.
+ *
+ * - grouplock. An io context to acquire group lock.
+ *
+ * CIT_MISC io is used simply as a context in which locks and pages
+ * are manipulated. Such io has no internal "process", that is,
+ * cl_io_loop() is never called for it.
+ */
+ CIT_MISC,
+ CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+ /** Not initialized. */
+ CIS_ZERO,
+ /** Initialized. */
+ CIS_INIT,
+ /** IO iteration started. */
+ CIS_IT_STARTED,
+ /** Locks taken. */
+ CIS_LOCKED,
+ /** Actual IO is in progress. */
+ CIS_IO_GOING,
+ /** IO for the current iteration finished. */
+ CIS_IO_FINISHED,
+ /** Locks released. */
+ CIS_UNLOCKED,
+ /** Iteration completed. */
+ CIS_IT_ENDED,
+ /** cl_io finalized. */
+ CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io, ccc_io
+ */
+struct cl_io_slice {
+ struct cl_io *cis_io;
+ /** corresponding object slice. Immutable after creation. */
+ struct cl_object *cis_obj;
+ /** io operations. Immutable after creation. */
+ const struct cl_io_operations *cis_iop;
+ /**
+ * linkage into a list of all slices for a given cl_io, hanging off
+ * cl_io::ci_layers. Immutable after creation.
+ */
+ struct list_head cis_linkage;
+};
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+ /**
+ * Vector of io state transition methods for every io type.
+ *
+ * \see cl_page_operations::io
+ */
+ struct {
+ /**
+ * Prepare io iteration at a given layer.
+ *
+ * Called top-to-bottom at the beginning of each iteration of
+ * "io loop" (if it makes sense for this type of io). Here
+ * layer selects what work it will do during this iteration.
+ *
+ * \see cl_io_operations::cio_iter_fini()
+ */
+ int (*cio_iter_init) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Finalize io iteration.
+ *
+ * Called bottom-to-top at the end of each iteration of "io
+ * loop". Here layers can decide whether IO has to be
+ * continued.
+ *
+ * \see cl_io_operations::cio_iter_init()
+ */
+ void (*cio_iter_fini) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Collect locks for the current iteration of io.
+ *
+ * Called top-to-bottom to collect all locks necessary for
+ * this iteration. This methods shouldn't actually enqueue
+ * anything, instead it should post a lock through
+ * cl_io_lock_add(). Once all locks are collected, they are
+ * sorted and enqueued in the proper order.
+ */
+ int (*cio_lock) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Finalize unlocking.
+ *
+ * Called bottom-to-top to finish layer specific unlocking
+ * functionality, after generic code released all locks
+ * acquired by cl_io_operations::cio_lock().
+ */
+ void (*cio_unlock)(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Start io iteration.
+ *
+ * Once all locks are acquired, called top-to-bottom to
+ * commence actual IO. In the current implementation,
+ * top-level vvp_io_{read,write}_start() does all the work
+ * synchronously by calling generic_file_*(), so other layers
+ * are called when everything is done.
+ */
+ int (*cio_start)(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Called top-to-bottom at the end of io loop. Here layer
+ * might wait for an unfinished asynchronous io.
+ */
+ void (*cio_end) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Called bottom-to-top to notify layers that read/write IO
+ * iteration finished, with \a nob bytes transferred.
+ */
+ void (*cio_advance)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ size_t nob);
+ /**
+ * Called once per io, bottom-to-top to release io resources.
+ */
+ void (*cio_fini) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ } op[CIT_OP_NR];
+ struct {
+ /**
+ * Submit pages from \a queue->c2_qin for IO, and move
+ * successfully submitted pages into \a queue->c2_qout. Return
+ * non-zero if failed to submit even the single page. If
+ * submission failed after some pages were moved into \a
+ * queue->c2_qout, completion callback with non-zero ioret is
+ * executed on them.
+ */
+ int (*cio_submit)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ enum cl_req_type crt,
+ struct cl_2queue *queue);
+ } req_op[CRT_NR];
+ /**
+ * Read missing page.
+ *
+ * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
+ * method, when it hits not-up-to-date page in the range. Optional.
+ *
+ * \pre io->ci_type == CIT_READ
+ */
+ int (*cio_read_page)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ const struct cl_page_slice *page);
+ /**
+ * Prepare write of a \a page. Called bottom-to-top by a top-level
+ * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
+ * get data from user-level buffer.
+ *
+ * \pre io->ci_type == CIT_WRITE
+ *
+ * \see vvp_io_prepare_write(), lov_io_prepare_write(),
+ * osc_io_prepare_write().
+ */
+ int (*cio_prepare_write)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ const struct cl_page_slice *page,
+ unsigned from, unsigned to);
+ /**
+ *
+ * \pre io->ci_type == CIT_WRITE
+ *
+ * \see vvp_io_commit_write(), lov_io_commit_write(),
+ * osc_io_commit_write().
+ */
+ int (*cio_commit_write)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ const struct cl_page_slice *page,
+ unsigned from, unsigned to);
+ /**
+ * Optional debugging helper. Print given io slice.
+ */
+ int (*cio_print)(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+ /**
+ * instruct server to not block, if conflicting lock is found. Instead
+ * -EWOULDBLOCK is returned immediately.
+ */
+ CEF_NONBLOCK = 0x00000001,
+ /**
+ * take lock asynchronously (out of order), as it cannot
+ * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+ */
+ CEF_ASYNC = 0x00000002,
+ /**
+ * tell the server to instruct (though a flag in the blocking ast) an
+ * owner of the conflicting lock, that it can drop dirty pages
+ * protected by this lock, without sending them to the server.
+ */
+ CEF_DISCARD_DATA = 0x00000004,
+ /**
+ * tell the sub layers that it must be a `real' lock. This is used for
+ * mmapped-buffer locks and glimpse locks that must be never converted
+ * into lockless mode.
+ *
+ * \see vvp_mmap_locks(), cl_glimpse_lock().
+ */
+ CEF_MUST = 0x00000008,
+ /**
+ * tell the sub layers that never request a `real' lock. This flag is
+ * not used currently.
+ *
+ * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+ * conversion policy: ci_lockreq describes generic information of lock
+ * requirement for this IO, especially for locks which belong to the
+ * object doing IO; however, lock itself may have precise requirements
+ * that are described by the enqueue flags.
+ */
+ CEF_NEVER = 0x00000010,
+ /**
+ * for async glimpse lock.
+ */
+ CEF_AGL = 0x00000020,
+ /**
+ * mask of enq_flags.
+ */
+ CEF_MASK = 0x0000003f,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+ /** linkage into one of cl_lockset lists. */
+ struct list_head cill_linkage;
+ struct cl_lock_descr cill_descr;
+ struct cl_lock *cill_lock;
+ /** optional destructor */
+ void (*cill_fini)(const struct lu_env *env,
+ struct cl_io_lock_link *link);
+};
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ * - holding extent locks over multiple ost's introduces the danger of
+ * "cascading timeouts";
+ *
+ * - holding multiple locks over the same ost is still dead-lock prone,
+ * see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ * - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ * - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ * - SNS has to take locks across full stripe for correctness;
+ *
+ * - in the case when user level buffer, supplied to {read,write}(file0),
+ * is a part of a memory mapped lustre file, client has to take a dlm
+ * locks on file0, and all files that back up the buffer (or a part of
+ * the buffer, that is being processed in the current chunk, in any
+ * case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+ /** locks to be acquired. */
+ struct list_head cls_todo;
+ /** locks currently being processed. */
+ struct list_head cls_curr;
+ /** locks acquired. */
+ struct list_head cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+ /** Always lock data (e.g., O_APPEND). */
+ CILR_MANDATORY = 0,
+ /** Layers are free to decide between local and global locking. */
+ CILR_MAYBE,
+ /** Never lock: there is no cache (e.g., liblustre). */
+ CILR_NEVER
+};
+
+enum cl_fsync_mode {
+ /** start writeback, do not wait for them to finish */
+ CL_FSYNC_NONE = 0,
+ /** start writeback and wait for them to finish */
+ CL_FSYNC_LOCAL = 1,
+ /** discard all of dirty pages in a specific file range */
+ CL_FSYNC_DISCARD = 2,
+ /** start writeback and make sure they have reached storage before
+ * return. OST_SYNC RPC must be issued and finished */
+ CL_FSYNC_ALL = 3
+};
+
+struct cl_io_rw_common {
+ loff_t crw_pos;
+ size_t crw_count;
+ int crw_nonblock;
+};
+
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+ /** type of this IO. Immutable after creation. */
+ enum cl_io_type ci_type;
+ /** current state of cl_io state machine. */
+ enum cl_io_state ci_state;
+ /** main object this io is against. Immutable after creation. */
+ struct cl_object *ci_obj;
+ /**
+ * Upper layer io, of which this io is a part of. Immutable after
+ * creation.
+ */
+ struct cl_io *ci_parent;
+ /** List of slices. Immutable after creation. */
+ struct list_head ci_layers;
+ /** list of locks (to be) acquired by this io. */
+ struct cl_lockset ci_lockset;
+ /** lock requirements, this is just a help info for sublayers. */
+ enum cl_io_lock_dmd ci_lockreq;
+ union {
+ struct cl_rd_io {
+ struct cl_io_rw_common rd;
+ } ci_rd;
+ struct cl_wr_io {
+ struct cl_io_rw_common wr;
+ int wr_append;
+ int wr_sync;
+ } ci_wr;
+ struct cl_io_rw_common ci_rw;
+ struct cl_setattr_io {
+ struct ost_lvb sa_attr;
+ unsigned int sa_valid;
+ struct obd_capa *sa_capa;
+ } ci_setattr;
+ struct cl_fault_io {
+ /** page index within file. */
+ pgoff_t ft_index;
+ /** bytes valid byte on a faulted page. */
+ int ft_nob;
+ /** writable page? for nopage() only */
+ int ft_writable;
+ /** page of an executable? */
+ int ft_executable;
+ /** page_mkwrite() */
+ int ft_mkwrite;
+ /** resulting page */
+ struct cl_page *ft_page;
+ } ci_fault;
+ struct cl_fsync_io {
+ loff_t fi_start;
+ loff_t fi_end;
+ struct obd_capa *fi_capa;
+ /** file system level fid */
+ struct lu_fid *fi_fid;
+ enum cl_fsync_mode fi_mode;
+ /* how many pages were written/discarded */
+ unsigned int fi_nr_written;
+ } ci_fsync;
+ } u;
+ struct cl_2queue ci_queue;
+ size_t ci_nob;
+ int ci_result;
+ unsigned int ci_continue:1,
+ /**
+ * This io has held grouplock, to inform sublayers that
+ * don't do lockless i/o.
+ */
+ ci_no_srvlock:1,
+ /**
+ * The whole IO need to be restarted because layout has been changed
+ */
+ ci_need_restart:1,
+ /**
+ * to not refresh layout - the IO issuer knows that the layout won't
+ * change(page operations, layout change causes all page to be
+ * discarded), or it doesn't matter if it changes(sync).
+ */
+ ci_ignore_layout:1,
+ /**
+ * Check if layout changed after the IO finishes. Mainly for HSM
+ * requirement. If IO occurs to openning files, it doesn't need to
+ * verify layout because HSM won't release openning files.
+ * Right now, only two opertaions need to verify layout: glimpse
+ * and setattr.
+ */
+ ci_verify_layout:1;
+ /**
+ * Number of pages owned by this IO. For invariant checking.
+ */
+ unsigned ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/** \addtogroup cl_req cl_req
+ * @{ */
+/** \struct cl_req
+ * Transfer.
+ *
+ * There are two possible modes of transfer initiation on the client:
+ *
+ * - immediate transfer: this is started when a high level io wants a page
+ * or a collection of pages to be transferred right away. Examples:
+ * read-ahead, synchronous read in the case of non-page aligned write,
+ * page write-out as a part of extent lock cancellation, page write-out
+ * as a part of memory cleansing. Immediate transfer can be both
+ * cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
+ *
+ * - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
+ * when io wants to transfer a page to the server some time later, when
+ * it can be done efficiently. Example: pages dirtied by the write(2)
+ * path.
+ *
+ * In any case, transfer takes place in the form of a cl_req, which is a
+ * representation for a network RPC.
+ *
+ * Pages queued for an opportunistic transfer are cached until it is decided
+ * that efficient RPC can be composed of them. This decision is made by "a
+ * req-formation engine", currently implemented as a part of osc
+ * layer. Req-formation depends on many factors: the size of the resulting
+ * RPC, whether or not multi-object RPCs are supported by the server,
+ * max-rpc-in-flight limitations, size of the dirty cache, etc.
+ *
+ * For the immediate transfer io submits a cl_page_list, that req-formation
+ * engine slices into cl_req's, possibly adding cached pages to some of
+ * the resulting req's.
+ *
+ * Whenever a page from cl_page_list is added to a newly constructed req, its
+ * cl_page_operations::cpo_prep() layer methods are called. At that moment,
+ * page state is atomically changed from cl_page_state::CPS_OWNED to
+ * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
+ * is zeroed, and cl_page::cp_req is set to the
+ * req. cl_page_operations::cpo_prep() method at the particular layer might
+ * return -EALREADY to indicate that it does not need to submit this page
+ * at all. This is possible, for example, if page, submitted for read,
+ * became up-to-date in the meantime; and for write, the page don't have
+ * dirty bit marked. \see cl_io_submit_rw()
+ *
+ * Whenever a cached page is added to a newly constructed req, its
+ * cl_page_operations::cpo_make_ready() layer methods are called. At that
+ * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
+ * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
+ * req. cl_page_operations::cpo_make_ready() method at the particular layer
+ * might return -EAGAIN to indicate that this page is not eligible for the
+ * transfer right now.
+ *
+ * FUTURE
+ *
+ * Plan is to divide transfers into "priority bands" (indicated when
+ * submitting cl_page_list, and queuing a page for the opportunistic transfer)
+ * and allow glueing of cached pages to immediate transfers only within single
+ * band. This would make high priority transfers (like lock cancellation or
+ * memory pressure induced write-out) really high priority.
+ *
+ */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+ /** Generic attributes for the server consumption. */
+ struct obdo *cra_oa;
+ /** Capability. */
+ struct obd_capa *cra_capa;
+ /** Jobid */
+ char cra_jobid[JOBSTATS_JOBID_SIZE];
+};
+
+/**
+ * Transfer request operations definable at every layer.
+ *
+ * Concurrency: transfer formation engine synchronizes calls to all transfer
+ * methods.
+ */
+struct cl_req_operations {
+ /**
+ * Invoked top-to-bottom by cl_req_prep() when transfer formation is
+ * complete (all pages are added).
+ *
+ * \see osc_req_prep()
+ */
+ int (*cro_prep)(const struct lu_env *env,
+ const struct cl_req_slice *slice);
+ /**
+ * Called top-to-bottom to fill in \a oa fields. This is called twice
+ * with different flags, see bug 10150 and osc_build_req().
+ *
+ * \param obj an object from cl_req which attributes are to be set in
+ * \a oa.
+ *
+ * \param oa struct obdo where attributes are placed
+ *
+ * \param flags \a oa fields to be filled.
+ */
+ void (*cro_attr_set)(const struct lu_env *env,
+ const struct cl_req_slice *slice,
+ const struct cl_object *obj,
+ struct cl_req_attr *attr, obd_valid flags);
+ /**
+ * Called top-to-bottom from cl_req_completion() to notify layers that
+ * transfer completed. Has to free all state allocated by
+ * cl_device_operations::cdo_req_init().
+ */
+ void (*cro_completion)(const struct lu_env *env,
+ const struct cl_req_slice *slice, int ioret);
+};
+
+/**
+ * A per-object state that (potentially multi-object) transfer request keeps.
+ */
+struct cl_req_obj {
+ /** object itself */
+ struct cl_object *ro_obj;
+ /** reference to cl_req_obj::ro_obj. For debugging. */
+ struct lu_ref_link *ro_obj_ref;
+ /* something else? Number of pages for a given object? */
+};
+
+/**
+ * Transfer request.
+ *
+ * Transfer requests are not reference counted, because IO sub-system owns
+ * them exclusively and knows when to free them.
+ *
+ * Life cycle.
+ *
+ * cl_req is created by cl_req_alloc() that calls
+ * cl_device_operations::cdo_req_init() device methods to allocate per-req
+ * state in every layer.
+ *
+ * Then pages are added (cl_req_page_add()), req keeps track of all objects it
+ * contains pages for.
+ *
+ * Once all pages were collected, cl_page_operations::cpo_prep() method is
+ * called top-to-bottom. At that point layers can modify req, let it pass, or
+ * deny it completely. This is to support things like SNS that have transfer
+ * ordering requirements invisible to the individual req-formation engine.
+ *
+ * On transfer completion (or transfer timeout, or failure to initiate the
+ * transfer of an allocated req), cl_req_operations::cro_completion() method
+ * is called, after execution of cl_page_operations::cpo_completion() of all
+ * req's pages.
+ */
+struct cl_req {
+ enum cl_req_type crq_type;
+ /** A list of pages being transfered */
+ struct list_head crq_pages;
+ /** Number of pages in cl_req::crq_pages */
+ unsigned crq_nrpages;
+ /** An array of objects which pages are in ->crq_pages */
+ struct cl_req_obj *crq_o;
+ /** Number of elements in cl_req::crq_objs[] */
+ unsigned crq_nrobjs;
+ struct list_head crq_layers;
+};
+
+/**
+ * Per-layer state for request.
+ */
+struct cl_req_slice {
+ struct cl_req *crs_req;
+ struct cl_device *crs_dev;
+ struct list_head crs_linkage;
+ const struct cl_req_operations *crs_ops;
+};
+
+/* @} cl_req */
+
+enum cache_stats_item {
+ /** how many cache lookups were performed */
+ CS_lookup = 0,
+ /** how many times cache lookup resulted in a hit */
+ CS_hit,
+ /** how many entities are in the cache right now */
+ CS_total,
+ /** how many entities in the cache are actively used (and cannot be
+ * evicted) right now */
+ CS_busy,
+ /** how many entities were created at all */
+ CS_create,
+ CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+ const char *cs_name;
+ atomic_t cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+ struct lu_site cs_lu;
+ /**
+ * Statistical counters. Atomics do not scale, something better like
+ * per-cpu counters is needed.
+ *
+ * These are exported as /proc/fs/lustre/llite/.../site
+ *
+ * When interpreting keep in mind that both sub-locks (and sub-pages)
+ * and top-locks (and top-pages) are accounted here.
+ */
+ struct cache_stats cs_pages;
+ struct cache_stats cs_locks;
+ atomic_t cs_pages_state[CPS_NR];
+ atomic_t cs_locks_state[CLS_NR];
+};
+
+int cl_site_init (struct cl_site *s, struct cl_device *top);
+void cl_site_fini (struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+ return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline int lu_device_is_cl(const struct lu_device *d)
+{
+ return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+ LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+ return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+ return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+ LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+ return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+ return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+ return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_device *cl_object_device(const struct cl_object *o)
+{
+ LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
+ return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+ return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+ return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+ return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+ return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+ lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+ struct cl_object *obj,
+ const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+ struct cl_object *obj,
+ const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+ struct cl_object *obj, const struct cl_io_operations *ops);
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+ struct cl_device *dev,
+ const struct cl_req_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+ const struct lu_fid *fid,
+ const struct cl_object_conf *c);
+
+int cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put (const struct lu_env *env, struct cl_object *o);
+void cl_object_get (struct cl_object *o);
+void cl_object_attr_lock (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int cl_object_attr_get (const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr);
+int cl_object_attr_set (const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid);
+int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj,
+ struct ost_lvb *lvb);
+int cl_conf_set (const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf);
+void cl_object_prune (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill (const struct lu_env *env, struct cl_object *obj);
+int cl_object_has_locks (struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+ return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+ clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+ cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+ struct cl_page *page)
+{
+ return (void *)((char *)page + clob->co_slice_off);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+enum {
+ CLP_GANG_OKAY = 0,
+ CLP_GANG_RESCHED,
+ CLP_GANG_AGAIN,
+ CLP_GANG_ABORT
+};
+
+/* callback of cl_page_gang_lookup() */
+typedef int (*cl_page_gang_cb_t) (const struct lu_env *, struct cl_io *,
+ struct cl_page *, void *);
+int cl_page_gang_lookup (const struct lu_env *env,
+ struct cl_object *obj,
+ struct cl_io *io,
+ pgoff_t start, pgoff_t end,
+ cl_page_gang_cb_t cb, void *cbdata);
+struct cl_page *cl_page_lookup (struct cl_object_header *hdr,
+ pgoff_t index);
+struct cl_page *cl_page_find (const struct lu_env *env,
+ struct cl_object *obj,
+ pgoff_t idx, struct page *vmpage,
+ enum cl_page_type type);
+struct cl_page *cl_page_find_sub (const struct lu_env *env,
+ struct cl_object *obj,
+ pgoff_t idx, struct page *vmpage,
+ struct cl_page *parent);
+void cl_page_get (struct cl_page *page);
+void cl_page_put (const struct lu_env *env,
+ struct cl_page *page);
+void cl_page_print (const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct cl_page *pg);
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct cl_page *pg);
+struct page *cl_page_vmpage (const struct lu_env *env,
+ struct cl_page *page);
+struct cl_page *cl_vmpage_page (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+ const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int cl_page_own (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page);
+int cl_page_own_try (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page);
+void cl_page_assume (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page);
+void cl_page_unassume (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg);
+void cl_page_disown (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page);
+int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int cl_page_prep (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+ struct cl_page *pg, enum cl_req_type crt, int ioret);
+int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+ enum cl_req_type crt);
+int cl_page_cache_add (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip (const struct lu_env *env, struct cl_page *pg,
+ int from, int to);
+int cl_page_cancel (const struct lu_env *env, struct cl_page *page);
+int cl_page_flush (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void cl_page_discard (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg);
+void cl_page_delete (const struct lu_env *env, struct cl_page *pg);
+int cl_page_unmap (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg);
+int cl_page_is_vmlocked (const struct lu_env *env,
+ const struct cl_page *pg);
+void cl_page_export (const struct lu_env *env,
+ struct cl_page *pg, int uptodate);
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page);
+loff_t cl_offset (const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index (const struct cl_object *obj, loff_t offset);
+int cl_page_size (const struct cl_object *obj);
+int cl_pages_prune (const struct lu_env *env, struct cl_object *obj);
+
+void cl_lock_print (const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct cl_lock_descr *descr);
+/* @} helper */
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source);
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source);
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source);
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+ struct cl_object *obj, pgoff_t index,
+ struct cl_lock *except, int pending,
+ int canceld);
+static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
+ struct cl_object *obj,
+ struct cl_page *page,
+ struct cl_lock *except,
+ int pending, int canceld)
+{
+ LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
+ return cl_lock_at_pgoff(env, obj, page->cp_index, except,
+ pending, canceld);
+}
+
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+ const struct lu_device_type *dtype);
+
+void cl_lock_get (struct cl_lock *lock);
+void cl_lock_get_trust (struct cl_lock *lock);
+void cl_lock_put (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_hold_add (const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source);
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source);
+void cl_lock_unhold (const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source);
+void cl_lock_release (const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source);
+void cl_lock_user_add (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_user_del (const struct lu_env *env, struct cl_lock *lock);
+
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+ struct cl_lock *lock);
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state);
+int cl_lock_is_intransit(struct cl_lock *lock);
+
+int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock,
+ int keep_mutex);
+
+/** \name statemachine statemachine
+ * Interface to lock state machine consists of 3 parts:
+ *
+ * - "try" functions that attempt to effect a state transition. If state
+ * transition is not possible right now (e.g., if it has to wait for some
+ * asynchronous event to occur), these functions return
+ * cl_lock_transition::CLO_WAIT.
+ *
+ * - "non-try" functions that implement synchronous blocking interface on
+ * top of non-blocking "try" functions. These functions repeatedly call
+ * corresponding "try" versions, and if state transition is not possible
+ * immediately, wait for lock state change.
+ *
+ * - methods from cl_lock_operations, called by "try" functions. Lock can
+ * be advanced to the target state only when all layers voted that they
+ * are ready for this transition. "Try" functions call methods under lock
+ * mutex. If a layer had to release a mutex, it re-acquires it and returns
+ * cl_lock_transition::CLO_REPEAT, causing "try" function to call all
+ * layers again.
+ *
+ * TRY NON-TRY METHOD FINAL STATE
+ *
+ * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
+ *
+ * cl_wait_try() cl_wait() cl_lock_operations::clo_wait() CLS_HELD
+ *
+ * cl_unuse_try() cl_unuse() cl_lock_operations::clo_unuse() CLS_CACHED
+ *
+ * cl_use_try() NONE cl_lock_operations::clo_use() CLS_HELD
+ *
+ * @{ */
+
+int cl_enqueue (const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 flags);
+int cl_wait (const struct lu_env *env, struct cl_lock *lock);
+void cl_unuse (const struct lu_env *env, struct cl_lock *lock);
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 flags);
+int cl_unuse_try (const struct lu_env *env, struct cl_lock *lock);
+int cl_wait_try (const struct lu_env *env, struct cl_lock *lock);
+int cl_use_try (const struct lu_env *env, struct cl_lock *lock, int atomic);
+
+/** @} statemachine */
+
+void cl_lock_signal (const struct lu_env *env, struct cl_lock *lock);
+int cl_lock_state_wait (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_state_set (const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state);
+int cl_queue_match (const struct list_head *queue,
+ const struct cl_lock_descr *need);
+
+void cl_lock_mutex_get (const struct lu_env *env, struct cl_lock *lock);
+int cl_lock_mutex_try (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_mutex_put (const struct lu_env *env, struct cl_lock *lock);
+int cl_lock_is_mutexed (struct cl_lock *lock);
+int cl_lock_nr_mutexed (const struct lu_env *env);
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
+int cl_lock_ext_match (const struct cl_lock_descr *has,
+ const struct cl_lock_descr *need);
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+ const struct cl_lock_descr *need);
+int cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
+int cl_lock_modify (const struct lu_env *env, struct cl_lock *lock,
+ const struct cl_lock_descr *desc);
+
+void cl_lock_closure_init (const struct lu_env *env,
+ struct cl_lock_closure *closure,
+ struct cl_lock *origin, int wait);
+void cl_lock_closure_fini (struct cl_lock_closure *closure);
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_lock_closure *closure);
+void cl_lock_disclosure (const struct lu_env *env,
+ struct cl_lock_closure *closure);
+int cl_lock_enclosure (const struct lu_env *env, struct cl_lock *lock,
+ struct cl_lock_closure *closure);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
+
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int cl_io_init (const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj);
+int cl_io_sub_init (const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj);
+int cl_io_rw_init (const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, loff_t pos, size_t count);
+int cl_io_loop (const struct lu_env *env, struct cl_io *io);
+
+void cl_io_fini (const struct lu_env *env, struct cl_io *io);
+int cl_io_iter_init (const struct lu_env *env, struct cl_io *io);
+void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io);
+int cl_io_lock (const struct lu_env *env, struct cl_io *io);
+void cl_io_unlock (const struct lu_env *env, struct cl_io *io);
+int cl_io_start (const struct lu_env *env, struct cl_io *io);
+void cl_io_end (const struct lu_env *env, struct cl_io *io);
+int cl_io_lock_add (const struct lu_env *env, struct cl_io *io,
+ struct cl_io_lock_link *link);
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+ struct cl_lock_descr *descr);
+int cl_io_read_page (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page);
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, unsigned from, unsigned to);
+int cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, unsigned from, unsigned to);
+int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type iot, struct cl_2queue *queue);
+int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type iot, struct cl_2queue *queue,
+ long timeout);
+void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io,
+ size_t nob);
+int cl_io_cancel (const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue);
+int cl_io_is_going (const struct lu_env *env);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+ return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+ return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+ return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+ return io->ci_type == CIT_SETATTR &&
+ (io->u.ci_setattr.sa_valid & ATTR_SIZE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base) \
+do { \
+ typeof(foo_io) __foo_io = (foo_io); \
+ \
+ CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \
+ memset(&__foo_io->base + 1, 0, \
+ (sizeof *__foo_io) - sizeof __foo_io->base); \
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+ LASSERT(plist->pl_nr > 0);
+ return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list) \
+ list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list) \
+ list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init (struct cl_page_list *plist);
+void cl_page_list_add (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move (struct cl_page_list *dst, struct cl_page_list *src,
+ struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+ struct cl_page_list *head);
+void cl_page_list_del (const struct lu_env *env,
+ struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+int cl_page_list_own (const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+int cl_page_list_unmap (const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init (struct cl_2queue *queue);
+void cl_2queue_add (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown (const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume (const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard (const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+/** \defgroup cl_req cl_req
+ * @{ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+ enum cl_req_type crt, int nr_objects);
+
+void cl_req_page_add (const struct lu_env *env, struct cl_req *req,
+ struct cl_page *page);
+void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
+int cl_req_prep (const struct lu_env *env, struct cl_req *req);
+void cl_req_attr_set (const struct lu_env *env, struct cl_req *req,
+ struct cl_req_attr *attr, obd_valid flags);
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+ /** number of pages yet to be transferred. */
+ atomic_t csi_sync_nr;
+ /** error code. */
+ int csi_sync_rc;
+ /** barrier of destroy this structure */
+ atomic_t csi_barrier;
+ /** completion to be signaled when transfer is complete. */
+ wait_queue_head_t csi_waitq;
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue, struct cl_sync_io *anchor,
+ long timeout);
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
+
+/** @} cl_sync_io */
+
+/** @} cl_req */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ * - there is a (mostly) fixed number of threads, and
+ *
+ * - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumtpion fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ * - allocation and destruction of environment is amortized by caching no
+ * longer used environments instead of destroying them;
+ *
+ * - there is a notion of "current" environment, attached to the kernel
+ * data structure representing current thread Top-level lustre code
+ * allocates an environment and makes it current, then calls into
+ * non-lustre code, that in turn calls lustre back. Low-level lustre
+ * code thus called can fetch environment created by the top-level code
+ * and reuse it, avoiding additional environment allocation.
+ * Right now, three interfaces can attach the cl_env to running thread:
+ * - cl_env_get
+ * - cl_env_implant
+ * - cl_env_reexit(cl_env_reenter had to be called priorly)
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct cl_env_nest {
+ int cen_refcheck;
+ void *cen_cookie;
+};
+
+struct lu_env *cl_env_peek (int *refcheck);
+struct lu_env *cl_env_get (int *refcheck);
+struct lu_env *cl_env_alloc (int *refcheck, __u32 tags);
+struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
+void cl_env_put (struct lu_env *env, int *refcheck);
+void cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
+void *cl_env_reenter (void);
+void cl_env_reexit (void *cookie);
+void cl_env_implant (struct lu_env *env, int *refcheck);
+void cl_env_unplant (struct lu_env *env, int *refcheck);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+ struct lu_device_type *ldt,
+ struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/dt_object.h b/drivers/staging/lustre/lustre/include/dt_object.h
new file mode 100644
index 000000000000..e116bb21b529
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/dt_object.h
@@ -0,0 +1,1498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+ MNTOPT_USERXATTR = 0x00000001,
+ MNTOPT_ACL = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+ unsigned ddp_max_name_len;
+ unsigned ddp_max_nlink;
+ unsigned ddp_block_shift;
+ mntopt_t ddp_mntopts;
+ unsigned ddp_max_ea_size;
+ void *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */
+ int ddp_mount_type;
+ unsigned long long ddp_maxbytes;
+ /* percentage of available space to reserve for grant error margin */
+ int ddp_grant_reserved;
+ /* per-inode space consumption */
+ short ddp_inodespace;
+ /* per-fragment grant overhead to be used by client for grant
+ * calculation */
+ int ddp_grant_frag;
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+ struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC 0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN 32
+
+struct dt_txn_commit_cb {
+ struct list_head dcb_linkage;
+ dt_cb_t dcb_func;
+ __u32 dcb_magic;
+ char dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+ /**
+ * Return device-wide statistics.
+ */
+ int (*dt_statfs)(const struct lu_env *env,
+ struct dt_device *dev, struct obd_statfs *osfs);
+ /**
+ * Create transaction, described by \a param.
+ */
+ struct thandle *(*dt_trans_create)(const struct lu_env *env,
+ struct dt_device *dev);
+ /**
+ * Start transaction, described by \a param.
+ */
+ int (*dt_trans_start)(const struct lu_env *env,
+ struct dt_device *dev, struct thandle *th);
+ /**
+ * Finish previously started transaction.
+ */
+ int (*dt_trans_stop)(const struct lu_env *env,
+ struct thandle *th);
+ /**
+ * Add commit callback to the transaction.
+ */
+ int (*dt_trans_cb_add)(struct thandle *th,
+ struct dt_txn_commit_cb *dcb);
+ /**
+ * Return fid of root index object.
+ */
+ int (*dt_root_get)(const struct lu_env *env,
+ struct dt_device *dev, struct lu_fid *f);
+ /**
+ * Return device configuration data.
+ */
+ void (*dt_conf_get)(const struct lu_env *env,
+ const struct dt_device *dev,
+ struct dt_device_param *param);
+ /**
+ * handling device state, mostly for tests
+ */
+ int (*dt_sync)(const struct lu_env *env, struct dt_device *dev);
+ int (*dt_ro)(const struct lu_env *env, struct dt_device *dev);
+ /**
+ * Start a transaction commit asynchronously
+ *
+ * \param env environment
+ * \param dev dt_device to start commit on
+ *
+ * \return 0 success, negative value if error
+ */
+ int (*dt_commit_async)(const struct lu_env *env,
+ struct dt_device *dev);
+ /**
+ * Initialize capability context.
+ */
+ int (*dt_init_capa_ctxt)(const struct lu_env *env,
+ struct dt_device *dev,
+ int mode, unsigned long timeout,
+ __u32 alg, struct lustre_capa_key *keys);
+};
+
+struct dt_index_features {
+ /** required feature flags from enum dt_index_flags */
+ __u32 dif_flags;
+ /** minimal required key size */
+ size_t dif_keysize_min;
+ /** maximal required key size, 0 if no limit */
+ size_t dif_keysize_max;
+ /** minimal required record size */
+ size_t dif_recsize_min;
+ /** maximal required record size, 0 if no limit */
+ size_t dif_recsize_max;
+ /** pointer size for record */
+ size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+ /** index supports variable sized keys */
+ DT_IND_VARKEY = 1 << 0,
+ /** index supports variable sized records */
+ DT_IND_VARREC = 1 << 1,
+ /** index can be modified */
+ DT_IND_UPDATE = 1 << 2,
+ /** index supports records with non-unique (duplicate) keys */
+ DT_IND_NONUNQ = 1 << 3,
+ /**
+ * index support fixed-size keys sorted with natural numerical way
+ * and is able to return left-side value if no exact value found
+ */
+ DT_IND_RANGE = 1 << 4,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+ struct dt_object *dah_parent;
+ __u32 dah_mode;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+ DFT_REGULAR,
+ DFT_DIR,
+ /** for mknod */
+ DFT_NODE,
+ /** for special index */
+ DFT_INDEX,
+ /** for symbolic link */
+ DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+ /** type for dt object */
+ enum dt_format_type dof_type;
+ union {
+ struct dof_regular {
+ int striped;
+ } dof_reg;
+ struct dof_dir {
+ } dof_dir;
+ struct dof_node {
+ } dof_node;
+ /**
+ * special index need feature as parameter to create
+ * special idx
+ */
+ struct dof_index {
+ const struct dt_index_features *di_feat;
+ } dof_idx;
+ } u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+/**
+ * Per-dt-object operations.
+ */
+struct dt_object_operations {
+ void (*do_read_lock)(const struct lu_env *env,
+ struct dt_object *dt, unsigned role);
+ void (*do_write_lock)(const struct lu_env *env,
+ struct dt_object *dt, unsigned role);
+ void (*do_read_unlock)(const struct lu_env *env,
+ struct dt_object *dt);
+ void (*do_write_unlock)(const struct lu_env *env,
+ struct dt_object *dt);
+ int (*do_write_locked)(const struct lu_env *env,
+ struct dt_object *dt);
+ /**
+ * Note: following ->do_{x,}attr_{set,get}() operations are very
+ * similar to ->moo_{x,}attr_{set,get}() operations in struct
+ * md_object_operations (see md_object.h). These operations are not in
+ * lu_object_operations, because ->do_{x,}attr_set() versions take
+ * transaction handle as an argument (this transaction is started by
+ * caller). We might factor ->do_{x,}attr_get() into
+ * lu_object_operations, but that would break existing symmetry.
+ */
+
+ /**
+ * Return standard attributes.
+ *
+ * precondition: lu_object_exists(&dt->do_lu);
+ */
+ int (*do_attr_get)(const struct lu_env *env,
+ struct dt_object *dt, struct lu_attr *attr,
+ struct lustre_capa *capa);
+ /**
+ * Set standard attributes.
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_attr_set)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_attr *attr,
+ struct thandle *handle);
+ int (*do_attr_set)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_attr *attr,
+ struct thandle *handle,
+ struct lustre_capa *capa);
+ /**
+ * Return a value of an extended attribute.
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, const char *name,
+ struct lustre_capa *capa);
+ /**
+ * Set value of an extended attribute.
+ *
+ * \a fl - flags from enum lu_xattr_flags
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_xattr_set)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_buf *buf,
+ const char *name, int fl,
+ struct thandle *handle);
+ int (*do_xattr_set)(const struct lu_env *env,
+ struct dt_object *dt, const struct lu_buf *buf,
+ const char *name, int fl, struct thandle *handle,
+ struct lustre_capa *capa);
+ /**
+ * Delete existing extended attribute.
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_xattr_del)(const struct lu_env *env,
+ struct dt_object *dt,
+ const char *name, struct thandle *handle);
+ int (*do_xattr_del)(const struct lu_env *env,
+ struct dt_object *dt,
+ const char *name, struct thandle *handle,
+ struct lustre_capa *capa);
+ /**
+ * Place list of existing extended attributes into \a buf (which has
+ * length len).
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_xattr_list)(const struct lu_env *env,
+ struct dt_object *dt, struct lu_buf *buf,
+ struct lustre_capa *capa);
+ /**
+ * Init allocation hint using parent object and child mode.
+ * (1) The \a parent might be NULL if this is a partial creation for
+ * remote object.
+ * (2) The type of child is in \a child_mode.
+ * (3) The result hint is stored in \a ah;
+ */
+ void (*do_ah_init)(const struct lu_env *env,
+ struct dt_allocation_hint *ah,
+ struct dt_object *parent,
+ struct dt_object *child,
+ umode_t child_mode);
+ /**
+ * Create new object on this device.
+ *
+ * precondition: !dt_object_exists(dt);
+ * postcondition: ergo(result == 0, dt_object_exists(dt));
+ */
+ int (*do_declare_create)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lu_attr *attr,
+ struct dt_allocation_hint *hint,
+ struct dt_object_format *dof,
+ struct thandle *th);
+ int (*do_create)(const struct lu_env *env, struct dt_object *dt,
+ struct lu_attr *attr,
+ struct dt_allocation_hint *hint,
+ struct dt_object_format *dof,
+ struct thandle *th);
+
+ /**
+ Destroy object on this device
+ * precondition: !dt_object_exists(dt);
+ * postcondition: ergo(result == 0, dt_object_exists(dt));
+ */
+ int (*do_declare_destroy)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct thandle *th);
+ int (*do_destroy)(const struct lu_env *env, struct dt_object *dt,
+ struct thandle *th);
+
+ /**
+ * Announce that this object is going to be used as an index. This
+ * operation check that object supports indexing operations and
+ * installs appropriate dt_index_operations vector on success.
+ *
+ * Also probes for features. Operation is successful if all required
+ * features are supported.
+ */
+ int (*do_index_try)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_index_features *feat);
+ /**
+ * Add nlink of the object
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_ref_add)(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th);
+ int (*do_ref_add)(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th);
+ /**
+ * Del nlink of the object
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_ref_del)(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th);
+ int (*do_ref_del)(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th);
+
+ struct obd_capa *(*do_capa_get)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lustre_capa *old,
+ __u64 opc);
+ int (*do_object_sync)(const struct lu_env *, struct dt_object *);
+ /**
+ * Get object info of next level. Currently, only get inode from osd.
+ * This is only used by quota b=16542
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
+ void **data);
+
+ /**
+ * Lock object.
+ */
+ int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+ struct lustre_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ void *policy);
+};
+
+/**
+ * Per-dt-object operations on "file body".
+ */
+struct dt_body_operations {
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos,
+ struct lustre_capa *capa);
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ ssize_t (*dbo_declare_write)(const struct lu_env *env,
+ struct dt_object *dt,
+ const loff_t size, loff_t pos,
+ struct thandle *handle);
+ ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_buf *buf, loff_t *pos,
+ struct thandle *handle, struct lustre_capa *capa,
+ int ignore_quota);
+ /*
+ * methods for zero-copy IO
+ */
+
+ /*
+ * precondition: dt_object_exists(dt);
+ * returns:
+ * < 0 - error code
+ * = 0 - illegal
+ * > 0 - number of local buffers prepared
+ */
+ int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt,
+ loff_t pos, ssize_t len, struct niobuf_local *lb,
+ int rw, struct lustre_capa *capa);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt,
+ struct niobuf_local *lb, int nr);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt,
+ struct niobuf_local *lb, int nr);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_declare_write_commit)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct niobuf_local *,
+ int, struct thandle *);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt,
+ struct niobuf_local *, int, struct thandle *);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt,
+ struct niobuf_local *lnb, int nr);
+ int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt,
+ struct ll_user_fiemap *fm);
+ /**
+ * Punch object's content
+ * precondition: regular object, not index
+ */
+ int (*dbo_declare_punch)(const struct lu_env *, struct dt_object *,
+ __u64, __u64, struct thandle *th);
+ int (*dbo_punch)(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end, struct thandle *th,
+ struct lustre_capa *capa);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index.
+ */
+struct dt_index_operations {
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt,
+ struct dt_rec *rec, const struct dt_key *key,
+ struct lustre_capa *capa);
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dio_declare_insert)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_rec *rec,
+ const struct dt_key *key,
+ struct thandle *handle);
+ int (*dio_insert)(const struct lu_env *env, struct dt_object *dt,
+ const struct dt_rec *rec, const struct dt_key *key,
+ struct thandle *handle, struct lustre_capa *capa,
+ int ignore_quota);
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dio_declare_delete)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_key *key,
+ struct thandle *handle);
+ int (*dio_delete)(const struct lu_env *env, struct dt_object *dt,
+ const struct dt_key *key, struct thandle *handle,
+ struct lustre_capa *capa);
+ /**
+ * Iterator interface
+ */
+ struct dt_it_ops {
+ /**
+ * Allocate and initialize new iterator.
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ struct dt_it *(*init)(const struct lu_env *env,
+ struct dt_object *dt,
+ __u32 attr,
+ struct lustre_capa *capa);
+ void (*fini)(const struct lu_env *env,
+ struct dt_it *di);
+ int (*get)(const struct lu_env *env,
+ struct dt_it *di,
+ const struct dt_key *key);
+ void (*put)(const struct lu_env *env,
+ struct dt_it *di);
+ int (*next)(const struct lu_env *env,
+ struct dt_it *di);
+ struct dt_key *(*key)(const struct lu_env *env,
+ const struct dt_it *di);
+ int (*key_size)(const struct lu_env *env,
+ const struct dt_it *di);
+ int (*rec)(const struct lu_env *env,
+ const struct dt_it *di,
+ struct dt_rec *rec,
+ __u32 attr);
+ __u64 (*store)(const struct lu_env *env,
+ const struct dt_it *di);
+ int (*load)(const struct lu_env *env,
+ const struct dt_it *di, __u64 hash);
+ int (*key_rec)(const struct lu_env *env,
+ const struct dt_it *di, void* key_rec);
+ } dio_it;
+};
+
+enum dt_otable_it_valid {
+ DOIV_ERROR_HANDLE = 0x0001,
+};
+
+enum dt_otable_it_flags {
+ /* Exit when fail. */
+ DOIF_FAILOUT = 0x0001,
+
+ /* Reset iteration position to the device beginning. */
+ DOIF_RESET = 0x0002,
+
+ /* There is up layer component uses the iteration. */
+ DOIF_OUTUSED = 0x0004,
+};
+
+/* otable based iteration needs to use the common DT interation APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT 16
+#define DT_OTABLE_IT_FLAGS_MASK 0xffff0000
+
+struct dt_device {
+ struct lu_device dd_lu_dev;
+ const struct dt_device_operations *dd_ops;
+
+ /**
+ * List of dt_txn_callback (see below). This is not protected in any
+ * way, because callbacks are supposed to be added/deleted only during
+ * single-threaded start-up shut-down procedures.
+ */
+ struct list_head dd_txn_callbacks;
+};
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+ return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device * lu2dt_dev(struct lu_device *l)
+{
+ LASSERT(lu_device_is_dt(l));
+ return container_of0(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+ struct lu_object do_lu;
+ const struct dt_object_operations *do_ops;
+ const struct dt_body_operations *do_body_ops;
+ const struct dt_index_operations *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+ /* all initialized llog systems on this node linked by this */
+ struct list_head los_list;
+
+ /* how many handle's reference this los has */
+ atomic_t los_refcount;
+ struct dt_device *los_dev;
+ struct dt_object *los_obj;
+
+ /* data used to generate new fids */
+ struct mutex los_id_lock;
+ __u64 los_seq;
+ __u32 los_last_oid;
+};
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+ LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+ return container_of0(l, struct dt_object, do_lu);
+}
+
+int dt_object_init(struct dt_object *obj,
+ struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+ return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+ return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+ LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+ return container_of0(o, struct dt_object, do_lu);
+}
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ * This transaction handle is allocated upon starting a new transaction,
+ * and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ * We do _NOT_ support nested transaction. So, every thread should only
+ * have one active transaction, and a transaction only belongs to one
+ * thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ * dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ * No RPC request should be issued inside transaction.
+ */
+struct thandle {
+ /** the dt device on which the transactions are executed */
+ struct dt_device *th_dev;
+
+ /** context for this transaction, tag is LCT_TX_HANDLE */
+ struct lu_context th_ctx;
+
+ /** additional tags (layers can add in declare) */
+ __u32 th_tags;
+
+ /** the last operation result in this transaction.
+ * this value is used in recovery */
+ __s32 th_result;
+
+ /** whether we need sync commit */
+ unsigned int th_sync:1;
+
+ /* local transation, no need to inform other layers */
+ unsigned int th_local:1;
+
+ /* In DNE, one transaction can be disassemblied into
+ * updates on several different MDTs, and these updates
+ * will be attached to th_remote_update_list per target.
+ * Only single thread will access the list, no need lock
+ */
+ struct list_head th_remote_update_list;
+ struct update_request *th_current_request;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+ int (*dtc_txn_start)(const struct lu_env *env,
+ struct thandle *txn, void *cookie);
+ int (*dtc_txn_stop)(const struct lu_env *env,
+ struct thandle *txn, void *cookie);
+ void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
+ void *dtc_cookie;
+ __u32 dtc_tag;
+ struct list_head dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+ struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+void dt_txn_hook_commit(struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+ const char *name,
+ void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+ char *local, dt_entry_func_t entry_func,
+ void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+ const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+ struct dt_device *dt,
+ const char *dirname,
+ const char *filename,
+ struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object_format *dof,
+ struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+ struct dt_device *dev,
+ const struct lu_fid *fid,
+ struct lu_device *top_dev);
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+ const struct lu_fid *fid)
+{
+ return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev);
+}
+
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+ const struct lu_fid *first_fid,
+ struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+ struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *o,
+ struct lu_attr *attr,
+ struct dt_object_format *dof,
+ struct thandle *th);
+int local_object_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *o,
+ struct lu_attr *attr, struct dt_object_format *dof,
+ struct thandle *th);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *parent,
+ const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object *parent,
+ const char *name,
+ __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *parent,
+ const char *name, __u32 mode,
+ const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object *parent,
+ const char *name, __u32 mode,
+ const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+ struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+ struct dt_object *o, struct lustre_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ void *policy)
+{
+ LASSERT(o);
+ LASSERT(o->do_ops);
+ LASSERT(o->do_ops->do_object_lock);
+ return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+ const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env,
+ struct dt_object *o)
+{
+ LASSERT(o);
+ LASSERT(o->do_ops);
+ LASSERT(o->do_ops->do_object_sync);
+ return o->do_ops->do_object_sync(env, o);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+ struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+ dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+ union lu_page *lp, int nob,
+ const struct dt_it_ops *iops,
+ struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+ const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+ void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+ struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+ struct dt_device *d)
+{
+ LASSERT(d->dd_ops->dt_trans_create);
+ return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+ struct dt_device *d, struct thandle *th)
+{
+ LASSERT(d->dd_ops->dt_trans_start);
+ return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+ struct dt_device *d, struct thandle *th)
+{
+ LASSERT(d->dd_ops->dt_trans_start);
+ th->th_local = 1;
+ return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+ struct dt_device *d, struct thandle *th)
+{
+ LASSERT(d->dd_ops->dt_trans_stop);
+ return d->dd_ops->dt_trans_stop(env, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+ struct dt_txn_commit_cb *dcb)
+{
+ LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+ dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+ return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+ struct dt_object *dt,
+ int size, loff_t pos,
+ struct thandle *th)
+{
+ int rc;
+
+ LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+ LASSERT(th != NULL);
+ LASSERT(dt->do_body_ops);
+ LASSERT(dt->do_body_ops->dbo_declare_write);
+ rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th);
+ return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lu_attr *attr,
+ struct dt_allocation_hint *hint,
+ struct dt_object_format *dof,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_create);
+ return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lu_attr *attr,
+ struct dt_allocation_hint *hint,
+ struct dt_object_format *dof,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_create);
+ return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+ struct dt_object *dt,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_destroy);
+ return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+ struct dt_object *dt,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_destroy);
+ return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+ struct dt_object *dt,
+ unsigned role)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_read_lock);
+ dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+ struct dt_object *dt,
+ unsigned role)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_write_lock);
+ dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+ struct dt_object *dt)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_read_unlock);
+ dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+ struct dt_object *dt)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_write_unlock);
+ dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+ struct dt_object *dt)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_write_locked);
+ return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+ struct lu_attr *la, void *arg)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_attr_get);
+ return dt->do_ops->do_attr_get(env, dt, la, arg);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_attr *la,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_attr_set);
+ return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_attr *la, struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_attr_set);
+ return dt->do_ops->do_attr_set(env, dt, la, th, capa);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_ref_add);
+ return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_ref_add);
+ return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_ref_del);
+ return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_ref_del);
+ return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline struct obd_capa *dt_capa_get(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lustre_capa *old, __u64 opc)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_ref_del);
+ return dt->do_ops->do_capa_get(env, dt, old, opc);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+ struct niobuf_remote *rnb,
+ struct niobuf_local *lnb, int rw,
+ struct lustre_capa *capa)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_bufs_get);
+ return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset,
+ rnb->len, lnb, rw, capa);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+ struct niobuf_local *lnb, int n)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_bufs_put);
+ return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+ struct niobuf_local *lnb, int n)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_write_prep);
+ return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+ struct dt_object *d,
+ struct niobuf_local *lnb,
+ int n, struct thandle *th)
+{
+ LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+ LASSERT(th != NULL);
+ return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+ struct dt_object *d, struct niobuf_local *lnb,
+ int n, struct thandle *th)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_write_commit);
+ return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+ struct niobuf_local *lnb, int n)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_read_prep);
+ return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+ struct dt_object *dt, __u64 start,
+ __u64 end, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_body_ops);
+ LASSERT(dt->do_body_ops->dbo_declare_punch);
+ return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end, struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_body_ops);
+ LASSERT(dt->do_body_ops->dbo_punch);
+ return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+ struct ll_user_fiemap *fm)
+{
+ LASSERT(d);
+ if (d->do_body_ops == NULL)
+ return -EPROTO;
+ if (d->do_body_ops->dbo_fiemap_get == NULL)
+ return -EOPNOTSUPP;
+ return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+ struct obd_statfs *osfs)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_statfs);
+ return dev->dd_ops->dt_statfs(env, dev, osfs);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+ struct lu_fid *f)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_root_get);
+ return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+ const struct dt_device *dev,
+ struct dt_device_param *param)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_conf_get);
+ return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_sync);
+ return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_ro);
+ return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_rec *rec,
+ const struct dt_key *key,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_declare_insert);
+ return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_rec *rec,
+ const struct dt_key *key,
+ struct thandle *th,
+ struct lustre_capa *capa,
+ int noquota)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_insert);
+ return dt->do_index_ops->dio_insert(env, dt, rec, key, th,
+ capa, noquota);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+ struct dt_object *dt,
+ const char *name,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_xattr_del);
+ return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+ struct dt_object *dt, const char *name,
+ struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_xattr_del);
+ return dt->do_ops->do_xattr_del(env, dt, name, th, capa);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_buf *buf,
+ const char *name, int fl,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_xattr_set);
+ return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+ struct dt_object *dt, const struct lu_buf *buf,
+ const char *name, int fl, struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_xattr_set);
+ return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+ struct dt_object *dt, struct lu_buf *buf,
+ const char *name, struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_xattr_get);
+ return dt->do_ops->do_xattr_get(env, dt, buf, name, capa);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env,
+ struct dt_object *dt, struct lu_buf *buf,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_xattr_list);
+ return dt->do_ops->do_xattr_list(env, dt, buf, capa);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_key *key,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_declare_delete);
+ return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_key *key,
+ struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_delete);
+ return dt->do_index_ops->dio_delete(env, dt, key, th, capa);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+ struct dt_device *dev)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_commit_async);
+ return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_init_capa_ctxt(const struct lu_env *env,
+ struct dt_device *dev,
+ int mode, unsigned long timeout,
+ __u32 alg, struct lustre_capa_key *keys)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_init_capa_ctxt);
+ return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode,
+ timeout, alg, keys);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+ struct dt_object *dt,
+ struct dt_rec *rec,
+ const struct dt_key *key,
+ struct lustre_capa *capa)
+{
+ int ret;
+
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_lookup);
+
+ ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ ret = -ENOENT;
+ return ret;
+}
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+struct dt_find_hint {
+ struct lu_fid *dfh_fid;
+ struct dt_device *dfh_dt;
+ struct dt_object *dfh_o;
+};
+
+struct dt_thread_info {
+ char dti_buf[DT_MAX_PATH];
+ struct dt_find_hint dti_dfh;
+ struct lu_attr dti_attr;
+ struct lu_fid dti_fid;
+ struct dt_object_format dti_dof;
+ struct lustre_mdt_attrs dti_lma;
+ struct lu_buf dti_lb;
+ loff_t dti_off;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+ struct dt_thread_info *dti;
+
+ dti = lu_context_key_get(&env->le_ctx, &dt_key);
+ LASSERT(dti);
+ return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+
+# ifdef LPROCFS
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+# endif /* LPROCFS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h
new file mode 100644
index 000000000000..dfdb8aa4e035
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/interval_tree.h
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include <linux/libcfs/libcfs.h> /* LASSERT. */
+
+struct interval_node {
+ struct interval_node *in_left;
+ struct interval_node *in_right;
+ struct interval_node *in_parent;
+ unsigned in_color:1,
+ in_intree:1, /** set if the node is in tree */
+ in_res1:30;
+ __u8 in_res2[4]; /** tags, 8-bytes aligned */
+ __u64 in_max_high;
+ struct interval_node_extent {
+ __u64 start;
+ __u64 end;
+ } in_extent;
+};
+
+enum interval_iter {
+ INTERVAL_ITER_CONT = 1,
+ INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+ return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+ return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+ return node->in_extent.end;
+}
+
+static inline void interval_set(struct interval_node *node,
+ __u64 start, __u64 end)
+{
+ LASSERT(start <= end);
+ node->in_extent.start = start;
+ node->in_extent.end = end;
+ node->in_max_high = end;
+}
+
+/* Rules to write an interval callback.
+ * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ * should be stopped. It will then cause the iteration function to return
+ * immediately with return value INTERVAL_ITER_STOP.
+ * - callbacks for interval_iterate and interval_iterate_reverse: Every
+ * nodes in the tree will be set to @node before the callback being called
+ * - callback for interval_search: Only overlapped node will be set to @node
+ * before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+ void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+ struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+ struct interval_node_extent *ex,
+ interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root,
+ interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+ interval_callback_t func,void *data);
+
+void interval_expand(struct interval_node *root,
+ struct interval_node_extent *ext,
+ struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root,
+ struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+ struct interval_node_extent *ex);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/ioctl.h b/drivers/staging/lustre/lustre/include/ioctl.h
new file mode 100644
index 000000000000..227c261b2ae9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/ioctl.h
@@ -0,0 +1,106 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _IOWR
+
+/* On i386 and x86_64, _ASM_I386_IOCTL_H is defined by the kernel's ioctl.h,
+ * and on newer kernels this header is shared as _ASM_GENERIC_IOCTL_H.
+ *
+ * We can avoid any problems with the kernel header being included again by
+ * defining _ASM_I386_IOCTL_H here so that a later occurence of <asm/ioctl.h>
+ * does not include the kernel's ioctl.h after this one. b=14746 */
+#define _ASM_I386_IOCTL_H
+#define _ASM_GENERIC_IOCTL_H
+
+/* ioctl command encoding: 32 bits total, command in lower 16 bits,
+ * size of the parameter structure in the lower 14 bits of the
+ * upper 16 bits.
+ * Encoding the size of the parameter structure in the ioctl request
+ * The highest 2 bits are reserved for indicating the ``access mode''.
+ * NOTE: This limits the max parameter size to 16kB -1 !
+ */
+
+/*
+ * The following is for compatibility across the various Linux
+ * platforms. The i386 ioctl numbering scheme doesn't really enforce
+ * a type field. De facto, however, the top 8 bits of the lower 16
+ * bits are indeed used as a type field, so we might just as well make
+ * this explicit here. Please be sure to use the decoding macros
+ * below from now on.
+ */
+#define _IOC_NRBITS 8
+#define _IOC_TYPEBITS 8
+#define _IOC_SIZEBITS 14
+#define _IOC_DIRBITS 2
+
+#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT 0
+#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE 0U
+#define _IOC_WRITE 1U
+#define _IOC_READ 2U
+
+#define _IOC(dir,type,nr,size) (((dir) << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr) << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/* ...and for the drivers/sound files... */
+
+#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT)
+#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT)
+#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
+#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT)
+#define IOCSIZE_SHIFT (_IOC_SIZESHIFT)
+
+#endif /* _IOWR */
diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h
new file mode 100644
index 000000000000..9d4011f2908b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lclient.h
@@ -0,0 +1,437 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Definitions shared between vvp and liblustre, and other clients in the
+ * future.
+ *
+ * Author: Oleg Drokin <oleg.drokin@sun.com>
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LCLIENT_H
+#define LCLIENT_H
+
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+ struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+ return cl_glimpse_size0(inode, 0);
+}
+
+static inline int cl_agl(struct inode *inode)
+{
+ return cl_glimpse_size0(inode, 1);
+}
+
+/**
+ * Locking policy for setattr.
+ */
+enum ccc_setattr_lock_type {
+ /** Locking is done by server */
+ SETATTR_NOLOCK,
+ /** Extent lock is enqueued */
+ SETATTR_EXTENT_LOCK,
+ /** Existing local extent lock is used */
+ SETATTR_MATCH_LOCK
+};
+
+
+/**
+ * IO state private to vvp or slp layers.
+ */
+struct ccc_io {
+ /** super class */
+ struct cl_io_slice cui_cl;
+ struct cl_io_lock_link cui_link;
+ /**
+ * I/O vector information to or from which read/write is going.
+ */
+ struct iovec *cui_iov;
+ unsigned long cui_nrsegs;
+ /**
+ * Total iov count for left IO.
+ */
+ unsigned long cui_tot_nrsegs;
+ /**
+ * Old length for iov that was truncated partially.
+ */
+ size_t cui_iov_olen;
+ /**
+ * Total size for the left IO.
+ */
+ size_t cui_tot_count;
+
+ union {
+ struct {
+ enum ccc_setattr_lock_type cui_local_lock;
+ } setattr;
+ } u;
+ /**
+ * True iff io is processing glimpse right now.
+ */
+ int cui_glimpse;
+ /**
+ * Layout version when this IO is initialized
+ */
+ __u32 cui_layout_gen;
+ /**
+ * File descriptor against which IO is done.
+ */
+ struct ll_file_data *cui_fd;
+ struct kiocb *cui_iocb;
+};
+
+/**
+ * True, if \a io is a normal io, False for other (sendfile, splice*).
+ * must be impementated in arch specific code.
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io);
+
+extern struct lu_context_key ccc_key;
+extern struct lu_context_key ccc_session_key;
+
+struct ccc_thread_info {
+ struct cl_lock_descr cti_descr;
+ struct cl_io cti_io;
+ struct cl_attr cti_attr;
+};
+
+static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
+{
+ struct ccc_thread_info *info;
+
+ info = lu_context_key_get(&env->le_ctx, &ccc_key);
+ LASSERT(info != NULL);
+ return info;
+}
+
+static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env)
+{
+ struct cl_attr *attr = &ccc_env_info(env)->cti_attr;
+ memset(attr, 0, sizeof(*attr));
+ return attr;
+}
+
+static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env)
+{
+ struct cl_io *io = &ccc_env_info(env)->cti_io;
+ memset(io, 0, sizeof(*io));
+ return io;
+}
+
+struct ccc_session {
+ struct ccc_io cs_ios;
+};
+
+static inline struct ccc_session *ccc_env_session(const struct lu_env *env)
+{
+ struct ccc_session *ses;
+
+ ses = lu_context_key_get(env->le_ses, &ccc_session_key);
+ LASSERT(ses != NULL);
+ return ses;
+}
+
+static inline struct ccc_io *ccc_env_io(const struct lu_env *env)
+{
+ return &ccc_env_session(env)->cs_ios;
+}
+
+/**
+ * ccc-private object state.
+ */
+struct ccc_object {
+ struct cl_object_header cob_header;
+ struct cl_object cob_cl;
+ struct inode *cob_inode;
+
+ /**
+ * A list of dirty pages pending IO in the cache. Used by
+ * SOM. Protected by ll_inode_info::lli_lock.
+ *
+ * \see ccc_page::cpg_pending_linkage
+ */
+ struct list_head cob_pending_list;
+
+ /**
+ * Access this counter is protected by inode->i_sem. Now that
+ * the lifetime of transient pages must be covered by inode sem,
+ * we don't need to hold any lock..
+ */
+ int cob_transient_pages;
+ /**
+ * Number of outstanding mmaps on this file.
+ *
+ * \see ll_vm_open(), ll_vm_close().
+ */
+ atomic_t cob_mmap_cnt;
+
+ /**
+ * various flags
+ * cob_discard_page_warned
+ * if pages belonging to this object are discarded when a client
+ * is evicted, some debug info will be printed, this flag will be set
+ * during processing the first discarded page, then avoid flooding
+ * debug message for lots of discarded pages.
+ *
+ * \see ll_dirty_page_discard_warn.
+ */
+ unsigned int cob_discard_page_warned:1;
+};
+
+/**
+ * ccc-private page state.
+ */
+struct ccc_page {
+ struct cl_page_slice cpg_cl;
+ int cpg_defer_uptodate;
+ int cpg_ra_used;
+ int cpg_write_queued;
+ /**
+ * Non-empty iff this page is already counted in
+ * ccc_object::cob_pending_list. Protected by
+ * ccc_object::cob_pending_guard. This list is only used as a flag,
+ * that is, never iterated through, only checked for list_empty(), but
+ * having a list is useful for debugging.
+ */
+ struct list_head cpg_pending_linkage;
+ /** VM page */
+ struct page *cpg_page;
+};
+
+static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice)
+{
+ return container_of(slice, struct ccc_page, cpg_cl);
+}
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+
+struct ccc_device {
+ struct cl_device cdv_cl;
+ struct super_block *cdv_sb;
+ struct cl_device *cdv_next;
+};
+
+struct ccc_lock {
+ struct cl_lock_slice clk_cl;
+};
+
+struct ccc_req {
+ struct cl_req_slice crq_cl;
+};
+
+void *ccc_key_init (const struct lu_context *ctx,
+ struct lu_context_key *key);
+void ccc_key_fini (const struct lu_context *ctx,
+ struct lu_context_key *key, void *data);
+void *ccc_session_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key);
+void ccc_session_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data);
+
+int ccc_device_init (const struct lu_env *env,
+ struct lu_device *d,
+ const char *name, struct lu_device *next);
+struct lu_device *ccc_device_fini (const struct lu_env *env,
+ struct lu_device *d);
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *cfg,
+ const struct lu_device_operations *luops,
+ const struct cl_device_operations *clops);
+struct lu_device *ccc_device_free (const struct lu_env *env,
+ struct lu_device *d);
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *hdr,
+ struct lu_device *dev,
+ const struct cl_object_operations *clops,
+ const struct lu_object_operations *luops);
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req);
+void ccc_umount(const struct lu_env *env, struct cl_device *dev);
+int ccc_global_init(struct lu_device_type *device_type);
+void ccc_global_fini(struct lu_device_type *device_type);
+int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob,
+ const struct cl_object_conf *conf);
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf);
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj);
+int ccc_lock_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io,
+ const struct cl_lock_operations *lkops);
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid);
+int ccc_object_glimpse(const struct lu_env *env,
+ const struct cl_object *obj, struct ost_lvb *lvb);
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf);
+struct page *ccc_page_vmpage(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+int ccc_page_is_under_lock(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io);
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
+void ccc_transient_page_verify(const struct cl_page *page);
+int ccc_transient_page_own(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io, int nonblock);
+void ccc_transient_page_assume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+void ccc_transient_page_unassume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+void ccc_transient_page_disown(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+void ccc_transient_page_discard(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+int ccc_transient_page_prep(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+void ccc_lock_delete(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice);
+int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice,
+ struct cl_io *io, __u32 enqflags);
+int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_fits_into(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io);
+void ccc_lock_state(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ enum cl_lock_state state);
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+ __u32 enqflags, enum cl_lock_mode mode,
+ pgoff_t start, pgoff_t end);
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+ __u32 enqflags, enum cl_lock_mode mode,
+ loff_t start, loff_t end);
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios,
+ size_t nob);
+void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio,
+ struct cl_io *io);
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io, loff_t start, size_t count, int *exceed);
+void ccc_req_completion(const struct lu_env *env,
+ const struct cl_req_slice *slice, int ioret);
+void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice,
+ const struct cl_object *obj,
+ struct cl_req_attr *oa, obd_valid flags);
+
+struct lu_device *ccc2lu_dev (struct ccc_device *vdv);
+struct lu_object *ccc2lu (struct ccc_object *vob);
+struct ccc_device *lu2ccc_dev (const struct lu_device *d);
+struct ccc_device *cl2ccc_dev (const struct cl_device *d);
+struct ccc_object *lu2ccc (const struct lu_object *obj);
+struct ccc_object *cl2ccc (const struct cl_object *obj);
+struct ccc_lock *cl2ccc_lock (const struct cl_lock_slice *slice);
+struct ccc_io *cl2ccc_io (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+struct ccc_req *cl2ccc_req (const struct cl_req_slice *slice);
+struct page *cl2vm_page (const struct cl_page_slice *slice);
+struct inode *ccc_object_inode(const struct cl_object *obj);
+struct ccc_object *cl_inode2ccc (struct inode *inode);
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+ struct obd_capa *capa);
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+int ccc_object_invariant(const struct cl_object *obj);
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+int cl_local_size(struct inode *inode);
+
+__u16 ll_dirent_type_get(struct lu_dirent *ent);
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+__u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+# define CLOBINVRNT(env, clob, expr) \
+ ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr))
+
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp);
+int cl_ocd_update(struct obd_device *host,
+ struct obd_device *watched,
+ enum obd_notify_event ev, void *owner, void *data);
+
+struct ccc_grouplock {
+ struct lu_env *cg_env;
+ struct cl_io *cg_io;
+ struct cl_lock *cg_lock;
+ unsigned long cg_gid;
+};
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+ struct ccc_grouplock *cg);
+void cl_put_grouplock(struct ccc_grouplock *cg);
+
+/**
+ * New interfaces to get and put lov_stripe_md from lov layer. This violates
+ * layering because lov_stripe_md is supposed to be a private data in lov.
+ *
+ * NB: If you find you have to use these interfaces for your new code, please
+ * think about it again. These interfaces may be removed in the future for
+ * better layering. */
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj);
+void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm);
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode);
+void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm);
+
+/**
+ * Data structure managing a client's cached clean pages. An LRU of
+ * pages is maintained, along with other statistics.
+ */
+struct cl_client_cache {
+ atomic_t ccc_users; /* # of users (OSCs) of this data */
+ struct list_head ccc_lru; /* LRU list of cached clean pages */
+ spinlock_t ccc_lru_lock; /* lock for list */
+ atomic_t ccc_lru_left; /* # of LRU entries available */
+ unsigned long ccc_lru_max; /* Max # of LRU entries possible */
+ unsigned int ccc_lru_shrinkers; /* # of threads reclaiming */
+};
+
+#endif /*LCLIENT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h
new file mode 100644
index 000000000000..586692272d78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LINUX_LPROCFS_SNMP_H
+#define _LINUX_LPROCFS_SNMP_H
+
+#ifndef _LPROCFS_SNMP_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include <linux/smp.h>
+#include <linux/rwsem.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/statfs.h>
+
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_acl.h b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h
new file mode 100644
index 000000000000..ff4fc4ff2894
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h
@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_acl.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_LINUX_ACL_H
+#define _LUSTRE_LINUX_ACL_H
+
+#ifndef _LUSTRE_ACL_H
+#error Shoud not include direectly. use #include <lustre_acl.h> instead
+#endif
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+# include <linux/posix_acl_xattr.h>
+# define LUSTRE_POSIX_ACL_MAX_ENTRIES 32
+# define LUSTRE_POSIX_ACL_MAX_SIZE \
+ (sizeof(posix_acl_xattr_header) + \
+ LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+# include <linux/xattr.h> /* XATTR_{REPLACE,CREATE} */
+
+#ifndef LUSTRE_POSIX_ACL_MAX_SIZE
+# define LUSTRE_POSIX_ACL_MAX_SIZE 0
+#endif
+
+#endif /* _LUSTRE_LINUX_ACL_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_common.h b/drivers/staging/lustre/lustre/include/linux/lustre_common.h
new file mode 100644
index 000000000000..d1783a33d8ca
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_common.h
@@ -0,0 +1,22 @@
+#ifndef LUSTRE_COMMON_H
+#define LUSTRE_COMMON_H
+
+#include <linux/sched.h>
+
+static inline int cfs_cleanup_group_info(void)
+{
+ struct group_info *ginfo;
+
+ ginfo = groups_alloc(0);
+ if (!ginfo)
+ return -ENOMEM;
+
+ set_current_groups(ginfo);
+ put_group_info(ginfo);
+
+ return 0;
+}
+
+#define ll_inode_blksize(a) (1<<(a)->i_blkbits)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
new file mode 100644
index 000000000000..dff04688945b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
@@ -0,0 +1,349 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_COMPAT25_H
+#define _LINUX_COMPAT25_H
+
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/lustre_patchless_compat.h>
+
+# define LOCK_FS_STRUCT(fs) spin_lock(&(fs)->lock)
+# define UNLOCK_FS_STRUCT(fs) spin_unlock(&(fs)->lock)
+
+static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+ struct dentry *dentry)
+{
+ struct path path;
+ struct path old_pwd;
+
+ path.mnt = mnt;
+ path.dentry = dentry;
+ LOCK_FS_STRUCT(fs);
+ old_pwd = fs->pwd;
+ path_get(&path);
+ fs->pwd = path;
+ UNLOCK_FS_STRUCT(fs);
+
+ if (old_pwd.dentry)
+ path_put(&old_pwd);
+}
+
+
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS (1 << 27)
+
+#define current_ngroups current_cred()->group_info->ngroups
+#define current_groups current_cred()->group_info->small_block
+
+/*
+ * OBD need working random driver, thus all our
+ * initialization routines must be called after device
+ * driver initialization
+ */
+#ifndef MODULE
+#undef module_init
+#define module_init(a) late_initcall(a)
+#endif
+
+
+#define LTIME_S(time) (time.tv_sec)
+
+#define ll_permission(inode,mask,nd) inode_permission(inode,mask)
+
+# define ll_generic_permission(inode, mask, flags, check_acl) \
+ generic_permission(inode, mask)
+
+#define ll_blkdev_put(a, b) blkdev_put(a, b)
+
+#define ll_dentry_open(a,b,c) dentry_open(a,b,c)
+
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+ vfs_symlink(dir, dentry, path)
+
+
+#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \
+ generic_file_llseek_size(file, offset, origin, maxbytes, eof);
+
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i) do {} while (0) /* for write unlock */
+# define inode_dio_read(i) atomic_inc(&(i)->i_dio_count)
+/* inode_dio_done(i) use as-is for read unlock */
+
+#define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock)
+#define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock)
+
+static inline
+int ll_unregister_blkdev(unsigned int dev, const char *name)
+{
+ unregister_blkdev(dev, name);
+ return 0;
+}
+
+#define ll_invalidate_bdev(a,b) invalidate_bdev((a))
+
+#ifndef FS_HAS_FIEMAP
+#define FS_HAS_FIEMAP (0)
+#endif
+
+
+
+/* add a lustre compatible layer for crypto API */
+#include <linux/crypto.h>
+#define ll_crypto_hash crypto_hash
+#define ll_crypto_cipher crypto_blkcipher
+#define ll_crypto_alloc_hash(name, type, mask) crypto_alloc_hash(name, type, mask)
+#define ll_crypto_hash_setkey(tfm, key, keylen) crypto_hash_setkey(tfm, key, keylen)
+#define ll_crypto_hash_init(desc) crypto_hash_init(desc)
+#define ll_crypto_hash_update(desc, sl, bytes) crypto_hash_update(desc, sl, bytes)
+#define ll_crypto_hash_final(desc, out) crypto_hash_final(desc, out)
+#define ll_crypto_blkcipher_setkey(tfm, key, keylen) \
+ crypto_blkcipher_setkey(tfm, key, keylen)
+#define ll_crypto_blkcipher_set_iv(tfm, src, len) \
+ crypto_blkcipher_set_iv(tfm, src, len)
+#define ll_crypto_blkcipher_get_iv(tfm, dst, len) \
+ crypto_blkcipher_get_iv(tfm, dst, len)
+#define ll_crypto_blkcipher_encrypt(desc, dst, src, bytes) \
+ crypto_blkcipher_encrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt(desc, dst, src, bytes) \
+ crypto_blkcipher_decrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) \
+ crypto_blkcipher_encrypt_iv(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) \
+ crypto_blkcipher_decrypt_iv(desc, dst, src, bytes)
+
+static inline
+struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char *name,
+ u32 type, u32 mask)
+{
+ struct ll_crypto_cipher *rtn = crypto_alloc_blkcipher(name, type, mask);
+
+ return (rtn == NULL ? ERR_PTR(-ENOMEM) : rtn);
+}
+
+static inline int ll_crypto_hmac(struct ll_crypto_hash *tfm,
+ u8 *key, unsigned int *keylen,
+ struct scatterlist *sg,
+ unsigned int size, u8 *result)
+{
+ struct hash_desc desc;
+ int rv;
+ desc.tfm = tfm;
+ desc.flags = 0;
+ rv = crypto_hash_setkey(desc.tfm, key, *keylen);
+ if (rv) {
+ CERROR("failed to hash setkey: %d\n", rv);
+ return rv;
+ }
+ return crypto_hash_digest(&desc, sg, size, result);
+}
+static inline
+unsigned int ll_crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm)
+{
+ return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.max_keysize;
+}
+static inline
+unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
+{
+ return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize;
+}
+
+#define ll_crypto_hash_blocksize(tfm) crypto_hash_blocksize(tfm)
+#define ll_crypto_hash_digestsize(tfm) crypto_hash_digestsize(tfm)
+#define ll_crypto_blkcipher_ivsize(tfm) crypto_blkcipher_ivsize(tfm)
+#define ll_crypto_blkcipher_blocksize(tfm) crypto_blkcipher_blocksize(tfm)
+#define ll_crypto_free_hash(tfm) crypto_free_hash(tfm)
+#define ll_crypto_free_blkcipher(tfm) crypto_free_blkcipher(tfm)
+
+#define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry)
+#define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mode)
+#define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,dir,new)
+#define ll_vfs_unlink(inode,entry,mnt) vfs_unlink(inode,entry)
+#define ll_vfs_mknod(dir,entry,mnt,mode,dev) vfs_mknod(dir,entry,mode,dev)
+#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)
+#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
+ vfs_rename(old,old_dir,new,new_dir)
+
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#define cfs_bio_io_error(a,b) bio_io_error((a))
+#define cfs_bio_endio(a,b,c) bio_endio((a),(c))
+
+#define cfs_fs_pwd(fs) ((fs)->pwd.dentry)
+#define cfs_fs_mnt(fs) ((fs)->pwd.mnt)
+#define cfs_path_put(nd) path_put(&(nd)->path)
+
+
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
+#endif
+
+
+
+static inline int
+ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount)
+{
+ int rc;
+
+ if (sb->s_qcop->quota_on) {
+ struct path path;
+
+ rc = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (!rc)
+ return rc;
+ rc = sb->s_qcop->quota_on(sb, off, ver
+ , &path
+ );
+ path_put(&path);
+ return rc;
+ }
+ else
+ return -ENOSYS;
+}
+
+static inline int ll_quota_off(struct super_block *sb, int off, int remount)
+{
+ if (sb->s_qcop->quota_off) {
+ return sb->s_qcop->quota_off(sb, off
+ );
+ }
+ else
+ return -ENOSYS;
+}
+
+
+# define ll_vfs_dq_init dquot_initialize
+# define ll_vfs_dq_drop dquot_drop
+# define ll_vfs_dq_transfer dquot_transfer
+# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1)
+
+
+
+
+
+#define queue_max_phys_segments(rq) queue_max_segments(rq)
+#define queue_max_hw_segments(rq) queue_max_segments(rq)
+
+#define ll_kmap_atomic(a, b) kmap_atomic(a)
+#define ll_kunmap_atomic(a, b) kunmap_atomic(a)
+
+
+#define ll_d_hlist_node hlist_node
+#define ll_d_hlist_empty(list) hlist_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry)
+#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \
+ p = NULL; hlist_for_each_entry(dentry, i_dentry, alias)
+
+
+#define bio_hw_segments(q, bio) 0
+
+
+#define ll_pagevec_init(pv, cold) do {} while (0)
+#define ll_pagevec_add(pv, pg) (0)
+#define ll_pagevec_lru_add_file(pv) do {} while (0)
+
+
+#ifndef QUOTA_OK
+# define QUOTA_OK 0
+#endif
+#ifndef NO_QUOTA
+# define NO_QUOTA (-EDQUOT)
+#endif
+
+#ifndef SEEK_DATA
+#define SEEK_DATA 3 /* seek to the next data */
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE 4 /* seek to the next hole */
+#endif
+
+#ifndef FMODE_UNSIGNED_OFFSET
+#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
+#endif
+
+#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit)
+# define ext2_set_bit __test_and_set_bit_le
+# define ext2_clear_bit __test_and_clear_bit_le
+# define ext2_test_bit test_bit_le
+# define ext2_find_first_zero_bit find_first_zero_bit_le
+# define ext2_find_next_zero_bit find_next_zero_bit_le
+#endif
+
+#ifdef ATTR_TIMES_SET
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+#else
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET)
+#endif
+
+
+
+/*
+ * After 3.1, kernel's nameidata.intent.open.flags is different
+ * with lustre's lookup_intent.it_flags, as lustre's it_flags'
+ * lower bits equal to FMODE_xxx while kernel doesn't transliterate
+ * lower bits of nameidata.intent.open.flags to FMODE_xxx.
+ * */
+#include <linux/version.h>
+static inline int ll_namei_to_lookup_intent_flag(int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
+ flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
+#endif
+ return flag;
+}
+
+# define ll_mrf_ret void
+# define LL_MRF_RETURN(rc)
+
+#include <linux/fs.h>
+
+# define ll_umode_t umode_t
+
+#include <linux/dcache.h>
+
+# define ll_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode), flag)
+
+#endif /* _COMPAT25_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_debug.h b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h
new file mode 100644
index 000000000000..11deac7248ae
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DEBUG_H
+#define _LINUX_LUSTRE_DEBUG_H
+
+#ifndef _LUSTRE_DEBUG_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \
+ CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\
+ fmt, page, page->mapping, page->index, (long)page->flags, \
+ page_count(page), page_private(page), ## arg)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h
new file mode 100644
index 000000000000..207df03f6149
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h
@@ -0,0 +1,46 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DLM_H__
+#define _LINUX_LUSTRE_DLM_H__
+
+#ifndef _LUSTRE_DLM_H__
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+# include <linux/proc_fs.h>
+# include <asm/processor.h>
+# include <linux/bit_spinlock.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h
new file mode 100644
index 000000000000..6c7260957383
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h
@@ -0,0 +1,181 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LINUX_LUSTRE_FSFILT_H
+#define _LINUX_LUSTRE_FSFILT_H
+
+#ifndef _LUSTRE_FSFILT_H
+#error Do not #include this file directly. #include <lustre_fsfilt.h> instead
+#endif
+
+
+#include <obd.h>
+#include <obd_class.h>
+
+typedef void (*fsfilt_cb_t)(struct obd_device *obd, __u64 last_rcvd,
+ void *data, int error);
+
+struct fsfilt_operations {
+ struct list_head fs_list;
+ module_t *fs_owner;
+ char *fs_type;
+ char *(* fs_getlabel)(struct super_block *sb);
+ void *(* fs_start)(struct inode *inode, int op, void *desc_private,
+ int logs);
+ int (* fs_commit)(struct inode *inode, void *handle,int force_sync);
+ int (* fs_map_inode_pages)(struct inode *inode, struct page **page,
+ int pages, unsigned long *blocks,
+ int create, struct mutex *sem);
+ int (* fs_write_record)(struct file *, void *, int size, loff_t *,
+ int force_sync);
+ int (* fs_read_record)(struct file *, void *, int size, loff_t *);
+ int (* fs_setup)(struct super_block *sb);
+};
+
+extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops);
+extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops);
+extern struct fsfilt_operations *fsfilt_get_ops(const char *type);
+extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
+
+static inline char *fsfilt_get_label(struct obd_device *obd,
+ struct super_block *sb)
+{
+ if (obd->obd_fsops->fs_getlabel == NULL)
+ return NULL;
+ if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0')
+ return NULL;
+
+ return obd->obd_fsops->fs_getlabel(sb);
+}
+
+#define FSFILT_OP_UNLINK 1
+#define FSFILT_OP_CANCEL_UNLINK 10
+
+#define __fsfilt_check_slow(obd, start, msg) \
+do { \
+ if (cfs_time_before(jiffies, start + 15 * HZ)) \
+ break; \
+ else if (cfs_time_before(jiffies, start + 30 * HZ)) \
+ CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \
+ msg, (jiffies-start) / HZ); \
+ else if (cfs_time_before(jiffies, start + DISK_TIMEOUT * HZ)) \
+ CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \
+ (jiffies - start) / HZ); \
+ else \
+ CERROR("%s: slow %s %lus\n", obd->obd_name, msg, \
+ (jiffies - start) / HZ); \
+} while (0)
+
+#define fsfilt_check_slow(obd, start, msg) \
+do { \
+ __fsfilt_check_slow(obd, start, msg); \
+ start = jiffies; \
+} while (0)
+
+static inline void *fsfilt_start_log(struct obd_device *obd,
+ struct inode *inode, int op,
+ struct obd_trans_info *oti, int logs)
+{
+ unsigned long now = jiffies;
+ void *parent_handle = oti ? oti->oti_handle : NULL;
+ void *handle;
+
+ handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs);
+ CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
+
+ if (oti != NULL) {
+ if (parent_handle == NULL) {
+ oti->oti_handle = handle;
+ } else if (handle != parent_handle) {
+ CERROR("mismatch: parent %p, handle %p, oti %p\n",
+ parent_handle, handle, oti);
+ LBUG();
+ }
+ }
+ fsfilt_check_slow(obd, now, "journal start");
+ return handle;
+}
+
+static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
+ void *handle, int force_sync)
+{
+ unsigned long now = jiffies;
+ int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
+ CDEBUG(D_INFO, "committing handle %p\n", handle);
+
+ fsfilt_check_slow(obd, now, "journal start");
+
+ return rc;
+}
+
+static inline int fsfilt_map_inode_pages(struct obd_device *obd,
+ struct inode *inode,
+ struct page **page, int pages,
+ unsigned long *blocks,
+ int create, struct mutex *mutex)
+{
+ return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks,
+ create, mutex);
+}
+
+static inline int fsfilt_read_record(struct obd_device *obd, struct file *file,
+ void *buf, loff_t size, loff_t *offs)
+{
+ return obd->obd_fsops->fs_read_record(file, buf, size, offs);
+}
+
+static inline int fsfilt_write_record(struct obd_device *obd, struct file *file,
+ void *buf, loff_t size, loff_t *offs,
+ int force_sync)
+{
+ return obd->obd_fsops->fs_write_record(file, buf, size,offs,force_sync);
+}
+
+static inline int fsfilt_setup(struct obd_device *obd, struct super_block *fs)
+{
+ if (obd->obd_fsops->fs_setup)
+ return obd->obd_fsops->fs_setup(fs);
+ return 0;
+}
+
+
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_handles.h b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h
new file mode 100644
index 000000000000..ecf184051252
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_LUSTRE_HANDLES_H_
+#define __LINUX_LUSTRE_HANDLES_H_
+
+#ifndef __LUSTRE_HANDLES_H_
+#error Do not #include this file directly. #include <lustre_handles.h> instead
+#endif
+
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/list.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/rcupdate.h> /* for rcu_head{} */
+typedef struct rcu_head cfs_rcu_head_t;
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_intent.h b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h
new file mode 100644
index 000000000000..b10ddfa7df29
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+struct lustre_intent_data {
+ int it_disposition;
+ int it_status;
+ __u64 it_lock_handle;
+ __u64 it_lock_bits;
+ int it_lock_mode;
+ int it_remote_lock_mode;
+ __u64 it_remote_lock_handle;
+ void *it_data;
+ unsigned int it_lock_set:1;
+};
+
+struct lookup_intent {
+ int it_op;
+ int it_flags;
+ int it_create_mode;
+ union {
+ struct lustre_intent_data lustre;
+ } d;
+};
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lib.h b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h
new file mode 100644
index 000000000000..b2f755acadf6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LINUX_LUSTRE_LIB_H
+#define _LINUX_LUSTRE_LIB_H
+
+#ifndef _LUSTRE_LIB_H
+#error Do not #include this file directly. #include <lustre_lib.h> instead
+#endif
+
+# include <linux/rwsem.h>
+# include <linux/sched.h>
+# include <linux/signal.h>
+# include <linux/types.h>
+# include <linux/lustre_compat25.h>
+# include <linux/lustre_common.h>
+
+#ifndef LP_POISON
+#if BITS_PER_LONG > 32
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a)
+#endif
+#endif
+
+/* This macro is only for compatibility reasons with older Linux Lustre user
+ * tools. New ioctls should NOT use this macro as the ioctl "size". Instead
+ * the ioctl should get a "size" argument which is the actual data type used
+ * by the ioctl, to ensure the ioctl interface is versioned correctly. */
+#define OBD_IOC_DATA_TYPE long
+
+#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | \
+ sigmask(SIGTERM) | sigmask(SIGQUIT) | \
+ sigmask(SIGALRM))
+
+/* initialize ost_lvb according to inode */
+static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
+{
+ lvb->lvb_size = i_size_read(inode);
+ lvb->lvb_blocks = inode->i_blocks;
+ lvb->lvb_mtime = LTIME_S(inode->i_mtime);
+ lvb->lvb_atime = LTIME_S(inode->i_atime);
+ lvb->lvb_ctime = LTIME_S(inode->i_ctime);
+}
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
new file mode 100644
index 000000000000..c95dff900b58
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
@@ -0,0 +1,100 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LL_H
+#define _LINUX_LL_H
+
+#ifndef _LL_H
+#error Do not #include this file directly. #include <lustre_lite.h> instead
+#endif
+
+
+#include <linux/version.h>
+
+#include <asm/statfs.h>
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/proc_fs.h>
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_ha.h>
+
+#include <linux/rbtree.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/pagemap.h>
+
+/* lprocfs.c */
+enum {
+ LPROC_LL_DIRTY_HITS = 0,
+ LPROC_LL_DIRTY_MISSES,
+ LPROC_LL_READ_BYTES,
+ LPROC_LL_WRITE_BYTES,
+ LPROC_LL_BRW_READ,
+ LPROC_LL_BRW_WRITE,
+ LPROC_LL_OSC_READ,
+ LPROC_LL_OSC_WRITE,
+ LPROC_LL_IOCTL,
+ LPROC_LL_OPEN,
+ LPROC_LL_RELEASE,
+ LPROC_LL_MAP,
+ LPROC_LL_LLSEEK,
+ LPROC_LL_FSYNC,
+ LPROC_LL_READDIR,
+ LPROC_LL_SETATTR,
+ LPROC_LL_TRUNC,
+ LPROC_LL_FLOCK,
+ LPROC_LL_GETATTR,
+ LPROC_LL_CREATE,
+ LPROC_LL_LINK,
+ LPROC_LL_UNLINK,
+ LPROC_LL_SYMLINK,
+ LPROC_LL_MKDIR,
+ LPROC_LL_RMDIR,
+ LPROC_LL_MKNOD,
+ LPROC_LL_RENAME,
+ LPROC_LL_STAFS,
+ LPROC_LL_ALLOC_INODE,
+ LPROC_LL_SETXATTR,
+ LPROC_LL_GETXATTR,
+ LPROC_LL_LISTXATTR,
+ LPROC_LL_REMOVEXATTR,
+ LPROC_LL_INODE_PERM,
+ LPROC_LL_FILE_OPCODES
+};
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_log.h b/drivers/staging/lustre/lustre/include/linux/lustre_log.h
new file mode 100644
index 000000000000..e9c8e56737d2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_log.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LINUX_LUSTRE_LOG_H
+#define _LINUX_LUSTRE_LOG_H
+
+#ifndef _LUSTRE_LOG_H
+#error Do not #include this file directly. #include <lustre_log.h> instead
+#endif
+
+#define LUSTRE_LOG_SERVER
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_net.h b/drivers/staging/lustre/lustre/include/linux/lustre_net.h
new file mode 100644
index 000000000000..2d7c425d7012
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_net.h
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_NET_H
+#define _LINUX_LUSTRE_NET_H
+
+#ifndef _LUSTRE_NET_H
+#error Do not #include this file directly. #include <lustre_net.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+/* XXX Liang: should be moved to other header instead of here */
+#ifndef WITH_GROUP_INFO
+#define WITH_GROUP_INFO
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
new file mode 100644
index 000000000000..a8e9c0c8ffd2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -0,0 +1,83 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_PATCHLESS_COMPAT_H
+#define LUSTRE_PATCHLESS_COMPAT_H
+
+#include <linux/fs.h>
+
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+
+
+#define ll_delete_from_page_cache(page) delete_from_page_cache(page)
+
+static inline void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+ if (page->mapping != mapping)
+ return;
+
+ if (PagePrivate(page))
+ page->mapping->a_ops->invalidatepage(page, 0);
+
+ cancel_dirty_page(page, PAGE_SIZE);
+ ClearPageMappedToDisk(page);
+ ll_delete_from_page_cache(page);
+}
+
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+# define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
+#endif /* LUSTRE_PATCHLESS_COMPAT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_quota.h b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h
new file mode 100644
index 000000000000..421866b004cf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_QUOTA_H
+#define _LINUX_LUSTRE_QUOTA_H
+
+#ifndef _LUSTRE_QUOTA_H
+#error Do not #include this file directly. #include <lustre_quota.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
new file mode 100644
index 000000000000..ebaf92977f7f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LINUX_LUSTRE_USER_H
+#define _LINUX_LUSTRE_USER_H
+
+# include <linux/version.h>
+# include <linux/quota.h>
+
+/*
+ * asm-x86_64/processor.h on some SLES 9 distros seems to use
+ * kernel-only typedefs. fortunately skipping it altogether is ok
+ * (for now).
+ */
+#define __ASM_X86_64_PROCESSOR_H
+
+#include <linux/string.h>
+
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+ defined(__craynv) || defined (__mips64__) || defined(__powerpc64__)
+typedef struct stat lstat_t;
+#define lstat_f lstat
+#define HAVE_LOV_USER_MDS_DATA
+#else
+typedef struct stat64 lstat_t;
+#define lstat_f lstat64
+#define HAVE_LOV_USER_MDS_DATA
+#endif
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs.h b/drivers/staging/lustre/lustre/include/linux/lvfs.h
new file mode 100644
index 000000000000..eb59ac7d5946
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lvfs.h
@@ -0,0 +1,134 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LINUX_LVFS_H__
+#define __LINUX_LVFS_H__
+
+#ifndef __LVFS_H__
+#error Do not #include this file directly. #include <lvfs.h> instead
+#endif
+
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/lvfs_linux.h>
+
+#define LLOG_LVFS
+
+/* simple.c */
+
+struct lvfs_ucred {
+ __u32 luc_uid;
+ __u32 luc_gid;
+ __u32 luc_fsuid;
+ __u32 luc_fsgid;
+ kernel_cap_t luc_cap;
+ __u32 luc_umask;
+ struct group_info *luc_ginfo;
+ struct md_identity *luc_identity;
+};
+
+struct lvfs_callback_ops {
+ struct dentry *(*l_fid2dentry)(__u64 id_ino, __u32 gen, __u64 gr, void *data);
+};
+
+#define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA
+#define OBD_CTXT_DEBUG /* development-only debugging */
+struct lvfs_run_ctxt {
+ struct vfsmount *pwdmnt;
+ struct dentry *pwd;
+ mm_segment_t fs;
+ struct lvfs_ucred luc;
+ int ngroups;
+ struct lvfs_callback_ops cb_ops;
+ struct group_info *group_info;
+ struct dt_device *dt;
+#ifdef OBD_CTXT_DEBUG
+ __u32 magic;
+#endif
+};
+
+#ifdef OBD_CTXT_DEBUG
+#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC
+#else
+#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0)
+#endif
+
+
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname,
+ char *newname);
+
+static inline void l_dput(struct dentry *de)
+{
+ if (!de || IS_ERR(de))
+ return;
+ //shrink_dcache_parent(de);
+ LASSERT(d_count(de) > 0);
+ dput(de);
+}
+
+/* We need to hold the inode semaphore over the dcache lookup itself, or we
+ * run the risk of entering the filesystem lookup path concurrently on SMP
+ * systems, and instantiating two inodes for the same entry. We still
+ * protect against concurrent addition/removal races with the DLM locking.
+ */
+static inline struct dentry *ll_lookup_one_len(const char *fid_name,
+ struct dentry *dparent,
+ int fid_namelen)
+{
+ struct dentry *dchild;
+
+ mutex_lock(&dparent->d_inode->i_mutex);
+ dchild = lookup_one_len(fid_name, dparent, fid_namelen);
+ mutex_unlock(&dparent->d_inode->i_mutex);
+
+ if (IS_ERR(dchild) || dchild->d_inode == NULL)
+ return dchild;
+
+ if (is_bad_inode(dchild->d_inode)) {
+ CERROR("bad inode returned %lu/%u\n",
+ dchild->d_inode->i_ino, dchild->d_inode->i_generation);
+ dput(dchild);
+ dchild = ERR_PTR(-ENOENT);
+ }
+ return dchild;
+}
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h
new file mode 100644
index 000000000000..140a60f1f0c9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h
@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LVFS_LINUX_H__
+#define __LVFS_LINUX_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include <lvfs.h>
+
+#define l_file file
+#define l_dentry dentry
+
+#define l_filp_open filp_open
+
+struct lvfs_run_ctxt;
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *, struct l_dentry *,
+ int flags);
+
+struct l_linux_dirent {
+ struct list_head lld_list;
+ ino_t lld_ino;
+ unsigned long lld_off;
+ char lld_name[LL_FID_NAMELEN];
+};
+struct l_readdir_callback {
+ struct l_linux_dirent *lrc_dirent;
+ struct list_head *lrc_list;
+};
+
+#endif /* __LVFS_LINUX_H__ */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd.h b/drivers/staging/lustre/lustre/include/linux/obd.h
new file mode 100644
index 000000000000..2c36c0d19d06
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd.h
@@ -0,0 +1,128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_OBD_H
+#define __LINUX_OBD_H
+
+#ifndef __OBD_H
+#error Do not #include this file directly. #include <obd.h> instead
+#endif
+
+#include <obd_support.h>
+
+# include <linux/fs.h>
+# include <linux/list.h>
+# include <linux/sched.h> /* for struct task_struct, for current.h */
+# include <linux/proc_fs.h>
+# include <linux/mount.h>
+# include <linux/lustre_intent.h>
+
+struct ll_iattr {
+ struct iattr iattr;
+ unsigned int ia_attr_flags;
+};
+
+#define CLIENT_OBD_LIST_LOCK_DEBUG 1
+
+typedef struct {
+ spinlock_t lock;
+
+ unsigned long time;
+ struct task_struct *task;
+ const char *func;
+ int line;
+} client_obd_lock_t;
+
+static inline void __client_obd_list_lock(client_obd_lock_t *lock,
+ const char *func, int line)
+{
+ unsigned long cur = jiffies;
+ while (1) {
+ if (spin_trylock(&lock->lock)) {
+ LASSERT(lock->task == NULL);
+ lock->task = current;
+ lock->func = func;
+ lock->line = line;
+ lock->time = jiffies;
+ break;
+ }
+
+ if ((jiffies - cur > 5 * HZ) &&
+ (jiffies - lock->time > 5 * HZ)) {
+ struct task_struct *task = lock->task;
+
+ if (task == NULL)
+ continue;
+
+ LCONSOLE_WARN("%s:%d: lock %p was acquired"
+ " by <%s:%d:%s:%d> for %lu seconds.\n",
+ current->comm, current->pid,
+ lock, task->comm, task->pid,
+ lock->func, lock->line,
+ (jiffies - lock->time) / HZ);
+ LCONSOLE_WARN("====== for process holding the "
+ "lock =====\n");
+ libcfs_debug_dumpstack(task);
+ LCONSOLE_WARN("====== for current process =====\n");
+ libcfs_debug_dumpstack(NULL);
+ LCONSOLE_WARN("====== end =======\n");
+ cfs_pause(1000 * HZ);
+ }
+ cpu_relax();
+ }
+}
+
+#define client_obd_list_lock(lock) \
+ __client_obd_list_lock(lock, __FUNCTION__, __LINE__)
+
+static inline void client_obd_list_unlock(client_obd_lock_t *lock)
+{
+ LASSERT(lock->task != NULL);
+ lock->task = NULL;
+ lock->time = jiffies;
+ spin_unlock(&lock->lock);
+}
+
+
+static inline void client_obd_list_lock_init(client_obd_lock_t *lock)
+{
+ spin_lock_init(&lock->lock);
+}
+
+static inline void client_obd_list_lock_done(client_obd_lock_t *lock)
+{}
+
+#endif /* __LINUX_OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd_class.h b/drivers/staging/lustre/lustre/include/linux/obd_class.h
new file mode 100644
index 000000000000..021ead6639fc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd_class.h
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_CLASS_OBD_H
+#define __LINUX_CLASS_OBD_H
+
+#ifndef __CLASS_OBD_H
+#error Do not #include this file directly. #include <obd_class.h> instead
+#endif
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+
+/* obdo.c */
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid);
+void la_from_obdo(struct lu_attr *la, struct obdo *dst, obd_flag valid);
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+#define ll_inode_flags(inode) (inode->i_flags)
+
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd_support.h b/drivers/staging/lustre/lustre/include/linux/obd_support.h
new file mode 100644
index 000000000000..9166503408aa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd_support.h
@@ -0,0 +1,63 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_OBD_SUPPORT
+#define _LINUX_OBD_SUPPORT
+
+#ifndef _OBD_SUPPORT
+#error Do not #include this file directly. #include <obd_support.h> instead
+#endif
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+#include <asm/processor.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+
+# include <linux/types.h>
+# include <linux/blkdev.h>
+# include <lvfs.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h
new file mode 100644
index 000000000000..55f182205d78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h
@@ -0,0 +1,1043 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_SNMP_H
+#define _LPROCFS_SNMP_H
+
+#include <linux/lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/libcfs/params_tree.h>
+
+struct lprocfs_vars {
+ const char *name;
+ struct file_operations *fops;
+ void *data;
+ /**
+ * /proc file mode.
+ */
+ umode_t proc_mode;
+};
+
+struct lprocfs_static_vars {
+ struct lprocfs_vars *module_vars;
+ struct lprocfs_vars *obd_vars;
+};
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+ spinlock_t oh_lock;
+ unsigned long oh_buckets[OBD_HIST_MAX];
+};
+
+enum {
+ BRW_R_PAGES = 0,
+ BRW_W_PAGES,
+ BRW_R_RPC_HIST,
+ BRW_W_RPC_HIST,
+ BRW_R_IO_TIME,
+ BRW_W_IO_TIME,
+ BRW_R_DISCONT_PAGES,
+ BRW_W_DISCONT_PAGES,
+ BRW_R_DISCONT_BLOCKS,
+ BRW_W_DISCONT_BLOCKS,
+ BRW_R_DISK_IOSIZE,
+ BRW_W_DISK_IOSIZE,
+ BRW_R_DIO_FRAGS,
+ BRW_W_DIO_FRAGS,
+ BRW_LAST,
+};
+
+struct brw_stats {
+ struct obd_histogram hist[BRW_LAST];
+};
+
+enum {
+ RENAME_SAMEDIR_SIZE = 0,
+ RENAME_CROSSDIR_SRC_SIZE,
+ RENAME_CROSSDIR_TGT_SIZE,
+ RENAME_LAST,
+};
+
+struct rename_stats {
+ struct obd_histogram hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+ LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
+ LPROCFS_CNTR_AVGMINMAX = 0x0002,
+ LPROCFS_CNTR_STDDEV = 0x0004,
+
+ /* counter data type */
+ LPROCFS_TYPE_REGS = 0x0100,
+ LPROCFS_TYPE_BYTES = 0x0200,
+ LPROCFS_TYPE_PAGES = 0x0400,
+ LPROCFS_TYPE_CYCLE = 0x0800,
+};
+
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+ unsigned int lc_config;
+ const char *lc_name; /* must be static */
+ const char *lc_units; /* must be static */
+};
+
+struct lprocfs_counter {
+ __s64 lc_count;
+ __s64 lc_min;
+ __s64 lc_max;
+ __s64 lc_sumsquare;
+ /*
+ * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+ * for irq context counter, i.e. stats with
+ * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+ * lc_array_sum[1]
+ */
+ __s64 lc_array_sum[1];
+};
+#define lc_sum lc_array_sum[0]
+#define lc_sum_irq lc_array_sum[1]
+
+struct lprocfs_percpu {
+#ifndef __GNUC__
+ __s64 pad;
+#endif
+ struct lprocfs_counter lp_cntr[0];
+};
+
+#define LPROCFS_GET_NUM_CPU 0x0001
+#define LPROCFS_GET_SMP_ID 0x0002
+
+enum lprocfs_stats_flags {
+ LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */
+ LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+ * area and need locking */
+ LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+ LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001,
+ LPROCFS_FIELDS_FLAGS_SUM = 0x0002,
+ LPROCFS_FIELDS_FLAGS_MIN = 0x0003,
+ LPROCFS_FIELDS_FLAGS_MAX = 0x0004,
+ LPROCFS_FIELDS_FLAGS_AVG = 0x0005,
+ LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006,
+ LPROCFS_FIELDS_FLAGS_COUNT = 0x0007,
+};
+
+struct lprocfs_stats {
+ /* # of counters */
+ unsigned short ls_num;
+ /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+ unsigned short ls_biggest_alloc_num;
+ enum lprocfs_stats_flags ls_flags;
+ /* Lock used when there are no percpu stats areas; For percpu stats,
+ * it is used to protect ls_biggest_alloc_num change */
+ spinlock_t ls_lock;
+
+ /* has ls_num of counter headers */
+ struct lprocfs_counter_header *ls_cnt_header;
+ struct lprocfs_percpu *ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+ if (opc < OST_LAST_OPC) {
+ /* OST opcode */
+ return (opc - OST_FIRST_OPC);
+ } else if (opc < MDS_LAST_OPC) {
+ /* MDS opcode */
+ return (opc - MDS_FIRST_OPC +
+ OPC_RANGE(OST));
+ } else if (opc < LDLM_LAST_OPC) {
+ /* LDLM Opcode */
+ return (opc - LDLM_FIRST_OPC +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < MGS_LAST_OPC) {
+ /* MGS Opcode */
+ return (opc - MGS_FIRST_OPC +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < OBD_LAST_OPC) {
+ /* OBD Ping */
+ return (opc - OBD_FIRST_OPC +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < LLOG_LAST_OPC) {
+ /* LLOG Opcode */
+ return (opc - LLOG_FIRST_OPC +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < QUOTA_LAST_OPC) {
+ /* LQUOTA Opcode */
+ return (opc - QUOTA_FIRST_OPC +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < SEQ_LAST_OPC) {
+ /* SEQ opcode */
+ return (opc - SEQ_FIRST_OPC +
+ OPC_RANGE(QUOTA) +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < SEC_LAST_OPC) {
+ /* SEC opcode */
+ return (opc - SEC_FIRST_OPC +
+ OPC_RANGE(SEQ) +
+ OPC_RANGE(QUOTA) +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < FLD_LAST_OPC) {
+ /* FLD opcode */
+ return (opc - FLD_FIRST_OPC +
+ OPC_RANGE(SEC) +
+ OPC_RANGE(SEQ) +
+ OPC_RANGE(QUOTA) +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < UPDATE_LAST_OPC) {
+ /* update opcode */
+ return (opc - UPDATE_FIRST_OPC +
+ OPC_RANGE(FLD) +
+ OPC_RANGE(SEC) +
+ OPC_RANGE(SEQ) +
+ OPC_RANGE(QUOTA) +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else {
+ /* Unknown Opcode */
+ return -1;
+ }
+}
+
+
+#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST) + \
+ OPC_RANGE(MDS) + \
+ OPC_RANGE(LDLM) + \
+ OPC_RANGE(MGS) + \
+ OPC_RANGE(OBD) + \
+ OPC_RANGE(LLOG) + \
+ OPC_RANGE(SEC) + \
+ OPC_RANGE(SEQ) + \
+ OPC_RANGE(SEC) + \
+ OPC_RANGE(FLD) + \
+ OPC_RANGE(UPDATE))
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \
+ OPC_RANGE(EXTRA))
+
+enum {
+ PTLRPC_REQWAIT_CNTR = 0,
+ PTLRPC_REQQDEPTH_CNTR,
+ PTLRPC_REQACTIVE_CNTR,
+ PTLRPC_TIMEOUT,
+ PTLRPC_REQBUF_AVAIL_CNTR,
+ PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum {
+ LDLM_GLIMPSE_ENQUEUE = 0,
+ LDLM_PLAIN_ENQUEUE,
+ LDLM_EXTENT_ENQUEUE,
+ LDLM_FLOCK_ENQUEUE,
+ LDLM_IBITS_ENQUEUE,
+ MDS_REINT_SETATTR,
+ MDS_REINT_CREATE,
+ MDS_REINT_LINK,
+ MDS_REINT_UNLINK,
+ MDS_REINT_RENAME,
+ MDS_REINT_OPEN,
+ MDS_REINT_SETXATTR,
+ BRW_READ_BYTES,
+ BRW_WRITE_BYTES,
+ EXTRA_LAST_OPC
+};
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern proc_dir_entry_t *proc_lustre_root;
+
+struct obd_device;
+struct obd_histogram;
+
+/* Days / hours / mins / seconds format */
+struct dhms {
+ int d,h,m,s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+ ts->d = secs / 86400;
+ secs = secs % 86400;
+ ts->h = secs / 3600;
+ secs = secs % 3600;
+ ts->m = secs / 60;
+ ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN 20
+#define JOBSTATS_DISABLE "disable"
+#define JOBSTATS_PROCNAME_UID "procname_uid"
+
+typedef void (*cntr_init_callback)(struct lprocfs_stats *stats);
+
+struct obd_job_stats {
+ cfs_hash_t *ojs_hash;
+ struct list_head ojs_list;
+ rwlock_t ojs_lock; /* protect the obj_list */
+ cntr_init_callback ojs_cntr_init_fn;
+ int ojs_cntr_num;
+ int ojs_cleanup_interval;
+ time_t ojs_last_cleanup;
+};
+
+#ifdef LPROCFS
+
+extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+ unsigned int cpuid);
+/*
+ * \return value
+ * < 0 : on error (only possible for opc as LPROCFS_GET_SMP_ID)
+ */
+static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc,
+ unsigned long *flags)
+{
+ int rc = 0;
+
+ switch (opc) {
+ default:
+ LBUG();
+
+ case LPROCFS_GET_SMP_ID:
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+ spin_lock_irqsave(&stats->ls_lock, *flags);
+ else
+ spin_lock(&stats->ls_lock);
+ return 0;
+ } else {
+ unsigned int cpuid = get_cpu();
+
+ if (unlikely(stats->ls_percpu[cpuid] == NULL)) {
+ rc = lprocfs_stats_alloc_one(stats, cpuid);
+ if (rc < 0) {
+ put_cpu();
+ return rc;
+ }
+ }
+ return cpuid;
+ }
+
+ case LPROCFS_GET_NUM_CPU:
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+ spin_lock_irqsave(&stats->ls_lock, *flags);
+ else
+ spin_lock(&stats->ls_lock);
+ return 1;
+ } else {
+ return stats->ls_biggest_alloc_num;
+ }
+ }
+}
+
+static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc,
+ unsigned long *flags)
+{
+ switch (opc) {
+ default:
+ LBUG();
+
+ case LPROCFS_GET_SMP_ID:
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+ spin_unlock_irqrestore(&stats->ls_lock,
+ *flags);
+ } else {
+ spin_unlock(&stats->ls_lock);
+ }
+ } else {
+ put_cpu();
+ }
+ return;
+
+ case LPROCFS_GET_NUM_CPU:
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+ spin_unlock_irqrestore(&stats->ls_lock,
+ *flags);
+ } else {
+ spin_unlock(&stats->ls_lock);
+ }
+ }
+ return;
+ }
+}
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+ unsigned int percpusize;
+
+ percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+ /* irq safe stats need lc_array_sum[1] */
+ if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ percpusize += stats->ls_num * sizeof(__s64);
+
+ if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+ percpusize = L1_CACHE_ALIGN(percpusize);
+
+ return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+ int index)
+{
+ struct lprocfs_counter *cntr;
+
+ cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+ if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ cntr = (void *)cntr + index * sizeof(__s64);
+
+ return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ * lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ * lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+ long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+ long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+ lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+ lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+ struct lprocfs_counter_header *header,
+ enum lprocfs_stats_flags flags,
+ enum lprocfs_fields_flags field);
+static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
+ int idx,
+ enum lprocfs_fields_flags field)
+{
+ int i;
+ unsigned int num_cpu;
+ unsigned long flags = 0;
+ __u64 ret = 0;
+
+ LASSERT(stats != NULL);
+
+ num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+ for (i = 0; i < num_cpu; i++) {
+ if (stats->ls_percpu[i] == NULL)
+ continue;
+ ret += lprocfs_read_helper(
+ lprocfs_stats_counter_get(stats, i, idx),
+ &stats->ls_cnt_header[idx], stats->ls_flags,
+ field);
+ }
+ lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+ return ret;
+}
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+ struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+ struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+ unsigned int num_private_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
+ unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+ unsigned conf, const char *name,
+ const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obddev);
+extern void lprocfs_free_md_stats(struct obd_device *obddev);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device * obd,
+ proc_dir_entry_t *entry);
+extern int lprocfs_exp_setup(struct obd_export *exp,
+ lnet_nid_t *peer_nid, int *newnid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+extern proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+ char *name,
+ void *data,
+ struct file_operations *fops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+ const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+extern int
+lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data);
+
+extern int lprocfs_register_stats(proc_dir_entry_t *root, const char *name,
+ struct lprocfs_stats *stats);
+
+/* lprocfs_status.c */
+extern int lprocfs_add_vars(proc_dir_entry_t *root,
+ struct lprocfs_vars *var,
+ void *data);
+
+extern proc_dir_entry_t *lprocfs_register(const char *name,
+ proc_dir_entry_t *parent,
+ struct lprocfs_vars *list,
+ void *data);
+
+extern void lprocfs_remove(proc_dir_entry_t **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+ struct proc_dir_entry *parent);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+
+extern int lprocfs_seq_create(proc_dir_entry_t *parent, const char *name,
+ umode_t mode,
+ const struct file_operations *seq_fops,
+ void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+ umode_t mode,
+ const struct file_operations *seq_fops,
+ void *data);
+
+/* Generic callbacks */
+
+extern int lprocfs_rd_u64(struct seq_file *m, void *data);
+extern int lprocfs_rd_atomic(struct seq_file *m, void *data);
+extern int lprocfs_wr_atomic(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_rd_uint(struct seq_file *m, void *data);
+extern int lprocfs_wr_uint(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_rd_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_name(struct seq_file *m, void *data);
+extern int lprocfs_rd_server_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_conn_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_import(struct seq_file *m, void *data);
+extern int lprocfs_rd_state(struct seq_file *m, void *data);
+extern int lprocfs_rd_connect_flags(struct seq_file *m, void *data);
+extern int lprocfs_rd_num_exports(struct seq_file *m, void *data);
+extern int lprocfs_rd_numrefs(struct seq_file *m, void *data);
+
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(struct seq_file *m,
+ struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(struct seq_file *m, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+ size_t count, loff_t *off);
+extern int lprocfs_wr_ping(struct file *file, const char *buffer,
+ size_t count, loff_t *off);
+extern int lprocfs_wr_import(struct file *file, const char *buffer,
+ size_t count, loff_t *off);
+extern int lprocfs_rd_pinger_recov(struct seq_file *m, void *n);
+extern int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+ size_t count, loff_t *off);
+
+/* Statfs helpers */
+extern int lprocfs_rd_blksize(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesfree(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesavail(struct seq_file *m, void *data);
+extern int lprocfs_rd_filestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_filesfree(struct seq_file *m, void *data);
+
+extern int lprocfs_write_helper(const char *buffer, unsigned long count,
+ int *val);
+extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+ int *val, int mult);
+extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+ long val, int mult);
+extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count,
+ __u64 *val);
+extern int lprocfs_write_frac_u64_helper(const char *buffer,
+ unsigned long count,
+ __u64 *val, int mult);
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+ unsigned long *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+ struct lprocfs_counter *cnt);
+
+extern int lprocfs_single_release(cfs_inode_t *, struct file *);
+extern int lprocfs_seq_release(cfs_inode_t *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+#define LPROCFS_CLIMP_CHECK(obd) do { \
+ typecheck(struct obd_device *, obd); \
+ down_read(&(obd)->u.cli.cl_sem); \
+ if ((obd)->u.cli.cl_import == NULL) { \
+ up_read(&(obd)->u.cli.cl_sem); \
+ return -ENODEV; \
+ } \
+} while(0)
+#define LPROCFS_CLIMP_EXIT(obd) \
+ up_read(&(obd)->u.cli.cl_sem);
+
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+ proc entries; otherwise, you will define name##_seq_write function also for
+ a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+ call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write) \
+static int name##_single_open(cfs_inode_t *inode, struct file *file) \
+{ \
+ return single_open(file, name##_seq_show, PDE_DATA(inode)); \
+} \
+struct file_operations name##_fops = { \
+ .owner = THIS_MODULE, \
+ .open = name##_single_open, \
+ .read = seq_read, \
+ .write = custom_seq_write, \
+ .llseek = seq_lseek, \
+ .release = lprocfs_single_release, \
+}
+
+#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write)
+
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type) \
+ static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+ { \
+ return lprocfs_rd_##type(m, m->private); \
+ } \
+ LPROC_SEQ_FOPS_RO(name##_##type)
+
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type) \
+ static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+ { \
+ return lprocfs_rd_##type(m, m->private); \
+ } \
+ static ssize_t name##_##type##_seq_write(struct file *file, \
+ const char *buffer, size_t count, loff_t *off) \
+ { \
+ struct seq_file *seq = file->private_data; \
+ return lprocfs_wr_##type(file, buffer, \
+ count, seq->private); \
+ } \
+ LPROC_SEQ_FOPS(name##_##type);
+
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type) \
+ static ssize_t name##_##type##_write(struct file *file, \
+ const char *buffer, size_t count, loff_t *off) \
+ { \
+ return lprocfs_wr_##type(file, buffer, count, off); \
+ } \
+ static int name##_##type##_open(cfs_inode_t *inode, struct file *file) \
+ { \
+ return single_open(file, NULL, PDE_DATA(inode)); \
+ } \
+ struct file_operations name##_##type##_fops = { \
+ .open = name##_##type##_open, \
+ .write = name##_##type##_write, \
+ .release = lprocfs_single_release, \
+ };
+
+/* lprocfs_jobstats.c */
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+ int event, long amount);
+void lprocfs_job_stats_fini(struct obd_device *obd);
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+ cntr_init_callback fn);
+int lprocfs_rd_job_interval(struct seq_file *m, void *data);
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+/* lproc_status.c */
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data);
+int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer,
+ size_t count, loff_t *off);
+
+/* all quota proc functions */
+extern int lprocfs_quota_rd_bunit(char *page, char **start,
+ loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_btune(char *page, char **start,
+ loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_iunit(char *page, char **start,
+ loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_itune(char *page, char **start,
+ loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_type(char *page, char **start, loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_seconds(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_sync_blk(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_qs(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_qs(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_boundary_factor(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_bunit(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_bunit(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_iunit(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_iunit(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_qs_factor(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_qs_factor(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+
+
+
+#else
+/* LPROCFS is not defined */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+ int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+ int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+ int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+ int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+ int index, unsigned conf,
+ const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+ enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(proc_dir_entry_t *root,
+ const char *name,
+ struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+ struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+ struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+ unsigned int num_private_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
+ unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obddev)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid,
+ int *newnid)
+{ return 0; }
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline proc_dir_entry_t *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+ void *data, struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+ const char *format, ...)
+{return NULL; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{return count;}
+static inline
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{ return 0; }
+
+static inline proc_dir_entry_t *
+lprocfs_register(const char *name, proc_dir_entry_t *parent,
+ struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline int lprocfs_add_vars(proc_dir_entry_t *root,
+ struct lprocfs_vars *var,
+ void *data)
+{ return 0; }
+static inline void lprocfs_remove(proc_dir_entry_t **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+ struct proc_dir_entry *parent)
+{ return; }
+static inline int lprocfs_obd_setup(struct obd_device *dev,
+ struct lprocfs_vars *list)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_rd_u64(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_name(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_import(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{ return 0; }
+static inline int lprocfs_rd_state(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{ return 0; }
+extern inline int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(struct seq_file *m,
+ struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_ping(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_import(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+ struct lprocfs_counter *cnt)
+{ return; }
+static inline
+__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+ enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
+
+/* lprocfs_jobstats.c */
+static inline
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event,
+ long amount)
+{ return 0; }
+static inline
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+ cntr_init_callback fn)
+{ return 0; }
+
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* LPROCFS */
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
new file mode 100644
index 000000000000..d40ad81b4eb2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -0,0 +1,1346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#include <stdarg.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ * Server side object is split into layers, one per device in the
+ * corresponding device stack. Individual layer is represented by struct
+ * lu_object. Compound layered object --- by struct lu_object_header. Most
+ * interface functions take lu_object as an argument and operate on the
+ * whole compound object. This decision was made due to the following
+ * reasons:
+ *
+ * - it's envisaged that lu_object will be used much more often than
+ * lu_object_header;
+ *
+ * - we want lower (non-top) layers to be able to initiate operations
+ * on the whole object.
+ *
+ * Generic code supports layering more complex than simple stacking, e.g.,
+ * it is possible that at some layer object "spawns" multiple sub-objects
+ * on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ * Compound object is uniquely identified by its fid. Objects are indexed
+ * by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ * Object's life-time is controlled by reference counting. When reference
+ * count drops to 0, object is returned to cache. Cached objects still
+ * retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ * Objects are kept in the global LRU list, and lu_site_purge() function
+ * can be used to reclaim given number of unused objects from the tail of
+ * the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ * Generic code tries to replace recursion through layers by iterations
+ * where possible. Additionally to the end of reducing stack consumption,
+ * data, when practically possible, are allocated through lu_context_key
+ * interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+ /**
+ * Allocate object for the given device (without lower-layer
+ * parts). This is called by lu_object_operations::loo_object_init()
+ * from the parent layer, and should setup at least lu_object::lo_dev
+ * and lu_object::lo_ops fields of resulting lu_object.
+ *
+ * Object creation protocol.
+ *
+ * Due to design goal of avoiding recursion, object creation (see
+ * lu_object_alloc()) is somewhat involved:
+ *
+ * - first, lu_device_operations::ldo_object_alloc() method of the
+ * top-level device in the stack is called. It should allocate top
+ * level object (including lu_object_header), but without any
+ * lower-layer sub-object(s).
+ *
+ * - then lu_object_alloc() sets fid in the header of newly created
+ * object.
+ *
+ * - then lu_object_operations::loo_object_init() is called. It has
+ * to allocate lower-layer object(s). To do this,
+ * lu_object_operations::loo_object_init() calls ldo_object_alloc()
+ * of the lower-layer device(s).
+ *
+ * - for all new objects allocated by
+ * lu_object_operations::loo_object_init() (and inserted into object
+ * stack), lu_object_operations::loo_object_init() is called again
+ * repeatedly, until no new objects are created.
+ *
+ * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+ * result->lo_ops != NULL);
+ */
+ struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+ const struct lu_object_header *h,
+ struct lu_device *d);
+ /**
+ * process config specific for device.
+ */
+ int (*ldo_process_config)(const struct lu_env *env,
+ struct lu_device *, struct lustre_cfg *);
+ int (*ldo_recovery_complete)(const struct lu_env *,
+ struct lu_device *);
+
+ /**
+ * initialize local objects for device. this method called after layer has
+ * been initialized (after LCFG_SETUP stage) and before it starts serving
+ * user requests.
+ */
+
+ int (*ldo_prepare)(const struct lu_env *,
+ struct lu_device *parent,
+ struct lu_device *dev);
+
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+ /* This is a new object to be allocated, or the file
+ * corresponding to the object does not exists. */
+ LOC_F_NEW = 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+ /**
+ * Some hints for obj find and alloc.
+ */
+ loc_flags_t loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+ void *cookie, const char *format, ...)
+ __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+ /**
+ * Allocate lower-layer parts of the object by calling
+ * lu_device_operations::ldo_object_alloc() of the corresponding
+ * underlying device.
+ *
+ * This method is called once for each object inserted into object
+ * stack. It's responsibility of this method to insert lower-layer
+ * object(s) it create into appropriate places of object stack.
+ */
+ int (*loo_object_init)(const struct lu_env *env,
+ struct lu_object *o,
+ const struct lu_object_conf *conf);
+ /**
+ * Called (in top-to-bottom order) during object allocation after all
+ * layers were allocated and initialized. Can be used to perform
+ * initialization depending on lower layers.
+ */
+ int (*loo_object_start)(const struct lu_env *env,
+ struct lu_object *o);
+ /**
+ * Called before lu_object_operations::loo_object_free() to signal
+ * that object is being destroyed. Dual to
+ * lu_object_operations::loo_object_init().
+ */
+ void (*loo_object_delete)(const struct lu_env *env,
+ struct lu_object *o);
+ /**
+ * Dual to lu_device_operations::ldo_object_alloc(). Called when
+ * object is removed from memory.
+ */
+ void (*loo_object_free)(const struct lu_env *env,
+ struct lu_object *o);
+ /**
+ * Called when last active reference to the object is released (and
+ * object returns to the cache). This method is optional.
+ */
+ void (*loo_object_release)(const struct lu_env *env,
+ struct lu_object *o);
+ /**
+ * Optional debugging helper. Print given object.
+ */
+ int (*loo_object_print)(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o);
+ /**
+ * Optional debugging method. Returns true iff method is internally
+ * consistent.
+ */
+ int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+ /**
+ * reference count. This is incremented, in particular, on each object
+ * created at this layer.
+ *
+ * \todo XXX which means that atomic_t is probably too small.
+ */
+ atomic_t ld_ref;
+ /**
+ * Pointer to device type. Never modified once set.
+ */
+ struct lu_device_type *ld_type;
+ /**
+ * Operation vector for this device.
+ */
+ const struct lu_device_operations *ld_ops;
+ /**
+ * Stack this device belongs to.
+ */
+ struct lu_site *ld_site;
+ struct proc_dir_entry *ld_proc_entry;
+
+ /** \todo XXX: temporary back pointer into obd. */
+ struct obd_device *ld_obd;
+ /**
+ * A list of references to this object, for debugging.
+ */
+ struct lu_ref ld_reference;
+ /**
+ * Link the device to the site.
+ **/
+ struct list_head ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+ /** this is meta-data device */
+ LU_DEVICE_MD = (1 << 0),
+ /** this is data device */
+ LU_DEVICE_DT = (1 << 1),
+ /** data device in the client stack */
+ LU_DEVICE_CL = (1 << 2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+ /**
+ * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+ */
+ __u32 ldt_tags;
+ /**
+ * Name of this class. Unique system-wide. Never modified once set.
+ */
+ char *ldt_name;
+ /**
+ * Operations for this type.
+ */
+ const struct lu_device_type_operations *ldt_ops;
+ /**
+ * \todo XXX: temporary pointer to associated obd_type.
+ */
+ struct obd_type *ldt_obd_type;
+ /**
+ * \todo XXX: temporary: context tags used by obd_*() calls.
+ */
+ __u32 ldt_ctx_tags;
+ /**
+ * Number of existing device type instances.
+ */
+ unsigned ldt_device_nr;
+ /**
+ * Linkage into a global list of all device types.
+ *
+ * \see lu_device_types.
+ */
+ struct list_head ldt_linkage;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+ /**
+ * Allocate new device.
+ */
+ struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *lcfg);
+ /**
+ * Free device. Dual to
+ * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+ * the next device in the stack.
+ */
+ struct lu_device *(*ldto_device_free)(const struct lu_env *,
+ struct lu_device *);
+
+ /**
+ * Initialize the devices after allocation
+ */
+ int (*ldto_device_init)(const struct lu_env *env,
+ struct lu_device *, const char *,
+ struct lu_device *);
+ /**
+ * Finalize device. Dual to
+ * lu_device_type_operations::ldto_device_init(). Returns pointer to
+ * the next device in the stack.
+ */
+ struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+ struct lu_device *);
+ /**
+ * Initialize device type. This is called on module load.
+ */
+ int (*ldto_init)(struct lu_device_type *t);
+ /**
+ * Finalize device type. Dual to
+ * lu_device_type_operations::ldto_init(). Called on module unload.
+ */
+ void (*ldto_fini)(struct lu_device_type *t);
+ /**
+ * Called when the first device is created.
+ */
+ void (*ldto_start)(struct lu_device_type *t);
+ /**
+ * Called when number of devices drops to 0.
+ */
+ void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+ return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Flags for the object layers.
+ */
+enum lu_object_flags {
+ /**
+ * this flags is set if lu_object_operations::loo_object_init() has
+ * been called for this layer. Used by lu_object_alloc().
+ */
+ LU_OBJECT_ALLOCATED = (1 << 0)
+};
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+ /** size in bytes */
+ __u64 la_size;
+ /** modification time in seconds since Epoch */
+ obd_time la_mtime;
+ /** access time in seconds since Epoch */
+ obd_time la_atime;
+ /** change time in seconds since Epoch */
+ obd_time la_ctime;
+ /** 512-byte blocks allocated to object */
+ __u64 la_blocks;
+ /** permission bits and file type */
+ __u32 la_mode;
+ /** owner id */
+ __u32 la_uid;
+ /** group id */
+ __u32 la_gid;
+ /** object flags */
+ __u32 la_flags;
+ /** number of persistent references to this object */
+ __u32 la_nlink;
+ /** blk bits of the object*/
+ __u32 la_blkbits;
+ /** blk size of the object*/
+ __u32 la_blksize;
+ /** real device */
+ __u32 la_rdev;
+ /**
+ * valid bits
+ *
+ * \see enum la_valid
+ */
+ __u64 la_valid;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+ LA_ATIME = 1 << 0,
+ LA_MTIME = 1 << 1,
+ LA_CTIME = 1 << 2,
+ LA_SIZE = 1 << 3,
+ LA_MODE = 1 << 4,
+ LA_UID = 1 << 5,
+ LA_GID = 1 << 6,
+ LA_BLOCKS = 1 << 7,
+ LA_TYPE = 1 << 8,
+ LA_FLAGS = 1 << 9,
+ LA_NLINK = 1 << 10,
+ LA_RDEV = 1 << 11,
+ LA_BLKSIZE = 1 << 12,
+ LA_KILL_SUID = 1 << 13,
+ LA_KILL_SGID = 1 << 14,
+};
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+ /**
+ * Header for this object.
+ */
+ struct lu_object_header *lo_header;
+ /**
+ * Device for this layer.
+ */
+ struct lu_device *lo_dev;
+ /**
+ * Operations for this object.
+ */
+ const struct lu_object_operations *lo_ops;
+ /**
+ * Linkage into list of all layers.
+ */
+ struct list_head lo_linkage;
+ /**
+ * Depth. Top level layer depth is 0.
+ */
+ int lo_depth;
+ /**
+ * Flags from enum lu_object_flags.
+ */
+ __u32 lo_flags;
+ /**
+ * Link to the device, for debugging.
+ */
+ struct lu_ref_link *lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+ /**
+ * Don't keep this object in cache. Object will be destroyed as soon
+ * as last reference to it is released. This flag cannot be cleared
+ * once set.
+ */
+ LU_OBJECT_HEARD_BANSHEE = 0,
+ /**
+ * Mark this object has already been taken out of cache.
+ */
+ LU_OBJECT_UNHASHED = 1
+};
+
+enum lu_object_header_attr {
+ LOHA_EXISTS = 1 << 0,
+ LOHA_REMOTE = 1 << 1,
+ /**
+ * UNIX file type is stored in S_IFMT bits.
+ */
+ LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+ LOHA_FT_END = 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ */
+struct lu_object_header {
+ /**
+ * Object flags from enum lu_object_header_flags. Set and checked
+ * atomically.
+ */
+ unsigned long loh_flags;
+ /**
+ * Object reference count. Protected by lu_site::ls_guard.
+ */
+ atomic_t loh_ref;
+ /**
+ * Fid, uniquely identifying this object.
+ */
+ struct lu_fid loh_fid;
+ /**
+ * Common object attributes, cached for efficiency. From enum
+ * lu_object_header_attr.
+ */
+ __u32 loh_attr;
+ /**
+ * Linkage into per-site hash table. Protected by lu_site::ls_guard.
+ */
+ struct hlist_node loh_hash;
+ /**
+ * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+ */
+ struct list_head loh_lru;
+ /**
+ * Linkage into list of layers. Never modified once set (except lately
+ * during object destruction). No locking is necessary.
+ */
+ struct list_head loh_layers;
+ /**
+ * A list of references to this object, for debugging.
+ */
+ struct lu_ref loh_reference;
+};
+
+struct fld;
+
+struct lu_site_bkt_data {
+ /**
+ * number of busy object on this bucket
+ */
+ long lsb_busy;
+ /**
+ * LRU list, updated on each access to object. Protected by
+ * bucket lock of lu_site::ls_obj_hash.
+ *
+ * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+ * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+ * of list_for_each_entry_safe_reverse()).
+ */
+ struct list_head lsb_lru;
+ /**
+ * Wait-queue signaled when an object in this site is ultimately
+ * destroyed (lu_object_free()). It is used by lu_object_find() to
+ * wait before re-trying when object in the process of destruction is
+ * found in the hash table.
+ *
+ * \see htable_lookup().
+ */
+ wait_queue_head_t lsb_marche_funebre;
+};
+
+enum {
+ LU_SS_CREATED = 0,
+ LU_SS_CACHE_HIT,
+ LU_SS_CACHE_MISS,
+ LU_SS_CACHE_RACE,
+ LU_SS_CACHE_DEATH_RACE,
+ LU_SS_LRU_PURGED,
+ LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+ /**
+ * objects hash table
+ */
+ cfs_hash_t *ls_obj_hash;
+ /**
+ * index of bucket on hash table while purging
+ */
+ int ls_purge_start;
+ /**
+ * Top-level device for this stack.
+ */
+ struct lu_device *ls_top_dev;
+ /**
+ * Bottom-level device for this stack
+ */
+ struct lu_device *ls_bottom_dev;
+ /**
+ * Linkage into global list of sites.
+ */
+ struct list_head ls_linkage;
+ /**
+ * List for lu device for this site, protected
+ * by ls_ld_lock.
+ **/
+ struct list_head ls_ld_linkage;
+ spinlock_t ls_ld_lock;
+
+ /**
+ * lu_site stats
+ */
+ struct lprocfs_stats *ls_stats;
+ /**
+ * XXX: a hack! fld has to find md_site via site, remove when possible
+ */
+ struct seq_server_site *ld_seq_site;
+};
+
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+ cfs_hash_bd_t bd;
+
+ cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+ return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int lu_site_init (struct lu_site *s, struct lu_device *d);
+void lu_site_fini (struct lu_site *s);
+int lu_site_init_finish (struct lu_site *s);
+void lu_stack_fini (const struct lu_env *env, struct lu_device *top);
+void lu_device_get (struct lu_device *d);
+void lu_device_put (struct lu_device *d);
+int lu_device_init (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini (struct lu_device *d);
+int lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+int lu_object_init (struct lu_object *o,
+ struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini (struct lu_object *o);
+void lu_object_add_top (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add (struct lu_object *before, struct lu_object *o);
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+void lu_types_stop(void);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+ LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+ atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true of object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+ return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr);
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+ lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+ struct lu_device *dev, const struct lu_fid *f,
+ const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+ LASSERT(!list_empty(&h->loh_layers));
+ return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+ return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+ return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+ return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+ const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+ void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+ (object)->lo_header); \
+ lu_cdebug_printer(env, &msgdata, "\n"); \
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+void lu_object_print (const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+ return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+ return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+ LASSERT(lu_object_exists(o) != 0);
+ return o->lo_header->loh_attr;
+}
+
+static inline struct lu_ref_link *lu_object_ref_add(struct lu_object *o,
+ const char *scope,
+ const void *source)
+{
+ return lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+ const char *scope, const void *source)
+{
+ lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+ struct lu_ref_link *link,
+ const char *scope, const void *source)
+{
+ lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+ /** hash */
+ __u64 rp_hash;
+ /** count in bytes */
+ unsigned int rp_count;
+ /** number of pages */
+ unsigned int rp_npages;
+ /** requested attr */
+ __u32 rp_attrs;
+ /** pointers to pages */
+ struct page **rp_pages;
+};
+
+enum lu_xattr_flags {
+ LU_XATTR_REPLACE = (1 << 0),
+ LU_XATTR_CREATE = (1 << 1)
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+ LCS_INITIALIZED = 1,
+ LCS_ENTERED,
+ LCS_LEFT,
+ LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+ /**
+ * lu_context is used on the client side too. Yet we don't want to
+ * allocate values of server-side keys for the client contexts and
+ * vice versa.
+ *
+ * To achieve this, set of tags in introduced. Contexts and keys are
+ * marked with tags. Key value are created only for context whose set
+ * of tags has non-empty intersection with one for key. Tags are taken
+ * from enum lu_context_tag.
+ */
+ __u32 lc_tags;
+ enum lu_context_state lc_state;
+ /**
+ * Pointer to the home service thread. NULL for other execution
+ * contexts.
+ */
+ struct ptlrpc_thread *lc_thread;
+ /**
+ * Pointer to an array with key values. Internal implementation
+ * detail.
+ */
+ void **lc_value;
+ /**
+ * Linkage into a list of all remembered contexts. Only
+ * `non-transient' contexts, i.e., ones created for service threads
+ * are placed here.
+ */
+ struct list_head lc_remember;
+ /**
+ * Version counter used to skip calls to lu_context_refill() when no
+ * keys were registered.
+ */
+ unsigned lc_version;
+ /**
+ * Debugging cookie.
+ */
+ unsigned lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+ /**
+ * Thread on md server
+ */
+ LCT_MD_THREAD = 1 << 0,
+ /**
+ * Thread on dt server
+ */
+ LCT_DT_THREAD = 1 << 1,
+ /**
+ * Context for transaction handle
+ */
+ LCT_TX_HANDLE = 1 << 2,
+ /**
+ * Thread on client
+ */
+ LCT_CL_THREAD = 1 << 3,
+ /**
+ * A per-request session on a server, and a per-system-call session on
+ * a client.
+ */
+ LCT_SESSION = 1 << 4,
+ /**
+ * A per-request data on OSP device
+ */
+ LCT_OSP_THREAD = 1 << 5,
+ /**
+ * MGS device thread
+ */
+ LCT_MG_THREAD = 1 << 6,
+ /**
+ * Context for local operations
+ */
+ LCT_LOCAL = 1 << 7,
+ /**
+ * Set when at least one of keys, having values in this context has
+ * non-NULL lu_context_key::lct_exit() method. This is used to
+ * optimize lu_context_exit() call.
+ */
+ LCT_HAS_EXIT = 1 << 28,
+ /**
+ * Don't add references for modules creating key values in that context.
+ * This is only for contexts used internally by lu_object framework.
+ */
+ LCT_NOREF = 1 << 29,
+ /**
+ * Key is being prepared for retiring, don't create new values for it.
+ */
+ LCT_QUIESCENT = 1 << 30,
+ /**
+ * Context should be remembered.
+ */
+ LCT_REMEMBER = 1 << 31,
+ /**
+ * Contexts usable in cache shrinker thread.
+ */
+ LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ * - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ * preventing new key values from being allocated in the new contexts,
+ * and
+ *
+ * - scans a list of remembered contexts, destroying values of module
+ * keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+ /**
+ * Set of tags for which values of this key are to be instantiated.
+ */
+ __u32 lct_tags;
+ /**
+ * Value constructor. This is called when new value is created for a
+ * context. Returns pointer to new value of error pointer.
+ */
+ void *(*lct_init)(const struct lu_context *ctx,
+ struct lu_context_key *key);
+ /**
+ * Value destructor. Called when context with previously allocated
+ * value of this slot is destroyed. \a data is a value that was returned
+ * by a matching call to lu_context_key::lct_init().
+ */
+ void (*lct_fini)(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data);
+ /**
+ * Optional method called on lu_context_exit() for all allocated
+ * keys. Can be used by debugging code checking that locks are
+ * released, etc.
+ */
+ void (*lct_exit)(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data);
+ /**
+ * Internal implementation detail: index within lu_context::lc_value[]
+ * reserved for this key.
+ */
+ int lct_index;
+ /**
+ * Internal implementation detail: number of values created for this
+ * key.
+ */
+ atomic_t lct_used;
+ /**
+ * Internal implementation detail: module for this key.
+ */
+ module_t *lct_owner;
+ /**
+ * References to this key. For debugging.
+ */
+ struct lu_ref lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type) \
+ static void* mod##_key_init(const struct lu_context *ctx, \
+ struct lu_context_key *key) \
+ { \
+ type *value; \
+ \
+ CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value)); \
+ \
+ OBD_ALLOC_PTR(value); \
+ if (value == NULL) \
+ value = ERR_PTR(-ENOMEM); \
+ \
+ return value; \
+ } \
+ struct __##mod##__dummy_init {;} /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type) \
+ static void mod##_key_fini(const struct lu_context *ctx, \
+ struct lu_context_key *key, void* data) \
+ { \
+ type *info = data; \
+ \
+ OBD_FREE_PTR(info); \
+ } \
+ struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type) \
+ LU_KEY_INIT(mod,type); \
+ LU_KEY_FINI(mod,type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags) \
+ struct lu_context_key mod##_thread_key = { \
+ .lct_tags = tags, \
+ .lct_init = mod##_key_init, \
+ .lct_fini = mod##_key_fini \
+ }
+
+#define LU_CONTEXT_KEY_INIT(key) \
+do { \
+ (key)->lct_owner = THIS_MODULE; \
+} while (0)
+
+int lu_context_key_register(struct lu_context_key *key);
+void lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get (const struct lu_context *ctx,
+ const struct lu_context_key *key);
+void lu_context_key_quiesce (struct lu_context_key *key);
+void lu_context_key_revive (struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod) \
+ static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+ { \
+ struct lu_context_key *key = k; \
+ va_list args; \
+ \
+ va_start(args, k); \
+ do { \
+ LU_CONTEXT_KEY_INIT(key); \
+ key = va_arg(args, struct lu_context_key *); \
+ } while (key != NULL); \
+ va_end(args); \
+ }
+
+#define LU_TYPE_INIT(mod, ...) \
+ LU_KEY_INIT_GENERIC(mod) \
+ static int mod##_type_init(struct lu_device_type *t) \
+ { \
+ mod##_key_init_generic(__VA_ARGS__, NULL); \
+ return lu_context_key_register_many(__VA_ARGS__, NULL); \
+ } \
+ struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...) \
+ static void mod##_type_fini(struct lu_device_type *t) \
+ { \
+ lu_context_key_degister_many(__VA_ARGS__, NULL); \
+ } \
+ struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...) \
+ static void mod##_type_start(struct lu_device_type *t) \
+ { \
+ lu_context_key_revive_many(__VA_ARGS__, NULL); \
+ } \
+ struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...) \
+ static void mod##_type_stop(struct lu_device_type *t) \
+ { \
+ lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
+ } \
+ struct __##mod##_dummy_type_stop {;}
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...) \
+ LU_TYPE_INIT(mod, __VA_ARGS__); \
+ LU_TYPE_FINI(mod, __VA_ARGS__); \
+ LU_TYPE_START(mod, __VA_ARGS__); \
+ LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int lu_context_init (struct lu_context *ctx, __u32 tags);
+void lu_context_fini (struct lu_context *ctx);
+void lu_context_enter (struct lu_context *ctx);
+void lu_context_exit (struct lu_context *ctx);
+int lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many (struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+ /**
+ * "Local" context, used to store data instead of stack.
+ */
+ struct lu_context le_ctx;
+ /**
+ * "Session" context for per-request data.
+ */
+ struct lu_context *le_ses;
+};
+
+int lu_env_init (struct lu_env *env, __u32 tags);
+void lu_env_fini (struct lu_env *env);
+int lu_env_refill(struct lu_env *env);
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+ const char *ln_name;
+ int ln_namelen;
+};
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+ void *lb_buf;
+ ssize_t lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+ struct kmem_cache **ckd_cache;
+ const char *ckd_name;
+ const size_t ckd_size;
+};
+
+int lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+ const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, int size);
+void lu_buf_realloc(struct lu_buf *buf, int size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, int len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len);
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h
new file mode 100644
index 000000000000..624c19be1524
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_ref.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ * struct foo {
+ * atomic_t foo_refcount;
+ * struct lu_ref foo_reference;
+ * ...
+ * };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ * struct bar *bar;
+ *
+ * // bar owns a reference to foo.
+ * bar->bar_foo = foo_get(foo);
+ * lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ * ...
+ *
+ * // reference from bar to foo is released.
+ * lu_ref_del(&foo->foo_reference, "bar", bar);
+ * foo_put(bar->bar_foo);
+ *
+ *
+ * // current thread acquired a temporary reference to foo.
+ * foo_get(foo);
+ * lu_ref_add(&foo->reference, __FUNCTION__, current);
+ *
+ * ...
+ *
+ * // temporary reference is released.
+ * lu_ref_del(&foo->reference, __FUNCTION__, current);
+ * foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ * // There is a large number of bar's for a single foo.
+ * bar->bar_foo = foo_get(foo);
+ * bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ * ...
+ *
+ * // reference from bar to foo is released.
+ * lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ * foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+
+struct lu_ref {};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref,
+ const char *scope,
+ const void *source)
+{
+ return NULL;
+}
+
+static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref,
+ const char *scope,
+ const void *source)
+{
+ return NULL;
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+ const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+ const char *scope, const void *source0,
+ const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+ const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+ return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_target.h b/drivers/staging/lustre/lustre/include/lu_target.h
new file mode 100644
index 000000000000..8d48cf4e27ee
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_target.h
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_LU_TARGET_H
+#define _LUSTRE_LU_TARGET_H
+
+#include <dt_object.h>
+#include <lustre_disk.h>
+
+struct lu_target {
+ struct obd_device *lut_obd;
+ struct dt_device *lut_bottom;
+ /** last_rcvd file */
+ struct dt_object *lut_last_rcvd;
+ /* transaction callbacks */
+ struct dt_txn_callback lut_txn_cb;
+ /** server data in last_rcvd file */
+ struct lr_server_data lut_lsd;
+ /** Server last transaction number */
+ __u64 lut_last_transno;
+ /** Lock protecting last transaction number */
+ spinlock_t lut_translock;
+ /** Lock protecting client bitmap */
+ spinlock_t lut_client_bitmap_lock;
+ /** Bitmap of known clients */
+ unsigned long *lut_client_bitmap;
+};
+
+typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno,
+ void *data, int err);
+struct tgt_commit_cb {
+ tgt_cb_t tgt_cb_func;
+ void *tgt_cb_data;
+};
+
+void tgt_boot_epoch_update(struct lu_target *lut);
+int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *lut,
+ struct obd_export *exp, __u64 transno);
+int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp);
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+ struct obd_device *obd, struct dt_device *dt);
+void tgt_fini(const struct lu_env *env, struct lu_target *lut);
+int tgt_client_alloc(struct obd_export *exp);
+void tgt_client_free(struct obd_export *exp);
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int);
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_data_read(const struct lu_env *env, struct lu_target *tg,
+ struct lsd_client_data *lcd, loff_t *off, int index);
+int tgt_client_data_write(const struct lu_env *env, struct lu_target *tg,
+ struct lsd_client_data *lcd, loff_t *off, struct thandle *th);
+int tgt_server_data_read(const struct lu_env *env, struct lu_target *tg);
+int tgt_server_data_write(const struct lu_env *env, struct lu_target *tg,
+ struct thandle *th);
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, int sync);
+int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tg, loff_t off);
+
+#endif /* __LUSTRE_LU_TARGET_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/libiam.h b/drivers/staging/lustre/lustre/include/lustre/libiam.h
new file mode 100644
index 000000000000..e8e0b084a6bc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/libiam.h
@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ * lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+ FMT_LFIX,
+ FMT_LVAR
+};
+
+struct iam_uapi_info {
+ __u16 iui_keysize;
+ __u16 iui_recsize;
+ __u16 iui_ptrsize;
+ __u16 iui_height;
+ char iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+ int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open.
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *keybuf,
+ int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *key_buf,
+ int *keysize, char *save_key,
+ int rec_need_convert, char *rec_buf,
+ int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *keybuf,
+ int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *key_buf,
+ int *keysize, char *save_key,
+ int rec_need_convert, char *rec_buf,
+ int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *key_buf,
+ int *keysize, char *save_key,
+ int rec_need_convert, char *rec_buf,
+ int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *keybuf,
+ int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h
new file mode 100644
index 000000000000..707eb74fdf68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h
@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ * NOTE: This file is DEPRECATED! Please include lustreapi.h directly
+ * instead of this file. This file will be removed from a future version
+ * of lustre!
+ */
+
+#ifndef _LIBLUSTREAPI_H_
+#define _LIBLUSTREAPI_H_
+
+#include <lustre/lustreapi.h>
+#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly."
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
new file mode 100644
index 000000000000..ad253c6deadd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+
+
+struct ll_fiemap_extent {
+ __u64 fe_logical; /* logical offset in bytes for the start of
+ * the extent from the beginning of the file */
+ __u64 fe_physical; /* physical offset in bytes for the start
+ * of the extent from the beginning of the disk */
+ __u64 fe_length; /* length in bytes for this extent */
+ __u64 fe_reserved64[2];
+ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
+ __u32 fe_device; /* device number for this extent */
+ __u32 fe_reserved[2];
+};
+
+struct ll_user_fiemap {
+ __u64 fm_start; /* logical offset (inclusive) at
+ * which to start mapping (in) */
+ __u64 fm_length; /* logical length of mapping which
+ * userspace wants (in) */
+ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
+ __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+ __u32 fm_extent_count; /* size of fm_extents array (in) */
+ __u32 fm_reserved;
+ struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET (~0ULL)
+
+#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.
+ * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read
+ * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs.
+ * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be
+ * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata.
+ * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block.
+ * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but
+ * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively
+ * support extents. Result
+ * merged for efficiency. */
+
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+ return (sizeof(struct ll_user_fiemap) + extent_count *
+ sizeof(struct ll_fiemap_extent));
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+ return ((array_size - sizeof(struct ll_user_fiemap)) /
+ sizeof(struct ll_fiemap_extent));
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely.
+ * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
new file mode 100644
index 000000000000..93a3d7db3010
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
@@ -0,0 +1,2 @@
+#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0"
+#define LUSTRE_RELEASE 3.9.0_g6e62c21
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
new file mode 100644
index 000000000000..8825460f12ac
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
@@ -0,0 +1,3653 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here. Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * structs also declared in this file. Similarly, all flags and magic
+ * values in those structs should also be declared here. This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures, and the
+ * prototypes of the swabber functions for each struct. Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary. Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources. This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions. Some structs may have padding fields that
+ * can be used. Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format. The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
+ * implemented either here, inline (trivial implementations) or in
+ * ptlrpc/pack_generic.c. These 'swabbers' convert the type from "other"
+ * endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument. The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#if !defined(LASSERT) && !defined(LPU64)
+#include <linux/libcfs/libcfs.h> /* for LASSERT, LPUX64, etc */
+#endif
+
+/* Defn's shared with user-space. */
+#include <lustre/lustre_user.h>
+
+/*
+ * GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL is for incoming replies on the FOO
+ * FOO_BULK_PORTAL is for incoming bulk on the FOO
+ */
+
+#define CONNMGR_REQUEST_PORTAL 1
+#define CONNMGR_REPLY_PORTAL 2
+//#define OSC_REQUEST_PORTAL 3
+#define OSC_REPLY_PORTAL 4
+//#define OSC_BULK_PORTAL 5
+#define OST_IO_PORTAL 6
+#define OST_CREATE_PORTAL 7
+#define OST_BULK_PORTAL 8
+//#define MDC_REQUEST_PORTAL 9
+#define MDC_REPLY_PORTAL 10
+//#define MDC_BULK_PORTAL 11
+#define MDS_REQUEST_PORTAL 12
+//#define MDS_REPLY_PORTAL 13
+#define MDS_BULK_PORTAL 14
+#define LDLM_CB_REQUEST_PORTAL 15
+#define LDLM_CB_REPLY_PORTAL 16
+#define LDLM_CANCEL_REQUEST_PORTAL 17
+#define LDLM_CANCEL_REPLY_PORTAL 18
+//#define PTLBD_REQUEST_PORTAL 19
+//#define PTLBD_REPLY_PORTAL 20
+//#define PTLBD_BULK_PORTAL 21
+#define MDS_SETATTR_PORTAL 22
+#define MDS_READPAGE_PORTAL 23
+#define MDS_MDS_PORTAL 24
+
+#define MGC_REPLY_PORTAL 25
+#define MGS_REQUEST_PORTAL 26
+#define MGS_REPLY_PORTAL 27
+#define OST_REQUEST_PORTAL 28
+#define FLD_REQUEST_PORTAL 29
+#define SEQ_METADATA_PORTAL 30
+#define SEQ_DATA_PORTAL 31
+#define SEQ_CONTROLLER_PORTAL 32
+#define MGS_BULK_PORTAL 33
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR 4712
+#define PTL_RPC_MSG_REPLY 4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+
+#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION 0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION 0x00010000
+#define LUSTRE_MDS_VERSION 0x00020000
+#define LUSTRE_OST_VERSION 0x00030000
+#define LUSTRE_DLM_VERSION 0x00040000
+#define LUSTRE_LOG_VERSION 0x00050000
+#define LUSTRE_MGS_VERSION 0x00060000
+
+typedef __u32 mdsno_t;
+typedef __u64 seqno_t;
+typedef __u64 obd_id;
+typedef __u64 obd_seq;
+typedef __s64 obd_time;
+typedef __u64 obd_size;
+typedef __u64 obd_off;
+typedef __u64 obd_blocks;
+typedef __u64 obd_valid;
+typedef __u32 obd_blksize;
+typedef __u32 obd_mode;
+typedef __u32 obd_uid;
+typedef __u32 obd_gid;
+typedef __u32 obd_flag;
+typedef __u32 obd_count;
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+ __u64 lsr_start;
+ __u64 lsr_end;
+ __u32 lsr_index;
+ __u32 lsr_flags;
+};
+
+#define LU_SEQ_RANGE_MDT 0x0
+#define LU_SEQ_RANGE_OST 0x1
+#define LU_SEQ_RANGE_ANY 0x3
+
+#define LU_SEQ_RANGE_MASK 0x3
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+ return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+static inline int fld_range_is_ost(const struct lu_seq_range *range)
+{
+ return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+static inline int fld_range_is_mdt(const struct lu_seq_range *range)
+{
+ return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * This all range is only being used when fld client sends fld query request,
+ * but it does not know whether the seq is MDT or OST, so it will send req
+ * with ALL type, which means either seq type gotten from lookup can be
+ * expected.
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+ return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+ unsigned flags)
+{
+ LASSERT(!(flags & ~LU_SEQ_RANGE_MASK));
+ range->lsr_flags |= flags;
+}
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+ fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+ fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+ fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * returns width of given range \a r
+ */
+
+static inline __u64 range_space(const struct lu_seq_range *range)
+{
+ return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero
+ */
+
+static inline void range_init(struct lu_seq_range *range)
+{
+ range->lsr_start = range->lsr_end = range->lsr_index = 0;
+}
+
+/**
+ * check if given seq id \a s is within given range \a r
+ */
+
+static inline int range_within(const struct lu_seq_range *range,
+ __u64 s)
+{
+ return s >= range->lsr_start && s < range->lsr_end;
+}
+
+static inline int range_is_sane(const struct lu_seq_range *range)
+{
+ return (range->lsr_end >= range->lsr_start);
+}
+
+static inline int range_is_zero(const struct lu_seq_range *range)
+{
+ return (range->lsr_start == 0 && range->lsr_end == 0);
+}
+
+static inline int range_is_exhausted(const struct lu_seq_range *range)
+
+{
+ return range_space(range) == 0;
+}
+
+/* return 0 if two range have the same location */
+static inline int range_compare_loc(const struct lu_seq_range *r1,
+ const struct lu_seq_range *r2)
+{
+ return r1->lsr_index != r2->lsr_index ||
+ r1->lsr_flags != r2->lsr_flags;
+}
+
+#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x):%x:%s"
+
+#define PRANGE(range) \
+ (range)->lsr_start, \
+ (range)->lsr_end, \
+ (range)->lsr_index, \
+ fld_range_is_mdt(range) ? "mdt" : "ost"
+
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+/**
+ * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat.
+ * Deprecated since HSM and SOM attributes are now stored in separate on-disk
+ * xattr.
+ */
+enum lma_compat {
+ LMAC_HSM = 0x00000001,
+ LMAC_SOM = 0x00000002,
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+ LMAI_RELEASED = 0x0000001, /* file is released */
+ LMAI_AGENT = 0x00000002, /* agent inode */
+ LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object
+ is on the remote MDT */
+};
+#define LMA_INCOMPAT_SUPP (LMAI_AGENT | LMAI_REMOTE_PARENT)
+
+extern void lustre_lma_swab(struct lustre_mdt_attrs *lma);
+extern void lustre_lma_init(struct lustre_mdt_attrs *lma,
+ const struct lu_fid *fid, __u32 incompat);
+/**
+ * SOM on-disk attributes stored in a separate xattr.
+ */
+struct som_attrs {
+ /** Bitfield for supported data in this structure. For future use. */
+ __u32 som_compat;
+
+ /** Incompat feature list. The supported feature mask is availabe in
+ * SOM_INCOMPAT_SUPP */
+ __u32 som_incompat;
+
+ /** IO Epoch SOM attributes belongs to */
+ __u64 som_ioepoch;
+ /** total file size in objects */
+ __u64 som_size;
+ /** total fs blocks in objects */
+ __u64 som_blocks;
+ /** mds mount id the size is valid for */
+ __u64 som_mountid;
+};
+extern void lustre_som_swab(struct som_attrs *attrs);
+
+#define SOM_INCOMPAT_SUPP 0x0
+
+/**
+ * HSM on-disk attributes stored in a separate xattr.
+ */
+struct hsm_attrs {
+ /** Bitfield for supported data in this structure. For future use. */
+ __u32 hsm_compat;
+
+ /** HSM flags, see hsm_flags enum below */
+ __u32 hsm_flags;
+ /** backend archive id associated with the file */
+ __u64 hsm_arch_id;
+ /** version associated with the last archiving, if any */
+ __u64 hsm_arch_ver;
+};
+extern void lustre_hsm_swab(struct hsm_attrs *attrs);
+
+/**
+ * fid constants
+ */
+enum {
+ /** LASTID file has zero OID */
+ LUSTRE_FID_LASTID_OID = 0UL,
+ /** initial fid id value */
+ LUSTRE_FID_INIT_OID = 1UL
+};
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+ return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+ return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+ return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+ memset(fid, 0, sizeof(*fid));
+}
+
+static inline obd_id fid_ver_oid(const struct lu_fid *fid)
+{
+ return ((__u64)fid_ver(fid) << 32 | fid_oid(fid));
+}
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ *
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ */
+enum fid_seq {
+ FID_SEQ_OST_MDT0 = 0,
+ FID_SEQ_LLOG = 1, /* unnamed llogs */
+ FID_SEQ_ECHO = 2,
+ FID_SEQ_OST_MDT1 = 3,
+ FID_SEQ_OST_MAX = 9, /* Max MDT count before OST_on_FID */
+ FID_SEQ_LLOG_NAME = 10, /* named llogs */
+ FID_SEQ_RSVD = 11,
+ FID_SEQ_IGIF = 12,
+ FID_SEQ_IGIF_MAX = 0x0ffffffffULL,
+ FID_SEQ_IDIF = 0x100000000ULL,
+ FID_SEQ_IDIF_MAX = 0x1ffffffffULL,
+ /* Normal FID sequence starts from this value, i.e. 1<<33 */
+ FID_SEQ_START = 0x200000000ULL,
+ /* sequence for local pre-defined FIDs listed in local_oid */
+ FID_SEQ_LOCAL_FILE = 0x200000001ULL,
+ FID_SEQ_DOT_LUSTRE = 0x200000002ULL,
+ /* sequence is used for local named objects FIDs generated
+ * by local_object_storage library */
+ FID_SEQ_LOCAL_NAME = 0x200000003ULL,
+ /* Because current FLD will only cache the fid sequence, instead
+ * of oid on the client side, if the FID needs to be exposed to
+ * clients sides, it needs to make sure all of fids under one
+ * sequence will be located in one MDT. */
+ FID_SEQ_SPECIAL = 0x200000004ULL,
+ FID_SEQ_QUOTA = 0x200000005ULL,
+ FID_SEQ_QUOTA_GLB = 0x200000006ULL,
+ FID_SEQ_ROOT = 0x200000007ULL, /* Located on MDT0 */
+ FID_SEQ_NORMAL = 0x200000400ULL,
+ FID_SEQ_LOV_DEFAULT = 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS 32
+#define OBIF_MAX_OID (1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS 48
+#define IDIF_MAX_OID (1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+ /* Big Filesystem Lock to serialize rename operations */
+ FID_OID_SPECIAL_BFL = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+ FID_OID_DOT_LUSTRE = 1UL,
+ FID_OID_DOT_LUSTRE_OBF = 2UL,
+};
+
+static inline int fid_seq_is_mdt0(obd_seq seq)
+{
+ return (seq == FID_SEQ_OST_MDT0);
+}
+
+static inline int fid_seq_is_mdt(const __u64 seq)
+{
+ return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline int fid_seq_is_echo(obd_seq seq)
+{
+ return (seq == FID_SEQ_ECHO);
+}
+
+static inline int fid_is_echo(const struct lu_fid *fid)
+{
+ return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline int fid_seq_is_llog(obd_seq seq)
+{
+ return (seq == FID_SEQ_LLOG);
+}
+
+static inline int fid_is_llog(const struct lu_fid *fid)
+{
+ /* file with OID == 0 is not llog but contains last oid */
+ return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0;
+}
+
+static inline int fid_seq_is_rsvd(const __u64 seq)
+{
+ return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD);
+};
+
+static inline int fid_seq_is_special(const __u64 seq)
+{
+ return seq == FID_SEQ_SPECIAL;
+};
+
+static inline int fid_seq_is_local_file(const __u64 seq)
+{
+ return seq == FID_SEQ_LOCAL_FILE ||
+ seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline int fid_seq_is_root(const __u64 seq)
+{
+ return seq == FID_SEQ_ROOT;
+}
+
+static inline int fid_seq_is_dot(const __u64 seq)
+{
+ return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline int fid_seq_is_default(const __u64 seq)
+{
+ return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline int fid_is_mdt0(const struct lu_fid *fid)
+{
+ return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+ fid->f_seq = FID_SEQ_ROOT;
+ fid->f_oid = 1;
+ fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a igif; otherwise false.
+ */
+static inline int fid_seq_is_igif(const __u64 seq)
+{
+ return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+ return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a idif; otherwise false.
+ */
+static inline int fid_seq_is_idif(const __u64 seq)
+{
+ return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline int fid_is_idif(const struct lu_fid *fid)
+{
+ return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline int fid_is_local_file(const struct lu_fid *fid)
+{
+ return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline int fid_seq_is_norm(const __u64 seq)
+{
+ return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline int fid_is_norm(const struct lu_fid *fid)
+{
+ return fid_seq_is_norm(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline obd_seq fid_idif_seq(obd_id id, __u32 ost_idx)
+{
+ return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline obd_id fid_idif_id(obd_seq seq, __u32 oid, __u32 ver)
+{
+ return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+ LASSERT(fid_is_idif(fid));
+ return (fid_seq(fid) >> 16) & 0xffff;
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline obd_seq ostid_seq(const struct ost_id *ostid)
+{
+ if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+ return FID_SEQ_OST_MDT0;
+
+ if (fid_seq_is_default(ostid->oi.oi_seq))
+ return FID_SEQ_LOV_DEFAULT;
+
+ if (fid_is_idif(&ostid->oi_fid))
+ return FID_SEQ_OST_MDT0;
+
+ return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline obd_id ostid_id(const struct ost_id *ostid)
+{
+ if (fid_seq_is_mdt0(ostid_seq(ostid)))
+ return ostid->oi.oi_id & IDIF_OID_MASK;
+
+ if (fid_is_idif(&ostid->oi_fid))
+ return fid_idif_id(fid_seq(&ostid->oi_fid),
+ fid_oid(&ostid->oi_fid), 0);
+
+ return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+ if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+ oi->oi.oi_seq = seq;
+ } else {
+ oi->oi_fid.f_seq = seq;
+ /* Note: if f_oid + f_ver is zero, we need init it
+ * to be 1, otherwise, ostid_seq will treat this
+ * as old ostid (oi_seq == 0) */
+ if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0)
+ oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+ }
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+ ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+ ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+ ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline void ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+ if (fid_seq_is_mdt0(ostid_seq(oi))) {
+ if (oid >= IDIF_MAX_OID) {
+ CERROR("Bad "LPU64" to set "DOSTID"\n",
+ oid, POSTID(oi));
+ return;
+ }
+ oi->oi.oi_id = oid;
+ } else {
+ if (oid > OBIF_MAX_OID) {
+ CERROR("Bad "LPU64" to set "DOSTID"\n",
+ oid, POSTID(oi));
+ return;
+ }
+ oi->oi_fid.f_oid = oid;
+ }
+}
+
+static inline void ostid_inc_id(struct ost_id *oi)
+{
+ if (fid_seq_is_mdt0(ostid_seq(oi))) {
+ if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) {
+ CERROR("Bad inc "DOSTID"\n", POSTID(oi));
+ return;
+ }
+ oi->oi.oi_id++;
+ } else {
+ oi->oi_fid.f_oid++;
+ }
+}
+
+static inline void ostid_dec_id(struct ost_id *oi)
+{
+ if (fid_seq_is_mdt0(ostid_seq(oi)))
+ oi->oi.oi_id--;
+ else
+ oi->oi_fid.f_oid--;
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID. This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs. Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged. Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss. For reference see:
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid,
+ __u32 ost_idx)
+{
+ if (ost_idx > 0xffff) {
+ CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid),
+ ost_idx);
+ return -EBADF;
+ }
+
+ if (fid_seq_is_mdt0(ostid_seq(ostid))) {
+ /* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+ * that we map into the IDIF namespace. It allows up to 2^48
+ * objects per OST, as this is the object namespace that has
+ * been in production for years. This can handle create rates
+ * of 1M objects/s/OST for 9 years, or combinations thereof. */
+ if (ostid_id(ostid) >= IDIF_MAX_OID) {
+ CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+ POSTID(ostid), ost_idx);
+ return -EBADF;
+ }
+ fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx);
+ /* truncate to 32 bits by assignment */
+ fid->f_oid = ostid_id(ostid);
+ /* in theory, not currently used */
+ fid->f_ver = ostid_id(ostid) >> 48;
+ } else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ {
+ /* This is either an IDIF object, which identifies objects across
+ * all OSTs, or a regular FID. The IDIF namespace maps legacy
+ * OST objects into the FID namespace. In both cases, we just
+ * pass the FID through, no conversion needed. */
+ if (ostid->oi_fid.f_ver != 0) {
+ CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+ POSTID(ostid), ost_idx);
+ return -EBADF;
+ }
+ *fid = ostid->oi_fid;
+ }
+
+ return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+ if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+ CERROR("bad IGIF, "DFID"\n", PFID(fid));
+ return -EBADF;
+ }
+
+ if (fid_is_idif(fid)) {
+ ostid_set_seq_mdt0(ostid);
+ ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid),
+ fid_ver(fid)));
+ } else {
+ ostid->oi_fid = *fid;
+ }
+
+ return 0;
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline int fid_is_last_id(const struct lu_fid *fid)
+{
+ return (fid_oid(fid) == 0);
+}
+
+/**
+ * Get inode number from a igif.
+ * \param fid a igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+ return fid_seq(fid);
+}
+
+extern void lustre_swab_ost_id(struct ost_id *oid);
+
+/**
+ * Get inode generation from a igif.
+ * \param fid a igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+ return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+ fid->f_seq = ino;
+ fid->f_oid = gen;
+ fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+ /* check that all fields are converted */
+ CLASSERT(sizeof *src ==
+ sizeof fid_seq(src) +
+ sizeof fid_oid(src) + sizeof fid_ver(src));
+ dst->f_seq = cpu_to_le64(fid_seq(src));
+ dst->f_oid = cpu_to_le32(fid_oid(src));
+ dst->f_ver = cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+ /* check that all fields are converted */
+ CLASSERT(sizeof *src ==
+ sizeof fid_seq(src) +
+ sizeof fid_oid(src) + sizeof fid_ver(src));
+ dst->f_seq = le64_to_cpu(fid_seq(src));
+ dst->f_oid = le32_to_cpu(fid_oid(src));
+ dst->f_ver = le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+ /* check that all fields are converted */
+ CLASSERT(sizeof *src ==
+ sizeof fid_seq(src) +
+ sizeof fid_oid(src) + sizeof fid_ver(src));
+ dst->f_seq = cpu_to_be64(fid_seq(src));
+ dst->f_oid = cpu_to_be32(fid_oid(src));
+ dst->f_ver = cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+ /* check that all fields are converted */
+ CLASSERT(sizeof *src ==
+ sizeof fid_seq(src) +
+ sizeof fid_oid(src) + sizeof fid_ver(src));
+ dst->f_seq = be64_to_cpu(fid_seq(src));
+ dst->f_oid = be32_to_cpu(fid_oid(src));
+ dst->f_ver = be32_to_cpu(fid_ver(src));
+}
+
+static inline int fid_is_sane(const struct lu_fid *fid)
+{
+ return fid != NULL &&
+ ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) ||
+ fid_is_igif(fid) || fid_is_idif(fid) ||
+ fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline int fid_is_zero(const struct lu_fid *fid)
+{
+ return fid_seq(fid) == 0 && fid_oid(fid) == 0;
+}
+
+extern void lustre_swab_lu_fid(struct lu_fid *fid);
+extern void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+
+static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+ /* Check that there is no alignment padding. */
+ CLASSERT(sizeof *f0 ==
+ sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver);
+ return memcmp(f0, f1, sizeof *f0) == 0;
+}
+
+#define __diff_normalize(val0, val1) \
+({ \
+ typeof(val0) __val0 = (val0); \
+ typeof(val1) __val1 = (val1); \
+ \
+ (__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1); \
+})
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+ const struct lu_fid *f1)
+{
+ return
+ __diff_normalize(fid_seq(f0), fid_seq(f1)) ?:
+ __diff_normalize(fid_oid(f0), fid_oid(f1)) ?:
+ __diff_normalize(fid_ver(f0), fid_ver(f1));
+}
+
+static inline void ostid_cpu_to_le(struct ost_id *src_oi,
+ struct ost_id *dst_oi)
+{
+ if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+ dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+ dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+ } else {
+ fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+ }
+}
+
+static inline void ostid_le_to_cpu(struct ost_id *src_oi,
+ struct ost_id *dst_oi)
+{
+ if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+ dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+ dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+ } else {
+ fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+ }
+}
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+ LUDA_FID = 0x0001,
+ LUDA_TYPE = 0x0002,
+ LUDA_64BITHASH = 0x0004,
+
+ /* The following attrs are used for MDT interanl only,
+ * not visible to client */
+
+ /* Verify the dirent consistency */
+ LUDA_VERIFY = 0x8000,
+ /* Only check but not repair the dirent inconsistency */
+ LUDA_VERIFY_DRYRUN = 0x4000,
+ /* The dirent has been repaired, or to be repaired (dryrun). */
+ LUDA_REPAIR = 0x2000,
+ /* The system is upgraded, has beed or to be repaired (dryrun). */
+ LUDA_UPGRADE = 0x1000,
+ /* Ignore this record, go to next directly. */
+ LUDA_IGNORE = 0x0800,
+};
+
+#define LU_DIRENT_ATTRS_MASK 0xf800
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+ /** valid if LUDA_FID is set. */
+ struct lu_fid lde_fid;
+ /** a unique entry identifier: a hash or an offset. */
+ __u64 lde_hash;
+ /** total record length, including all attributes. */
+ __u16 lde_reclen;
+ /** name length */
+ __u16 lde_namelen;
+ /** optional variable size attributes following this entry.
+ * taken from enum lu_dirent_attrs.
+ */
+ __u32 lde_attrs;
+ /** name is followed by the attributes indicated in ->ldp_attrs, in
+ * their natural order. After the last attribute, padding bytes are
+ * added to make ->lde_reclen a multiple of 8.
+ */
+ char lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+ __u16 lt_type;
+};
+
+struct lu_dirpage {
+ __u64 ldp_hash_start;
+ __u64 ldp_hash_end;
+ __u32 ldp_flags;
+ __u32 ldp_pad0;
+ struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+ /**
+ * dirpage contains no entry.
+ */
+ LDF_EMPTY = 1 << 0,
+ /**
+ * last entry's lde_hash equals ldp_hash_end.
+ */
+ LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+ if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+ return NULL;
+ else
+ return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+ struct lu_dirent *next;
+
+ if (le16_to_cpu(ent->lde_reclen) != 0)
+ next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
+ else
+ next = NULL;
+
+ return next;
+}
+
+static inline int lu_dirent_calc_size(int namelen, __u16 attr)
+{
+ int size;
+
+ if (attr & LUDA_TYPE) {
+ const unsigned align = sizeof(struct luda_type) - 1;
+ size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+ size += sizeof(struct luda_type);
+ } else
+ size = sizeof(struct lu_dirent) + namelen;
+
+ return (size + 7) & ~7;
+}
+
+static inline int lu_dirent_size(struct lu_dirent *ent)
+{
+ if (le16_to_cpu(ent->lde_reclen) == 0) {
+ return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
+ le32_to_cpu(ent->lde_attrs));
+ }
+ return le16_to_cpu(ent->lde_reclen);
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_CACHE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+ __u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline int lustre_handle_is_used(struct lustre_handle *lh)
+{
+ return lh->cookie != 0ull;
+}
+
+static inline int lustre_handle_equal(const struct lustre_handle *lh1,
+ const struct lustre_handle *lh2)
+{
+ return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+ struct lustre_handle *src)
+{
+ tgt->cookie = src->cookie;
+}
+
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT 0x1
+#define MSGHDR_CKSUM_INCOMPAT18 0x2
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+ __u32 lm_bufcount;
+ __u32 lm_secflvr;
+ __u32 lm_magic;
+ __u32 lm_repsize;
+ __u32 lm_cksum;
+ __u32 lm_flags;
+ __u32 lm_padding_2;
+ __u32 lm_padding_3;
+ __u32 lm_buflens[0];
+};
+
+/* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS 4
+#define JOBSTATS_JOBID_SIZE 32 /* 32 bytes string */
+struct ptlrpc_body_v3 {
+ struct lustre_handle pb_handle;
+ __u32 pb_type;
+ __u32 pb_version;
+ __u32 pb_opc;
+ __u32 pb_status;
+ __u64 pb_last_xid;
+ __u64 pb_last_seen;
+ __u64 pb_last_committed;
+ __u64 pb_transno;
+ __u32 pb_flags;
+ __u32 pb_op_flags;
+ __u32 pb_conn_cnt;
+ __u32 pb_timeout; /* for req, the deadline, for rep, the service est */
+ __u32 pb_service_time; /* for rep, actual service time */
+ __u32 pb_limit;
+ __u64 pb_slv;
+ /* VBR: pre-versions */
+ __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+ /* padding for future needs */
+ __u64 pb_padding[4];
+ char pb_jobid[JOBSTATS_JOBID_SIZE];
+};
+#define ptlrpc_body ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+ struct lustre_handle pb_handle;
+ __u32 pb_type;
+ __u32 pb_version;
+ __u32 pb_opc;
+ __u32 pb_status;
+ __u64 pb_last_xid;
+ __u64 pb_last_seen;
+ __u64 pb_last_committed;
+ __u64 pb_transno;
+ __u32 pb_flags;
+ __u32 pb_op_flags;
+ __u32 pb_conn_cnt;
+ __u32 pb_timeout; /* for req, the deadline, for rep, the service est */
+ __u32 pb_service_time; /* for rep, actual service time, also used for
+ net_latency of req */
+ __u32 pb_limit;
+ __u64 pb_slv;
+ /* VBR: pre-versions */
+ __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+ /* padding for future needs */
+ __u64 pb_padding[4];
+};
+
+extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF 0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF 1
+#define REPLY_REC_OFF 1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF 2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF 3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF 1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF 2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF 31
+
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK 0xffff0000
+#define MSG_OP_FLAG_SHIFT 16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK 0x0000ffff
+#define MSG_LAST_REPLAY 0x0001
+#define MSG_RESENT 0x0002
+#define MSG_REPLAY 0x0004
+/* #define MSG_AT_SUPPORT 0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY 0x0010
+#define MSG_VERSION_REPLAY 0x0020
+#define MSG_REQ_REPLAY_DONE 0x0040
+#define MSG_LOCK_REPLAY_DONE 0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING 0x00000001
+#define MSG_CONNECT_RECONNECT 0x00000002
+#define MSG_CONNECT_REPLAYABLE 0x00000004
+//#define MSG_CONNECT_PEER 0x8
+#define MSG_CONNECT_LIBCLIENT 0x00000010
+#define MSG_CONNECT_INITIAL 0x00000020
+#define MSG_CONNECT_ASYNC 0x00000040
+#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO 0x00000100 /* report transno */
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL 0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW 0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS 0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN 0x2000ULL /*files can be concatenated.
+ *We do not support JOIN FILE
+ *anymore, reserve this flags
+ *just for preventing such bit
+ *to be reused.*/
+#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL 0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits
+ * directory hash */
+#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+ * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+ * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+ * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch. Please clear any such
+ * changes with senior engineers before starting to use a new flag. Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the
+ * flag to check_obd_connect_data(), and updates wiretests accordingly, so it
+ * can be approved and landed easily to reserve the flag for future use. */
+
+/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS
+ * connection. It is a temporary bug fix for Imperative Recovery interop
+ * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for
+ * 2.2 clients/servers is no longer needed. LU-1252/LU-1644. */
+#define OBD_CONNECT_MNE_SWAB OBD_CONNECT_MDS_MDS
+
+#define OCD_HAS_FLAG(ocd, flg) \
+ (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+
+#define MDT_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+ OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+ OBD_CONNECT_IBITS | \
+ OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+ OBD_CONNECT_RMT_CLIENT | \
+ OBD_CONNECT_RMT_CLIENT_FORCE | \
+ OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \
+ OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \
+ OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \
+ OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \
+ OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \
+ OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+ OBD_CONNECT_EINPROGRESS | \
+ OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+ OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+ OBD_CONNECT_PINGLESS)
+#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
+ OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+ LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
+ OBD_CONNECT_RMT_CLIENT | \
+ OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \
+ OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+ OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \
+ OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+ OBD_CONNECT_MAX_EASIZE | \
+ OBD_CONNECT_EINPROGRESS | \
+ OBD_CONNECT_JOBSTATS | \
+ OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+ OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+ OBD_CONNECT_PINGLESS)
+#define ECHO_CONNECT_SUPPORTED (0)
+#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+ OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+ OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
+ OBD_CONNECT_FULL20)
+
+#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
+ ((patch)<<8) + (fix))
+#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
+#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255)
+#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255)
+#define OBD_OCD_VERSION_FIX(version) ((int)(version)&255)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data_v1 {
+ __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+ __u32 ocd_version; /* lustre release version number */
+ __u32 ocd_grant; /* initial cache grant amount (bytes) */
+ __u32 ocd_index; /* LOV index to connect to */
+ __u32 ocd_brw_size; /* Maximum BRW size in bytes, must be 2^n */
+ __u64 ocd_ibits_known; /* inode bits this client understands */
+ __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */
+ __u8 ocd_inodespace; /* log2 of the per-inode space consumption */
+ __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */
+ __u32 ocd_unused; /* also fix lustre_swab_connect */
+ __u64 ocd_transno; /* first transno from client to be replayed */
+ __u32 ocd_group; /* MDS group on OST */
+ __u32 ocd_cksum_types; /* supported checksum algorithms */
+ __u32 ocd_max_easize; /* How big LOV EA can be on MDS */
+ __u32 ocd_instance; /* also fix lustre_swab_connect */
+ __u64 ocd_maxbytes; /* Maximum stripe size in bytes */
+};
+
+struct obd_connect_data {
+ __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+ __u32 ocd_version; /* lustre release version number */
+ __u32 ocd_grant; /* initial cache grant amount (bytes) */
+ __u32 ocd_index; /* LOV index to connect to */
+ __u32 ocd_brw_size; /* Maximum BRW size in bytes */
+ __u64 ocd_ibits_known; /* inode bits this client understands */
+ __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */
+ __u8 ocd_inodespace; /* log2 of the per-inode space consumption */
+ __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */
+ __u32 ocd_unused; /* also fix lustre_swab_connect */
+ __u64 ocd_transno; /* first transno from client to be replayed */
+ __u32 ocd_group; /* MDS group on OST */
+ __u32 ocd_cksum_types; /* supported checksum algorithms */
+ __u32 ocd_max_easize; /* How big LOV EA can be on MDS */
+ __u32 ocd_instance; /* instance # of this target */
+ __u64 ocd_maxbytes; /* Maximum stripe size in bytes */
+ /* Fields after ocd_maxbytes are only accessible by the receiver
+ * if the corresponding flag in ocd_connect_flags is set. Accessing
+ * any field after ocd_maxbytes on the receiver without a valid flag
+ * may result in out-of-bound memory access and kernel oops. */
+ __u64 padding1; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding2; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding3; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding4; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding5; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding6; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding7; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding8; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding9; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingA; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingB; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingC; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingD; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingE; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingF; /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch. Please clear any such changes
+ * with senior engineers before starting to use a new field. Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+
+extern void lustre_swab_connect(struct obd_connect_data *ocd);
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
+ */
+typedef enum {
+ OBD_CKSUM_CRC32 = 0x00000001,
+ OBD_CKSUM_ADLER = 0x00000002,
+ OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
+
+/*
+ * OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+typedef enum {
+ OST_REPLY = 0, /* reply ? */
+ OST_GETATTR = 1,
+ OST_SETATTR = 2,
+ OST_READ = 3,
+ OST_WRITE = 4,
+ OST_CREATE = 5,
+ OST_DESTROY = 6,
+ OST_GET_INFO = 7,
+ OST_CONNECT = 8,
+ OST_DISCONNECT = 9,
+ OST_PUNCH = 10,
+ OST_OPEN = 11,
+ OST_CLOSE = 12,
+ OST_STATFS = 13,
+ OST_SYNC = 16,
+ OST_SET_INFO = 17,
+ OST_QUOTACHECK = 18,
+ OST_QUOTACTL = 19,
+ OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+ OST_LAST_OPC
+} ost_cmd_t;
+#define OST_FIRST_OPC OST_REPLY
+
+enum obdo_flags {
+ OBD_FL_INLINEDATA = 0x00000001,
+ OBD_FL_OBDMDEXISTS = 0x00000002,
+ OBD_FL_DELORPHAN = 0x00000004, /* if set in o_flags delete orphans */
+ OBD_FL_NORPC = 0x00000008, /* set in o_flags do in OSC not OST */
+ OBD_FL_IDONLY = 0x00000010, /* set in o_flags only adjust obj id*/
+ OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+ OBD_FL_DEBUG_CHECK = 0x00000040, /* echo client/server debug check */
+ OBD_FL_NO_USRQUOTA = 0x00000100, /* the object's owner is over quota */
+ OBD_FL_NO_GRPQUOTA = 0x00000200, /* the object's group is over quota */
+ OBD_FL_CREATE_CROW = 0x00000400, /* object should be create on write */
+ OBD_FL_SRVLOCK = 0x00000800, /* delegate DLM locking to server */
+ OBD_FL_CKSUM_CRC32 = 0x00001000, /* CRC32 checksum type */
+ OBD_FL_CKSUM_ADLER = 0x00002000, /* ADLER checksum type */
+ OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+ OBD_FL_CKSUM_RSVD2 = 0x00008000, /* for future cksum types */
+ OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */
+ OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+ OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client.
+ * XXX: obsoleted - reserved for old
+ * clients prior than 2.2 */
+ OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+ OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */
+
+ /* Note that while these checksum values are currently separate bits,
+ * in 2.x we can actually allow all values from 1-31 if we wanted. */
+ OBD_FL_CKSUM_ALL = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+ OBD_FL_CKSUM_CRC32C,
+
+ /* mask for local-only flag, which won't be sent over network */
+ OBD_FL_LOCAL_MASK = 0xF0000000,
+};
+
+#define LOV_MAGIC_V1 0x0BD10BD0
+#define LOV_MAGIC LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_MAGIC_V3 0x0BD30BD0
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ */
+#define LOV_MAGIC_V1_DEF 0x0CD10BD0
+#define LOV_MAGIC_V3_DEF 0x0CD30BD0
+
+#define LOV_PATTERN_RAID0 0x001 /* stripes are used round-robin */
+#define LOV_PATTERN_RAID1 0x002 /* stripes are mirrors of each other */
+#define LOV_PATTERN_FIRST 0x100 /* first stripe is not in round-robin */
+#define LOV_PATTERN_CMOBD 0x200
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/
+ struct ost_id l_ost_oi; /* OST object ID */
+ __u32 l_ost_gen; /* generation of this l_ost_idx */
+ __u32 l_ost_idx; /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */
+ __u32 lmm_magic; /* magic number = LOV_MAGIC_V1 */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ struct ost_id lmm_oi; /* LOV object ID */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ /* lmm_stripe_count used to be __u32 */
+ __u16 lmm_stripe_count; /* num stripes in use for this object */
+ __u16 lmm_layout_gen; /* layout generation number */
+ struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ * ........
+ * __u64 lmm_object_id;
+ * __u64 lmm_object_seq;
+ * ......
+ * }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ * lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+ struct ost_id *oi)
+{
+ oi->oi.oi_id = fid_oid(fid);
+ oi->oi.oi_seq = fid_seq(fid);
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+ oi->oi.oi_seq = seq;
+}
+
+static inline __u64 lmm_oi_id(struct ost_id *oi)
+{
+ return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(struct ost_id *oi)
+{
+ return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+ struct ost_id *src_oi)
+{
+ dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+ dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+ struct ost_id *src_oi)
+{
+ dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+ dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */
+
+#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default"
+#define XATTR_USER_PREFIX "user."
+#define XATTR_TRUSTED_PREFIX "trusted."
+#define XATTR_SECURITY_PREFIX "security."
+#define XATTR_LUSTRE_PREFIX "lustre."
+
+#define XATTR_NAME_LOV "trusted.lov"
+#define XATTR_NAME_LMA "trusted.lma"
+#define XATTR_NAME_LMV "trusted.lmv"
+#define XATTR_NAME_LINK "trusted.link"
+#define XATTR_NAME_FID "trusted.fid"
+#define XATTR_NAME_VERSION "trusted.version"
+#define XATTR_NAME_SOM "trusted.som"
+#define XATTR_NAME_HSM "trusted.hsm"
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace"
+
+struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */
+ __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ struct ost_id lmm_oi; /* LOV object ID */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ /* lmm_stripe_count used to be __u32 */
+ __u16 lmm_stripe_count; /* num stripes in use for this object */
+ __u16 lmm_layout_gen; /* layout generation number */
+ char lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */
+ struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+#define OBD_MD_FLID (0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */
+#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */
+#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */
+#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */
+#define OBD_MD_FLQOS (0x00200000ULL) /* quality of service stats */
+/*#define OBD_MD_FLOSCOPQ (0x00400000ULL) osc opaque data, never used */
+#define OBD_MD_FLCOOKIE (0x00800000ULL) /* log cancellation cookie */
+#define OBD_MD_FLGROUP (0x01000000ULL) /* group */
+#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLEPOCH (0x04000000ULL) /* ->ost write with ioepoch */
+ /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */
+#define OBD_MD_REINT (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */
+
+/* OBD_MD_MDTIDX is used to get MDT index, but it is never been used overwire,
+ * and it is already obsolete since 2.3 */
+/* #define OBD_MD_MDTIDX (0x0000000800000000ULL) */
+
+#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLRMTPERM (0x0000010000000000ULL) /* remote permission */
+#define OBD_MD_FLMDSCAPA (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+ * under lock */
+#define OBD_MD_FLOBJCOUNT (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */
+#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */
+#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */
+#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+ OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \
+ OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \
+ OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+ OBD_MD_FLGENER | OBD_MD_FLRDEV | OBD_MD_FLGROUP)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+ HSS_SETMASK = 0x01,
+ HSS_CLEARMASK = 0x02,
+ HSS_ARCHIVE_ID = 0x04,
+};
+
+struct hsm_state_set {
+ __u32 hss_valid;
+ __u32 hss_archive_id;
+ __u64 hss_setmask;
+ __u64 hss_clearmask;
+};
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+
+extern void lustre_swab_obd_statfs (struct obd_statfs *os);
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ 0x01
+#define OBD_BRW_WRITE 0x02
+#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous
+ * transfer and is not accounted in
+ * the grant. */
+#define OBD_BRW_CHECK 0x10
+#define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED 0x40 /* the ost manages this */
+#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA 0x100
+#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+
+#define OBD_OBJECT_EOF 0xffffffffffffffffULL
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+ struct ost_id ioo_oid; /* object ID, if multi-obj BRW */
+ __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4,
+ * now (PTLRPC_BULK_OPS_COUNT - 1) in
+ * high 16 bits in 2.4 and later */
+ __u32 ioo_bufcnt; /* number of niobufs for this object */
+};
+
+#define IOOBJ_MAX_BRW_BITS 16
+#define IOOBJ_TYPE_MASK ((1U << IOOBJ_MAX_BRW_BITS) - 1)
+#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num) \
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo);
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+ __u64 offset;
+ __u32 len;
+ __u32 flags;
+};
+
+extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks) \
+ ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc) \
+ do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks) (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+ __u64 lvb_size;
+ obd_time lvb_mtime;
+ obd_time lvb_atime;
+ obd_time lvb_ctime;
+ __u64 lvb_blocks;
+};
+
+extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+
+struct ost_lvb {
+ __u64 lvb_size;
+ obd_time lvb_mtime;
+ obd_time lvb_atime;
+ obd_time lvb_ctime;
+ __u64 lvb_blocks;
+ __u32 lvb_mtime_ns;
+ __u32 lvb_atime_ns;
+ __u32 lvb_ctime_ns;
+ __u32 lvb_padding;
+};
+
+extern void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+
+/*
+ * lquota data structures
+ */
+
+#ifndef QUOTABLOCK_BITS
+#define QUOTABLOCK_BITS 10
+#endif
+
+#ifndef QUOTABLOCK_SIZE
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+#endif
+
+#ifndef toqb
+#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
+#endif
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+ struct lu_fid qid_fid; /* FID for per-directory quota */
+ __u64 qid_uid; /* user identifier */
+ __u64 qid_gid; /* group identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+ __u32 qc_cmd;
+ __u32 qc_type; /* see Q_* flag below */
+ __u32 qc_id;
+ __u32 qc_stat;
+ struct obd_dqinfo qc_dqinfo;
+ struct obd_dqblk qc_dqblk;
+};
+
+extern void lustre_swab_obd_quotactl(struct obd_quotactl *q);
+
+#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */
+#define Q_GETOINFO 0x800102 /* get obd quota info */
+#define Q_GETOQUOTA 0x800103 /* get obd quotas */
+#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QCTL_COPY(out, in) \
+do { \
+ Q_COPY(out, in, qc_cmd); \
+ Q_COPY(out, in, qc_type); \
+ Q_COPY(out, in, qc_id); \
+ Q_COPY(out, in, qc_stat); \
+ Q_COPY(out, in, qc_dqinfo); \
+ Q_COPY(out, in, qc_dqblk); \
+} while (0)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+ struct lu_fid qb_fid; /* FID of global index packing the pool ID
+ * and type (data or metadata) as well as
+ * the quota type (user or group). */
+ union lquota_id qb_id; /* uid or gid or directory FID */
+ __u32 qb_flags; /* see below */
+ __u32 qb_padding;
+ __u64 qb_count; /* acquire/release count (kbytes/inodes) */
+ __u64 qb_usage; /* current slave usage (kbytes/inodes) */
+ __u64 qb_slv_ver; /* slave index file version */
+ struct lustre_handle qb_lockh; /* per-ID lock handle */
+ struct lustre_handle qb_glb_lockh; /* global lock handle */
+ __u64 qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ 0x1 /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ 0x2 /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL 0x4 /* release quota */
+#define QUOTA_DQACQ_FL_REPORT 0x8 /* report usage */
+
+extern void lustre_swab_quota_body(struct quota_body *b);
+
+/* Quota types currently supported */
+enum {
+ LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */
+ LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */
+ LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+ LQUOTA_RES_MD = 0x01, /* skip 0 to avoid null oid in FID */
+ LQUOTA_RES_DT = 0x02,
+ LQUOTA_LAST_RES,
+ LQUOTA_FIRST_RES = LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+ __u64 bspace; /* current space in use */
+ __u64 ispace; /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+ __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+ __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+ __u64 qbr_time; /* grace time, in seconds */
+ __u64 qbr_granted; /* how much is granted to slaves, in #inodes or
+ * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+ __u64 qsr_granted; /* space granted to the slave for the key=ID,
+ * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+ union lquota_id gl_id; /* quota ID subject to the glimpse */
+ __u64 gl_flags; /* see LQUOTA_FL* below */
+ __u64 gl_ver; /* new index version */
+ __u64 gl_hardlimit; /* new hardlimit or qunit value */
+ __u64 gl_softlimit; /* new softlimit */
+ __u64 gl_time;
+ __u64 gl_pad2;
+};
+#define gl_qunit gl_hardlimit /* current qunit value used when
+ * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+ __u64 lvb_flags; /* see LQUOTA_FL* above */
+ __u64 lvb_id_may_rel; /* space that might be released later */
+ __u64 lvb_id_rel; /* space released by the slave for this ID */
+ __u64 lvb_id_qunit; /* current qunit value */
+ __u64 lvb_pad1;
+};
+
+extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+typedef enum {
+ QUOTA_DQACQ = 601,
+ QUOTA_DQREL = 602,
+ QUOTA_LAST_OPC
+} quota_cmd_t;
+#define QUOTA_FIRST_OPC QUOTA_DQACQ
+
+/*
+ * MDS REQ RECORDS
+ */
+
+/* opcodes */
+typedef enum {
+ MDS_GETATTR = 33,
+ MDS_GETATTR_NAME = 34,
+ MDS_CLOSE = 35,
+ MDS_REINT = 36,
+ MDS_READPAGE = 37,
+ MDS_CONNECT = 38,
+ MDS_DISCONNECT = 39,
+ MDS_GETSTATUS = 40,
+ MDS_STATFS = 41,
+ MDS_PIN = 42,
+ MDS_UNPIN = 43,
+ MDS_SYNC = 44,
+ MDS_DONE_WRITING = 45,
+ MDS_SET_INFO = 46,
+ MDS_QUOTACHECK = 47,
+ MDS_QUOTACTL = 48,
+ MDS_GETXATTR = 49,
+ MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */
+ MDS_WRITEPAGE = 51,
+ MDS_IS_SUBDIR = 52,
+ MDS_GET_INFO = 53,
+ MDS_HSM_STATE_GET = 54,
+ MDS_HSM_STATE_SET = 55,
+ MDS_HSM_ACTION = 56,
+ MDS_HSM_PROGRESS = 57,
+ MDS_HSM_REQUEST = 58,
+ MDS_HSM_CT_REGISTER = 59,
+ MDS_HSM_CT_UNREGISTER = 60,
+ MDS_SWAP_LAYOUTS = 61,
+ MDS_LAST_OPC
+} mds_cmd_t;
+
+#define MDS_FIRST_OPC MDS_GETATTR
+
+
+/* opcodes for object update */
+typedef enum {
+ UPDATE_OBJ = 1000,
+ UPDATE_LAST_OPC
+} update_cmd_t;
+
+#define UPDATE_FIRST_OPC UPDATE_OBJ
+
+/*
+ * Do not exceed 63
+ */
+
+typedef enum {
+ REINT_SETATTR = 1,
+ REINT_CREATE = 2,
+ REINT_LINK = 3,
+ REINT_UNLINK = 4,
+ REINT_RENAME = 5,
+ REINT_OPEN = 6,
+ REINT_SETXATTR = 7,
+ REINT_RMENTRY = 8,
+// REINT_WRITE = 9,
+ REINT_MAX
+} mds_reint_t, mdt_reint_t;
+
+extern void lustre_swab_generic_32s (__u32 *val);
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD 0x00000001
+#define DISP_LOOKUP_EXECD 0x00000002
+#define DISP_LOOKUP_NEG 0x00000004
+#define DISP_LOOKUP_POS 0x00000008
+#define DISP_OPEN_CREATE 0x00000010
+#define DISP_OPEN_OPEN 0x00000020
+#define DISP_ENQ_COMPLETE 0x00400000
+#define DISP_ENQ_OPEN_REF 0x00800000
+#define DISP_ENQ_CREATE_REF 0x01000000
+#define DISP_OPEN_LOCK 0x02000000
+
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001 /* dentry, mode, owner, group */
+#define MDS_INODELOCK_UPDATE 0x000002 /* size, links, timestamps */
+#define MDS_INODELOCK_OPEN 0x000004 /* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008 /* for layout */
+#define MDS_INODELOCK_PERM 0x000010 /* for permission */
+
+#define MDS_INODELOCK_MAXSHIFT 4
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
+extern void lustre_swab_ll_fid (struct ll_fid *fid);
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+ LUSTRE_RES_ID_SEQ_OFF = 0,
+ LUSTRE_RES_ID_VER_OID_OFF = 1,
+ LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+ LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+ LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+ LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+/* mdt_thread_info.mti_flags. */
+enum md_op_flags {
+ /* The flag indicates Size-on-MDS attributes are changed. */
+ MF_SOM_CHANGE = (1 << 0),
+ /* Flags indicates an epoch opens or closes. */
+ MF_EPOCH_OPEN = (1 << 1),
+ MF_EPOCH_CLOSE = (1 << 2),
+ MF_MDC_CANCEL_FID1 = (1 << 3),
+ MF_MDC_CANCEL_FID2 = (1 << 4),
+ MF_MDC_CANCEL_FID3 = (1 << 5),
+ MF_MDC_CANCEL_FID4 = (1 << 6),
+ /* There is a pending attribute update. */
+ MF_SOM_AU = (1 << 7),
+ /* Cancel OST locks while getattr OST attributes. */
+ MF_GETATTR_LOCK = (1 << 8),
+ MF_GET_MDT_IDX = (1 << 9),
+};
+
+#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE)
+
+#define LUSTRE_BFLAG_UNCOMMITTED_WRITES 0x1
+
+/* these should be identical to their EXT4_*_FL counterparts, they are
+ * redefined here only to avoid dragging in fs/ext4/ext4.h */
+#define LUSTRE_SYNC_FL 0x00000008 /* Synchronous updates */
+#define LUSTRE_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define LUSTRE_APPEND_FL 0x00000020 /* writes to file may only append */
+#define LUSTRE_NOATIME_FL 0x00000080 /* do not update atime */
+#define LUSTRE_DIRSYNC_FL 0x00010000 /* dirsync behaviour (dir only) */
+
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+ return (((flags & LUSTRE_SYNC_FL) ? S_SYNC : 0) |
+ ((flags & LUSTRE_NOATIME_FL) ? S_NOATIME : 0) |
+ ((flags & LUSTRE_APPEND_FL) ? S_APPEND : 0) |
+#if defined(S_DIRSYNC)
+ ((flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) |
+#endif
+ ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+ return (((iflags & S_SYNC) ? LUSTRE_SYNC_FL : 0) |
+ ((iflags & S_NOATIME) ? LUSTRE_NOATIME_FL : 0) |
+ ((iflags & S_APPEND) ? LUSTRE_APPEND_FL : 0) |
+#if defined(S_DIRSYNC)
+ ((iflags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) |
+#endif
+ ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
+struct mdt_body {
+ struct lu_fid fid1;
+ struct lu_fid fid2;
+ struct lustre_handle handle;
+ __u64 valid;
+ __u64 size; /* Offset, in the case of MDS_READPAGE */
+ obd_time mtime;
+ obd_time atime;
+ obd_time ctime;
+ __u64 blocks; /* XID, in the case of MDS_READPAGE */
+ __u64 ioepoch;
+ __u64 unused1; /* was "ino" until 2.4.0 */
+ __u32 fsuid;
+ __u32 fsgid;
+ __u32 capability;
+ __u32 mode;
+ __u32 uid;
+ __u32 gid;
+ __u32 flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */
+ __u32 rdev;
+ __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */
+ __u32 unused2; /* was "generation" until 2.4.0 */
+ __u32 suppgid;
+ __u32 eadatasize;
+ __u32 aclsize;
+ __u32 max_mdsize;
+ __u32 max_cookiesize;
+ __u32 uid_h; /* high 32-bits of uid, for FUID */
+ __u32 gid_h; /* high 32-bits of gid, for FUID */
+ __u32 padding_5; /* also fix lustre_swab_mdt_body */
+ __u64 padding_6;
+ __u64 padding_7;
+ __u64 padding_8;
+ __u64 padding_9;
+ __u64 padding_10;
+}; /* 216 */
+
+extern void lustre_swab_mdt_body (struct mdt_body *b);
+
+struct mdt_ioepoch {
+ struct lustre_handle handle;
+ __u64 ioepoch;
+ __u32 flags;
+ __u32 padding;
+};
+
+extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b);
+
+/* permissions for md_perm.mp_perm */
+enum {
+ CFS_SETUID_PERM = 0x01,
+ CFS_SETGID_PERM = 0x02,
+ CFS_SETGRP_PERM = 0x04,
+ CFS_RMTACL_PERM = 0x08,
+ CFS_RMTOWN_PERM = 0x10
+};
+
+/* inode access permission for remote user, the inode info are omitted,
+ * for client knows them. */
+struct mdt_remote_perm {
+ __u32 rp_uid;
+ __u32 rp_gid;
+ __u32 rp_fsuid;
+ __u32 rp_fsuid_h;
+ __u32 rp_fsgid;
+ __u32 rp_fsgid_h;
+ __u32 rp_access_perm; /* MAY_READ/WRITE/EXEC */
+ __u32 rp_padding;
+};
+
+extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p);
+
+struct mdt_rec_setattr {
+ __u32 sa_opcode;
+ __u32 sa_cap;
+ __u32 sa_fsuid;
+ __u32 sa_fsuid_h;
+ __u32 sa_fsgid;
+ __u32 sa_fsgid_h;
+ __u32 sa_suppgid;
+ __u32 sa_suppgid_h;
+ __u32 sa_padding_1;
+ __u32 sa_padding_1_h;
+ struct lu_fid sa_fid;
+ __u64 sa_valid;
+ __u32 sa_uid;
+ __u32 sa_gid;
+ __u64 sa_size;
+ __u64 sa_blocks;
+ obd_time sa_mtime;
+ obd_time sa_atime;
+ obd_time sa_ctime;
+ __u32 sa_attr_flags;
+ __u32 sa_mode;
+ __u32 sa_bias; /* some operation flags */
+ __u32 sa_padding_3;
+ __u32 sa_padding_4;
+ __u32 sa_padding_5;
+};
+
+extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa);
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+#define MDS_ATTR_MODE 0x1ULL /* = 1 */
+#define MDS_ATTR_UID 0x2ULL /* = 2 */
+#define MDS_ATTR_GID 0x4ULL /* = 4 */
+#define MDS_ATTR_SIZE 0x8ULL /* = 8 */
+#define MDS_ATTR_ATIME 0x10ULL /* = 16 */
+#define MDS_ATTR_MTIME 0x20ULL /* = 32 */
+#define MDS_ATTR_CTIME 0x40ULL /* = 64 */
+#define MDS_ATTR_ATIME_SET 0x80ULL /* = 128 */
+#define MDS_ATTR_MTIME_SET 0x100ULL /* = 256 */
+#define MDS_ATTR_FORCE 0x200ULL /* = 512, Not a change, but a change it */
+#define MDS_ATTR_ATTR_FLAG 0x400ULL /* = 1024 */
+#define MDS_ATTR_KILL_SUID 0x800ULL /* = 2048 */
+#define MDS_ATTR_KILL_SGID 0x1000ULL /* = 4096 */
+#define MDS_ATTR_CTIME_SET 0x2000ULL /* = 8192 */
+#define MDS_ATTR_FROM_OPEN 0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS 0x8000ULL /* = 32768 */
+
+#ifndef FMODE_READ
+#define FMODE_READ 00000001
+#define FMODE_WRITE 00000002
+#endif
+
+#define MDS_FMODE_CLOSED 00000000
+#define MDS_FMODE_EXEC 00000004
+/* IO Epoch is opened on a closed file. */
+#define MDS_FMODE_EPOCH 01000000
+/* IO Epoch is opened on a file truncate. */
+#define MDS_FMODE_TRUNC 02000000
+/* Size-on-MDS Attribute Update is pending. */
+#define MDS_FMODE_SOM 04000000
+
+#define MDS_OPEN_CREATED 00000010
+#define MDS_OPEN_CROSS 00000020
+
+#define MDS_OPEN_CREAT 00000100
+#define MDS_OPEN_EXCL 00000200
+#define MDS_OPEN_TRUNC 00001000
+#define MDS_OPEN_APPEND 00002000
+#define MDS_OPEN_SYNC 00010000
+#define MDS_OPEN_DIRECTORY 00200000
+
+#define MDS_OPEN_BY_FID 040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file.
+ * We do not support JOIN FILE
+ * anymore, reserve this flags
+ * just for preventing such bit
+ * to be reused. */
+
+#define MDS_OPEN_LOCK 04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE 0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE 0200000000000ULL /* New stripe needed (restripe or
+ * hsm restore) */
+#define MDS_OPEN_VOLATILE 0400000000000ULL /* File is volatile = created
+ unlinked */
+
+/* permission for create non-directory file */
+#define MAY_CREATE (1 << 7)
+/* permission for create directory file */
+#define MAY_LINK (1 << 8)
+/* permission for delete from the directory */
+#define MAY_UNLINK (1 << 9)
+/* source's permission for rename */
+#define MAY_RENAME_SRC (1 << 10)
+/* target's permission for rename */
+#define MAY_RENAME_TAR (1 << 11)
+/* part (parent's) VTX permission check */
+#define MAY_VTX_PART (1 << 12)
+/* full VTX permission check */
+#define MAY_VTX_FULL (1 << 13)
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL (1 << 14)
+
+enum {
+ MDS_CHECK_SPLIT = 1 << 0,
+ MDS_CROSS_REF = 1 << 1,
+ MDS_VTX_BYPASS = 1 << 2,
+ MDS_PERM_BYPASS = 1 << 3,
+ MDS_SOM = 1 << 4,
+ MDS_QUOTA_IGNORE = 1 << 5,
+ MDS_CLOSE_CLEANUP = 1 << 6,
+ MDS_KEEP_ORPHAN = 1 << 7,
+ MDS_RECOV_OPEN = 1 << 8,
+ MDS_DATA_MODIFIED = 1 << 9,
+ MDS_CREATE_VOLATILE = 1 << 10,
+ MDS_OWNEROVERRIDE = 1 << 11,
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+ __u32 cr_opcode;
+ __u32 cr_cap;
+ __u32 cr_fsuid;
+ __u32 cr_fsuid_h;
+ __u32 cr_fsgid;
+ __u32 cr_fsgid_h;
+ __u32 cr_suppgid1;
+ __u32 cr_suppgid1_h;
+ __u32 cr_suppgid2;
+ __u32 cr_suppgid2_h;
+ struct lu_fid cr_fid1;
+ struct lu_fid cr_fid2;
+ struct lustre_handle cr_old_handle; /* handle in case of open replay */
+ obd_time cr_time;
+ __u64 cr_rdev;
+ __u64 cr_ioepoch;
+ __u64 cr_padding_1; /* rr_blocks */
+ __u32 cr_mode;
+ __u32 cr_bias;
+ /* use of helpers set/get_mrc_cr_flags() is needed to access
+ * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+ * extend cr_flags size without breaking 1.8 compat */
+ __u32 cr_flags_l; /* for use with open, low 32 bits */
+ __u32 cr_flags_h; /* for use with open, high 32 bits */
+ __u32 cr_umask; /* umask for create */
+ __u32 cr_padding_4; /* rr_padding_4 */
+};
+
+static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+ mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+ mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc)
+{
+ return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32));
+}
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+ __u32 lk_opcode;
+ __u32 lk_cap;
+ __u32 lk_fsuid;
+ __u32 lk_fsuid_h;
+ __u32 lk_fsgid;
+ __u32 lk_fsgid_h;
+ __u32 lk_suppgid1;
+ __u32 lk_suppgid1_h;
+ __u32 lk_suppgid2;
+ __u32 lk_suppgid2_h;
+ struct lu_fid lk_fid1;
+ struct lu_fid lk_fid2;
+ obd_time lk_time;
+ __u64 lk_padding_1; /* rr_atime */
+ __u64 lk_padding_2; /* rr_ctime */
+ __u64 lk_padding_3; /* rr_size */
+ __u64 lk_padding_4; /* rr_blocks */
+ __u32 lk_bias;
+ __u32 lk_padding_5; /* rr_mode */
+ __u32 lk_padding_6; /* rr_flags */
+ __u32 lk_padding_7; /* rr_padding_2 */
+ __u32 lk_padding_8; /* rr_padding_3 */
+ __u32 lk_padding_9; /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+ __u32 ul_opcode;
+ __u32 ul_cap;
+ __u32 ul_fsuid;
+ __u32 ul_fsuid_h;
+ __u32 ul_fsgid;
+ __u32 ul_fsgid_h;
+ __u32 ul_suppgid1;
+ __u32 ul_suppgid1_h;
+ __u32 ul_suppgid2;
+ __u32 ul_suppgid2_h;
+ struct lu_fid ul_fid1;
+ struct lu_fid ul_fid2;
+ obd_time ul_time;
+ __u64 ul_padding_2; /* rr_atime */
+ __u64 ul_padding_3; /* rr_ctime */
+ __u64 ul_padding_4; /* rr_size */
+ __u64 ul_padding_5; /* rr_blocks */
+ __u32 ul_bias;
+ __u32 ul_mode;
+ __u32 ul_padding_6; /* rr_flags */
+ __u32 ul_padding_7; /* rr_padding_2 */
+ __u32 ul_padding_8; /* rr_padding_3 */
+ __u32 ul_padding_9; /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+ __u32 rn_opcode;
+ __u32 rn_cap;
+ __u32 rn_fsuid;
+ __u32 rn_fsuid_h;
+ __u32 rn_fsgid;
+ __u32 rn_fsgid_h;
+ __u32 rn_suppgid1;
+ __u32 rn_suppgid1_h;
+ __u32 rn_suppgid2;
+ __u32 rn_suppgid2_h;
+ struct lu_fid rn_fid1;
+ struct lu_fid rn_fid2;
+ obd_time rn_time;
+ __u64 rn_padding_1; /* rr_atime */
+ __u64 rn_padding_2; /* rr_ctime */
+ __u64 rn_padding_3; /* rr_size */
+ __u64 rn_padding_4; /* rr_blocks */
+ __u32 rn_bias; /* some operation flags */
+ __u32 rn_mode; /* cross-ref rename has mode */
+ __u32 rn_padding_5; /* rr_flags */
+ __u32 rn_padding_6; /* rr_padding_2 */
+ __u32 rn_padding_7; /* rr_padding_3 */
+ __u32 rn_padding_8; /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+ __u32 sx_opcode;
+ __u32 sx_cap;
+ __u32 sx_fsuid;
+ __u32 sx_fsuid_h;
+ __u32 sx_fsgid;
+ __u32 sx_fsgid_h;
+ __u32 sx_suppgid1;
+ __u32 sx_suppgid1_h;
+ __u32 sx_suppgid2;
+ __u32 sx_suppgid2_h;
+ struct lu_fid sx_fid;
+ __u64 sx_padding_1; /* These three are rr_fid2 */
+ __u32 sx_padding_2;
+ __u32 sx_padding_3;
+ __u64 sx_valid;
+ obd_time sx_time;
+ __u64 sx_padding_5; /* rr_ctime */
+ __u64 sx_padding_6; /* rr_size */
+ __u64 sx_padding_7; /* rr_blocks */
+ __u32 sx_size;
+ __u32 sx_flags;
+ __u32 sx_padding_8; /* rr_flags */
+ __u32 sx_padding_9; /* rr_padding_2 */
+ __u32 sx_padding_10; /* rr_padding_3 */
+ __u32 sx_padding_11; /* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structres and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+ __u32 rr_opcode;
+ __u32 rr_cap;
+ __u32 rr_fsuid;
+ __u32 rr_fsuid_h;
+ __u32 rr_fsgid;
+ __u32 rr_fsgid_h;
+ __u32 rr_suppgid1;
+ __u32 rr_suppgid1_h;
+ __u32 rr_suppgid2;
+ __u32 rr_suppgid2_h;
+ struct lu_fid rr_fid1;
+ struct lu_fid rr_fid2;
+ obd_time rr_mtime;
+ obd_time rr_atime;
+ obd_time rr_ctime;
+ __u64 rr_size;
+ __u64 rr_blocks;
+ __u32 rr_bias;
+ __u32 rr_mode;
+ __u32 rr_flags;
+ __u32 rr_flags_h;
+ __u32 rr_umask;
+ __u32 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+
+struct lmv_desc {
+ __u32 ld_tgt_count; /* how many MDS's */
+ __u32 ld_active_tgt_count; /* how many active */
+ __u32 ld_default_stripe_count; /* how many objects are used */
+ __u32 ld_pattern; /* default MEA_MAGIC_* */
+ __u64 ld_default_hash_size;
+ __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_qos_maxage; /* in second */
+ __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */
+ struct obd_uuid ld_uuid;
+};
+
+extern void lustre_swab_lmv_desc (struct lmv_desc *ld);
+
+/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
+struct lmv_stripe_md {
+ __u32 mea_magic;
+ __u32 mea_count;
+ __u32 mea_master;
+ __u32 mea_padding;
+ char mea_pool_name[LOV_MAXPOOLNAME];
+ struct lu_fid mea_ids[0];
+};
+
+extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea);
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR 0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS 0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b
+
+#define MAX_HASH_SIZE_32 0x7fffffffUL
+#define MAX_HASH_SIZE 0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL
+
+enum fld_rpc_opc {
+ FLD_QUERY = 900,
+ FLD_LAST_OPC,
+ FLD_FIRST_OPC = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+ SEQ_QUERY = 700,
+ SEQ_LAST_OPC,
+ SEQ_FIRST_OPC = SEQ_QUERY
+};
+
+enum seq_op {
+ SEQ_ALLOC_SUPER = 0,
+ SEQ_ALLOC_META = 1
+};
+
+/*
+ * LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE 8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS. With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+ __u32 ld_tgt_count; /* how many OBD's */
+ __u32 ld_active_tgt_count; /* how many active */
+ __u32 ld_default_stripe_count; /* how many objects are used */
+ __u32 ld_pattern; /* default PATTERN_RAID0 */
+ __u64 ld_default_stripe_size; /* in bytes */
+ __u64 ld_default_stripe_offset; /* in bytes */
+ __u32 ld_padding_0; /* unused */
+ __u32 ld_qos_maxage; /* in second */
+ __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */
+ __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */
+ struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count /* for swabbing from llogs */
+
+extern void lustre_swab_lov_desc (struct lov_desc *ld);
+
+/*
+ * LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+typedef enum {
+ LDLM_ENQUEUE = 101,
+ LDLM_CONVERT = 102,
+ LDLM_CANCEL = 103,
+ LDLM_BL_CALLBACK = 104,
+ LDLM_CP_CALLBACK = 105,
+ LDLM_GL_CALLBACK = 106,
+ LDLM_SET_INFO = 107,
+ LDLM_LAST_OPC
+} ldlm_cmd_t;
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+ __u64 name[RES_NAME_SIZE];
+};
+
+extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id);
+
+static inline int ldlm_res_eq(const struct ldlm_res_id *res0,
+ const struct ldlm_res_id *res1)
+{
+ return !memcmp(res0, res1, sizeof(*res0));
+}
+
+/* lock types */
+typedef enum {
+ LCK_MINMODE = 0,
+ LCK_EX = 1,
+ LCK_PW = 2,
+ LCK_PR = 4,
+ LCK_CW = 8,
+ LCK_CR = 16,
+ LCK_NL = 32,
+ LCK_GROUP = 64,
+ LCK_COS = 128,
+ LCK_MAXMODE
+} ldlm_mode_t;
+
+#define LCK_MODE_NUM 8
+
+typedef enum {
+ LDLM_PLAIN = 10,
+ LDLM_EXTENT = 11,
+ LDLM_FLOCK = 12,
+ LDLM_IBITS = 13,
+ LDLM_MAX_TYPE
+} ldlm_type_t;
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+ __u64 start;
+ __u64 end;
+ __u64 gid;
+};
+
+static inline int ldlm_extent_overlap(struct ldlm_extent *ex1,
+ struct ldlm_extent *ex2)
+{
+ return (ex1->start <= ex2->end) && (ex2->start <= ex1->end);
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(struct ldlm_extent *ex1,
+ struct ldlm_extent *ex2)
+{
+ return (ex1->start <= ex2->start) && (ex1->end >= ex2->end);
+}
+
+struct ldlm_inodebits {
+ __u64 bits;
+};
+
+struct ldlm_flock_wire {
+ __u64 lfw_start;
+ __u64 lfw_end;
+ __u64 lfw_owner;
+ __u32 lfw_padding;
+ __u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+typedef union {
+ struct ldlm_extent l_extent;
+ struct ldlm_flock_wire l_flock;
+ struct ldlm_inodebits l_inodebits;
+} ldlm_wire_policy_data_t;
+
+extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d);
+
+union ldlm_gl_desc {
+ struct ldlm_gl_lquota_desc lquota_desc;
+};
+
+extern void lustre_swab_gl_desc(union ldlm_gl_desc *);
+
+struct ldlm_intent {
+ __u64 opc;
+};
+
+extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
+
+struct ldlm_resource_desc {
+ ldlm_type_t lr_type;
+ __u32 lr_padding; /* also fix lustre_swab_ldlm_resource_desc */
+ struct ldlm_res_id lr_name;
+};
+
+extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r);
+
+struct ldlm_lock_desc {
+ struct ldlm_resource_desc l_resource;
+ ldlm_mode_t l_req_mode;
+ ldlm_mode_t l_granted_mode;
+ ldlm_wire_policy_data_t l_policy_data;
+};
+
+extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+ __u32 lock_flags;
+ __u32 lock_count;
+ struct ldlm_lock_desc lock_desc;
+ struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
+
+/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available.
+ * Otherwise, 2 are available. */
+#define ldlm_request_bufsize(count,type) \
+({ \
+ int _avail = LDLM_LOCKREQ_HANDLES; \
+ _avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \
+ sizeof(struct ldlm_request) + \
+ (count > _avail ? count - _avail : 0) * \
+ sizeof(struct lustre_handle); \
+})
+
+struct ldlm_reply {
+ __u32 lock_flags;
+ __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */
+ struct ldlm_lock_desc lock_desc;
+ struct lustre_handle lock_handle;
+ __u64 lock_policy_res1;
+ __u64 lock_policy_res2;
+};
+
+extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
+
+#define ldlm_flags_to_wire(flags) ((__u32)(flags))
+#define ldlm_flags_from_wire(flags) ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+ MGS_CONNECT = 250,
+ MGS_DISCONNECT,
+ MGS_EXCEPTION, /* node died, etc. */
+ MGS_TARGET_REG, /* whenever target starts up */
+ MGS_TARGET_DEL,
+ MGS_SET_INFO,
+ MGS_CONFIG_READ,
+ MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
+
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+ char mgs_param[MGS_PARAM_MAXLEN];
+};
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN 64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX 32
+struct mgs_target_info {
+ __u32 mti_lustre_ver;
+ __u32 mti_stripe_index;
+ __u32 mti_config_ver;
+ __u32 mti_flags;
+ __u32 mti_nid_count;
+ __u32 mti_instance; /* Running instance of target */
+ char mti_fsname[MTI_NAME_MAXLEN];
+ char mti_svname[MTI_NAME_MAXLEN];
+ char mti_uuid[sizeof(struct obd_uuid)];
+ __u64 mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t)*/
+ char mti_params[MTI_PARAM_MAXLEN];
+};
+extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+
+struct mgs_nidtbl_entry {
+ __u64 mne_version; /* table version of this entry */
+ __u32 mne_instance; /* target instance # */
+ __u32 mne_index; /* target index */
+ __u32 mne_length; /* length of this entry - by bytes */
+ __u8 mne_type; /* target type LDD_F_SV_TYPE_OST/MDT */
+ __u8 mne_nid_type; /* type of nid(mbz). for ipv6. */
+ __u8 mne_nid_size; /* size of each NID, by bytes */
+ __u8 mne_nid_count; /* # of NIDs in buffer */
+ union {
+ lnet_nid_t nids[0]; /* variable size buffer for NIDs. */
+ } u;
+};
+extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+
+struct mgs_config_body {
+ char mcb_name[MTI_NAME_MAXLEN]; /* logname */
+ __u64 mcb_offset; /* next index of config log to request */
+ __u16 mcb_type; /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+ __u8 mcb_reserved;
+ __u8 mcb_bits; /* bits unit size of config log */
+ __u32 mcb_units; /* # of units for bulk transfer */
+};
+extern void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+
+struct mgs_config_res {
+ __u64 mcr_offset; /* index of last config log */
+ __u64 mcr_size; /* size of the log */
+};
+extern void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+
+/* Config marker flags (in config log) */
+#define CM_START 0x01
+#define CM_END 0x02
+#define CM_SKIP 0x04
+#define CM_UPGRADE146 0x08
+#define CM_EXCLUDE 0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+ __u32 cm_step; /* aka config version */
+ __u32 cm_flags;
+ __u32 cm_vers; /* lustre release version number */
+ __u32 cm_padding; /* 64 bit align */
+ obd_time cm_createtime; /*when this record was first created */
+ obd_time cm_canceltime; /*when this record is no longer valid*/
+ char cm_tgtname[MTI_NAME_MAXLEN];
+ char cm_comment[MTI_NAME_MAXLEN];
+};
+
+extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
+ int swab, int size);
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+typedef enum {
+ OBD_PING = 400,
+ OBD_LOG_CANCEL,
+ OBD_QC_CALLBACK,
+ OBD_IDX_READ,
+ OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
+
+/* catalog of log objects */
+
+/** Identifier for a single log object */
+struct llog_logid {
+ struct ost_id lgl_oi;
+ __u32 lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+ struct llog_logid lci_logid;
+ __u32 lci_padding1;
+ __u32 lci_padding2;
+ __u32 lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK 0xfff00000
+
+typedef enum {
+ LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000,
+ OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00,
+ /* OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, never used */
+ MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+ REINT_UNLINK, /* obsolete after 2.5.0 */
+ MDS_UNLINK64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+ REINT_UNLINK,
+ /* MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+ MDS_SETATTR64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+ REINT_SETATTR,
+ OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000,
+ /* PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+ LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000,
+ /* LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, obsolete 1.8.0 */
+ CHANGELOG_REC = LLOG_OP_MAGIC | 0x60000,
+ CHANGELOG_USER_REC = LLOG_OP_MAGIC | 0x70000,
+ LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539,
+ LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b,
+} llog_op_type;
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+ (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+ __u32 lrh_len;
+ __u32 lrh_index;
+ __u32 lrh_type;
+ __u32 lrh_id;
+};
+
+struct llog_rec_tail {
+ __u32 lrt_len;
+ __u32 lrt_index;
+};
+
+/* Where data follow just after header */
+#define REC_DATA(ptr) \
+ ((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec) \
+ (rec->lrh_len - sizeof(struct llog_rec_hdr) - \
+ sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+ struct llog_rec_hdr lid_hdr;
+ struct llog_logid lid_id;
+ __u32 lid_padding1;
+ __u64 lid_padding2;
+ __u64 lid_padding3;
+ struct llog_rec_tail lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+ struct llog_rec_hdr lur_hdr;
+ obd_id lur_oid;
+ obd_count lur_oseq;
+ obd_count lur_count;
+ struct llog_rec_tail lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+ struct llog_rec_hdr lur_hdr;
+ struct lu_fid lur_fid;
+ obd_count lur_count; /* to destroy the lost precreated */
+ __u32 lur_padding1;
+ __u64 lur_padding2;
+ __u64 lur_padding3;
+ struct llog_rec_tail lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+ struct llog_rec_hdr lsr_hdr;
+ struct ost_id lsr_oi;
+ __u32 lsr_uid;
+ __u32 lsr_uid_h;
+ __u32 lsr_gid;
+ __u32 lsr_gid_h;
+ __u64 lsr_padding;
+ struct llog_rec_tail lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+ struct llog_rec_hdr lsc_hdr;
+ struct ll_fid lsc_fid;
+ __u32 lsc_ioepoch;
+ __u32 lsc_padding1;
+ __u64 lsc_padding2;
+ __u64 lsc_padding3;
+ struct llog_rec_tail lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK (1 << CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK 0XFFFFFFFF
+/** default \a changelog_rec_type mask */
+#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE)
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+ __u64 cs_recno;
+ __u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+ struct llog_rec_hdr cr_hdr;
+ struct changelog_rec cr;
+ struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+struct llog_changelog_ext_rec {
+ struct llog_rec_hdr cr_hdr;
+ struct changelog_ext_rec cr;
+ struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+
+struct llog_changelog_user_rec {
+ struct llog_rec_hdr cur_hdr;
+ __u32 cur_id;
+ __u32 cur_padding;
+ __u64 cur_endrec;
+ struct llog_rec_tail cur_tail;
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+ __u64 mnt_cnt;
+ __u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+ struct llog_rec_hdr lgr_hdr;
+ struct llog_gen lgr_gen;
+ __u64 padding1;
+ __u64 padding2;
+ __u64 padding3;
+ struct llog_rec_tail lgr_tail;
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_CHUNK_SIZE 8192
+#define LLOG_HEADER_SIZE (96)
+#define LLOG_BITMAP_BYTES (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE)
+
+#define LLOG_MIN_REC_SIZE (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+/* flags for the logs */
+enum llog_flag {
+ LLOG_F_ZAP_WHEN_EMPTY = 0x1,
+ LLOG_F_IS_CAT = 0x2,
+ LLOG_F_IS_PLAIN = 0x4,
+};
+
+struct llog_log_hdr {
+ struct llog_rec_hdr llh_hdr;
+ obd_time llh_timestamp;
+ __u32 llh_count;
+ __u32 llh_bitmap_offset;
+ __u32 llh_size;
+ __u32 llh_flags;
+ __u32 llh_cat_idx;
+ /* for a catalog the first plain slot is next to it */
+ struct obd_uuid llh_tgtuuid;
+ __u32 llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23];
+ __u32 llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)];
+ struct llog_rec_tail llh_tail;
+} __attribute__((packed));
+
+#define LLOG_BITMAP_SIZE(llh) (__u32)((llh->llh_hdr.lrh_len - \
+ llh->llh_bitmap_offset - \
+ sizeof(llh->llh_tail)) * 8)
+
+/** log cookies are used to reference a specific log file and a record therein */
+struct llog_cookie {
+ struct llog_logid lgc_lgl;
+ __u32 lgc_subsys;
+ __u32 lgc_index;
+ __u32 lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+ LLOG_ORIGIN_HANDLE_CREATE = 501,
+ LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502,
+ LLOG_ORIGIN_HANDLE_READ_HEADER = 503,
+ LLOG_ORIGIN_HANDLE_WRITE_REC = 504,
+ LLOG_ORIGIN_HANDLE_CLOSE = 505,
+ LLOG_ORIGIN_CONNECT = 506,
+ LLOG_CATINFO = 507, /* deprecated */
+ LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508,
+ LLOG_ORIGIN_HANDLE_DESTROY = 509, /* for destroy llog object*/
+ LLOG_LAST_OPC,
+ LLOG_FIRST_OPC = LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+ struct llog_logid lgd_logid;
+ __u32 lgd_ctxt_idx;
+ __u32 lgd_llh_flags;
+ __u32 lgd_index;
+ __u32 lgd_saved_index;
+ __u32 lgd_len;
+ __u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+ struct llog_gen lgdc_gen;
+ struct llog_logid lgdc_logid;
+ __u32 lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+ obd_valid o_valid; /* hot fields in this obdo */
+ struct ost_id o_oi;
+ obd_id o_parent_seq;
+ obd_size o_size; /* o_size-o_blocks == ost_lvb */
+ obd_time o_mtime;
+ obd_time o_atime;
+ obd_time o_ctime;
+ obd_blocks o_blocks; /* brw: cli sent cached bytes */
+ obd_size o_grant;
+
+ /* 32-bit fields start here: keep an even number of them via padding */
+ obd_blksize o_blksize; /* optimal IO blocksize */
+ obd_mode o_mode; /* brw: cli sent cache remain */
+ obd_uid o_uid;
+ obd_gid o_gid;
+ obd_flag o_flags;
+ obd_count o_nlink; /* brw: checksum */
+ obd_count o_parent_oid;
+ obd_count o_misc; /* brw: o_dropped */
+
+ __u64 o_ioepoch; /* epoch in ost writes */
+ __u32 o_stripe_idx; /* holds stripe idx */
+ __u32 o_parent_ver;
+ struct lustre_handle o_handle; /* brw: lock handle to prolong
+ * locks */
+ struct llog_cookie o_lcookie; /* destroy: unlink cookie from
+ * MDS */
+ __u32 o_uid_h;
+ __u32 o_gid_h;
+
+ __u64 o_data_version; /* getattr: sum of iversion for
+ * each stripe.
+ * brw: grant space consumed on
+ * the client for the write */
+ __u64 o_padding_4;
+ __u64 o_padding_5;
+ __u64 o_padding_6;
+};
+
+#define o_dirty o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum o_nlink
+#define o_grant_used o_data_version
+
+static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd,
+ struct obdo *wobdo, struct obdo *lobdo)
+{
+ memcpy(wobdo, lobdo, sizeof(*lobdo));
+ wobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+ if (ocd == NULL)
+ return;
+
+ if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+ fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
+ /* Currently OBD_FL_OSTID will only be used when 2.4 echo
+ * client communicate with pre-2.4 server */
+ wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
+ wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
+ }
+}
+
+static inline void lustre_get_wire_obdo(struct obd_connect_data *ocd,
+ struct obdo *lobdo, struct obdo *wobdo)
+{
+ obd_flag local_flags = 0;
+
+ if (lobdo->o_valid & OBD_MD_FLFLAGS)
+ local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK;
+
+ LASSERT(!(wobdo->o_flags & OBD_FL_LOCAL_MASK));
+
+ memcpy(lobdo, wobdo, sizeof(*lobdo));
+ if (local_flags != 0) {
+ lobdo->o_valid |= OBD_MD_FLFLAGS;
+ lobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+ lobdo->o_flags |= local_flags;
+ }
+ if (ocd == NULL)
+ return;
+
+ if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+ fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) {
+ /* see above */
+ lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq;
+ lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id;
+ lobdo->o_oi.oi_fid.f_ver = 0;
+ }
+}
+
+extern void lustre_swab_obdo (struct obdo *o);
+
+/* request structure for OST's */
+struct ost_body {
+ struct obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+ char name[8];
+ struct obdo oa;
+ struct ll_user_fiemap fiemap;
+};
+
+extern void lustre_swab_ost_body (struct ost_body *b);
+extern void lustre_swab_ost_last_id(obd_id *id);
+extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
+
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+ int stripe_count);
+extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+
+/* llog_swab.c */
+extern void lustre_swab_llogd_body (struct llogd_body *d);
+extern void lustre_swab_llog_hdr (struct llog_log_hdr *h);
+extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d);
+extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+extern void lustre_swab_llog_id(struct llog_logid *lid);
+
+struct lustre_cfg;
+extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_obdo(struct obdo *oa);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+ __u32 ii_magic;
+
+ /* reply: see idx_info_flags below */
+ __u32 ii_flags;
+
+ /* request & reply: number of lu_idxpage (to be) transferred */
+ __u16 ii_count;
+ __u16 ii_pad0;
+
+ /* request: requested attributes passed down to the iterator API */
+ __u32 ii_attrs;
+
+ /* request & reply: index file identifier (FID) */
+ struct lu_fid ii_fid;
+
+ /* reply: version of the index file before starting to walk the index.
+ * Please note that the version can be modified at any time during the
+ * transfer */
+ __u64 ii_version;
+
+ /* request: hash to start with:
+ * reply: hash of the first entry of the first lu_idxpage and hash
+ * of the entry to read next if any */
+ __u64 ii_hash_start;
+ __u64 ii_hash_end;
+
+ /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+ * set */
+ __u16 ii_keysize;
+
+ /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+ * is set */
+ __u16 ii_recsize;
+
+ __u32 ii_pad1;
+ __u64 ii_pad2;
+ __u64 ii_pad3;
+};
+extern void lustre_swab_idx_info(struct idx_info *ii);
+
+#define II_END_OFF MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+ II_FL_NOHASH = 1 << 0, /* client doesn't care about hash value */
+ II_FL_VARKEY = 1 << 1, /* keys can be of variable size */
+ II_FL_VARREC = 1 << 2, /* records can be of variable size */
+ II_FL_NONUNQ = 1 << 3, /* index supports non-unique keys */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+ /* 16-byte header */
+ __u32 lip_magic;
+ __u16 lip_flags;
+ __u16 lip_nr; /* number of entries in the container */
+ __u64 lip_pad0; /* additional padding for future use */
+
+ /* key/record pairs are stored in the remaining 4080 bytes.
+ * depending upon the flags in idx_info::ii_flags, each key/record
+ * pair might be preceded by:
+ * - a hash value
+ * - the key size (II_FL_VARKEY is set)
+ * - the record size (II_FL_VARREC is set)
+ *
+ * For the time being, we only support fixed-size key & record. */
+ char lip_entries[0];
+};
+extern void lustre_swab_lip_header(struct lu_idxpage *lip);
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+ struct lu_dirpage lp_dir; /* for MDS_READPAGE */
+ struct lu_idxpage lp_idx; /* for OBD_IDX_READ */
+ char lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+typedef enum {
+ SEC_CTX_INIT = 801,
+ SEC_CTX_INIT_CONT = 802,
+ SEC_CTX_FINI = 803,
+ SEC_LAST_OPC,
+ SEC_FIRST_OPC = SEC_CTX_INIT
+} sec_cmd_t;
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN 64
+#define CAPA_HMAC_KEY_MAX_LEN 56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+ struct lu_fid lc_fid; /** fid */
+ __u64 lc_opc; /** operations allowed */
+ __u64 lc_uid; /** file owner */
+ __u64 lc_gid; /** file group */
+ __u32 lc_flags; /** HMAC algorithm & flags */
+ __u32 lc_keyid; /** key# used for the capability */
+ __u32 lc_timeout; /** capa timeout value (sec) */
+ __u32 lc_expiry; /** expiry time (sec) */
+ __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /** HMAC */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa(struct lustre_capa *c);
+
+/** lustre_capa::lc_opc */
+enum {
+ CAPA_OPC_BODY_WRITE = 1<<0, /**< write object data */
+ CAPA_OPC_BODY_READ = 1<<1, /**< read object data */
+ CAPA_OPC_INDEX_LOOKUP = 1<<2, /**< lookup object fid */
+ CAPA_OPC_INDEX_INSERT = 1<<3, /**< insert object fid */
+ CAPA_OPC_INDEX_DELETE = 1<<4, /**< delete object fid */
+ CAPA_OPC_OSS_WRITE = 1<<5, /**< write oss object data */
+ CAPA_OPC_OSS_READ = 1<<6, /**< read oss object data */
+ CAPA_OPC_OSS_TRUNC = 1<<7, /**< truncate oss object */
+ CAPA_OPC_OSS_DESTROY = 1<<8, /**< destroy oss object */
+ CAPA_OPC_META_WRITE = 1<<9, /**< write object meta data */
+ CAPA_OPC_META_READ = 1<<10, /**< read object meta data */
+};
+
+#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE)
+#define CAPA_OPC_MDS_ONLY \
+ (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \
+ CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE)
+#define CAPA_OPC_OSS_ONLY \
+ (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC | \
+ CAPA_OPC_OSS_DESTROY)
+#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY
+#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY)
+
+/* MDS capability covers object capability for operations of body r/w
+ * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w,
+ * while OSS capability only covers object capability for operations of
+ * oss data(file content) r/w/truncate.
+ */
+static inline int capa_for_mds(struct lustre_capa *c)
+{
+ return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0;
+}
+
+static inline int capa_for_oss(struct lustre_capa *c)
+{
+ return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0;
+}
+
+/* lustre_capa::lc_hmac_alg */
+enum {
+ CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */
+ CAPA_HMAC_ALG_MAX,
+};
+
+#define CAPA_FL_MASK 0x00ffffff
+#define CAPA_HMAC_ALG_MASK 0xff000000
+
+struct lustre_capa_key {
+ __u64 lk_seq; /**< mds# */
+ __u32 lk_keyid; /**< key# */
+ __u32 lk_padding;
+ __u8 lk_key[CAPA_HMAC_KEY_MAX_LEN]; /**< key */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+ __u32 leh_magic;
+ __u32 leh_reccount;
+ __u64 leh_len; /* total size */
+ /* future use */
+ __u32 padding1;
+ __u32 padding2;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+ /** __u16 stored big-endian, unaligned */
+ unsigned char lee_reclen[2];
+ unsigned char lee_parent_fid[sizeof(struct lu_fid)];
+ char lee_name[0];
+}__attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+ struct lu_fid gf_fid;
+ __u64 gf_recno;
+ __u32 gf_linkno;
+ __u32 gf_pathlen;
+ char gf_path[0];
+} __attribute__((packed));
+
+void lustre_swab_fid2path (struct getinfo_fid2path *gf);
+
+enum {
+ LAYOUT_INTENT_ACCESS = 0,
+ LAYOUT_INTENT_READ = 1,
+ LAYOUT_INTENT_WRITE = 2,
+ LAYOUT_INTENT_GLIMPSE = 3,
+ LAYOUT_INTENT_TRUNC = 4,
+ LAYOUT_INTENT_RELEASE = 5,
+ LAYOUT_INTENT_RESTORE = 6
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+ __u32 li_opc; /* intent operation for enqueue, read, write etc */
+ __u32 li_flags;
+ __u64 li_start;
+ __u64 li_end;
+};
+
+void lustre_swab_layout_intent(struct layout_intent *li);
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+ /* Field taken from struct hsm_progress */
+ lustre_fid hpk_fid;
+ __u64 hpk_cookie;
+ struct hsm_extent hpk_extent;
+ __u16 hpk_flags;
+ __u16 hpk_errval; /* positive val */
+ __u32 hpk_padding1;
+ /* Additional fields */
+ __u64 hpk_data_version;
+ __u64 hpk_padding2;
+} __attribute__((packed));
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+extern void lustre_swab_hsm_request(struct hsm_request *hr);
+
+/**
+ * These are object update opcode under UPDATE_OBJ, which is currently
+ * being used by cross-ref operations between MDT.
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ * Update request format
+ * magic: UPDATE_BUFFER_MAGIC_V1
+ * Count: How many updates in the req.
+ * bufs[0] : following are packets of object.
+ * update[0]:
+ * type: object_update_op, the op code of update
+ * fid: The object fid of the update.
+ * lens/bufs: other parameters of the update.
+ * update[1]:
+ * type: object_update_op, the op code of update
+ * fid: The object fid of the update.
+ * lens/bufs: other parameters of the update.
+ * ..........
+ * update[7]: type: object_update_op, the op code of update
+ * fid: The object fid of the update.
+ * lens/bufs: other parameters of the update.
+ * Current 8 maxim updates per object update request.
+ *
+ *******************************************************************
+ * update reply format:
+ *
+ * ur_version: UPDATE_REPLY_V1
+ * ur_count: The count of the reply, which is usually equal
+ * to the number of updates in the request.
+ * ur_lens: The reply lengths of each object update.
+ *
+ * replies: 1st update reply [4bytes_ret: other body]
+ * 2nd update reply [4bytes_ret: other body]
+ * .....
+ * nth update reply [4bytes_ret: other body]
+ *
+ * For each reply of the update, the format would be
+ * result(4 bytes):Other stuff
+ */
+
+#define UPDATE_MAX_OPS 10
+#define UPDATE_BUFFER_MAGIC_V1 0xBDDE0001
+#define UPDATE_BUFFER_MAGIC UPDATE_BUFFER_MAGIC_V1
+#define UPDATE_BUF_COUNT 8
+enum object_update_op {
+ OBJ_CREATE = 1,
+ OBJ_DESTROY = 2,
+ OBJ_REF_ADD = 3,
+ OBJ_REF_DEL = 4,
+ OBJ_ATTR_SET = 5,
+ OBJ_ATTR_GET = 6,
+ OBJ_XATTR_SET = 7,
+ OBJ_XATTR_GET = 8,
+ OBJ_INDEX_LOOKUP = 9,
+ OBJ_INDEX_INSERT = 10,
+ OBJ_INDEX_DELETE = 11,
+ OBJ_LAST
+};
+
+struct update {
+ __u32 u_type;
+ __u32 u_batchid;
+ struct lu_fid u_fid;
+ __u32 u_lens[UPDATE_BUF_COUNT];
+ __u32 u_bufs[0];
+};
+
+struct update_buf {
+ __u32 ub_magic;
+ __u32 ub_count;
+ __u32 ub_bufs[0];
+};
+
+#define UPDATE_REPLY_V1 0x00BD0001
+struct update_reply {
+ __u32 ur_version;
+ __u32 ur_count;
+ __u32 ur_lens[0];
+};
+
+void lustre_swab_update_buf(struct update_buf *ub);
+void lustre_swab_update_reply_buf(struct update_reply *ur);
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+ __u64 msl_flags;
+} __packed;
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+
+#endif
+/** @} lustreidl */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644
index 000000000000..1c87a61a7fc1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License version 2 for more details. A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+enum lfsck_param_flags {
+ /* Reset LFSCK iterator position to the device beginning. */
+ LPF_RESET = 0x0001,
+
+ /* Exit when fail. */
+ LPF_FAILOUT = 0x0002,
+
+ /* Dryrun mode, only check without modification */
+ LPF_DRYRUN = 0x0004,
+};
+
+enum lfsck_type {
+ /* For MDT-OST consistency check/repair. */
+ LT_LAYOUT = 0x0001,
+
+ /* For MDT-MDT consistency check/repair. */
+ LT_DNE = 0x0002,
+
+ /* For FID-in-dirent and linkEA consistency check/repair. */
+ LT_NAMESPACE = 0x0004,
+};
+
+#define LFSCK_VERSION_V1 1
+#define LFSCK_VERSION_V2 2
+
+#define LFSCK_TYPES_ALL ((__u16)(~0))
+#define LFSCK_TYPES_DEF ((__u16)0)
+#define LFSCK_TYPES_SUPPORTED LT_NAMESPACE
+
+#define LFSCK_SPEED_NO_LIMIT 0
+#define LFSCK_SPEED_LIMIT_DEF LFSCK_SPEED_NO_LIMIT
+
+enum lfsck_start_valid {
+ LSV_SPEED_LIMIT = 0x00000001,
+ LSV_ERROR_HANDLE = 0x00000002,
+ LSV_DRYRUN = 0x00000004,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+ /* Which arguments are valid, see 'enum lfsck_start_valid'. */
+ __u32 ls_valid;
+
+ /* How many items can be scanned at most per second. */
+ __u32 ls_speed_limit;
+
+ /* For compatibility between user space tools and kernel service. */
+ __u16 ls_version;
+
+ /* Which LFSCK components to be (have been) started. */
+ __u16 ls_active;
+
+ /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+ __u16 ls_flags;
+
+ /* For 64-bits aligned. */
+ __u16 ls_padding;
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
new file mode 100644
index 000000000000..7e9f57507f04
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
@@ -0,0 +1,1145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include <lustre/ll_fiemap.h>
+#include <linux/lustre_user.h>
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD _IOW('v', 2, long)
+#define FSFILT_IOC_FIEMAP _IOWR('f', 11, struct ll_user_fiemap)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+ OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */
+ OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */
+ OS_STATE_RDONLY_1 = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
+ OS_STATE_RDONLY_2 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+ OS_STATE_RDONLY_3 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+};
+
+struct obd_statfs {
+ __u64 os_type;
+ __u64 os_blocks;
+ __u64 os_bfree;
+ __u64 os_bavail;
+ __u64 os_files;
+ __u64 os_ffree;
+ __u8 os_fsid[40];
+ __u32 os_bsize;
+ __u32 os_namelen;
+ __u64 os_maxbytes;
+ __u32 os_state; /**< obd_statfs_state OS_STATE_* flag */
+ __u32 os_fprecreated; /* objs available now to the caller */
+ /* used in QoS code to find preferred
+ * OSTs */
+ __u32 os_spare2;
+ __u32 os_spare3;
+ __u32 os_spare4;
+ __u32 os_spare5;
+ __u32 os_spare6;
+ __u32 os_spare7;
+ __u32 os_spare8;
+ __u32 os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+ /**
+ * FID sequence. Sequence is a unit of migration: all files (objects)
+ * with FIDs from a given sequence are stored on the same server.
+ * Lustre should support 2^64 objects, so even if each sequence
+ * has only a single object we can still enumerate 2^64 objects.
+ **/
+ __u64 f_seq;
+ /* FID number within sequence. */
+ __u32 f_oid;
+ /**
+ * FID version, used to distinguish different versions (in the sense
+ * of snapshots, etc.) of the same file system object. Not currently
+ * used.
+ **/
+ __u32 f_ver;
+};
+
+struct filter_fid {
+ struct lu_fid ff_parent; /* ff_parent.f_ver == file stripe number */
+};
+
+/* keep this one for compatibility */
+struct filter_fid_old {
+ struct lu_fid ff_parent;
+ __u64 ff_objid;
+ __u64 ff_seq;
+};
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them. Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+ /**
+ * Bitfield for supported data in this structure. From enum lma_compat.
+ * lma_self_fid and lma_flags are always available.
+ */
+ __u32 lma_compat;
+ /**
+ * Per-file incompat feature list. Lustre version should support all
+ * flags set in this field. The supported feature mask is available in
+ * LMA_INCOMPAT_SUPP.
+ */
+ __u32 lma_incompat;
+ /** FID of this inode */
+ struct lu_fid lma_self_fid;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+ union {
+ struct ostid {
+ __u64 oi_id;
+ __u64 oi_seq;
+ } oi;
+ struct lu_fid oi_fid;
+ };
+};
+
+#define DOSTID LPX64":"LPU64
+#define POSTID(oi) ostid_seq(oi), ostid_id(oi)
+
+/*
+ * The ioctl naming rules:
+ * LL_* - works on the currently opened filehandle instead of parent dir
+ * *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_* - gets/sets data related to MDC
+ * *_LOV_* - gets/sets data related to OSC/LOV
+ * *FILE* - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO - set/get lov_user_mds_data
+ */
+/* see <lustre_lib.h> for ioctl numberss 101-150 */
+#define LL_IOC_GETFLAGS _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS _IOW ('f', 153, long)
+/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */
+#define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long)
+/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */
+#define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long)
+/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */
+#define LL_IOC_LOV_SETEA _IOW ('f', 156, long)
+#define LL_IOC_RECREATE_OBJ _IOW ('f', 157, long)
+#define LL_IOC_RECREATE_FID _IOW ('f', 157, struct lu_fid)
+#define LL_IOC_GROUP_LOCK _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK _IOW ('f', 159, long)
+/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */
+#define LL_IOC_QUOTACHECK _IOW ('f', 160, int)
+/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */
+#define LL_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *)
+/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */
+#define LL_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl)
+#define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *)
+#define IOC_LOV_GETINFO _IOWR('f', 165, struct lov_user_mds_data *)
+#define LL_IOC_FLUSHCTX _IOW ('f', 166, long)
+#define LL_IOC_RMTACL _IOW ('f', 167, long)
+#define LL_IOC_GETOBDCOUNT _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int)
+
+/* see <lustre_lib.h> for ioctl numbers 177-210 */
+
+#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START _IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END _IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS _IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST _IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION _IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS _IOW('f', 219, \
+ struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION _IOR('f', 220, \
+ struct hsm_current_action)
+/* see <lustre_lib.h> for ioctl numbers 221-232 */
+
+#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY _IOWR('f', 242, __u64)
+
+#define LL_STATFS_LMV 1
+#define LL_STATFS_LOV 2
+#define LL_STATFS_NODELAY 4
+
+#define IOC_MDC_TYPE 'i'
+#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+/* Keep these for backward compartability. */
+#define LL_IOC_OBD_STATFS IOC_OBD_STATFS
+#define IOC_MDC_GETSTRIPE IOC_MDC_GETFILESTRIPE
+
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Hopefully O_LOV_DELAY_CREATE does not conflict with standard O_xxx flags.
+ * Previously it was defined as 0100000000 and conflicts with FMODE_NONOTIFY
+ * which was added since kernel 2.6.36, so we redefine it as 020000000.
+ * To be compatible with old version's statically linked binary, finally we
+ * define it as (020000000 | 0100000000).
+ * */
+#define O_LOV_DELAY_CREATE 0120000000
+
+#define LL_FILE_IGNORE_LOCK 0x00000001
+#define LL_FILE_GROUP_LOCKED 0x00000002
+#define LL_FILE_READAHEA 0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO 0x00000010 /* server-side locks with cio */
+#define LL_FILE_RMTACL 0x00000020
+
+#define LOV_USER_MAGIC_V1 0x0BD10BD0
+#define LOV_USER_MAGIC LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
+
+#define LMV_MAGIC_V1 0x0CD10CD0 /*normal stripe lmv magic */
+#define LMV_USER_MAGIC 0x0CD20CD0 /*default lmv magic*/
+
+#define LOV_PATTERN_RAID0 0x001
+#define LOV_PATTERN_RAID1 0x002
+#define LOV_PATTERN_FIRST 0x100
+
+#define LOV_MAXPOOLNAME 16
+#define LOV_POOLNAMEF "%.16s"
+
+#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES 0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 { /* per-stripe data structure */
+ struct ost_id l_ost_oi; /* OST object ID */
+ __u32 l_ost_gen; /* generation of this OST index */
+ __u32 l_ost_idx; /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 { /* LOV EA user data (host-endian) */
+ __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ struct ost_id lmm_oi; /* LOV object ID */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ __u16 lmm_stripe_count; /* num stripes in use for this object */
+ union {
+ __u16 lmm_stripe_offset; /* starting stripe offset in
+ * lmm_objects, use when writing */
+ __u16 lmm_layout_gen; /* layout generation number
+ * used when reading */
+ };
+ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed, __may_alias__));
+
+struct lov_user_md_v3 { /* LOV EA user data (host-endian) */
+ __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ struct ost_id lmm_oi; /* LOV object ID */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ __u16 lmm_stripe_count; /* num stripes in use for this object */
+ union {
+ __u16 lmm_stripe_offset; /* starting stripe offset in
+ * lmm_objects, use when writing */
+ __u16 lmm_layout_gen; /* layout generation number
+ * used when reading */
+ };
+ char lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this. It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+ lstat_t lmd_st; /* MDS stat struct */
+ struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+ lstat_t lmd_st; /* MDS stat struct */
+ struct lov_user_md_v3 lmd_lmm; /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+/* keep this to be the same size as lov_user_ost_data_v1 */
+struct lmv_user_mds_data {
+ struct lu_fid lum_fid;
+ __u32 lum_padding;
+ __u32 lum_mds;
+};
+
+/* lum_type */
+enum {
+ LMV_STRIPE_TYPE = 0,
+ LMV_DEFAULT_TYPE = 1,
+};
+
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+ __u32 lum_magic; /* must be the first field */
+ __u32 lum_stripe_count; /* dirstripe count */
+ __u32 lum_stripe_offset; /* MDT idx for default dirstripe */
+ __u32 lum_hash_type; /* Dir stripe policy */
+ __u32 lum_type; /* LMV type: default or normal */
+ __u32 lum_padding1;
+ __u32 lum_padding2;
+ __u32 lum_padding3;
+ char lum_pool_name[LOV_MAXPOOLNAME];
+ struct lmv_user_mds_data lum_objects[0];
+};
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+ return sizeof(struct lmv_user_md) +
+ stripes * sizeof(struct lmv_user_mds_data);
+}
+
+extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+
+struct ll_recreate_obj {
+ __u64 lrc_id;
+ __u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+ __u64 id; /* holds object id */
+ __u32 generation; /* holds object generation */
+ __u32 f_type; /* holds object type or stripe idx when passing it to
+ * OST for saving into EA. */
+};
+
+#define UUID_MAX 40
+struct obd_uuid {
+ char uuid[UUID_MAX];
+};
+
+static inline int obd_uuid_equals(const struct obd_uuid *u1,
+ const struct obd_uuid *u2)
+{
+ return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+ return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+ strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+ uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(struct obd_uuid *uuid)
+{
+ if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+ /* Obviously not safe, but for printfs, no real harm done...
+ we're always null-terminated, even in a race. */
+ static char temp[sizeof(*uuid)];
+ memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+ temp[sizeof(*uuid) - 1] = '\0';
+ return temp;
+ }
+ return (char *)(uuid->uuid);
+}
+
+/* Extract fsname from uuid (or target name) of a target
+ e.g. (myfs-OST0007_UUID -> myfs)
+ see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+ char *p;
+
+ strncpy(buf, uuid, buflen - 1);
+ buf[buflen - 1] = '\0';
+ p = strrchr(buf, '-');
+ if (p)
+ *p = '\0';
+}
+
+/* printf display format
+ e.g. printf("file FID is "DFID"\n", PFID(fid)); */
+#define DFID_NOBRACE LPX64":0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid) \
+ (fid)->f_seq, \
+ (fid)->f_oid, \
+ (fid)->f_ver
+
+/* scanf input parse format -- strip '[' first.
+ e.g. sscanf(fidstr, SFID, RFID(&fid)); */
+/* #define SFID "0x"LPX64i":0x"LPSZX":0x"LPSZX""
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 4 has type 'unsigned int *'
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 5 has type 'unsigned int *'
+*/
+#define SFID "0x"LPX64i":0x%x:0x%x"
+#define RFID(fid) \
+ &((fid)->f_seq), \
+ &((fid)->f_oid), \
+ &((fid)->f_ver)
+
+
+/********* Quotas **********/
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */
+#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */
+#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */
+#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE 0x80000b /* invalidate quota data */
+#define LUSTRE_Q_FINVALIDATE 0x80000c /* invalidate filter quota data */
+
+#define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */
+
+struct if_quotacheck {
+ char obd_type[16];
+ struct obd_uuid obd_uuid;
+};
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX 64
+
+struct perm_downcall_data {
+ __u64 pdd_nid;
+ __u32 pdd_perm;
+ __u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+ __u32 idd_magic;
+ __u32 idd_err;
+ __u32 idd_uid;
+ __u32 idd_gid;
+ __u32 idd_nperms;
+ __u32 idd_ngroups;
+ struct perm_downcall_data idd_perms[N_PERMS_MAX];
+ __u32 idd_groups[0];
+};
+
+/* for non-mapped uid/gid */
+#define NOBODY_UID 99
+#define NOBODY_GID 99
+
+#define INVALID_ID (-1)
+
+enum {
+ RMT_LSETFACL = 1,
+ RMT_LGETFACL = 2,
+ RMT_RSETFACL = 3,
+ RMT_RGETFACL = 4
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS 1
+#define QIF_SPACE 2
+#define QIF_ILIMITS 4
+#define QIF_INODES 8
+#define QIF_BTIME 16
+#define QIF_ITIME 32
+#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR ".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN 14
+/* hdr + MDT index */
+#define LUSTRE_VOLATILE_IDX LUSTRE_VOLATILE_HDR":%.4X:"
+
+typedef enum lustre_quota_version {
+ LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+ __u64 dqi_bgrace;
+ __u64 dqi_igrace;
+ __u32 dqi_flags;
+ __u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+ __u64 dqb_bhardlimit;
+ __u64 dqb_bsoftlimit;
+ __u64 dqb_curspace;
+ __u64 dqb_ihardlimit;
+ __u64 dqb_isoftlimit;
+ __u64 dqb_curinodes;
+ __u64 dqb_btime;
+ __u64 dqb_itime;
+ __u32 dqb_valid;
+ __u32 dqb_padding;
+};
+
+enum {
+ QC_GENERAL = 0,
+ QC_MDTIDX = 1,
+ QC_OSTIDX = 2,
+ QC_UUID = 3
+};
+
+struct if_quotactl {
+ __u32 qc_cmd;
+ __u32 qc_type;
+ __u32 qc_id;
+ __u32 qc_stat;
+ __u32 qc_valid;
+ __u32 qc_idx;
+ struct obd_dqinfo qc_dqinfo;
+ struct obd_dqblk qc_dqblk;
+ char obd_type[16];
+ struct obd_uuid obd_uuid;
+};
+
+/* swap layout flags */
+#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0)
+#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1)
+#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2)
+#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3)
+struct lustre_swap_layouts {
+ __u64 sl_flags;
+ __u32 sl_fd;
+ __u32 sl_gid;
+ __u64 sl_dv1;
+ __u64 sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+ CL_MARK = 0,
+ CL_CREATE = 1, /* namespace */
+ CL_MKDIR = 2, /* namespace */
+ CL_HARDLINK = 3, /* namespace */
+ CL_SOFTLINK = 4, /* namespace */
+ CL_MKNOD = 5, /* namespace */
+ CL_UNLINK = 6, /* namespace */
+ CL_RMDIR = 7, /* namespace */
+ CL_RENAME = 8, /* namespace */
+ CL_EXT = 9, /* namespace extended record (2nd half of rename) */
+ CL_OPEN = 10, /* not currently used */
+ CL_CLOSE = 11, /* may be written to log only with mtime change */
+ CL_LAYOUT = 12, /* file layout/striping modified */
+ CL_TRUNC = 13,
+ CL_SETATTR = 14,
+ CL_XATTR = 15,
+ CL_HSM = 16, /* HSM specific events, see flags */
+ CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */
+ CL_CTIME = 18,
+ CL_ATIME = 19,
+ CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+ static const char *changelog_str[] = {
+ "MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+ "RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC",
+ "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME",
+ };
+
+ if (type >= 0 && type < CL_LAST)
+ return changelog_str[type];
+ return NULL;
+}
+
+/* per-record flags */
+#define CLF_VERSION 0x1000
+#define CLF_EXT_VERSION 0x2000
+#define CLF_FLAGSHIFT 12
+#define CLF_FLAGMASK ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK (~CLF_FLAGMASK)
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+ /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST 0x0001 /* rename unlink last hardlink of target */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L 0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H 6
+#define CLF_HSM_EVENT_L 7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H 9
+#define CLF_HSM_FLAG_L 10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H 11
+#define CLF_HSM_SPARE_L 12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H 15
+#define CLF_HSM_LAST 15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+ >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS 0x00
+#define CLF_HSM_MAXERROR 0x7E
+#define CLF_HSM_ERROVERFLOW 0x7F
+
+#define CLF_HSM_DIRTY 1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+ HE_ARCHIVE = 0,
+ HE_RESTORE = 1,
+ HE_CANCEL = 2,
+ HE_RELEASE = 3,
+ HE_REMOVE = 4,
+ HE_STATE = 5,
+ HE_SPARE1 = 6,
+ HE_SPARE2 = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+ return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+ *flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+ return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+ *flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+ return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+ *flags |= (error << CLF_HSM_ERR_L);
+}
+
+#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + sizeof(struct changelog_rec))
+
+struct changelog_rec {
+ __u16 cr_namelen;
+ __u16 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */
+ __u32 cr_type; /**< \a changelog_rec_type */
+ __u64 cr_index; /**< changelog record number */
+ __u64 cr_prev; /**< last index for this target fid */
+ __u64 cr_time;
+ union {
+ lustre_fid cr_tfid; /**< target fid */
+ __u32 cr_markerflags; /**< CL_MARK flags */
+ };
+ lustre_fid cr_pfid; /**< parent fid */
+ char cr_name[0]; /**< last element */
+} __attribute__((packed));
+
+/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save
+ * space, only rename uses changelog_ext_rec, while others use changelog_rec to
+ * store records.
+ */
+struct changelog_ext_rec {
+ __u16 cr_namelen;
+ __u16 cr_flags; /**< (flags & CLF_FLAGMASK) |
+ CLF_EXT_VERSION */
+ __u32 cr_type; /**< \a changelog_rec_type */
+ __u64 cr_index; /**< changelog record number */
+ __u64 cr_prev; /**< last index for this target fid */
+ __u64 cr_time;
+ union {
+ lustre_fid cr_tfid; /**< target fid */
+ __u32 cr_markerflags; /**< CL_MARK flags */
+ };
+ lustre_fid cr_pfid; /**< target parent fid */
+ lustre_fid cr_sfid; /**< source fid, or zero */
+ lustre_fid cr_spfid; /**< source parent fid, or zero */
+ char cr_name[0]; /**< last element */
+} __attribute__((packed));
+
+#define CHANGELOG_REC_EXTENDED(rec) \
+ (((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION)
+
+static inline int changelog_rec_size(struct changelog_rec *rec)
+{
+ return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec):
+ sizeof(*rec);
+}
+
+static inline char *changelog_rec_name(struct changelog_rec *rec)
+{
+ return CHANGELOG_REC_EXTENDED(rec) ?
+ ((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name;
+}
+
+static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec)
+{
+ return rec->cr_namelen - strlen(rec->cr_name) - 1;
+}
+
+static inline char *changelog_rec_sname(struct changelog_ext_rec *rec)
+{
+ return rec->cr_name + strlen(rec->cr_name) + 1;
+}
+
+struct ioc_changelog {
+ __u64 icc_recno;
+ __u32 icc_mdtindex;
+ __u32 icc_id;
+ __u32 icc_flags;
+};
+
+enum changelog_message_type {
+ CL_RECORD = 10, /* message is a changelog_rec */
+ CL_EOF = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+ __u64 idv_version;
+ __u64 idv_flags; /* See LL_DV_xxx */
+};
+#define LL_DV_NOFLUSH 0x01 /* Do not take READ EXTENT LOCK before sampling
+ version. Dirty caches are left unchanged. */
+
+#ifndef offsetof
+# define offsetof(typ,memb) ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+ HS_EXISTS = 0x00000001,
+ HS_DIRTY = 0x00000002,
+ HS_RELEASED = 0x00000004,
+ HS_ARCHIVED = 0x00000008,
+ HS_NORELEASE = 0x00000010,
+ HS_NOARCHIVE = 0x00000020,
+ HS_LOST = 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+ HPS_WAITING = 1,
+ HPS_RUNNING = 2,
+ HPS_DONE = 3,
+};
+#define HPS_NONE 0
+
+static inline char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+ switch (s) {
+ case HPS_WAITING: return "waiting";
+ case HPS_RUNNING: return "running";
+ case HPS_DONE: return "done";
+ default: return "unknown";
+ }
+}
+
+struct hsm_extent {
+ __u64 offset;
+ __u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+ /** Current HSM states, from enum hsm_states. */
+ __u32 hus_states;
+ __u32 hus_archive_id;
+ /** The current undergoing action, if there is one */
+ __u32 hus_in_progress_state;
+ __u32 hus_in_progress_action;
+ struct hsm_extent hus_in_progress_location;
+ char hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+ struct lu_fid hssi_fid;
+ __u64 hssi_setmask;
+ __u64 hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+ /** The current undergoing action, if there is one */
+ /* state is one of hsm_progress_states */
+ __u32 hca_state;
+ /* action is one of hsm_user_action */
+ __u32 hca_action;
+ struct hsm_extent hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+ HUA_NONE = 1, /* no action (noop) */
+ HUA_ARCHIVE = 10, /* copy to hsm */
+ HUA_RESTORE = 11, /* prestage */
+ HUA_RELEASE = 12, /* drop ost objects */
+ HUA_REMOVE = 13, /* remove from archive */
+ HUA_CANCEL = 14 /* cancel a request */
+};
+
+static inline char *hsm_user_action2name(enum hsm_user_action a)
+{
+ switch (a) {
+ case HUA_NONE: return "NOOP";
+ case HUA_ARCHIVE: return "ARCHIVE";
+ case HUA_RESTORE: return "RESTORE";
+ case HUA_RELEASE: return "RELEASE";
+ case HUA_REMOVE: return "REMOVE";
+ case HUA_CANCEL: return "CANCEL";
+ default: return "UNKNOWN";
+ }
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, connot be set by user */
+#define HSM_GHOST_COPY 0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+ __u32 hr_action; /* enum hsm_user_action */
+ __u32 hr_archive_id; /* archive id, used only with HUA_ARCHIVE */
+ __u64 hr_flags; /* request flags */
+ __u32 hr_itemcount; /* item count in hur_user_item vector */
+ __u32 hr_data_len;
+};
+
+struct hsm_user_item {
+ lustre_fid hui_fid;
+ struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+ struct hsm_request hur_request;
+ struct hsm_user_item hur_user_item[0];
+ /* extra data blob at end of struct (after all
+ * hur_user_items), only use helpers to access it
+ */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+ return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/** Compute the current length of the provided hsm_user_request. */
+static inline int hur_len(struct hsm_user_request *hur)
+{
+ return offsetof(struct hsm_user_request,
+ hur_user_item[hur->hur_request.hr_itemcount]) +
+ hur->hur_request.hr_data_len;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+ HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+ HSMA_NONE = 10, /* no action */
+ HSMA_ARCHIVE = 20, /* arbitrary offset */
+ HSMA_RESTORE = 21,
+ HSMA_REMOVE = 22,
+ HSMA_CANCEL = 23
+};
+
+static inline char *hsm_copytool_action2name(enum hsm_copytool_action a)
+{
+ switch (a) {
+ case HSMA_NONE: return "NOOP";
+ case HSMA_ARCHIVE: return "ARCHIVE";
+ case HSMA_RESTORE: return "RESTORE";
+ case HSMA_REMOVE: return "REMOVE";
+ case HSMA_CANCEL: return "CANCEL";
+ default: return "UNKNOWN";
+ }
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+ __u32 hai_len; /* valid size of this struct */
+ __u32 hai_action; /* hsm_copytool_action, but use known size */
+ lustre_fid hai_fid; /* Lustre FID to operated on */
+ lustre_fid hai_dfid; /* fid used for data access */
+ struct hsm_extent hai_extent; /* byte range to operate on */
+ __u64 hai_cookie; /* action cookie from coordinator */
+ __u64 hai_gid; /* grouplock id */
+ char hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/*
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ * \param hai [IN] record to print
+ * \param buffer [OUT] output buffer
+ * \param len [IN] max buffer len
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(struct hsm_action_item *hai,
+ char *buffer, int len)
+{
+ int i, sz, data_len;
+ char *ptr;
+
+ ptr = buffer;
+ sz = len;
+ data_len = hai->hai_len - sizeof(*hai);
+ for (i = 0 ; (i < data_len) && (sz > 0) ; i++)
+ {
+ int cnt;
+
+ cnt = snprintf(ptr, sz, "%.2X",
+ (unsigned char)hai->hai_data[i]);
+ ptr += cnt;
+ sz -= cnt;
+ }
+ *ptr = '\0';
+ return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+ __u32 hal_version;
+ __u32 hal_count; /* number of hai's to follow */
+ __u64 hal_compound_id; /* returned by coordinator */
+ __u64 hal_flags;
+ __u32 hal_archive_id; /* which archive backend */
+ __u32 padding1;
+ char hal_fsname[0]; /* null-terminated */
+ /* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+ boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+ return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item * hai_zero(struct hsm_action_list *hal)
+{
+ return (struct hsm_action_item *)(hal->hal_fsname +
+ cfs_size_round(strlen(hal-> \
+ hal_fsname)));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+ return (struct hsm_action_item *)((char *)hai +
+ cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline int hal_size(struct hsm_action_list *hal)
+{
+ int i, sz;
+ struct hsm_action_item *hai;
+
+ sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname));
+ hai = hai_zero(hal);
+ for (i = 0 ; i < hal->hal_count ; i++) {
+ sz += cfs_size_round(hai->hai_len);
+ hai = hai_next(hai);
+ }
+ return(sz);
+}
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY 0x02
+
+struct hsm_progress {
+ lustre_fid hp_fid;
+ __u64 hp_cookie;
+ struct hsm_extent hp_extent;
+ __u16 hp_flags;
+ __u16 hp_errval; /* positive val */
+ __u32 padding;
+};
+
+/**
+ * Use by copytool during any hsm request they handled.
+ * This structure is initialized by llapi_hsm_copy_start()
+ * which is an helper over the ioctl() interface
+ * Store Lustre, internal use only, data.
+ */
+struct hsm_copy {
+ __u64 hc_data_version;
+ __u16 hc_flags;
+ __u16 hc_errval; /* positive val */
+ __u32 padding;
+ struct hsm_action_item hc_hai;
+};
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustreapi.h b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h
new file mode 100644
index 000000000000..63da66506639
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h
@@ -0,0 +1,310 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTREAPI_H_
+#define _LUSTREAPI_H_
+
+/** \defgroup llapi llapi
+ *
+ * @{
+ */
+
+#include <lustre/lustre_user.h>
+
+typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, void *args);
+
+/* lustreapi message severity level */
+enum llapi_message_level {
+ LLAPI_MSG_OFF = 0,
+ LLAPI_MSG_FATAL = 1,
+ LLAPI_MSG_ERROR = 2,
+ LLAPI_MSG_WARN = 3,
+ LLAPI_MSG_NORMAL = 4,
+ LLAPI_MSG_INFO = 5,
+ LLAPI_MSG_DEBUG = 6,
+ LLAPI_MSG_MAX
+};
+
+/* the bottom three bits reserved for llapi_message_level */
+#define LLAPI_MSG_MASK 0x00000007
+#define LLAPI_MSG_NO_ERRNO 0x00000010
+
+extern void llapi_msg_set_level(int level);
+extern void llapi_error(int level, int rc, char *fmt, ...);
+#define llapi_err_noerrno(level, fmt, a...) \
+ llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a)
+extern void llapi_printf(int level, char *fmt, ...);
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
+ int stripe_offset, int stripe_count,
+ int stripe_pattern);
+extern int llapi_file_open(const char *name, int flags, int mode,
+ unsigned long long stripe_size, int stripe_offset,
+ int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name,
+ unsigned long long stripe_size,
+ int stripe_offset, int stripe_count,
+ int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+ unsigned long long stripe_size,
+ int stripe_offset, int stripe_count,
+ int stripe_pattern, char *pool_name);
+extern int llapi_poollist(const char *name);
+extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
+ char *buffer, int buffer_size);
+extern int llapi_get_poolmembers(const char *poolname, char **members,
+ int list_size, char *buffer, int buffer_size);
+extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+#define HAVE_LLAPI_FILE_LOOKUP
+extern int llapi_file_lookup(int dirfd, const char *name);
+
+#define VERBOSE_COUNT 0x1
+#define VERBOSE_SIZE 0x2
+#define VERBOSE_OFFSET 0x4
+#define VERBOSE_POOL 0x8
+#define VERBOSE_DETAIL 0x10
+#define VERBOSE_OBJID 0x20
+#define VERBOSE_GENERATION 0x40
+#define VERBOSE_MDTINDEX 0x80
+#define VERBOSE_ALL (VERBOSE_COUNT | VERBOSE_SIZE | VERBOSE_OFFSET | \
+ VERBOSE_POOL | VERBOSE_OBJID | VERBOSE_GENERATION)
+
+struct find_param {
+ unsigned int maxdepth;
+ time_t atime;
+ time_t mtime;
+ time_t ctime;
+ int asign; /* cannot be bitfields due to using pointers to */
+ int csign; /* access them during argument parsing. */
+ int msign;
+ int type;
+ int size_sign:2, /* these need to be signed values */
+ stripesize_sign:2,
+ stripecount_sign:2;
+ unsigned long long size;
+ unsigned long long size_units;
+ uid_t uid;
+ gid_t gid;
+
+ unsigned long zeroend:1,
+ recursive:1,
+ exclude_pattern:1,
+ exclude_type:1,
+ exclude_obd:1,
+ exclude_mdt:1,
+ exclude_gid:1,
+ exclude_uid:1,
+ check_gid:1, /* group ID */
+ check_uid:1, /* user ID */
+ check_pool:1, /* LOV pool name */
+ check_size:1, /* file size */
+ exclude_pool:1,
+ exclude_size:1,
+ exclude_atime:1,
+ exclude_mtime:1,
+ exclude_ctime:1,
+ get_lmv:1, /* get MDT list from LMV */
+ raw:1, /* do not fill in defaults */
+ check_stripesize:1, /* LOV stripe size */
+ exclude_stripesize:1,
+ check_stripecount:1, /* LOV stripe count */
+ exclude_stripecount:1;
+
+ int verbose;
+ int quiet;
+
+ /* regular expression */
+ char *pattern;
+
+ char *print_fmt;
+
+ struct obd_uuid *obduuid;
+ int num_obds;
+ int num_alloc_obds;
+ int obdindex;
+ int *obdindexes;
+
+ struct obd_uuid *mdtuuid;
+ int num_mdts;
+ int num_alloc_mdts;
+ int mdtindex;
+ int *mdtindexes;
+ int file_mdtindex;
+
+ int lumlen;
+ struct lov_user_mds_data *lmd;
+
+ char poolname[LOV_MAXPOOLNAME + 1];
+
+ int fp_lmv_count;
+ struct lmv_user_md *fp_lmv_md;
+
+ unsigned long long stripesize;
+ unsigned long long stripesize_units;
+ unsigned long long stripecount;
+
+ /* In-process parameters. */
+ unsigned long got_uuids:1,
+ obds_printed:1,
+ have_fileinfo:1; /* file attrs and LOV xattr */
+ unsigned int depth;
+ dev_t st_dev;
+};
+
+extern int llapi_ostlist(char *path, struct find_param *param);
+extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
+extern int llapi_getstripe(char *path, struct find_param *param);
+extern int llapi_find(char *path, struct find_param *param);
+
+extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+ int stripe_count, int stripe_pattern,
+ char *poolname);
+int llapi_direntry_remove(char *dname);
+extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+ struct obd_statfs *stat_buf,
+ struct obd_uuid *uuid_buf);
+extern int llapi_ping(char *obd_type, char *obd_name);
+extern int llapi_target_check(int num_types, char **obd_types, char *dir);
+extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+extern int llapi_is_lustre_mnttype(const char *type);
+extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+extern int parse_size(char *optarg, unsigned long long *size,
+ unsigned long long *size_units, int bytes_spec);
+extern int llapi_search_mounts(const char *pathname, int index,
+ char *mntdir, char *fsname);
+extern int llapi_search_fsname(const char *pathname, char *fsname);
+extern int llapi_getname(const char *path, char *buf, size_t size);
+
+extern void llapi_ping_target(char *obd_type, char *obd_name,
+ char *obd_uuid, void *args);
+
+extern int llapi_search_rootpath(char *pathname, const char *fsname);
+
+struct mntent;
+#define HAVE_LLAPI_IS_LUSTRE_MNT
+extern int llapi_is_lustre_mnt(struct mntent *mnt);
+extern int llapi_quotachown(char *path, int flag);
+extern int llapi_quotacheck(char *mnt, int check_type);
+extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk);
+extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
+ llapi_cb_t cb);
+extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+extern int llapi_lsetfacl(int argc, char *argv[]);
+extern int llapi_lgetfacl(int argc, char *argv[]);
+extern int llapi_rsetfacl(int argc, char *argv[]);
+extern int llapi_rgetfacl(int argc, char *argv[]);
+extern int llapi_cp(int argc, char *argv[]);
+extern int llapi_ls(int argc, char *argv[]);
+extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
+ int pathlen, long long *recno, int *linkno);
+extern int llapi_path2fid(const char *path, lustre_fid *fid);
+extern int llapi_fd2fid(const int fd, lustre_fid *fid);
+
+extern int llapi_get_version(char *buffer, int buffer_size, char **version);
+extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+ __u32 archive_id);
+
+extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
+static inline int llapi_create_volatile(char *directory, int mode)
+{
+ return llapi_create_volatile_idx(directory, -1, mode);
+}
+
+
+extern int llapi_fswap_layouts(const int fd1, const int fd2,
+ __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_swap_layouts(const char *path1, const char *path2,
+ __u64 dv1, __u64 dv2, __u64 flags);
+
+/* Changelog interface. priv is private state, managed internally
+ by these functions */
+#define CHANGELOG_FLAG_FOLLOW 0x01 /* Not yet implemented */
+#define CHANGELOG_FLAG_BLOCK 0x02 /* Blocking IO makes sense in case of
+ slow user parsing of the records, but it also prevents us from cleaning
+ up if the records are not consumed. */
+
+/* Records received are in extentded format now, though most of them are still
+ * written in disk in changelog_rec format (to save space and time), it's
+ * converted to extented format in the lustre api to ease changelog analysis. */
+#define HAVE_CHANGELOG_EXTEND_REC 1
+
+extern int llapi_changelog_start(void **priv, int flags, const char *mdtname,
+ long long startrec);
+extern int llapi_changelog_fini(void **priv);
+extern int llapi_changelog_recv(void *priv, struct changelog_ext_rec **rech);
+extern int llapi_changelog_free(struct changelog_ext_rec **rech);
+/* Allow records up to endrec to be destroyed; requires registered id. */
+extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
+ long long endrec);
+
+/* HSM copytool interface.
+ * priv is private state, managed internally by these functions
+ */
+struct hsm_copytool_private;
+extern int llapi_hsm_copytool_start(struct hsm_copytool_private **priv,
+ char *fsname, int flags,
+ int archive_count, int *archives);
+extern int llapi_hsm_copytool_fini(struct hsm_copytool_private **priv);
+extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+ struct hsm_action_list **hal, int *msgsize);
+extern int llapi_hsm_copytool_free(struct hsm_action_list **hal);
+extern int llapi_hsm_copy_start(char *mnt, struct hsm_copy *copy,
+ const struct hsm_action_item *hai);
+extern int llapi_hsm_copy_end(char *mnt, struct hsm_copy *copy,
+ const struct hsm_progress *hp);
+extern int llapi_hsm_progress(char *mnt, struct hsm_progress *hp);
+extern int llapi_hsm_import(const char *dst, int archive, struct stat *st,
+ unsigned long long stripe_size, int stripe_offset,
+ int stripe_count, int stripe_pattern,
+ char *pool_name, lustre_fid *newfid);
+
+/* HSM user interface */
+extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+ int data_len);
+extern int llapi_hsm_request(char *mnt, struct hsm_user_request *request);
+extern int llapi_hsm_current_action(const char *path,
+ struct hsm_current_action *hca);
+/** @} llapi */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h
new file mode 100644
index 000000000000..5cfb87b180c3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_acl.h
@@ -0,0 +1,42 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/lustre_acl.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_capa.h b/drivers/staging/lustre/lustre/include/lustre_capa.h
new file mode 100644
index 000000000000..d77bffc0b59d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_capa.h
@@ -0,0 +1,305 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_capa.h
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#ifndef __LINUX_CAPA_H_
+#define __LINUX_CAPA_H_
+
+/** \defgroup capa capa
+ *
+ * @{
+ */
+
+/*
+ * capability
+ */
+#include <linux/crypto.h>
+#include <lustre/lustre_idl.h>
+
+#define CAPA_TIMEOUT 1800 /* sec, == 30 min */
+#define CAPA_KEY_TIMEOUT (24 * 60 * 60) /* sec, == 1 days */
+
+struct capa_hmac_alg {
+ const char *ha_name;
+ int ha_len;
+ int ha_keylen;
+};
+
+#define DEF_CAPA_HMAC_ALG(name, type, len, keylen) \
+[CAPA_HMAC_ALG_ ## type] = { \
+ .ha_name = name, \
+ .ha_len = len, \
+ .ha_keylen = keylen, \
+}
+
+struct client_capa {
+ struct inode *inode;
+ struct list_head lli_list; /* link to lli_oss_capas */
+};
+
+struct target_capa {
+ struct hlist_node c_hash; /* link to capa hash */
+};
+
+struct obd_capa {
+ struct list_head c_list; /* link to capa_list */
+
+ struct lustre_capa c_capa; /* capa */
+ atomic_t c_refc; /* ref count */
+ cfs_time_t c_expiry; /* jiffies */
+ spinlock_t c_lock; /* protect capa content */
+ int c_site;
+
+ union {
+ struct client_capa cli;
+ struct target_capa tgt;
+ } u;
+};
+
+enum {
+ CAPA_SITE_CLIENT = 0,
+ CAPA_SITE_SERVER,
+ CAPA_SITE_MAX
+};
+
+static inline struct lu_fid *capa_fid(struct lustre_capa *capa)
+{
+ return &capa->lc_fid;
+}
+
+static inline __u64 capa_opc(struct lustre_capa *capa)
+{
+ return capa->lc_opc;
+}
+
+static inline __u64 capa_uid(struct lustre_capa *capa)
+{
+ return capa->lc_uid;
+}
+
+static inline __u64 capa_gid(struct lustre_capa *capa)
+{
+ return capa->lc_gid;
+}
+
+static inline __u32 capa_flags(struct lustre_capa *capa)
+{
+ return capa->lc_flags & 0xffffff;
+}
+
+static inline __u32 capa_alg(struct lustre_capa *capa)
+{
+ return (capa->lc_flags >> 24);
+}
+
+static inline __u32 capa_keyid(struct lustre_capa *capa)
+{
+ return capa->lc_keyid;
+}
+
+static inline __u64 capa_key_seq(struct lustre_capa_key *key)
+{
+ return key->lk_seq;
+}
+
+static inline __u32 capa_key_keyid(struct lustre_capa_key *key)
+{
+ return key->lk_keyid;
+}
+
+static inline __u32 capa_timeout(struct lustre_capa *capa)
+{
+ return capa->lc_timeout;
+}
+
+static inline __u32 capa_expiry(struct lustre_capa *capa)
+{
+ return capa->lc_expiry;
+}
+
+void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *,
+ const char *fmt, ... );
+#define DEBUG_CAPA(level, capa, fmt, args...) \
+do { \
+ if (((level) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (level)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \
+ _debug_capa((capa), &msgdata, fmt, ##args); \
+ } \
+} while (0)
+
+#define DEBUG_CAPA_KEY(level, k, fmt, args...) \
+do { \
+CDEBUG(level, fmt " capability key@%p seq "LPU64" keyid %u\n", \
+ ##args, k, capa_key_seq(k), capa_key_keyid(k)); \
+} while (0)
+
+typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *);
+
+/* obdclass/capa.c */
+extern struct list_head capa_list[];
+extern spinlock_t capa_lock;
+extern int capa_count[];
+extern struct kmem_cache *capa_cachep;
+
+struct hlist_head *init_capa_hash(void);
+void cleanup_capa_hash(struct hlist_head *hash);
+
+struct obd_capa *capa_add(struct hlist_head *hash,
+ struct lustre_capa *capa);
+struct obd_capa *capa_lookup(struct hlist_head *hash,
+ struct lustre_capa *capa, int alive);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key);
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+void capa_cpy(void *dst, struct obd_capa *ocapa);
+static inline struct obd_capa *alloc_capa(int site)
+{
+ struct obd_capa *ocapa;
+
+ if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER))
+ return ERR_PTR(-EINVAL);
+
+ OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep);
+ if (unlikely(!ocapa))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ocapa->c_list);
+ atomic_set(&ocapa->c_refc, 1);
+ spin_lock_init(&ocapa->c_lock);
+ ocapa->c_site = site;
+ if (ocapa->c_site == CAPA_SITE_CLIENT)
+ INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+ else
+ INIT_HLIST_NODE(&ocapa->u.tgt.c_hash);
+
+ return ocapa;
+}
+
+static inline struct obd_capa *capa_get(struct obd_capa *ocapa)
+{
+ if (!ocapa)
+ return NULL;
+
+ atomic_inc(&ocapa->c_refc);
+ return ocapa;
+}
+
+static inline void capa_put(struct obd_capa *ocapa)
+{
+ if (!ocapa)
+ return;
+
+ if (atomic_read(&ocapa->c_refc) == 0) {
+ DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for");
+ LBUG();
+ }
+
+ if (atomic_dec_and_test(&ocapa->c_refc)) {
+ LASSERT(list_empty(&ocapa->c_list));
+ if (ocapa->c_site == CAPA_SITE_CLIENT) {
+ LASSERT(list_empty(&ocapa->u.cli.lli_list));
+ } else {
+ struct hlist_node *hnode;
+
+ hnode = &ocapa->u.tgt.c_hash;
+ LASSERT(!hnode->next && !hnode->pprev);
+ }
+ OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa));
+ }
+}
+
+static inline int open_flags_to_accmode(int flags)
+{
+ int mode = flags;
+
+ if ((mode + 1) & O_ACCMODE)
+ mode++;
+ if (mode & O_TRUNC)
+ mode |= 2;
+
+ return mode;
+}
+
+static inline __u64 capa_open_opc(int mode)
+{
+ return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ;
+}
+
+static inline void set_capa_expiry(struct obd_capa *ocapa)
+{
+ cfs_time_t expiry = cfs_time_sub((cfs_time_t)ocapa->c_capa.lc_expiry,
+ cfs_time_current_sec());
+ ocapa->c_expiry = cfs_time_add(cfs_time_current(),
+ cfs_time_seconds(expiry));
+}
+
+static inline int capa_is_expired_sec(struct lustre_capa *capa)
+{
+ return (capa->lc_expiry - cfs_time_current_sec() <= 0);
+}
+
+static inline int capa_is_expired(struct obd_capa *ocapa)
+{
+ return cfs_time_beforeq(ocapa->c_expiry, cfs_time_current());
+}
+
+static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc)
+{
+ return (capa_opc(capa) & opc) == opc;
+}
+
+struct filter_capa_key {
+ struct list_head k_list;
+ struct lustre_capa_key k_key;
+};
+
+enum {
+ LC_ID_NONE = 0,
+ LC_ID_PLAIN = 1,
+ LC_ID_CONVERT = 2
+};
+
+#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT)
+
+/** @} capa */
+
+#endif /* __LINUX_CAPA_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h
new file mode 100644
index 000000000000..f12429f38215
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_cfg.h
@@ -0,0 +1,299 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_CFG_H
+#define _LUSTRE_CFG_H
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+ cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail. */
+#define LCFG_REQUIRED 0x0001000
+
+enum lcfg_command_type {
+ LCFG_ATTACH = 0x00cf001, /**< create a new obd instance */
+ LCFG_DETACH = 0x00cf002, /**< destroy obd instance */
+ LCFG_SETUP = 0x00cf003, /**< call type-specific setup */
+ LCFG_CLEANUP = 0x00cf004, /**< call type-specific cleanup */
+ LCFG_ADD_UUID = 0x00cf005, /**< add a nid to a niduuid */
+ LCFG_DEL_UUID = 0x00cf006, /**< remove a nid from a niduuid */
+ LCFG_MOUNTOPT = 0x00cf007, /**< create a profile (mdc, osc) */
+ LCFG_DEL_MOUNTOPT = 0x00cf008, /**< destroy a profile */
+ LCFG_SET_TIMEOUT = 0x00cf009, /**< set obd_timeout */
+ LCFG_SET_UPCALL = 0x00cf00a, /**< deprecated */
+ LCFG_ADD_CONN = 0x00cf00b, /**< add a failover niduuid to an obd */
+ LCFG_DEL_CONN = 0x00cf00c, /**< remove a failover niduuid */
+ LCFG_LOV_ADD_OBD = 0x00cf00d, /**< add an osc to a lov */
+ LCFG_LOV_DEL_OBD = 0x00cf00e, /**< remove an osc from a lov */
+ LCFG_PARAM = 0x00cf00f, /**< set a proc parameter */
+ LCFG_MARKER = 0x00cf010, /**< metadata about next cfg rec */
+ LCFG_LOG_START = 0x00ce011, /**< mgc only, process a cfg log */
+ LCFG_LOG_END = 0x00ce012, /**< stop processing updates */
+ LCFG_LOV_ADD_INA = 0x00ce013, /**< like LOV_ADD_OBD, inactive */
+ LCFG_ADD_MDC = 0x00cf014, /**< add an mdc to a lmv */
+ LCFG_DEL_MDC = 0x00cf015, /**< remove an mdc from a lmv */
+ LCFG_SPTLRPC_CONF = 0x00ce016, /**< security */
+ LCFG_POOL_NEW = 0x00ce020, /**< create an ost pool name */
+ LCFG_POOL_ADD = 0x00ce021, /**< add an ost to a pool */
+ LCFG_POOL_REM = 0x00ce022, /**< remove an ost from a pool */
+ LCFG_POOL_DEL = 0x00ce023, /**< destroy an ost pool name */
+ LCFG_SET_LDLM_TIMEOUT = 0x00ce030, /**< set ldlm_timeout */
+ LCFG_PRE_CLEANUP = 0x00cf031, /**< call type-specific pre
+ * cleanup cleanup */
+};
+
+struct lustre_cfg_bufs {
+ void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+ __u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+ __u32 lcfg_bufcount;
+};
+
+struct lustre_cfg {
+ __u32 lcfg_version;
+ __u32 lcfg_command;
+
+ __u32 lcfg_num;
+ __u32 lcfg_flags;
+ __u64 lcfg_nid;
+ __u32 lcfg_nal; /* not used any more */
+
+ __u32 lcfg_bufcount;
+ __u32 lcfg_buflens[0];
+};
+
+enum cfg_record_type {
+ PORTALS_CFG_TYPE = 1,
+ LUSTRE_CFG_TYPE = 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx) \
+ ((lcfg)->lcfg_bufcount <= (idx) \
+ ? 0 \
+ : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+ __u32 index,
+ void *buf,
+ __u32 buflen)
+{
+ if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+ return;
+ if (bufs == NULL)
+ return;
+
+ if (bufs->lcfg_bufcount <= index)
+ bufs->lcfg_bufcount = index + 1;
+
+ bufs->lcfg_buf[index] = buf;
+ bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+ __u32 index,
+ char *str)
+{
+ lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name)
+{
+ memset((bufs), 0, sizeof(*bufs));
+ if (name)
+ lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index)
+{
+ int i;
+ int offset;
+ int bufcount;
+ LASSERT (lcfg != NULL);
+ LASSERT (index >= 0);
+
+ bufcount = lcfg->lcfg_bufcount;
+ if (index >= bufcount)
+ return NULL;
+
+ offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+ for (i = 0; i < index; i++)
+ offset += cfs_size_round(lcfg->lcfg_buflens[i]);
+ return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+ struct lustre_cfg *lcfg)
+{
+ int i;
+ bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+ for (i = 0; i < bufs->lcfg_bufcount; i++) {
+ bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+ bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+ }
+}
+
+static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
+{
+ char *s;
+
+ if (lcfg->lcfg_buflens[index] == 0)
+ return NULL;
+
+ s = lustre_cfg_buf(lcfg, index);
+ if (s == NULL)
+ return NULL;
+
+ /*
+ * make sure it's NULL terminated, even if this kills a char
+ * of data. Try to use the padding first though.
+ */
+ if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+ int last = min((int)lcfg->lcfg_buflens[index],
+ cfs_size_round(lcfg->lcfg_buflens[index]) - 1);
+ char lost = s[last];
+ s[last] = '\0';
+ if (lost != '\0') {
+ CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+ index, s, lost);
+ }
+ }
+ return s;
+}
+
+static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+ int i;
+ int len;
+ ENTRY;
+
+ len = LCFG_HDR_SIZE(bufcount);
+ for (i = 0; i < bufcount; i++)
+ len += cfs_size_round(buflens[i]);
+
+ RETURN(cfs_size_round(len));
+}
+
+
+#include <obd_support.h>
+
+static inline struct lustre_cfg *lustre_cfg_new(int cmd,
+ struct lustre_cfg_bufs *bufs)
+{
+ struct lustre_cfg *lcfg;
+ char *ptr;
+ int i;
+
+ ENTRY;
+
+ OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+ bufs->lcfg_buflen));
+ if (!lcfg)
+ RETURN(ERR_PTR(-ENOMEM));
+
+ lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+ lcfg->lcfg_command = cmd;
+ lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+ ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+ for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+ lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+ LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr);
+ }
+ RETURN(lcfg);
+}
+
+static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
+{
+ int len;
+
+ len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
+
+ OBD_FREE(lcfg, len);
+ EXIT;
+ return;
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, int len)
+{
+ struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+ ENTRY;
+ if (!lcfg)
+ RETURN(-EINVAL);
+
+ /* check that the first bits of the struct are valid */
+ if (len < LCFG_HDR_SIZE(0))
+ RETURN(-EINVAL);
+
+ if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+ RETURN(-EINVAL);
+
+ if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+ RETURN(-EINVAL);
+
+ /* check that the buflens are valid */
+ if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+ RETURN(-EINVAL);
+
+ /* make sure all the pointers point inside the data */
+ if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+ RETURN(-EINVAL);
+
+ RETURN(0);
+}
+
+#include <lustre/lustre_user.h>
+
+#ifndef INVALID_UID
+#define INVALID_UID (-1)
+#endif
+
+/** @} cfg */
+
+#endif // _LUSTRE_CFG_H
diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h
new file mode 100644
index 000000000000..3d9e4462af43
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_debug.h
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_DEBUG_H
+#define _LUSTRE_DEBUG_H
+
+/** \defgroup debug debug
+ *
+ * @{
+ */
+
+#include <lustre_net.h>
+#include <obd.h>
+
+#include <linux/lustre_debug.h>
+
+#define ASSERT_MAX_SIZE_MB 60000ULL
+#define ASSERT_PAGE_INDEX(index, OP) \
+do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)) { \
+ CERROR("bad page index %lu > %llu\n", index, \
+ ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)); \
+ libcfs_debug = ~0UL; \
+ OP; \
+}} while(0)
+
+#define ASSERT_FILE_OFFSET(offset, OP) \
+do { if (offset > ASSERT_MAX_SIZE_MB << 20) { \
+ CERROR("bad file offset %llu > %llu\n", offset, \
+ ASSERT_MAX_SIZE_MB << 20); \
+ libcfs_debug = ~0UL; \
+ OP; \
+}} while(0)
+
+/* lib/debug.c */
+void dump_lniobuf(struct niobuf_local *lnb);
+int dump_req(struct ptlrpc_request *req);
+void dump_lsm(int level, struct lov_stripe_md *lsm);
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id);
+int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id);
+
+/** @} debug */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
new file mode 100644
index 000000000000..8db6086ea4ea
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -0,0 +1,543 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+/****************** on-disk files *********************/
+
+#define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR "CONFIGS"
+#define CONFIGS_FILE "mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD "last_rcvd"
+#define LOV_OBJID "lov_objid"
+#define LOV_OBJSEQ "lov_objseq"
+#define HEALTH_CHECK "health_check"
+#define CAPA_KEYS "capa_keys"
+#define CHANGELOG_USERS "changelog_users"
+#define MGS_NIDTBL_DIR "NIDTBL_VERSIONS"
+#define QMT_DIR "quota_master"
+#define QSD_DIR "quota_slave"
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT 0x0001
+#define LDD_F_SV_TYPE_OST 0x0002
+#define LDD_F_SV_TYPE_MGS 0x0004
+#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \
+ LDD_F_SV_TYPE_OST | \
+ LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL 0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX 0x0010
+/** never registered */
+#define LDD_F_VIRGIN 0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE 0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD 0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF 0x0100
+/** COMPAT_14 */
+#define LDD_F_UPGRADE14 0x0200
+/** process as lctl conf_param */
+#define LDD_F_PARAM 0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE 0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE 0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR 0x4000
+
+/* opc for target register */
+#define LDD_F_OPC_REG 0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK 0xf0000000
+
+#define LDD_F_ONDISK_MASK (LDD_F_SV_TYPE_MASK)
+
+#define LDD_F_MASK 0xFFFF
+
+enum ldd_mount_type {
+ LDD_MT_EXT3 = 0,
+ LDD_MT_LDISKFS,
+ LDD_MT_SMFS,
+ LDD_MT_REISERFS,
+ LDD_MT_LDISKFS2,
+ LDD_MT_ZFS,
+ LDD_MT_LAST
+};
+
+static inline char *mt_str(enum ldd_mount_type mt)
+{
+ static char *mount_type_string[] = {
+ "ext3",
+ "ldiskfs",
+ "smfs",
+ "reiserfs",
+ "ldiskfs2",
+ "zfs",
+ };
+ return mount_type_string[mt];
+}
+
+static inline char *mt_type(enum ldd_mount_type mt)
+{
+ static char *mount_type_string[] = {
+ "osd-ldiskfs",
+ "osd-ldiskfs",
+ "osd-smfs",
+ "osd-reiserfs",
+ "osd-ldiskfs",
+ "osd-zfs",
+ };
+ return mount_type_string[mt];
+}
+
+#define LDD_INCOMPAT_SUPP 0
+#define LDD_ROCOMPAT_SUPP 0
+
+#define LDD_MAGIC 0x1dd00001
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+ __u32 ldd_magic;
+ __u32 ldd_feature_compat; /* compatible feature flags */
+ __u32 ldd_feature_rocompat;/* read-only compatible feature flags */
+ __u32 ldd_feature_incompat;/* incompatible feature flags */
+
+ __u32 ldd_config_ver; /* config rewrite count - not used */
+ __u32 ldd_flags; /* LDD_SV_TYPE */
+ __u32 ldd_svindex; /* server index (0001), must match
+ svname */
+ __u32 ldd_mount_type; /* target fs type LDD_MT_* */
+ char ldd_fsname[64]; /* filesystem this server is part of,
+ MTI_NAME_MAXLEN */
+ char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/
+ __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */
+
+/*200*/ char ldd_userdata[1024 - 200]; /* arbitrary user string */
+/*1024*/__u8 ldd_padding[4096 - 1024];
+/*4096*/char ldd_mount_opts[4096]; /* target fs mount opts */
+/*8192*/char ldd_params[4096]; /* key=value pairs */
+};
+
+
+#define IS_MDT(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data) ((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+ LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST))
+#define MT_STR(data) mt_str((data)->ldd_mount_type)
+
+/* Make the mdt/ost server obd name based on the filesystem name */
+static inline int server_make_name(__u32 flags, __u16 index, char *fs,
+ char *name)
+{
+ if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
+ if (!(flags & LDD_F_SV_ALL))
+ sprintf(name, "%.8s%c%s%04x", fs,
+ (flags & LDD_F_VIRGIN) ? ':' :
+ ((flags & LDD_F_WRITECONF) ? '=' : '-'),
+ (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",
+ index);
+ } else if (flags & LDD_F_SV_TYPE_MGS) {
+ sprintf(name, "MGS");
+ } else {
+ CERROR("unknown server type %#x\n", flags);
+ return 1;
+ }
+ return 0;
+}
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+ everything as string options */
+
+#define LMD_MAGIC 0xbdacbd03
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+ __u32 lmd_magic;
+ __u32 lmd_flags; /* lustre mount flags */
+ int lmd_mgs_failnodes; /* mgs failover node count */
+ int lmd_exclude_count;
+ int lmd_recovery_time_soft;
+ int lmd_recovery_time_hard;
+ char *lmd_dev; /* device name */
+ char *lmd_profile; /* client only */
+ char *lmd_mgssec; /* sptlrpc flavor to mgs */
+ char *lmd_opts; /* lustre mount options (as opposed to
+ _device_ mount options) */
+ char *lmd_params; /* lustre params */
+ __u32 *lmd_exclude; /* array of OSTs to ignore */
+ char *lmd_mgs; /* MGS nid */
+ char *lmd_osd_type; /* OSD type */
+};
+
+#define LMD_FLG_SERVER 0x0001 /* Mounting a server */
+#define LMD_FLG_CLIENT 0x0002 /* Mounting a client */
+#define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */
+#define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers,
+ no other services */
+#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing
+ existing MGS services */
+#define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */
+#define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */
+#define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */
+#define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */
+#define LMD_FLG_IAM 0x0400 /* IAM dir */
+#define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */
+#define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */
+#define LMD_FLG_UPDATE 0x2000 /* update parameters */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+
+/****************** last_rcvd file *********************/
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS 32
+#define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
+#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
+#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
+
+#define LR_SERVER_SIZE 512
+#define LR_CLIENT_START 8192
+#define LR_CLIENT_SIZE 128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST 0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT 0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20 0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID 0x00000001
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS 0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST 0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT 0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR 0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID 0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM 0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR 0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA 0x00000080
+/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER 0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI 0x00000200
+
+/* Data stored per server at the head of the last_rcvd file. In le32 order.
+ This should be common to filter_internal.h, lustre_mds.h */
+struct lr_server_data {
+ __u8 lsd_uuid[40]; /* server UUID */
+ __u64 lsd_last_transno; /* last completed transaction ID */
+ __u64 lsd_compat14; /* reserved - compat with old last_rcvd */
+ __u64 lsd_mount_count; /* incarnation number */
+ __u32 lsd_feature_compat; /* compatible feature flags */
+ __u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+ __u32 lsd_feature_incompat;/* incompatible feature flags */
+ __u32 lsd_server_size; /* size of server data area */
+ __u32 lsd_client_start; /* start of per-client data area */
+ __u16 lsd_client_size; /* size of per-client data area */
+ __u16 lsd_subdir_count; /* number of subdirectories for objects */
+ __u64 lsd_catalog_oid; /* recovery catalog object id */
+ __u32 lsd_catalog_ogen; /* recovery catalog inode generation */
+ __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */
+ __u32 lsd_osd_index; /* index number of OST in LOV */
+ __u32 lsd_padding1; /* was lsd_mdt_index, unused in 2.4.0 */
+ __u32 lsd_start_epoch; /* VBR: start epoch from last boot */
+ /** transaction values since lsd_trans_table_time */
+ __u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+ /** start point of transno table below */
+ __u32 lsd_trans_table_time; /* time of first slot in table above */
+ __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+ __u8 lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file. In le32 order. */
+struct lsd_client_data {
+ __u8 lcd_uuid[40]; /* client UUID */
+ __u64 lcd_last_transno; /* last completed transaction ID */
+ __u64 lcd_last_xid; /* xid for the last transaction */
+ __u32 lcd_last_result; /* result from last RPC */
+ __u32 lcd_last_data; /* per-op data (disposition for open &c.) */
+ /* for MDS_CLOSE requests */
+ __u64 lcd_last_close_transno; /* last completed transaction ID */
+ __u64 lcd_last_close_xid; /* xid for the last transaction */
+ __u32 lcd_last_close_result; /* result from last RPC */
+ __u32 lcd_last_close_data; /* per-op data */
+ /* VBR: last versions */
+ __u64 lcd_pre_versions[4];
+ __u32 lcd_last_epoch;
+ /** orphans handling for delayed export rely on that */
+ __u32 lcd_first_epoch;
+ __u8 lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* bug20354: the lcd_uuid for export of clients may be wrong */
+static inline void check_lcd(char *obd_name, int index,
+ struct lsd_client_data *lcd)
+{
+ int length = sizeof(lcd->lcd_uuid);
+ if (strnlen((char*)lcd->lcd_uuid, length) == length) {
+ lcd->lcd_uuid[length - 1] = '\0';
+
+ LCONSOLE_ERROR("the client UUID (%s) on %s for exports"
+ "stored in last_rcvd(index = %d) is bad!\n",
+ lcd->lcd_uuid, obd_name, index);
+ }
+}
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+ struct lr_server_data *lsd)
+{
+ int i;
+ memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+ lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno);
+ lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14);
+ lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count);
+ lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat);
+ lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+ lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+ lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size);
+ lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start);
+ lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size);
+ lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count);
+ lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid);
+ lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen);
+ memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+ lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index);
+ lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1);
+ lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch);
+ for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+ lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+ lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+ lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+ struct lr_server_data *buf)
+{
+ int i;
+ memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+ buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno);
+ buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14);
+ buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count);
+ buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat);
+ buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+ buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+ buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size);
+ buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start);
+ buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size);
+ buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count);
+ buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid);
+ buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen);
+ memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+ buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index);
+ buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1);
+ buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch);
+ for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+ buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+ buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+ buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+ struct lsd_client_data *lcd)
+{
+ memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+ lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno);
+ lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid);
+ lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result);
+ lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data);
+ lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+ lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid);
+ lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result);
+ lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data);
+ lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]);
+ lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]);
+ lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]);
+ lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]);
+ lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch);
+ lcd->lcd_first_epoch = le32_to_cpu(buf->lcd_first_epoch);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+ struct lsd_client_data *buf)
+{
+ memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+ buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno);
+ buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid);
+ buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result);
+ buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data);
+ buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+ buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid);
+ buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result);
+ buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data);
+ buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]);
+ buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]);
+ buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]);
+ buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]);
+ buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch);
+ buf->lcd_first_epoch = cpu_to_le32(lcd->lcd_first_epoch);
+}
+
+static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+ return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+ lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+ return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+ lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** superblock additional info *********************/
+
+struct ll_sb_info;
+
+struct lustre_sb_info {
+ int lsi_flags;
+ struct obd_device *lsi_mgc; /* mgc obd */
+ struct lustre_mount_data *lsi_lmd; /* mount command info */
+ struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */
+ struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/
+ struct vfsmount *lsi_srv_mnt; /* the one server mount */
+ atomic_t lsi_mounts; /* references to the srv_mnt */
+ char lsi_svname[MTI_NAME_MAXLEN];
+ char lsi_osd_obdname[64];
+ char lsi_osd_uuid[64];
+ struct obd_export *lsi_osd_exp;
+ char lsi_osd_type[16];
+ char lsi_fstype[16];
+ struct backing_dev_info lsi_bdi; /* each client mountpoint needs
+ own backing_dev_info */
+};
+
+#define LSI_UMOUNT_FAILOVER 0x00200000
+#define LSI_BDI_INITIALIZED 0x00400000
+
+#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info))
+#define s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define get_mount_flags(sb) (s2lsi(sb)->lsi_lmd->lmd_flags)
+#define get_mntdev_name(sb) (s2lsi(sb)->lsi_lmd->lmd_dev)
+
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+ char *lmi_name;
+ struct super_block *lmi_sb;
+ struct vfsmount *lmi_mnt;
+ struct list_head lmi_list_chain;
+};
+
+/****************** prototypes *********************/
+
+/* obd_mount.c */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+int server_name2svname(const char *label, char *svname, const char **endptr,
+ size_t svsize);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+ char *s1, char *s2, char *s3, char *s4);
+int lustre_start_mgc(struct super_block *sb);
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+ struct vfsmount *mnt));
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+
+/** @} disk */
+
+#endif // _LUSTRE_DISK_H
diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h
new file mode 100644
index 000000000000..317f928fc151
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm.h
@@ -0,0 +1,1671 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ * - To provide locking assuring consistency of data on all Lustre nodes.
+ * - To allow clients to cache state protected by a lock by holding the
+ * lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include <linux/lustre_dlm.h>
+
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_handles.h>
+#include <interval_tree.h> /* for interval_node{}, ldlm_extent */
+#include <lu_ref.h>
+
+struct obd_ops;
+struct obd_device;
+
+#define OBD_LDLM_DEVICENAME "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000))
+#define LDLM_CTIME_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+
+/**
+ * LDLM non-error return states
+ */
+typedef enum {
+ ELDLM_OK = 0,
+
+ ELDLM_LOCK_CHANGED = 300,
+ ELDLM_LOCK_ABORTED = 301,
+ ELDLM_LOCK_REPLACED = 302,
+ ELDLM_NO_LOCK_DATA = 303,
+ ELDLM_LOCK_WOULDBLOCK = 304,
+
+ ELDLM_NAMESPACE_EXISTS = 400,
+ ELDLM_BAD_NAMESPACE = 401
+} ldlm_error_t;
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+typedef enum {
+ LDLM_NAMESPACE_SERVER = 1 << 0,
+ LDLM_NAMESPACE_CLIENT = 1 << 1
+} ldlm_side_t;
+
+/**
+ * Declaration of flags sent through the wire.
+ **/
+#define LDLM_FL_LOCK_CHANGED 0x000001 /* extent, mode, or resource changed */
+
+/**
+ * If the server returns one of these flags, then the lock was put on that list.
+ * If the client sends one of these flags (during recovery ONLY!), it wants the
+ * lock added to the specified list, no questions asked.
+ */
+#define LDLM_FL_BLOCK_GRANTED 0x000002
+#define LDLM_FL_BLOCK_CONV 0x000004
+#define LDLM_FL_BLOCK_WAIT 0x000008
+
+/* Used to be LDLM_FL_CBPENDING 0x000010 moved to non-wire flags */
+
+#define LDLM_FL_AST_SENT 0x000020 /* blocking or cancel packet was
+ * queued for sending. */
+/* Used to be LDLM_FL_WAIT_NOREPROC 0x000040 moved to non-wire flags */
+/* Used to be LDLM_FL_CANCEL 0x000080 moved to non-wire flags */
+
+/**
+ * Lock is being replayed. This could probably be implied by the fact that one
+ * of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous.
+ */
+#define LDLM_FL_REPLAY 0x000100
+
+#define LDLM_FL_INTENT_ONLY 0x000200 /* Don't grant lock, just do intent. */
+
+/* Used to be LDLM_FL_LOCAL_ONLY 0x000400 moved to non-wire flags */
+/* Used to be LDLM_FL_FAILED 0x000800 moved to non-wire flags */
+
+#define LDLM_FL_HAS_INTENT 0x001000 /* lock request has intent */
+
+/* Used to be LDLM_FL_CANCELING 0x002000 moved to non-wire flags */
+/* Used to be LDLM_FL_LOCAL 0x004000 moved to non-wire flags */
+
+#define LDLM_FL_DISCARD_DATA 0x010000 /* discard (no writeback) on cancel */
+
+#define LDLM_FL_NO_TIMEOUT 0x020000 /* Blocked by group lock - wait
+ * indefinitely */
+
+/** file & record locking */
+#define LDLM_FL_BLOCK_NOWAIT 0x040000 /* Server told not to wait if blocked.
+ * For AGL, OST will not send glimpse
+ * callback. */
+#define LDLM_FL_TEST_LOCK 0x080000 // return blocking lock
+
+/* Used to be LDLM_FL_LVB_READY 0x100000 moved to non-wire flags */
+/* Used to be LDLM_FL_KMS_IGNORE 0x200000 moved to non-wire flags */
+/* Used to be LDLM_FL_NO_LRU 0x400000 moved to non-wire flags */
+
+/* Immediatelly cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This is
+ * for clients (like liblustre) that cannot be expected to reliably response
+ * to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK 0x800000
+
+/* Flags flags inherited from parent lock when doing intents. */
+#define LDLM_INHERIT_FLAGS (LDLM_FL_CANCEL_ON_BLOCK)
+
+/* Used to be LDLM_FL_CP_REQD 0x1000000 moved to non-wire flags */
+/* Used to be LDLM_FL_CLEANED 0x2000000 moved to non-wire flags */
+/* Used to be LDLM_FL_ATOMIC_CB 0x4000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_AST 0x10000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_DONE 0x20000000 moved to non-wire flags */
+
+/* measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION 0x40000000
+
+/* These are flags that are mapped into the flags and ASTs of blocking locks */
+#define LDLM_AST_DISCARD_DATA 0x80000000 /* Add FL_DISCARD to blocking ASTs */
+
+/* Flags sent in AST lock_flags to be mapped into the receiving lock. */
+#define LDLM_AST_FLAGS (LDLM_FL_DISCARD_DATA)
+
+/*
+ * --------------------------------------------------------------------------
+ * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above
+ * 0x80000000 will not be sent over the wire.
+ * --------------------------------------------------------------------------
+ */
+
+/**
+ * Declaration of flags not sent through the wire.
+ **/
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep
+ * emulation + race with upcoming bl_ast.
+ */
+#define LDLM_FL_FAIL_LOC 0x100000000ULL
+
+/**
+ * Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it.
+ */
+#define LDLM_FL_SKIPPED 0x200000000ULL
+/* this lock is being destroyed */
+#define LDLM_FL_CBPENDING 0x400000000ULL
+/* not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC 0x800000000ULL
+/* cancellation callback already run */
+#define LDLM_FL_CANCEL 0x1000000000ULL
+#define LDLM_FL_LOCAL_ONLY 0x2000000000ULL
+/* don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED 0x4000000000ULL
+/* lock cancel has already been sent */
+#define LDLM_FL_CANCELING 0x8000000000ULL
+/* local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL 0x10000000000ULL
+/* XXX FIXME: This is being added to b_size as a low-risk fix to the fact that
+ * the LVB filling happens _after_ the lock has been granted, so another thread
+ * can match it before the LVB has been updated. As a dirty hack, we set
+ * LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and callers
+ * must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST, which can
+ * be replaced with a LVB-aware wrapping function for OSC locks. That change is
+ * pretty high-risk, though, and would need a lot more testing. */
+#define LDLM_FL_LVB_READY 0x20000000000ULL
+/* A lock contributes to the known minimum size (KMS) calculation until it has
+ * finished the part of its cancelation that performs write back on its dirty
+ * pages. It can remain on the granted list during this whole time. Threads
+ * racing to update the KMS after performing their writeback need to know to
+ * exclude each other's locks from the calculation as they walk the granted
+ * list. */
+#define LDLM_FL_KMS_IGNORE 0x40000000000ULL
+/* completion AST to be executed */
+#define LDLM_FL_CP_REQD 0x80000000000ULL
+/* cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED 0x100000000000ULL
+/* optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB 0x200000000000ULL
+
+/* It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting
+ * locks to this client for the first operation, whereas the second
+ * operation has canceled this lock and is waiting for rpc_lock which is
+ * taken by the first operation. LDLM_FL_BL_AST is set by
+ * ldlm_callback_handler() in the lock to prevent the Early Lock Cancel
+ * (ELC) code from cancelling it.
+ *
+ * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock
+ * cache is dropped to let ldlm_callback_handler() return EINVAL to the
+ * server. It is used when ELC RPC is already prepared and is waiting
+ * for rpc_lock, too late to send a separate CANCEL RPC. */
+#define LDLM_FL_BL_AST 0x400000000000ULL
+#define LDLM_FL_BL_DONE 0x800000000000ULL
+/* Don't put lock into the LRU list, so that it is not canceled due to aging.
+ * Used by MGC locks, they are cancelled only at unmount or by callback. */
+#define LDLM_FL_NO_LRU 0x1000000000000ULL
+
+/**
+ * The blocking callback is overloaded to perform two functions. These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING 1
+#define LDLM_CB_CANCELING 2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ * on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ * lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ * an OST, a lock with PR mode will be issued. Also, if the client opens a
+ * file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ * requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ * an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ * NL CR CW PR PW EX
+ * NL 1 1 1 1 1 1
+ * CR 1 1 1 1 1 0
+ * CW 1 1 1 0 0 0
+ * PR 1 1 0 1 0 0
+ * PW 1 1 0 0 0 0
+ * EX 1 0 0 0 0 0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX LCK_NL
+#define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern ldlm_mode_t lck_compat_array[];
+
+static inline void lockmode_verify(ldlm_mode_t mode)
+{
+ LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode)
+{
+ return (lck_compat_array[exist_mode] & new_mode);
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+ - do we just separate this by security domains and use a prefix for
+ multiple namespaces in the same domain?
+ -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ * waiting_locks_spinlock
+ *
+ * lr_lock
+ * led_lock
+ *
+ * lr_lock
+ * ns_lock
+ *
+ * lr_lvb_mutex
+ * lr_lock
+ *
+ */
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+ /** Recalculate pool \a pl usage */
+ int (*po_recalc)(struct ldlm_pool *pl);
+ /** Cancel at least \a nr locks from pool \a pl */
+ int (*po_shrink)(struct ldlm_pool *pl, int nr,
+ unsigned int gfp_mask);
+ int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+ /** Pool proc directory. */
+ proc_dir_entry_t *pl_proc_dir;
+ /** Pool name, must be long enough to hold compound proc entry name. */
+ char pl_name[100];
+ /** Lock for protecting SLV/CLV updates. */
+ spinlock_t pl_lock;
+ /** Number of allowed locks in in pool, both, client and server side. */
+ atomic_t pl_limit;
+ /** Number of granted locks in */
+ atomic_t pl_granted;
+ /** Grant rate per T. */
+ atomic_t pl_grant_rate;
+ /** Cancel rate per T. */
+ atomic_t pl_cancel_rate;
+ /** Server lock volume (SLV). Protected by pl_lock. */
+ __u64 pl_server_lock_volume;
+ /** Current biggest client lock volume. Protected by pl_lock. */
+ __u64 pl_client_lock_volume;
+ /** Lock volume factor. SLV on client is calculated as following:
+ * server_slv * lock_volume_factor. */
+ atomic_t pl_lock_volume_factor;
+ /** Time when last SLV from server was obtained. */
+ time_t pl_recalc_time;
+ /** Recalculation period for pool. */
+ time_t pl_recalc_period;
+ /** Recalculation and shrink operations. */
+ struct ldlm_pool_ops *pl_ops;
+ /** Number of planned locks for next period. */
+ int pl_grant_plan;
+ /** Pool statistics. */
+ struct lprocfs_stats *pl_stats;
+};
+
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+ void *req_cookie, ldlm_mode_t mode, __u64 flags,
+ void *data);
+
+typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ * - OSC-OST code to maintain current object size/times
+ * - layout lock code to return the layout when the layout lock is granted
+ */
+struct ldlm_valblock_ops {
+ int (*lvbo_init)(struct ldlm_resource *res);
+ int (*lvbo_update)(struct ldlm_resource *res,
+ struct ptlrpc_request *r,
+ int increase);
+ int (*lvbo_free)(struct ldlm_resource *res);
+ /* Return size of lvb data appropriate RPC size can be reserved */
+ int (*lvbo_size)(struct ldlm_lock *lock);
+ /* Called to fill in lvb data to RPC buffer @buf */
+ int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+typedef enum {
+ LDLM_NAMESPACE_GREEDY = 1 << 0,
+ LDLM_NAMESPACE_MODEST = 1 << 1
+} ldlm_appetite_t;
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+ /** back pointer to namespace */
+ struct ldlm_namespace *nsb_namespace;
+ /**
+ * Estimated lock callback time. Used by adaptive timeout code to
+ * avoid spurious client evictions due to unresponsiveness when in
+ * fact the network or overall system load is at fault
+ */
+ struct adaptive_timeout nsb_at_estimate;
+};
+
+enum {
+ /** LDLM namespace lock stats */
+ LDLM_NSS_LOCKS = 0,
+ LDLM_NSS_LAST
+};
+
+typedef enum {
+ /** invalide type */
+ LDLM_NS_TYPE_UNKNOWN = 0,
+ /** mdc namespace */
+ LDLM_NS_TYPE_MDC,
+ /** mds namespace */
+ LDLM_NS_TYPE_MDT,
+ /** osc namespace */
+ LDLM_NS_TYPE_OSC,
+ /** ost namespace */
+ LDLM_NS_TYPE_OST,
+ /** mgc namespace */
+ LDLM_NS_TYPE_MGC,
+ /** mgs namespace */
+ LDLM_NS_TYPE_MGT,
+} ldlm_ns_type_t;
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ * to make decisions like what locks could be granted and what conflicts
+ * exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ * only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+ /** Backward link to OBD, required for LDLM pool to store new SLV. */
+ struct obd_device *ns_obd;
+
+ /** Flag indicating if namespace is on client instead of server */
+ ldlm_side_t ns_client;
+
+ /** Resource hash table for namespace. */
+ cfs_hash_t *ns_rs_hash;
+
+ /** serialize */
+ spinlock_t ns_lock;
+
+ /** big refcount (by bucket) */
+ atomic_t ns_bref;
+
+ /**
+ * Namespace connect flags supported by server (may be changed via
+ * /proc, LRU resize may be disabled/enabled).
+ */
+ __u64 ns_connect_flags;
+
+ /** Client side original connect flags supported by server. */
+ __u64 ns_orig_connect_flags;
+
+ /* namespace proc dir entry */
+ struct proc_dir_entry *ns_proc_dir_entry;
+
+ /**
+ * Position in global namespace list linking all namespaces on
+ * the node.
+ */
+ struct list_head ns_list_chain;
+
+ /**
+ * List of unused locks for this namespace. This list is also called
+ * LRU lock list.
+ * Unused locks are locks with zero reader/writer reference counts.
+ * This list is only used on clients for lock caching purposes.
+ * When we want to release some locks voluntarily or if server wants
+ * us to release some locks due to e.g. memory pressure, we take locks
+ * to release from the head of this list.
+ * Locks are linked via l_lru field in \see struct ldlm_lock.
+ */
+ struct list_head ns_unused_list;
+ /** Number of locks in the LRU list above */
+ int ns_nr_unused;
+
+ /**
+ * Maximum number of locks permitted in the LRU. If 0, means locks
+ * are managed by pools and there is no preset limit, rather it is all
+ * controlled by available memory on this client and on server.
+ */
+ unsigned int ns_max_unused;
+ /** Maximum allowed age (last used time) for locks in the LRU */
+ unsigned int ns_max_age;
+ /**
+ * Server only: number of times we evicted clients due to lack of reply
+ * to ASTs.
+ */
+ unsigned int ns_timeouts;
+ /**
+ * Number of seconds since the file change time after which the
+ * MDT will return an UPDATE lock along with a LOOKUP lock.
+ * This allows the client to start caching negative dentries
+ * for a directory and may save an RPC for a later stat.
+ */
+ unsigned int ns_ctime_age_limit;
+
+ /**
+ * Used to rate-limit ldlm_namespace_dump calls.
+ * \see ldlm_namespace_dump. Increased by 10 seconds every time
+ * it is called.
+ */
+ cfs_time_t ns_next_dump;
+
+ /** "policy" function that does actual lock conflict determination */
+ ldlm_res_policy ns_policy;
+
+ /**
+ * LVB operations for this namespace.
+ * \see struct ldlm_valblock_ops
+ */
+ struct ldlm_valblock_ops *ns_lvbo;
+
+ /**
+ * Used by filter code to store pointer to OBD of the service.
+ * Should be dropped in favor of \a ns_obd
+ */
+ void *ns_lvbp;
+
+ /**
+ * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+ * a resource is removed.
+ */
+ wait_queue_head_t ns_waitq;
+ /** LDLM pool structure for this namespace */
+ struct ldlm_pool ns_pool;
+ /** Definition of how eagerly unused locks will be released from LRU */
+ ldlm_appetite_t ns_appetite;
+
+ /**
+ * If more than \a ns_contended_locks are found, the resource is
+ * considered to be contended. Lock enqueues might specify that no
+ * contended locks should be granted
+ */
+ unsigned ns_contended_locks;
+
+ /**
+ * The resources in this namespace remember contended state during
+ * \a ns_contention_time, in seconds.
+ */
+ unsigned ns_contention_time;
+
+ /**
+ * Limit size of contended extent locks, in bytes.
+ * If extended lock is requested for more then this many bytes and
+ * caller instructs us not to grant contended locks, we would disregard
+ * such a request.
+ */
+ unsigned ns_max_nolock_size;
+
+ /** Limit of parallel AST RPC count. */
+ unsigned ns_max_parallel_ast;
+
+ /** Callback to cancel locks before replaying it during recovery. */
+ ldlm_cancel_for_recovery ns_cancel_for_recovery;
+
+ /** LDLM lock stats */
+ struct lprocfs_stats *ns_stats;
+
+ /**
+ * Flag to indicate namespace is being freed. Used to determine if
+ * recalculation of LDLM pool statistics should be skipped.
+ */
+ unsigned ns_stopping:1;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+ LASSERT(ns != NULL);
+ LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+ LDLM_NAMESPACE_SERVER)));
+ LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+ ns->ns_client == LDLM_NAMESPACE_SERVER);
+ return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+ LASSERT(ns != NULL);
+ LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+ LDLM_NAMESPACE_SERVER)));
+ LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+ ns->ns_client == LDLM_NAMESPACE_SERVER);
+ return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+ LASSERT(ns != NULL);
+ return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+ LASSERT(ns != NULL);
+ return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+ ldlm_cancel_for_recovery arg)
+{
+ LASSERT(ns != NULL);
+ ns->ns_cancel_for_recovery = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+ struct ldlm_lock_desc *new, void *data,
+ int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+ void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+/** Type for weight callback function of a lock. */
+typedef unsigned long (*ldlm_weigh_callback)(struct ldlm_lock *lock);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+ struct ldlm_lock *gl_lock; /* lock to glimpse */
+ struct list_head gl_list; /* linkage to other gl work structs */
+ __u32 gl_flags;/* see LDLM_GL_WORK_* below */
+ union ldlm_gl_desc *gl_desc; /* glimpse descriptor to be packed in
+ * glimpse callback request */
+};
+
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+ struct interval_node li_node; /* node for tree management */
+ struct list_head li_group; /* the locks which have the same
+ * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+ /** Tree size. */
+ int lit_size;
+ ldlm_mode_t lit_mode; /* lock mode */
+ struct interval_node *lit_root; /* actual ldlm_interval */
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+typedef enum {
+ LCF_ASYNC = 0x1, /* Cancel locks asynchronously. */
+ LCF_LOCAL = 0x2, /* Cancel locks locally, not notifing server */
+ LCF_BL_AST = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST
+ * in the same RPC */
+} ldlm_cancel_flags_t;
+
+struct ldlm_flock {
+ __u64 start;
+ __u64 end;
+ __u64 owner;
+ __u64 blocking_owner;
+ struct obd_export *blocking_export;
+ /* Protected by the hash lock */
+ __u32 blocking_refs;
+ __u32 pid;
+};
+
+typedef union {
+ struct ldlm_extent l_extent;
+ struct ldlm_flock l_flock;
+ struct ldlm_inodebits l_inodebits;
+} ldlm_policy_data_t;
+
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+ const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+ const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy);
+
+enum lvb_type {
+ LVB_T_NONE = 0,
+ LVB_T_OST = 1,
+ LVB_T_LQUOTA = 2,
+ LVB_T_LAYOUT = 3,
+};
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+ /**
+ * Local lock handle.
+ * When remote side wants to tell us about a lock, they address
+ * it by this opaque handle. The handle does not hold a
+ * reference on the ldlm_lock, so it can be safely passed to
+ * other threads or nodes. When the lock needs to be accessed
+ * from the handle, it is looked up again in the lock table, and
+ * may no longer exist.
+ *
+ * Must be first in the structure.
+ */
+ struct portals_handle l_handle;
+ /**
+ * Lock reference count.
+ * This is how many users have pointers to actual structure, so that
+ * we do not accidentally free lock structure that is in use.
+ */
+ atomic_t l_refc;
+ /**
+ * Internal spinlock protects l_resource. We should hold this lock
+ * first before taking res_lock.
+ */
+ spinlock_t l_lock;
+ /**
+ * Pointer to actual resource this lock is in.
+ * ldlm_lock_change_resource() can change this.
+ */
+ struct ldlm_resource *l_resource;
+ /**
+ * List item for client side LRU list.
+ * Protected by ns_lock in struct ldlm_namespace.
+ */
+ struct list_head l_lru;
+ /**
+ * Linkage to resource's lock queues according to current lock state.
+ * (could be granted, waiting or converting)
+ * Protected by lr_lock in struct ldlm_resource.
+ */
+ struct list_head l_res_link;
+ /**
+ * Tree node for ldlm_extent.
+ */
+ struct ldlm_interval *l_tree_node;
+ /**
+ * Per export hash of locks.
+ * Protected by per-bucket exp->exp_lock_hash locks.
+ */
+ struct hlist_node l_exp_hash;
+ /**
+ * Per export hash of flock locks.
+ * Protected by per-bucket exp->exp_flock_hash locks.
+ */
+ struct hlist_node l_exp_flock_hash;
+ /**
+ * Requested mode.
+ * Protected by lr_lock.
+ */
+ ldlm_mode_t l_req_mode;
+ /**
+ * Granted mode, also protected by lr_lock.
+ */
+ ldlm_mode_t l_granted_mode;
+ /** Lock completion handler pointer. Called when lock is granted. */
+ ldlm_completion_callback l_completion_ast;
+ /**
+ * Lock blocking AST handler pointer.
+ * It plays two roles:
+ * - as a notification of an attempt to queue a conflicting lock (once)
+ * - as a notification when the lock is being cancelled.
+ *
+ * As such it's typically called twice: once for the initial conflict
+ * and then once more when the last user went away and the lock is
+ * cancelled (could happen recursively).
+ */
+ ldlm_blocking_callback l_blocking_ast;
+ /**
+ * Lock glimpse handler.
+ * Glimpse handler is used to obtain LVB updates from a client by
+ * server
+ */
+ ldlm_glimpse_callback l_glimpse_ast;
+
+ /** XXX apparently unused "weight" handler. To be removed? */
+ ldlm_weigh_callback l_weigh_ast;
+
+ /**
+ * Lock export.
+ * This is a pointer to actual client export for locks that were granted
+ * to clients. Used server-side.
+ */
+ struct obd_export *l_export;
+ /**
+ * Lock connection export.
+ * Pointer to server export on a client.
+ */
+ struct obd_export *l_conn_export;
+
+ /**
+ * Remote lock handle.
+ * If the lock is remote, this is the handle of the other side lock
+ * (l_handle)
+ */
+ struct lustre_handle l_remote_handle;
+
+ /**
+ * Representation of private data specific for a lock type.
+ * Examples are: extent range for extent lock or bitmask for ibits locks
+ */
+ ldlm_policy_data_t l_policy_data;
+
+ /**
+ * Lock state flags.
+ * Like whenever we receive any blocking requests for this lock, etc.
+ * Protected by lr_lock.
+ */
+ __u64 l_flags;
+ /**
+ * Lock r/w usage counters.
+ * Protected by lr_lock.
+ */
+ __u32 l_readers;
+ __u32 l_writers;
+ /**
+ * If the lock is granted, a process sleeps on this waitq to learn when
+ * it's no longer in use. If the lock is not granted, a process sleeps
+ * on this waitq to learn when it becomes granted.
+ */
+ wait_queue_head_t l_waitq;
+
+ /**
+ * Seconds. It will be updated if there is any activity related to
+ * the lock, e.g. enqueue the lock or send blocking AST.
+ */
+ cfs_time_t l_last_activity;
+
+ /**
+ * Time last used by e.g. being matched by lock match.
+ * Jiffies. Should be converted to time if needed.
+ */
+ cfs_time_t l_last_used;
+
+ /** Originally requested extent for the extent lock. */
+ struct ldlm_extent l_req_extent;
+
+ unsigned int l_failed:1,
+ /**
+ * Set for locks that were removed from class hash table and will be
+ * destroyed when last reference to them is released. Set by
+ * ldlm_lock_destroy_internal().
+ *
+ * Protected by lock and resource locks.
+ */
+ l_destroyed:1,
+ /*
+ * it's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+ *
+ * NB: compared with check_res_locked(), checking this bit is cheaper.
+ * Also, spin_is_locked() is deprecated for kernel code; one reason is
+ * because it works only for SMP so user needs to add extra macros like
+ * LASSERT_SPIN_LOCKED for uniprocessor kernels.
+ */
+ l_res_locked:1,
+ /*
+ * It's set once we call ldlm_add_waiting_lock_res_locked()
+ * to start the lock-timeout timer and it will never be reset.
+ *
+ * Protected by lock_res_and_lock().
+ */
+ l_waited:1,
+ /** Flag whether this is a server namespace lock. */
+ l_ns_srv:1;
+
+ /*
+ * Client-side-only members.
+ */
+
+ enum lvb_type l_lvb_type;
+
+ /**
+ * Temporary storage for a LVB received during an enqueue operation.
+ */
+ __u32 l_lvb_len;
+ void *l_lvb_data;
+
+ /** Private storage for lock user. Opaque to LDLM. */
+ void *l_ast_data;
+
+ /*
+ * Server-side-only members.
+ */
+
+ /**
+ * Connection cookie for the client originating the operation.
+ * Used by Commit on Share (COS) code. Currently only used for
+ * inodebits locks on MDS.
+ */
+ __u64 l_client_cookie;
+
+ /**
+ * List item for locks waiting for cancellation from clients.
+ * The lists this could be linked into are:
+ * waiting_locks_list (protected by waiting_locks_spinlock),
+ * then if the lock timed out, it is moved to
+ * expired_lock_thread.elt_expired_locks for further processing.
+ * Protected by elt_lock.
+ */
+ struct list_head l_pending_chain;
+
+ /**
+ * Set when lock is sent a blocking AST. Time in seconds when timeout
+ * is reached and client holding this lock could be evicted.
+ * This timeout could be further extended by e.g. certain IO activity
+ * under this lock.
+ * \see ost_rw_prolong_locks
+ */
+ cfs_time_t l_callback_timeout;
+
+ /** Local PID of process which created this lock. */
+ __u32 l_pid;
+
+ /**
+ * Number of times blocking AST was sent for this lock.
+ * This is for debugging. Valid values are 0 and 1, if there is an
+ * attempt to send blocking AST more than once, an assertion would be
+ * hit. \see ldlm_work_bl_ast_lock
+ */
+ int l_bl_ast_run;
+ /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+ struct list_head l_bl_ast;
+ /** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+ struct list_head l_cp_ast;
+ /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+ struct list_head l_rk_ast;
+
+ /**
+ * Pointer to a conflicting lock that caused blocking AST to be sent
+ * for this lock
+ */
+ struct ldlm_lock *l_blocking_lock;
+
+ /**
+ * Protected by lr_lock, linkages to "skip lists".
+ * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+ */
+ struct list_head l_sl_mode;
+ struct list_head l_sl_policy;
+
+ /** Reference tracking structure to debug leaked locks. */
+ struct lu_ref l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ /* Debugging stuff for bug 20498, for tracking export references. */
+ /** number of export references taken */
+ int l_exp_refs_nr;
+ /** link all locks referencing one export */
+ struct list_head l_exp_refs_link;
+ /** referenced export object */
+ struct obd_export *l_exp_refs_target;
+#endif
+ /**
+ * export blocking dlm lock list, protected by
+ * l_export->exp_bl_list_lock.
+ * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+ * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+ */
+ struct list_head l_exp_list;
+};
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+ struct ldlm_ns_bucket *lr_ns_bucket;
+
+ /**
+ * List item for list in namespace hash.
+ * protected by ns_lock
+ */
+ struct hlist_node lr_hash;
+
+ /** Spinlock to protect locks under this resource. */
+ spinlock_t lr_lock;
+
+ /**
+ * protected by lr_lock
+ * @{ */
+ /** List of locks in granted state */
+ struct list_head lr_granted;
+ /** List of locks waiting to change their granted mode (converted) */
+ struct list_head lr_converting;
+ /**
+ * List of locks that could not be granted due to conflicts and
+ * that are waiting for conflicts to go away */
+ struct list_head lr_waiting;
+ /** @} */
+
+ /* XXX No longer needed? Remove ASAP */
+ ldlm_mode_t lr_most_restr;
+
+ /** Type of locks this resource can hold. Only one type per resource. */
+ ldlm_type_t lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+ /** Resource name */
+ struct ldlm_res_id lr_name;
+ /** Reference count for this resource */
+ atomic_t lr_refcount;
+
+ /**
+ * Interval trees (only for extent locks) for all modes of this resource
+ */
+ struct ldlm_interval_tree lr_itree[LCK_MODE_NUM];
+
+ /**
+ * Server-side-only lock value block elements.
+ * To serialize lvbo_init.
+ */
+ struct mutex lr_lvb_mutex;
+ int lr_lvb_len;
+ /** protected by lr_lock */
+ void *lr_lvb_data;
+
+ /** When the resource was considered as contended. */
+ cfs_time_t lr_contention_time;
+ /** List of references to this resource. For debugging. */
+ struct lu_ref lr_reference;
+
+ struct inode *lr_lvb_inode;
+};
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+ return lock->l_resource->lr_type == LDLM_IBITS &&
+ lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+ return ns->ns_rs_hash->hs_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+ return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+ return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+ return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+ return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+ struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+ if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL)
+ return ns->ns_lvbo->lvbo_init(res);
+
+ return 0;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+ if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+ return ns->ns_lvbo->lvbo_size(lock);
+
+ return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+{
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+ if (ns->ns_lvbo != NULL) {
+ LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+ return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+ }
+ return 0;
+}
+
+struct ldlm_ast_work {
+ struct ldlm_lock *w_lock;
+ int w_blocking;
+ struct ldlm_lock_desc w_desc;
+ struct list_head w_list;
+ int w_flags;
+ void *w_data;
+ int w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+ __u32 ei_type; /** Type of the lock being enqueued. */
+ __u32 ei_mode; /** Mode of the lock being enqueued. */
+ void *ei_cb_bl; /** blocking lock callback */
+ void *ei_cb_cp; /** lock completion callback */
+ void *ei_cb_gl; /** lock glimpse callback */
+ void *ei_cb_wg; /** lock weigh callback */
+ void *ei_cbdata; /** Data to be passed into callbacks. */
+};
+
+extern struct obd_ops ldlm_obd_ops;
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern char *ldlm_it2str(int it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...) \
+ CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \
+ CFS_CHECK_STACK(msgdata, mask, cdls); \
+ \
+ if (((mask) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (mask)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \
+ _ldlm_lock_debug(lock, msgdata, fmt, ##a); \
+} while(0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+ struct libcfs_debug_msg_data *data,
+ const char *fmt, ...)
+ __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \
+ static cfs_debug_limit_state_t _ldlm_cdls; \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls); \
+ ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...) do { \
+ if (likely(lock != NULL)) { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL); \
+ ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, \
+ "### " fmt , ##a); \
+ } else { \
+ LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a); \
+ } \
+} while (0)
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+ int first_enq, ldlm_error_t *err,
+ struct list_head *work_list);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP 2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+ void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+ void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+ ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_callback_suite {
+ ldlm_completion_callback lcs_completion;
+ ldlm_blocking_callback lcs_blocking;
+ ldlm_glimpse_callback lcs_glimpse;
+ ldlm_weigh_callback lcs_weigh;
+};
+
+/* ldlm_lockd.c */
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+ struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(struct lustre_handle *, void *);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+ return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+ lu_ref_del(&lock->l_reference, "handle", current)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+ struct ldlm_lock *lock;
+
+ lock = __ldlm_handle2lock(h, flags);
+ if (lock != NULL)
+ LDLM_LOCK_REF_DEL(lock);
+ return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from reqest \a r
+ */
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+ struct ptlrpc_request *r, int increase)
+{
+ if (ldlm_res_to_ns(res)->ns_lvbo &&
+ ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+ return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r,
+ increase);
+ }
+ return 0;
+}
+
+int ldlm_error2errno(ldlm_error_t error);
+ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this
+ * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock) \
+do { \
+ LDLM_LOCK_REF_DEL(lock); \
+ /*LDLM_DEBUG((lock), "put");*/ \
+ ldlm_lock_put(lock); \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock) \
+do { \
+ /*LDLM_DEBUG((lock), "put");*/ \
+ ldlm_lock_put(lock); \
+} while (0)
+
+#define LDLM_LOCK_GET(lock) \
+({ \
+ ldlm_lock_get(lock); \
+ /*LDLM_DEBUG((lock), "get");*/ \
+ lock; \
+})
+
+#define ldlm_lock_list_put(head, member, count) \
+({ \
+ struct ldlm_lock *_lock, *_next; \
+ int c = count; \
+ list_for_each_entry_safe(_lock, _next, head, member) { \
+ if (c-- == 0) \
+ break; \
+ list_del_init(&_lock->member); \
+ LDLM_LOCK_RELEASE(_lock); \
+ } \
+ LASSERT(c <= 0); \
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode);
+int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+ const struct ldlm_res_id *, ldlm_type_t type,
+ ldlm_policy_data_t *, ldlm_mode_t mode,
+ struct lustre_handle *, int unref);
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+ __u64 *bits);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+ __u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, struct lustre_handle *);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *
+ldlm_namespace_new(struct obd_device *obd, char *name,
+ ldlm_side_t client, ldlm_appetite_t apt,
+ ldlm_ns_type_t ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+ struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client);
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+int ldlm_proc_setup(void);
+#ifdef LPROCFS
+void ldlm_proc_cleanup(void);
+#else
+static inline void ldlm_proc_cleanup(void) {}
+#endif
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+ struct ldlm_resource *parent,
+ const struct ldlm_res_id *,
+ ldlm_type_t type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+ struct list_head *head,
+ struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+ const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do { \
+ lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current); \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do { \
+ lu_ref_del(&(res)->lr_reference, __FUNCTION__, current); \
+} while (0)
+
+/* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+ void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+ struct ldlm_enqueue_info *einfo,
+ const struct ldlm_res_id *res_id,
+ ldlm_policy_data_t const *policy, __u64 *flags,
+ void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+ struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ struct list_head *cancels,
+ int count);
+int ldlm_prep_elc_req(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ int version, int opc, int canceloff,
+ struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+ const struct ldlm_request *dlm_req,
+ const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+ ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+ __u64 *flags, void *lvb, __u32 lvb_len,
+ struct lustre_handle *lockh, int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_type_t type, ldlm_policy_data_t *policy,
+ ldlm_mode_t mode, __u64 *flags,
+ ldlm_blocking_callback blocking,
+ ldlm_completion_callback completion,
+ ldlm_glimpse_callback glimpse,
+ void *data, __u32 lvb_len, enum lvb_type lvb_type,
+ const __u64 *client_cookie,
+ struct lustre_handle *lockh);
+int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
+ void *data, __u32 data_len);
+int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+ ldlm_cancel_flags_t cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+ ldlm_cancel_flags_t flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode,
+ ldlm_cancel_flags_t flags,
+ void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+ int count, ldlm_cancel_flags_t flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+ struct list_head *cancels,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode, int lock_flags,
+ ldlm_cancel_flags_t cancel_flags, void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+ ldlm_cancel_flags_t flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+ struct ptlrpc_request *req, ldlm_cancel_flags_t flags);
+/** @} ldlm_cli_api */
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+
+/* ioctls for trying requests */
+#define IOC_LDLM_TYPE 'f'
+#define IOC_LDLM_MIN_NR 40
+
+#define IOC_LDLM_TEST _IOWR('f', 40, long)
+#define IOC_LDLM_DUMP _IOWR('f', 41, long)
+#define IOC_LDLM_REGRESS_START _IOWR('f', 42, long)
+#define IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long)
+#define IOC_LDLM_MAX_NR 43
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+ LRT_NORMAL,
+ LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+ spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+ enum lock_res_type mode)
+{
+ spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+ spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+ LASSERT(spin_is_locked(&res->lr_lock));
+}
+
+struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+void ldlm_pools_recalc(ldlm_side_t client);
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+ int idx, ldlm_side_t client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+ unsigned int gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+#endif
+/** @} LDLM */
diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h
new file mode 100644
index 000000000000..b94f76a3301b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_eacl.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_EACL_H
+#define _LUSTRE_EACL_H
+
+/** \defgroup eacl eacl
+ *
+ * @{
+ */
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <linux/posix_acl_xattr.h>
+
+typedef struct {
+ __u16 e_tag;
+ __u16 e_perm;
+ __u32 e_id;
+ __u32 e_stat;
+} ext_acl_xattr_entry;
+
+typedef struct {
+ __u32 a_count;
+ ext_acl_xattr_entry a_entries[0];
+} ext_acl_xattr_header;
+
+#define CFS_ACL_XATTR_SIZE(count, prefix) \
+ (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry))
+
+#define CFS_ACL_XATTR_COUNT(size, prefix) \
+ (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry))
+
+
+extern ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size);
+extern int
+lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+ posix_acl_xattr_header **out);
+extern void
+lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size);
+extern void
+lustre_ext_acl_xattr_free(ext_acl_xattr_header *header);
+extern int
+lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+ ext_acl_xattr_header *ext_header,
+ posix_acl_xattr_header **out);
+extern ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+ ext_acl_xattr_header *ext_header);
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+/** @} eacl */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h
new file mode 100644
index 000000000000..d61c020a4643
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_export.h
@@ -0,0 +1,389 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+ /** Protects led_lcd below */
+ struct mutex ted_lcd_lock;
+ /** Per-client data for each export */
+ struct lsd_client_data *ted_lcd;
+ /** Offset of record in last_rcvd file */
+ loff_t ted_lr_off;
+ /** Client index in last_rcvd file */
+ int ted_lr_idx;
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+ struct tg_export_data med_ted;
+ /** List of all files opened by client on this MDT */
+ struct list_head med_open_head;
+ spinlock_t med_open_lock; /* med_open_head, mfd_list */
+ /** Bitmask of all ibit locks this MDT understands */
+ __u64 med_ibits_known;
+ struct mutex med_idmap_mutex;
+ struct lustre_idmap_table *med_idmap;
+};
+
+struct ec_export_data { /* echo client */
+ struct list_head eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+ struct tg_export_data fed_ted;
+ spinlock_t fed_lock; /**< protects fed_mod_list */
+ long fed_dirty; /* in bytes */
+ long fed_grant; /* in bytes */
+ struct list_head fed_mod_list; /* files being modified */
+ int fed_mod_count;/* items in fed_writing list */
+ long fed_pending; /* bytes just being written */
+ __u32 fed_group;
+ __u8 fed_pagesize; /* log2 of client page size */
+};
+
+struct mgs_export_data {
+ struct list_head med_clients; /* mgc fs client via this exp */
+ spinlock_t med_lock; /* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+ lnet_nid_t nid;
+ struct hlist_node nid_hash;
+ struct list_head nid_list;
+ struct obd_device *nid_obd;
+ struct proc_dir_entry *nid_proc;
+ struct lprocfs_stats *nid_stats;
+ struct lprocfs_stats *nid_ldlm_stats;
+ atomic_t nid_exp_ref_count; /* for obd_nid_stats_hash
+ exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat) \
+do { \
+ atomic_inc(&(nidstat)->nid_exp_ref_count); \
+} while(0)
+
+#define nidstat_putref(nidstat) \
+do { \
+ atomic_dec(&(nidstat)->nid_exp_ref_count); \
+ LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0, \
+ "stat %p nid_exp_ref_count < 0\n", nidstat); \
+} while(0)
+
+enum obd_option {
+ OBD_OPT_FORCE = 0x0001,
+ OBD_OPT_FAILOVER = 0x0002,
+ OBD_OPT_ABORT_RECOV = 0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+ /**
+ * Export handle, it's id is provided to client on connect
+ * Subsequent client RPCs contain this handle id to identify
+ * what export they are talking to.
+ */
+ struct portals_handle exp_handle;
+ atomic_t exp_refcount;
+ /**
+ * Set of counters below is to track where export references are
+ * kept. The exp_rpc_count is used for reconnect handling also,
+ * the cb_count and locks_count are for debug purposes only for now.
+ * The sum of them should be less than exp_refcount by 3
+ */
+ atomic_t exp_rpc_count; /* RPC references */
+ atomic_t exp_cb_count; /* Commit callback references */
+ /** Number of queued replay requests to be processes */
+ atomic_t exp_replay_count;
+ atomic_t exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ struct list_head exp_locks_list;
+ spinlock_t exp_locks_list_guard;
+#endif
+ /** UUID of client connected to this export */
+ struct obd_uuid exp_client_uuid;
+ /** To link all exports on an obd device */
+ struct list_head exp_obd_chain;
+ struct hlist_node exp_uuid_hash; /** uuid-export hash*/
+ struct hlist_node exp_nid_hash; /** nid-export hash */
+ /**
+ * All exports eligible for ping evictor are linked into a list
+ * through this field in "most time since last request on this export"
+ * order
+ * protected by obd_dev_lock
+ */
+ struct list_head exp_obd_chain_timed;
+ /** Obd device of this export */
+ struct obd_device *exp_obd;
+ /**
+ * "reverse" import to send requests (e.g. from ldlm) back to client
+ * exp_lock protect its change
+ */
+ struct obd_import *exp_imp_reverse;
+ struct nid_stat *exp_nid_stats;
+ struct lprocfs_stats *exp_md_stats;
+ /** Active connetion */
+ struct ptlrpc_connection *exp_connection;
+ /** Connection count value from last succesful reconnect rpc */
+ __u32 exp_conn_cnt;
+ /** Hash list of all ldlm locks granted on this export */
+ cfs_hash_t *exp_lock_hash;
+ /**
+ * Hash list for Posix lock deadlock detection, added with
+ * ldlm_lock::l_exp_flock_hash.
+ */
+ cfs_hash_t *exp_flock_hash;
+ struct list_head exp_outstanding_replies;
+ struct list_head exp_uncommitted_replies;
+ spinlock_t exp_uncommitted_replies_lock;
+ /** Last committed transno for this export */
+ __u64 exp_last_committed;
+ /** When was last request received */
+ cfs_time_t exp_last_request_time;
+ /** On replay all requests waiting for replay are linked here */
+ struct list_head exp_req_replay_queue;
+ /**
+ * protects exp_flags, exp_outstanding_replies and the change
+ * of exp_imp_reverse
+ */
+ spinlock_t exp_lock;
+ /** Compatibility flags for this export are embedded into
+ * exp_connect_data */
+ struct obd_connect_data exp_connect_data;
+ enum obd_option exp_flags;
+ unsigned long exp_failed:1,
+ exp_in_recovery:1,
+ exp_disconnected:1,
+ exp_connecting:1,
+ /** VBR: export missed recovery */
+ exp_delayed:1,
+ /** VBR: failed version checking */
+ exp_vbr_failed:1,
+ exp_req_replay_needed:1,
+ exp_lock_replay_needed:1,
+ exp_need_sync:1,
+ exp_flvr_changed:1,
+ exp_flvr_adapt:1,
+ exp_libclient:1, /* liblustre client? */
+ /* client timed out and tried to reconnect,
+ * but couldn't because of active rpcs */
+ exp_abort_active_req:1,
+ /* if to swap nidtbl entries for 2.2 clients.
+ * Only used by the MGS to fix LU-1644. */
+ exp_need_mne_swab:1;
+ /* also protected by exp_lock */
+ enum lustre_sec_part exp_sp_peer;
+ struct sptlrpc_flavor exp_flvr; /* current */
+ struct sptlrpc_flavor exp_flvr_old[2]; /* about-to-expire */
+ cfs_time_t exp_flvr_expire[2]; /* seconds */
+
+ /** protects exp_hp_rpcs */
+ spinlock_t exp_rpc_lock;
+ struct list_head exp_hp_rpcs; /* (potential) HP RPCs */
+
+ /** blocking dlm lock list, protected by exp_bl_list_lock */
+ struct list_head exp_bl_list;
+ spinlock_t exp_bl_list_lock;
+
+ /** Target specific data */
+ union {
+ struct tg_export_data eu_target_data;
+ struct mdt_export_data eu_mdt_data;
+ struct filter_export_data eu_filter_data;
+ struct ec_export_data eu_ec_data;
+ struct mgs_export_data eu_mgs_data;
+ } u;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+ return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+ return *exp_connect_flags_ptr(exp);
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+ return exp->exp_connect_data.ocd_brw_size;
+
+ return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+ return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
+{
+ LASSERT(exp->exp_delayed);
+ return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
+ cfs_time_current_sec());
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_rmtclient(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int client_is_remote(struct obd_export *exp)
+{
+ struct obd_import *imp = class_exp2cliimp(exp);
+
+ return !!(imp->imp_connect_data.ocd_connect_flags &
+ OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ LASSERT(exp->exp_connection);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_som(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+ struct obd_connect_data *ocd;
+
+ LASSERT(imp != NULL);
+ ocd = &imp->imp_connect_data;
+ return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+ return true;
+ else
+ return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+ struct obd_connect_data *ocd;
+
+ LASSERT(imp != NULL);
+ ocd = &imp->imp_connect_data;
+ if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+ return true;
+ else
+ return false;
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h
new file mode 100644
index 000000000000..7d20cba07287
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fid.h
@@ -0,0 +1,762 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LINUX_FID_H
+#define __LINUX_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ * File IDentifier generated by client from range allocated by the SEQuence
+ * service and stored in struct lu_fid. The FID is composed of three parts:
+ * SEQuence, ObjectID, and VERsion. The SEQ component is a filesystem
+ * unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ * The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ * for system use. The OID component is a 32-bit value generated by the
+ * client on a per-SEQ basis to allow creating many unique FIDs without
+ * communication with the server. The VER component is a 32-bit value that
+ * distinguishes between different FID instantiations, such as snapshots or
+ * separate subtrees within the filesystem. FIDs with the same VER field
+ * are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ * MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ * OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ * Inode and Generation In FID, a surrogate FID used to globally identify
+ * an existing object on OLD formatted MDT file system. This would only be
+ * used on MDT0 in a DNE filesystem, because there cannot be more than one
+ * MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ * range, where inode number is stored in SEQ, and inode generation is in OID.
+ * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ * which is the maximum possible for an ldiskfs backend. It also assumes
+ * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ * to clients, which has always been true.
+ *
+ * IDIF
+ * object ID In FID, a surrogate FID used to globally identify an existing
+ * OST object on OLD formatted OST file system. Belongs to a sequence in
+ * [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ * ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ * be identified in the FLD correctly. The OID field is calculated as:
+ *
+ * objid & 0xffffffff
+ *
+ * that is, it consists of lower 32 bits of object ID. For objects within
+ * the IDIF range, object ID extraction will be:
+ *
+ * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ * o_seq = 0; // formerly group number
+ *
+ * NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ * on any OST, and that no more than 65535 OSTs are in use. Both are very
+ * reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ * a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ * or combinations thereof.
+ *
+ * OST_MDT0
+ * Surrogate FID used to identify an existing object on OLD formatted OST
+ * filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ * the introduction of FID-on-OST, at which point IDIF will be used to
+ * identify objects as residing on a specific OST.
+ *
+ * LLOG
+ * For Lustre Log objects the object sequence 1 is used. This is compatible
+ * with both OLD and NEW namespaces, as this SEQ number is in the
+ * ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ * sequence numbers.
+ *
+ * ECHO
+ * For testing OST IO performance the object sequence 2 is used. This is
+ * compatible with both OLD and NEW namespaces, as this SEQ number is in
+ * the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ * sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ * For testing with multiple MDTs the object sequence 3 through 9 is used,
+ * allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ * of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ * mappings. However, this SEQ range is only for testing prior to any
+ * production DNE release, as the objects in this range conflict across all
+ * OSTs, as the OST index is not part of the FID. For production DNE usage,
+ * OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ * For compatibility with existing OLD OST network protocol structures, the
+ * FID must map onto the o_id and o_seq in a manner that ensures existing
+ * objects are identified consistently for IO, as well as onto the LDLM
+ * namespace to ensure IDIFs there is only a single resource name for any
+ * object in the DLM. The OLD OST object DLM resource mapping is:
+ *
+ * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ * The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ * resource[] = {SEQ, OID, VER, HASH};
+ *
+ * NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ * for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ * in all production releases the OLD o_seq field is always zero, and all
+ * valid FID OID values are non-zero, so the lock resources will not collide.
+ * Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <lustre_mdt.h>
+#include <obd.h>
+
+
+struct lu_site;
+struct lu_context;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+
+enum {
+ /*
+ * This is how may metadata FIDs may be allocated in one sequence(128k)
+ */
+ LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+ /*
+ * This is how many data FIDs could be allocated in one sequence(4B - 1)
+ */
+ LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+ /*
+ * How many sequences to allocate to a client at once.
+ */
+ LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+ /*
+ * seq allocation pool size.
+ */
+ LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+ /*
+ * This is how many sequences may be in one super-sequence allocated to
+ * MDTs.
+ */
+ LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+enum {
+ /** 2^6 FIDs for OI containers */
+ OSD_OI_FID_OID_BITS = 6,
+ /** reserve enough FIDs in case we want more in the future */
+ OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
+/** special OID for local objects */
+enum local_oid {
+ /** \see fld_mod_init */
+ FLD_INDEX_OID = 3UL,
+ /** \see fid_mod_init */
+ FID_SEQ_CTL_OID = 4UL,
+ FID_SEQ_SRV_OID = 5UL,
+ /** \see mdd_mod_init */
+ MDD_ROOT_INDEX_OID = 6UL, /* deprecated in 2.4 */
+ MDD_ORPHAN_OID = 7UL, /* deprecated in 2.4 */
+ MDD_LOV_OBJ_OID = 8UL,
+ MDD_CAPA_KEYS_OID = 9UL,
+ /** \see mdt_mod_init */
+ LAST_RECV_OID = 11UL,
+ OSD_FS_ROOT_OID = 13UL,
+ ACCT_USER_OID = 15UL,
+ ACCT_GROUP_OID = 16UL,
+ LFSCK_BOOKMARK_OID = 17UL,
+ OTABLE_IT_OID = 18UL,
+ /* These two definitions are obsolete
+ * OFD_GROUP0_LAST_OID = 20UL,
+ * OFD_GROUP4K_LAST_OID = 20UL+4096,
+ */
+ OFD_LAST_GROUP_OID = 4117UL,
+ LLOG_CATALOGS_OID = 4118UL,
+ MGS_CONFIGS_OID = 4119UL,
+ OFD_HEALTH_CHECK_OID = 4120UL,
+ MDD_LOV_OBJ_OSEQ = 4121UL,
+ LFSCK_NAMESPACE_OID = 4122UL,
+ REMOTE_PARENT_DIR_OID = 4123UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+ fid->f_seq = FID_SEQ_LOCAL_FILE;
+ fid->f_oid = oid;
+ fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+ fid->f_seq = FID_SEQ_LOCAL_NAME;
+ fid->f_oid = oid;
+ fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+ return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+ fid_oid(fid) == 1));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+ return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+ fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+ return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+ fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+ return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+ fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+ return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+ (fid_oid(fid) == ACCT_USER_OID ||
+ fid_oid(fid) == ACCT_GROUP_OID);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+ return fid_seq(fid) == FID_SEQ_QUOTA ||
+ fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+ const __u64 seq = fid_seq(fid);
+
+ /* Here, we cannot distinguish whether the normal FID is for OST
+ * object or not. It is caller's duty to check more if needed. */
+ return (!fid_is_last_id(fid) &&
+ (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+ fid_is_root(fid) || fid_is_dot_lustre(fid);
+}
+
+static inline int fid_seq_in_fldb(__u64 seq)
+{
+ return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+ fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq)
+{
+ if (fid_seq_is_mdt0(seq)) {
+ fid->f_seq = fid_idif_seq(0, 0);
+ } else {
+ LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+ fid_seq_is_idif(seq), LPX64"\n", seq);
+ fid->f_seq = seq;
+ }
+ fid->f_oid = 0;
+ fid->f_ver = 0;
+}
+
+enum lu_mgr_type {
+ LUSTRE_SEQ_SERVER,
+ LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+ /* Sequence-controller export. */
+ struct obd_export *lcs_exp;
+ struct mutex lcs_mutex;
+
+ /*
+ * Range of allowed for allocation sequeces. When using lu_client_seq on
+ * clients, this contains meta-sequence range. And for servers this
+ * contains super-sequence range.
+ */
+ struct lu_seq_range lcs_space;
+
+ /* Seq related proc */
+ proc_dir_entry_t *lcs_proc_dir;
+
+ /* This holds last allocated fid in last obtained seq */
+ struct lu_fid lcs_fid;
+
+ /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+ enum lu_cli_type lcs_type;
+
+ /*
+ * Service uuid, passed from MDT + seq name to form unique seq name to
+ * use it with procfs.
+ */
+ char lcs_name[80];
+
+ /*
+ * Sequence width, that is how many objects may be allocated in one
+ * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+ */
+ __u64 lcs_width;
+
+ /* Seq-server for direct talking */
+ struct lu_server_seq *lcs_srv;
+
+ /* wait queue for fid allocation and update indicator */
+ wait_queue_head_t lcs_waitq;
+ int lcs_update;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+ /* Available sequences space */
+ struct lu_seq_range lss_space;
+
+ /* keeps highwater in lsr_end for seq allocation algorithm */
+ struct lu_seq_range lss_lowater_set;
+ struct lu_seq_range lss_hiwater_set;
+
+ /*
+ * Device for server side seq manager needs (saving sequences to backing
+ * store).
+ */
+ struct dt_device *lss_dev;
+
+ /* /seq file object device */
+ struct dt_object *lss_obj;
+
+ /* Seq related proc */
+ proc_dir_entry_t *lss_proc_dir;
+
+ /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+ enum lu_mgr_type lss_type;
+
+ /* Client interafce to request controller */
+ struct lu_client_seq *lss_cli;
+
+ /* Mutex for protecting allocation */
+ struct mutex lss_mutex;
+
+ /*
+ * Service uuid, passed from MDT + seq name to form unique seq name to
+ * use it with procfs.
+ */
+ char lss_name[80];
+
+ /*
+ * Allocation chunks for super and meta sequences. Default values are
+ * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+ */
+ __u64 lss_width;
+
+ /*
+ * minimum lss_alloc_set size that should be allocated from
+ * lss_space
+ */
+ __u64 lss_set_width;
+
+ /* sync is needed for update operation */
+ __u32 lss_need_sync;
+
+ /**
+ * Pointer to site object, required to access site fld.
+ */
+ struct seq_server_site *lss_site;
+};
+
+int seq_query(struct com_thread_info *info);
+int seq_handle(struct ptlrpc_request *req);
+
+/* Server methods */
+int seq_server_init(struct lu_server_seq *seq,
+ struct dt_device *dev,
+ const char *prefix,
+ enum lu_mgr_type type,
+ struct seq_server_site *ss,
+ const struct lu_env *env);
+
+void seq_server_fini(struct lu_server_seq *seq,
+ const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+ struct lu_seq_range *out,
+ const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+ struct lu_seq_range *out,
+ const struct lu_env *env);
+
+int seq_server_set_cli(struct lu_server_seq *seq,
+ struct lu_client_seq *cli,
+ const struct lu_env *env);
+
+/* Client methods */
+int seq_client_init(struct lu_client_seq *seq,
+ struct obd_export *exp,
+ enum lu_cli_type type,
+ const char *prefix,
+ struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+ struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+ seqno_t *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+ struct lu_site *site, const struct lu_fid *fid);
+
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+ enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *f,
+ struct ldlm_res_id *name)
+{
+ memset(name, 0, sizeof *name);
+ name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f);
+ name->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(f);
+ return name;
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline struct ldlm_res_id *
+fid_build_quota_resid(const struct lu_fid *glb_fid, union lquota_id *qid,
+ struct ldlm_res_id *res)
+{
+ fid_build_reg_res_name(glb_fid, res);
+ res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+ res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+ return res;
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_quota_resid(struct ldlm_res_id *res,
+ struct lu_fid *glb_fid,
+ union lquota_id *qid)
+{
+ glb_fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+ glb_fid->f_oid = (__u32)res->name[LUSTRE_RES_ID_VER_OID_OFF];
+ glb_fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+ qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+ qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+ qid->qid_fid.f_ver =
+ (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+/*
+ * Return true if resource is for object identified by fid.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *f,
+ const struct ldlm_res_id *name)
+{
+ return name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) &&
+ name->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(f);
+}
+
+/* reverse function of fid_build_reg_res_name() */
+static inline void fid_build_from_res_name(struct lu_fid *f,
+ const struct ldlm_res_id *name)
+{
+ fid_zero(f);
+ f->f_seq = name->name[LUSTRE_RES_ID_SEQ_OFF];
+ f->f_oid = name->name[LUSTRE_RES_ID_VER_OID_OFF] & 0xffffffff;
+ f->f_ver = name->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32;
+ LASSERT(fid_res_name_eq(f, name));
+}
+
+static inline struct ldlm_res_id *
+fid_build_pdo_res_name(const struct lu_fid *f,
+ unsigned int hash,
+ struct ldlm_res_id *name)
+{
+ fid_build_reg_res_name(f, name);
+ name->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+ return name;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just oposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unifiy the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ * res[0] = objid, res[1] = 0, still keep the original order,
+ * for compatiblity.
+ *
+ * For new resid
+ * res will be built from normal FID directly, i.e. res[0] = f_seq,
+ * res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(struct ost_id *oi,
+ struct ldlm_res_id *name)
+{
+ memset(name, 0, sizeof *name);
+ if (fid_seq_is_mdt0(ostid_seq(oi))) {
+ name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+ name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+ } else {
+ fid_build_reg_res_name((struct lu_fid *)oi, name);
+ }
+}
+
+static inline void ostid_res_name_to_id(struct ost_id *oi,
+ struct ldlm_res_id *name)
+{
+ if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+ /* old resid */
+ ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+ ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+ } else {
+ /* new resid */
+ fid_build_from_res_name((struct lu_fid *)oi, name);
+ }
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline int ostid_res_name_eq(struct ost_id *oi,
+ struct ldlm_res_id *name)
+{
+ /* Note: it is just a trick here to save some effort, probably the
+ * correct way would be turn them into the FID and compare */
+ if (fid_seq_is_mdt0(ostid_seq(oi))) {
+ return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+ name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+ } else {
+ return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+ name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+ }
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+ struct ldlm_res_id *resname)
+{
+ if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+ struct ost_id oi;
+ oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */
+ if (fid_to_ostid(fid, &oi) != 0)
+ return;
+ ostid_build_res_name(&oi, resname);
+ } else {
+ fid_build_reg_res_name(fid, resname);
+ }
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+ const struct ldlm_res_id *name)
+{
+ if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+ /* old resid */
+ struct ost_id oi;
+ ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+ ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+ ostid_to_fid(fid, &oi, 0);
+ } else {
+ /* new resid */
+ fid_build_from_res_name(fid, name);
+ }
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+ __u64 ino;
+ __u64 seq;
+
+ if (fid_is_igif(fid)) {
+ ino = lu_igif_ino(fid);
+ RETURN(ino);
+ }
+
+ seq = fid_seq(fid);
+
+ ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+ RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+ /* all objects with same id and different versions will belong to same
+ * collisions list. */
+ return cfs_hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+ __u32 ino;
+ __u64 seq;
+
+ if (fid_is_igif(fid)) {
+ ino = lu_igif_ino(fid);
+ RETURN(ino);
+ }
+
+ seq = fid_seq(fid) - FID_SEQ_START;
+
+ /* Map the high bits of the OID into higher bits of the inode number so
+ * that inodes generated at about the same time have a reduced chance
+ * of collisions. This will give a period of 2^12 = 1024 unique clients
+ * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+ * (from OID), or up to 128M inodes without collisions for new files. */
+ ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+ (seq >> (64 - (40-8)) & 0xffffff00) +
+ (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+ RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2)
+{
+ LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+ PFID(fid1), PFID(fid2));
+
+ if (fid_is_idif(fid1) && fid_is_idif(fid2))
+ return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+ fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+ return fid_oid(fid1) - fid_oid(fid2);
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+ dst->lsr_start = cpu_to_le64(src->lsr_start);
+ dst->lsr_end = cpu_to_le64(src->lsr_end);
+ dst->lsr_index = cpu_to_le32(src->lsr_index);
+ dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+ dst->lsr_start = le64_to_cpu(src->lsr_start);
+ dst->lsr_end = le64_to_cpu(src->lsr_end);
+ dst->lsr_index = le32_to_cpu(src->lsr_index);
+ dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+ dst->lsr_start = cpu_to_be64(src->lsr_start);
+ dst->lsr_end = cpu_to_be64(src->lsr_end);
+ dst->lsr_index = cpu_to_be32(src->lsr_index);
+ dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+ dst->lsr_start = be64_to_cpu(src->lsr_start);
+ dst->lsr_end = be64_to_cpu(src->lsr_end);
+ dst->lsr_index = be32_to_cpu(src->lsr_index);
+ dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+/** @} fid */
+
+#endif /* __LINUX_FID_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h
new file mode 100644
index 000000000000..11e034a65b17
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fld.h
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_mdt.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+
+extern const struct dt_index_features fld_index_features;
+extern const char fld_index_name[];
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+ LUSTRE_CLI_FLD_HASH_DHT = 0,
+ LUSTRE_CLI_FLD_HASH_RRB
+};
+
+
+struct lu_fld_target {
+ struct list_head ft_chain;
+ struct obd_export *ft_exp;
+ struct lu_server_fld *ft_srv;
+ __u64 ft_idx;
+};
+
+struct lu_server_fld {
+ /**
+ * Fld dir proc entry. */
+ proc_dir_entry_t *lsf_proc_dir;
+
+ /**
+ * /fld file object device */
+ struct dt_object *lsf_obj;
+
+ /**
+ * super sequence controller export, needed to forward fld
+ * lookup request. */
+ struct obd_export *lsf_control_exp;
+
+ /**
+ * Client FLD cache. */
+ struct fld_cache *lsf_cache;
+
+ /**
+ * Protect index modifications */
+ struct mutex lsf_lock;
+
+ /**
+ * Fld service name in form "fld-srv-lustre-MDTXXX" */
+ char lsf_name[80];
+
+};
+
+struct lu_client_fld {
+ /**
+ * Client side proc entry. */
+ proc_dir_entry_t *lcf_proc_dir;
+
+ /**
+ * List of exports client FLD knows about. */
+ struct list_head lcf_targets;
+
+ /**
+ * Current hash to be used to chose an export. */
+ struct lu_fld_hash *lcf_hash;
+
+ /**
+ * Exports count. */
+ int lcf_count;
+
+ /**
+ * Lock protecting exports list and fld_hash. */
+ spinlock_t lcf_lock;
+
+ /**
+ * Client FLD cache. */
+ struct fld_cache *lcf_cache;
+
+ /**
+ * Client fld proc entry name. */
+ char lcf_name[80];
+
+ const struct lu_context *lcf_ctx;
+
+ int lcf_flags;
+};
+
+/**
+ * number of blocks to reserve for particular operations. Should be function of
+ * ... something. Stub for now.
+ */
+enum {
+ /* one insert operation can involve two delete and one insert */
+ FLD_TXN_INDEX_INSERT_CREDITS = 60,
+ FLD_TXN_INDEX_DELETE_CREDITS = 20,
+};
+
+int fld_query(struct com_thread_info *info);
+
+/* Server methods */
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+ struct dt_device *dt, const char *prefix, int mds_node_id,
+ int type);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_server_create(const struct lu_env *env,
+ struct lu_server_fld *fld,
+ struct lu_seq_range *new,
+ struct thandle *th);
+
+int fld_server_create(const struct lu_env *env,
+ struct lu_server_fld *fld,
+ struct lu_seq_range *add_range,
+ struct thandle *th);
+
+int fld_insert_entry(const struct lu_env *env,
+ struct lu_server_fld *fld,
+ const struct lu_seq_range *range);
+
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+ seqno_t seq, struct lu_seq_range *range);
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+ const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+ __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+ struct lu_seq_range *range,
+ const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld,
+ seqno_t seq,
+ const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+ struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+ __u64 idx);
+
+void fld_client_proc_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h
new file mode 100644
index 000000000000..9dcc332cb2f3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LUSTRE_FSFILT_H
+#define _LUSTRE_FSFILT_H
+
+#include <linux/lustre_fsfilt.h>
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h
new file mode 100644
index 000000000000..105f6d61eef0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_ha.h
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+int ptlrpc_check_suspend(void);
+void ptlrpc_activate_timeouts(struct obd_import *imp);
+void ptlrpc_deactivate_timeouts(struct obd_import *imp);
+
+/** @} ha */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h
new file mode 100644
index 000000000000..fcd40f33426a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_handles.h
@@ -0,0 +1,93 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/lustre_handles.h>
+
+#include <linux/libcfs/libcfs.h>
+
+
+struct portals_handle_ops {
+ void (*hop_addref)(void *object);
+ void (*hop_free)(void *object, int size);
+};
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for. ie:
+ *
+ * struct ldlm_lock {
+ * struct portals_handle handle;
+ * ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock. If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+ struct list_head h_link;
+ __u64 h_cookie;
+ struct portals_handle_ops *h_ops;
+
+ /* newly added fields to handle the RCU issue. -jxiong */
+ cfs_rcu_head_t h_rcu;
+ spinlock_t h_lock;
+ unsigned int h_size:31;
+ unsigned int h_in:1;
+};
+#define RCU2HANDLE(rcu) container_of(rcu, struct portals_handle, h_rcu)
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *,
+ struct portals_handle_ops *ops);
+void class_handle_unhash(struct portals_handle *);
+void class_handle_hash_back(struct portals_handle *);
+void *class_handle2object(__u64 cookie);
+void class_handle_free_cb(cfs_rcu_head_t *);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_idmap.h b/drivers/staging/lustre/lustre/include/lustre_idmap.h
new file mode 100644
index 000000000000..084bdd6ab4db
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_idmap.h
@@ -0,0 +1,104 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_IDMAP_H
+#define _LUSTRE_IDMAP_H
+
+/** \defgroup idmap idmap
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_NGROUPS_PER_BLOCK ((int)(PAGE_CACHE_SIZE / sizeof(gid_t)))
+
+#define CFS_GROUP_AT(gi, i) \
+ ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK])
+
+enum {
+ CFS_IC_NOTHING = 0, /* convert nothing */
+ CFS_IC_ALL = 1, /* convert all items */
+ CFS_IC_MAPPED = 2, /* convert mapped uid/gid */
+ CFS_IC_UNMAPPED = 3 /* convert unmapped uid/gid */
+};
+
+#define CFS_IDMAP_NOTFOUND (-1)
+
+#define CFS_IDMAP_HASHSIZE 32
+
+enum lustre_idmap_idx {
+ RMT_UIDMAP_IDX,
+ LCL_UIDMAP_IDX,
+ RMT_GIDMAP_IDX,
+ LCL_GIDMAP_IDX,
+ CFS_IDMAP_N_HASHES
+};
+
+struct lustre_idmap_table {
+ spinlock_t lit_lock;
+ struct list_head lit_idmaps[CFS_IDMAP_N_HASHES][CFS_IDMAP_HASHSIZE];
+};
+
+struct lu_ucred;
+
+extern void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist);
+extern void lustre_groups_sort(group_info_t *group_info);
+extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp);
+
+extern int lustre_idmap_add(struct lustre_idmap_table *t,
+ uid_t ruid, uid_t luid,
+ gid_t rgid, gid_t lgid);
+extern int lustre_idmap_del(struct lustre_idmap_table *t,
+ uid_t ruid, uid_t luid,
+ gid_t rgid, gid_t lgid);
+extern int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+ struct lustre_idmap_table *t,
+ int reverse, uid_t uid);
+extern int lustre_idmap_lookup_gid(struct lu_ucred *mu,
+ struct lustre_idmap_table *t,
+ int reverse, gid_t gid);
+extern struct lustre_idmap_table *lustre_idmap_init(void);
+extern void lustre_idmap_fini(struct lustre_idmap_table *t);
+
+/** @} idmap */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h
new file mode 100644
index 000000000000..3a5dd6a94c08
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_import.h
@@ -0,0 +1,367 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <lustre/lustre_idl.h>
+
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4 /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1 /* use last reported value only */
+
+struct adaptive_timeout {
+ time_t at_binstart; /* bin start time */
+ unsigned int at_hist[AT_BINS]; /* timeout history bins */
+ unsigned int at_flags;
+ unsigned int at_current; /* current timeout value */
+ unsigned int at_worst_ever; /* worst-ever timeout value */
+ time_t at_worst_time; /* worst-ever timeout timestamp */
+ spinlock_t at_lock;
+};
+
+struct ptlrpc_at_array {
+ struct list_head *paa_reqs_array; /** array to hold requests */
+ __u32 paa_size; /** the size of array */
+ __u32 paa_count; /** the total count of reqs */
+ time_t paa_deadline; /** the earliest deadline of reqs */
+ __u32 *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+ int iat_portal[IMP_AT_MAX_PORTALS];
+ struct adaptive_timeout iat_net_latency;
+ struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+ LUSTRE_IMP_CLOSED = 1,
+ LUSTRE_IMP_NEW = 2,
+ LUSTRE_IMP_DISCON = 3,
+ LUSTRE_IMP_CONNECTING = 4,
+ LUSTRE_IMP_REPLAY = 5,
+ LUSTRE_IMP_REPLAY_LOCKS = 6,
+ LUSTRE_IMP_REPLAY_WAIT = 7,
+ LUSTRE_IMP_RECOVER = 8,
+ LUSTRE_IMP_FULL = 9,
+ LUSTRE_IMP_EVICTED = 10,
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+ static char* import_state_names[] = {
+ "<UNKNOWN>", "CLOSED", "NEW", "DISCONN",
+ "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+ "RECOVER", "FULL", "EVICTED",
+ };
+
+ LASSERT (state <= LUSTRE_IMP_EVICTED);
+ return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+ IMP_EVENT_DISCON = 0x808001,
+ IMP_EVENT_INACTIVE = 0x808002,
+ IMP_EVENT_INVALIDATE = 0x808003,
+ IMP_EVENT_ACTIVE = 0x808004,
+ IMP_EVENT_OCD = 0x808005,
+ IMP_EVENT_DEACTIVATE = 0x808006,
+ IMP_EVENT_ACTIVATE = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+ /** Item for linking connections together */
+ struct list_head oic_item;
+ /** Pointer to actual PortalRPC connection */
+ struct ptlrpc_connection *oic_conn;
+ /** uuid of remote side */
+ struct obd_uuid oic_uuid;
+ /**
+ * Time (64 bit jiffies) of last connection attempt on this connection
+ */
+ __u64 oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+ enum lustre_imp_state ish_state;
+ time_t ish_time;
+};
+
+/**
+ * Defintion of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+ /** Local handle (== id) for this import. */
+ struct portals_handle imp_handle;
+ /** Reference counter */
+ atomic_t imp_refcount;
+ struct lustre_handle imp_dlm_handle; /* client's ldlm export */
+ /** Currently active connection */
+ struct ptlrpc_connection *imp_connection;
+ /** PortalRPC client structure for this import */
+ struct ptlrpc_client *imp_client;
+ /** List element for linking into pinger chain */
+ struct list_head imp_pinger_chain;
+ /** List element for linking into chain for destruction */
+ struct list_head imp_zombie_chain;
+
+ /**
+ * Lists of requests that are retained for replay, waiting for a reply,
+ * or waiting for recovery to complete, respectively.
+ * @{
+ */
+ struct list_head imp_replay_list;
+ struct list_head imp_sending_list;
+ struct list_head imp_delayed_list;
+ /** @} */
+
+ /** obd device for this import */
+ struct obd_device *imp_obd;
+
+ /**
+ * some seciruty-related fields
+ * @{
+ */
+ struct ptlrpc_sec *imp_sec;
+ struct mutex imp_sec_mutex;
+ cfs_time_t imp_sec_expire;
+ /** @} */
+
+ /** Wait queue for those who need to wait for recovery completion */
+ wait_queue_head_t imp_recovery_waitq;
+
+ /** Number of requests currently in-flight */
+ atomic_t imp_inflight;
+ /** Number of requests currently unregistering */
+ atomic_t imp_unregistering;
+ /** Number of replay requests inflight */
+ atomic_t imp_replay_inflight;
+ /** Number of currently happening import invalidations */
+ atomic_t imp_inval_count;
+ /** Numbner of request timeouts */
+ atomic_t imp_timeouts;
+ /** Current import state */
+ enum lustre_imp_state imp_state;
+ /** History of import states */
+ struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN];
+ int imp_state_hist_idx;
+ /** Current import generation. Incremented on every reconnect */
+ int imp_generation;
+ /** Incremented every time we send reconnection request */
+ __u32 imp_conn_cnt;
+ /**
+ * \see ptlrpc_free_committed remembers imp_generation value here
+ * after a check to save on unnecessary replay list iterations
+ */
+ int imp_last_generation_checked;
+ /** Last tranno we replayed */
+ __u64 imp_last_replay_transno;
+ /** Last transno committed on remote side */
+ __u64 imp_peer_committed_transno;
+ /**
+ * \see ptlrpc_free_committed remembers last_transno since its last
+ * check here and if last_transno did not change since last run of
+ * ptlrpc_free_committed and import generation is the same, we can
+ * skip looking for requests to remove from replay list as optimisation
+ */
+ __u64 imp_last_transno_checked;
+ /**
+ * Remote export handle. This is how remote side knows what export
+ * we are talking to. Filled from response to connect request
+ */
+ struct lustre_handle imp_remote_handle;
+ /** When to perform next ping. time in jiffies. */
+ cfs_time_t imp_next_ping;
+ /** When we last succesfully connected. time in 64bit jiffies */
+ __u64 imp_last_success_conn;
+
+ /** List of all possible connection for import. */
+ struct list_head imp_conn_list;
+ /**
+ * Current connection. \a imp_connection is imp_conn_current->oic_conn
+ */
+ struct obd_import_conn *imp_conn_current;
+
+ /** Protects flags, level, generation, conn_cnt, *_list */
+ spinlock_t imp_lock;
+
+ /* flags */
+ unsigned long imp_no_timeout:1, /* timeouts are disabled */
+ imp_invalid:1, /* evicted */
+ /* administratively disabled */
+ imp_deactive:1,
+ /* try to recover the import */
+ imp_replayable:1,
+ /* don't run recovery (timeout instead) */
+ imp_dlm_fake:1,
+ /* use 1/2 timeout on MDS' OSCs */
+ imp_server_timeout:1,
+ /* VBR: imp in delayed recovery */
+ imp_delayed_recovery:1,
+ /* VBR: if gap was found then no lock replays
+ */
+ imp_no_lock_replay:1,
+ /* recovery by versions was failed */
+ imp_vbr_failed:1,
+ /* force an immidiate ping */
+ imp_force_verify:1,
+ /* force a scheduled ping */
+ imp_force_next_verify:1,
+ /* pingable */
+ imp_pingable:1,
+ /* resend for replay */
+ imp_resend_replay:1,
+ /* disable normal recovery, for test only. */
+ imp_no_pinger_recover:1,
+ /* need IR MNE swab */
+ imp_need_mne_swab:1,
+ /* import must be reconnected instead of
+ * chouse new connection */
+ imp_force_reconnect:1,
+ /* import has tried to connect with server */
+ imp_connect_tried:1;
+ __u32 imp_connect_op;
+ struct obd_connect_data imp_connect_data;
+ __u64 imp_connect_flags_orig;
+ int imp_connect_error;
+
+ __u32 imp_msg_magic;
+ __u32 imp_msghdr_flags; /* adjusted based on server capability */
+
+ struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
+
+ struct imp_at imp_at; /* adaptive timeout data */
+ time_t imp_last_reply_time; /* for health check */
+};
+
+typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
+ int event, void *event_arg, void *cb_data);
+
+/**
+ * Structure for import observer.
+ * It is possible to register "observer" on an import and every time
+ * something happens to an import (like connect/evict/disconnect)
+ * obderver will get its callback called with event type
+ */
+struct obd_import_observer {
+ struct list_head oio_chain;
+ obd_import_callback oio_cb;
+ void *oio_cb_data;
+};
+
+void class_observe_import(struct obd_import *imp, obd_import_callback cb,
+ void *cb_data);
+void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
+ void *cb_data);
+void class_notify_import_observers(struct obd_import *imp, int event,
+ void *event_arg);
+
+/* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+ /* add an arbitrary minimum: 125% +5 sec */
+ return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+ /* restore estimate value from timeout: e=4/5(t-5) */
+ LASSERT(val);
+ return (max((val << 2) / 5, 5U) - 4);
+}
+
+static inline void at_reset(struct adaptive_timeout *at, int val) {
+ at->at_current = val;
+ at->at_worst_ever = val;
+ at->at_worst_time = cfs_time_current_sec();
+}
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
+ memset(at, 0, sizeof(*at));
+ spin_lock_init(&at->at_lock);
+ at->at_flags = flags;
+ at_reset(at, val);
+}
+extern unsigned int at_min;
+static inline int at_get(struct adaptive_timeout *at) {
+ return (at->at_current > at_min) ? at->at_current : at_min;
+}
+int at_measured(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
new file mode 100644
index 000000000000..bdfc5391c6d2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -0,0 +1,667 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ver.h>
+#include <lustre_cfg.h>
+#include <linux/lustre_lib.h>
+
+/* target.c */
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+struct l_wait_info;
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lvfs.h>
+
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+ int opcode, int version,
+ obd_count keylen, void *key,
+ obd_count vallen, void *val,
+ struct ptlrpc_request_set *set);
+
+#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
+#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+/* client.c */
+
+int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg* lcfg);
+struct client_obd *client_conn2cli(struct lustre_handle *conn);
+
+struct md_open_data;
+struct obd_client_handle {
+ struct lustre_handle och_fh;
+ struct lu_fid och_fid;
+ struct md_open_data *och_mod;
+ __u32 och_magic;
+ int och_flags;
+};
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+/* statfs_pack.c */
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/* l_lock.c */
+struct lustre_lock {
+ int l_depth;
+ task_t *l_owner;
+ struct semaphore l_sem;
+ spinlock_t l_spin;
+};
+
+void l_lock_init(struct lustre_lock *);
+void l_lock(struct lustre_lock *);
+void l_unlock(struct lustre_lock *);
+int l_has_lock(struct lustre_lock *);
+
+/*
+ * For md echo client
+ */
+enum md_echo_cmd {
+ ECHO_MD_CREATE = 1, /* Open/Create file on MDT */
+ ECHO_MD_MKDIR = 2, /* Mkdir on MDT */
+ ECHO_MD_DESTROY = 3, /* Unlink file on MDT */
+ ECHO_MD_RMDIR = 4, /* Rmdir on MDT */
+ ECHO_MD_LOOKUP = 5, /* Lookup on MDT */
+ ECHO_MD_GETATTR = 6, /* Getattr on MDT */
+ ECHO_MD_SETATTR = 7, /* Setattr on MDT */
+ ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */
+};
+
+/*
+ * OBD IOCTLS
+ */
+#define OBD_IOCTL_VERSION 0x00010004
+
+struct obd_ioctl_data {
+ __u32 ioc_len;
+ __u32 ioc_version;
+
+ union {
+ __u64 ioc_cookie;
+ __u64 ioc_u64_1;
+ };
+ union {
+ __u32 ioc_conn1;
+ __u32 ioc_u32_1;
+ };
+ union {
+ __u32 ioc_conn2;
+ __u32 ioc_u32_2;
+ };
+
+ struct obdo ioc_obdo1;
+ struct obdo ioc_obdo2;
+
+ obd_size ioc_count;
+ obd_off ioc_offset;
+ __u32 ioc_dev;
+ __u32 ioc_command;
+
+ __u64 ioc_nid;
+ __u32 ioc_nal;
+ __u32 ioc_type;
+
+ /* buffers the kernel will treat as user pointers */
+ __u32 ioc_plen1;
+ char *ioc_pbuf1;
+ __u32 ioc_plen2;
+ char *ioc_pbuf2;
+
+ /* inline buffers for various arguments */
+ __u32 ioc_inllen1;
+ char *ioc_inlbuf1;
+ __u32 ioc_inllen2;
+ char *ioc_inlbuf2;
+ __u32 ioc_inllen3;
+ char *ioc_inlbuf3;
+ __u32 ioc_inllen4;
+ char *ioc_inlbuf4;
+
+ char ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+ __u32 ioc_len;
+ __u32 ioc_version;
+};
+
+static inline int obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+ int len = cfs_size_round(sizeof(struct obd_ioctl_data));
+ len += cfs_size_round(data->ioc_inllen1);
+ len += cfs_size_round(data->ioc_inllen2);
+ len += cfs_size_round(data->ioc_inllen3);
+ len += cfs_size_round(data->ioc_inllen4);
+ return len;
+}
+
+
+static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+ if (data->ioc_len > (1<<30)) {
+ CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+ return 1;
+ }
+ if (data->ioc_inllen1 > (1<<30)) {
+ CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+ return 1;
+ }
+ if (data->ioc_inllen2 > (1<<30)) {
+ CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+ return 1;
+ }
+ if (data->ioc_inllen3 > (1<<30)) {
+ CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+ return 1;
+ }
+ if (data->ioc_inllen4 > (1<<30)) {
+ CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+ return 1;
+ }
+ if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+ CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+ CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_inlbuf3 && !data->ioc_inllen3) {
+ CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_inlbuf4 && !data->ioc_inllen4) {
+ CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_pbuf1 && !data->ioc_plen1) {
+ CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_pbuf2 && !data->ioc_plen2) {
+ CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_plen1 && !data->ioc_pbuf1) {
+ CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+ return 1;
+ }
+ if (data->ioc_plen2 && !data->ioc_pbuf2) {
+ CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+ return 1;
+ }
+ if (obd_ioctl_packlen(data) > data->ioc_len) {
+ CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+ obd_ioctl_packlen(data), data->ioc_len);
+ return 1;
+ }
+ return 0;
+}
+
+
+#include <obd_support.h>
+
+/* function defined in lustre/obdclass/<platform>/<platform>-module.c */
+int obd_ioctl_getdata(char **buf, int *len, void *arg);
+int obd_ioctl_popdata(void *arg, void *data, int len);
+
+static inline void obd_ioctl_freedata(char *buf, int len)
+{
+ ENTRY;
+
+ OBD_FREE_LARGE(buf, len);
+ EXIT;
+ return;
+}
+
+/*
+ * BSD ioctl description:
+ * #define IOC_V1 _IOR(g, n1, long)
+ * #define IOC_V2 _IOW(g, n2, long)
+ *
+ * ioctl(f, IOC_V1, arg);
+ * arg will be treated as a long value,
+ *
+ * ioctl(f, IOC_V2, arg)
+ * arg will be treated as a pointer, bsd will call
+ * copyin(buf, arg, sizeof(long))
+ *
+ * To make BSD ioctl handles argument correctly and simplely,
+ * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data
+ * for us. Does this change affect Linux? (XXX Liang)
+ */
+#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY _IOW ('f', 104, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_SETATTR _IOW ('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR _IOWR ('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+
+#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC _IOW ('f', 114, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME])
+#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME
+
+#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLIENT_RECOVER _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET _IOW ('f', 136, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_DEC_FS_USE_COUNT _IO ('f', 139 )
+#define OBD_IOC_NO_TRANSNO _IOW ('f', 140, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SET_READONLY _IOW ('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY _IOR ('f', 142, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE)
+
+#define OBD_GET_VERSION _IOWR ('f', 144, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CLOSE_UUID _IOWR ('f', 147, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CHANGELOG_SEND _IOW ('f', 148, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDEVICE _IOWR ('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH _IOWR ('f', 150, OBD_IOC_DATA_TYPE)
+/* see also <lustre/lustre_user.h> for ioctls 151-153 */
+/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */
+#define OBD_IOC_LOV_SETSTRIPE _IOW ('f', 154, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */
+#define OBD_IOC_LOV_GETSTRIPE _IOW ('f', 155, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */
+#define OBD_IOC_LOV_SETEA _IOW ('f', 156, OBD_IOC_DATA_TYPE)
+/* see <lustre/lustre_user.h> for ioctls 157-159 */
+/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */
+#define OBD_IOC_QUOTACHECK _IOW ('f', 160, int)
+/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */
+#define OBD_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *)
+/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */
+#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl)
+/* see also <lustre/lustre_user.h> for ioctls 163-176 */
+#define OBD_IOC_CHANGELOG_REG _IOW ('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG _IOW ('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR _IOW ('f', 179, struct obd_ioctl_data)
+#define OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARAM _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LLOG_CATINFO is deprecated */
+#define OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE)
+
+#define ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/* <lustre/lustre_user.h> defines ioctl number 218-219 */
+#define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t)
+
+#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data)
+
+#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PAUSE_LFSCK _IOW('f', 232, OBD_IOC_DATA_TYPE)
+
+/* XXX _IOWR('f', 250, long) has been defined in
+ * libcfs/include/libcfs/libcfs_private.h for debug, don't use it
+ */
+
+/* Until such time as we get_info the per-stripe maximum from the OST,
+ * we define this to be 2T - 4k, which is the ext3 maxbytes. */
+#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+/* Special values for remove LOV EA from disk */
+#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \
+ offset == (typeof(offset))(-1))
+
+/* #define POISON_BULK 0 */
+
+/*
+ * l_wait_event is a flexible sleeping function, permitting simple caller
+ * configuration of interrupt and timeout sensitivity along with actions to
+ * be performed in the event of either exception.
+ *
+ * The first form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
+ * intr_handler, callback_data);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending. It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired. At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled. Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS. The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ * timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ * Thread1 Thread2
+ *
+ * l_wait_event(&obj->wq, ....); (1)
+ *
+ * wake_up(&obj->wq): (2)
+ * spin_lock(&q->lock); (2.1)
+ * __wake_up_common(q, ...); (2.2)
+ * spin_unlock(&q->lock, flags); (2.3)
+ *
+ * OBD_FREE_PTR(obj); (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
+ *
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
+ *
+ */
+static inline int back_to_sleep(void *arg)
+{
+ return 0;
+}
+
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
+struct l_wait_info {
+ cfs_duration_t lwi_timeout;
+ cfs_duration_t lwi_interval;
+ int lwi_allow_intr;
+ int (*lwi_on_timeout)(void *);
+ void (*lwi_on_signal)(void *);
+ void *lwi_cb_data;
+};
+
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = 0, \
+ .lwi_allow_intr = 0 \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = interval, \
+ .lwi_allow_intr = 0 \
+})
+
+#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = time_cb, \
+ .lwi_on_signal = sig_cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = 0, \
+ .lwi_allow_intr = 0 \
+})
+
+#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = time_cb, \
+ .lwi_on_signal = sig_cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = 0, \
+ .lwi_allow_intr = 1 \
+})
+
+#define LWI_INTR(cb, data) LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
+#define __l_wait_event(wq, condition, info, ret, l_add_wait) \
+do { \
+ wait_queue_t __wait; \
+ cfs_duration_t __timeout = info->lwi_timeout; \
+ sigset_t __blocked; \
+ int __allow_intr = info->lwi_allow_intr; \
+ \
+ ret = 0; \
+ if (condition) \
+ break; \
+ \
+ init_waitqueue_entry_current(&__wait); \
+ l_add_wait(&wq, &__wait); \
+ \
+ /* Block all signals (just the non-fatal ones if no timeout). */ \
+ if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr)) \
+ __blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); \
+ else \
+ __blocked = cfs_block_sigsinv(0); \
+ \
+ for (;;) { \
+ unsigned __wstate; \
+ \
+ __wstate = info->lwi_on_signal != NULL && \
+ (__timeout == 0 || __allow_intr) ? \
+ TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; \
+ \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ \
+ if (condition) \
+ break; \
+ \
+ if (__timeout == 0) { \
+ waitq_wait(&__wait, __wstate); \
+ } else { \
+ cfs_duration_t interval = info->lwi_interval? \
+ min_t(cfs_duration_t, \
+ info->lwi_interval,__timeout):\
+ __timeout; \
+ cfs_duration_t remaining = waitq_timedwait(&__wait,\
+ __wstate, \
+ interval); \
+ __timeout = cfs_time_sub(__timeout, \
+ cfs_time_sub(interval, remaining));\
+ if (__timeout == 0) { \
+ if (info->lwi_on_timeout == NULL || \
+ info->lwi_on_timeout(info->lwi_cb_data)) { \
+ ret = -ETIMEDOUT; \
+ break; \
+ } \
+ /* Take signals after the timeout expires. */ \
+ if (info->lwi_on_signal != NULL) \
+ (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\
+ } \
+ } \
+ \
+ if (condition) \
+ break; \
+ if (cfs_signal_pending()) { \
+ if (info->lwi_on_signal != NULL && \
+ (__timeout == 0 || __allow_intr)) { \
+ if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+ info->lwi_on_signal(info->lwi_cb_data);\
+ ret = -EINTR; \
+ break; \
+ } \
+ /* We have to do this here because some signals */ \
+ /* are not blockable - ie from strace(1). */ \
+ /* In these cases we want to schedule_timeout() */ \
+ /* again, because we don't want that to return */ \
+ /* -EINTR when the RPC actually succeeded. */ \
+ /* the recalc_sigpending() below will deliver the */ \
+ /* signal properly. */ \
+ cfs_clear_sigpending(); \
+ } \
+ } \
+ \
+ cfs_restore_sigs(__blocked); \
+ \
+ set_current_state(TASK_RUNNING); \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+
+
+#define l_wait_event(wq, condition, info) \
+({ \
+ int __ret; \
+ struct l_wait_info *__info = (info); \
+ \
+ __l_wait_event(wq, condition, __info, \
+ __ret, add_wait_queue); \
+ __ret; \
+})
+
+#define l_wait_event_exclusive(wq, condition, info) \
+({ \
+ int __ret; \
+ struct l_wait_info *__info = (info); \
+ \
+ __l_wait_event(wq, condition, __info, \
+ __ret, add_wait_queue_exclusive); \
+ __ret; \
+})
+
+#define l_wait_event_exclusive_head(wq, condition, info) \
+({ \
+ int __ret; \
+ struct l_wait_info *__info = (info); \
+ \
+ __l_wait_event(wq, condition, __info, \
+ __ret, add_wait_queue_exclusive_head); \
+ __ret; \
+})
+
+#define l_wait_condition(wq, condition) \
+({ \
+ struct l_wait_info lwi = { 0 }; \
+ l_wait_event(wq, condition, &lwi); \
+})
+
+#define l_wait_condition_exclusive(wq, condition) \
+({ \
+ struct l_wait_info lwi = { 0 }; \
+ l_wait_event_exclusive(wq, condition, &lwi); \
+})
+
+#define l_wait_condition_exclusive_head(wq, condition) \
+({ \
+ struct l_wait_info lwi = { 0 }; \
+ l_wait_event_exclusive_head(wq, condition, &lwi); \
+})
+
+#define LIBLUSTRE_CLIENT (0)
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h
new file mode 100644
index 000000000000..5790be913bf6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_linkea.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: di wang <di.wang@intel.com>
+ */
+
+struct linkea_data {
+ /**
+ * Buffer to keep link EA body.
+ */
+ struct lu_buf *ld_buf;
+ /**
+ * The matched header, entry and its lenght in the EA
+ */
+ struct link_ea_header *ld_leh;
+ struct link_ea_entry *ld_lee;
+ int ld_reclen;
+};
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf);
+int linkea_init(struct linkea_data *ldata);
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+ struct lu_name *lname, struct lu_fid *pfid);
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+ const struct lu_fid *pfid);
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname);
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+ const struct lu_fid *pfid);
+
+#define LINKEA_NEXT_ENTRY(ldata) \
+ (struct link_ea_entry *)((char *)ldata.ld_lee + ldata.ld_reclen)
+
+#define LINKEA_FIRST_ENTRY(ldata) \
+ (struct link_ea_entry *)(ldata.ld_leh + 1)
diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h
new file mode 100644
index 000000000000..25f8bfaccef3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_lite.h
@@ -0,0 +1,147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LL_H
+#define _LL_H
+
+/** \defgroup lite lite
+ *
+ * @{
+ */
+
+#include <linux/lustre_lite.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre_net.h>
+#include <lustre_mds.h>
+#include <lustre_ha.h>
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS (22)
+#define LL_MAX_BLKSIZE (1UL<<LL_MAX_BLKSIZE_BITS)
+
+#include <lustre/lustre_user.h>
+
+
+struct lustre_rw_params {
+ int lrp_lock_mode;
+ ldlm_policy_data_t lrp_policy;
+ obd_flag lrp_brw_flags;
+ int lrp_ast_flags;
+};
+
+/*
+ * XXX nikita: this function lives in the header because it is used by both
+ * llite kernel module and liblustre library, and there is no (?) better place
+ * to put it in.
+ */
+static inline void lustre_build_lock_params(int cmd, unsigned long open_flags,
+ __u64 connect_flags,
+ loff_t pos, ssize_t len,
+ struct lustre_rw_params *params)
+{
+ params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW;
+ params->lrp_brw_flags = 0;
+
+ params->lrp_policy.l_extent.start = pos;
+ params->lrp_policy.l_extent.end = pos + len - 1;
+ /*
+ * for now O_APPEND always takes local locks.
+ */
+ if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) {
+ params->lrp_policy.l_extent.start = 0;
+ params->lrp_policy.l_extent.end = OBD_OBJECT_EOF;
+ } else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) {
+ /*
+ * liblustre: OST-side locking for all non-O_APPEND
+ * reads/writes.
+ */
+ params->lrp_lock_mode = LCK_NL;
+ params->lrp_brw_flags = OBD_BRW_SRVLOCK;
+ } else {
+ /*
+ * nothing special for the kernel. In the future llite may use
+ * OST-side locks for small writes into highly contended
+ * files.
+ */
+ }
+ params->lrp_ast_flags = (open_flags & O_NONBLOCK) ?
+ LDLM_FL_BLOCK_NOWAIT : 0;
+}
+
+/*
+ * This is embedded into liblustre and llite super-blocks to keep track of
+ * connect flags (capabilities) supported by all imports given mount is
+ * connected to.
+ */
+struct lustre_client_ocd {
+ /*
+ * This is conjunction of connect_flags across all imports (LOVs) this
+ * mount is connected to. This field is updated by cl_ocd_update()
+ * under ->lco_lock.
+ */
+ __u64 lco_flags;
+ struct mutex lco_lock;
+ struct obd_export *lco_md_exp;
+ struct obd_export *lco_dt_exp;
+};
+
+/*
+ * Chain of hash overflow pages.
+ */
+struct ll_dir_chain {
+ /* XXX something. Later */
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+ if (BITS_PER_LONG == 32 && hash64)
+ hash >>= 32;
+ return ~0UL - hash;
+}
+
+/** @} lite */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h
new file mode 100644
index 000000000000..714ab378e431
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_log.h
@@ -0,0 +1,576 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include <linux/lustre_log.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#define LOG_NAME_LIMIT(logname, name) \
+ snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+ LLOG_OPEN_EXISTS = 0x0000,
+ LLOG_OPEN_NEW = 0x0001,
+};
+
+struct plain_handle_data {
+ struct list_head phd_entry;
+ struct llog_handle *phd_cat_handle;
+ struct llog_cookie phd_cookie; /* cookie of this log in its cat */
+};
+
+struct cat_handle_data {
+ struct list_head chd_head;
+ struct llog_handle *chd_current_log; /* currently open log */
+ struct llog_handle *chd_next_log; /* llog to be used next */
+};
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+ /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+ * logid's by non-zero ogen (inode generation) and convert them
+ * into IGIF */
+ if (id->lgl_ogen == 0) {
+ fid->f_seq = id->lgl_oi.oi.oi_seq;
+ fid->f_oid = id->lgl_oi.oi.oi_id;
+ fid->f_ver = 0;
+ } else {
+ lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+ }
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+ id->lgl_oi.oi.oi_seq = fid->f_seq;
+ id->lgl_oi.oi.oi_id = fid->f_oid;
+ id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+ log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+ return log_id->lgl_oi.oi.oi_id;
+}
+
+struct llog_handle;
+
+/* llog.c - general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+ int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+ struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+ struct llog_handle *loghandle, llog_cb_t cb,
+ void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+ int index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_handle **lgh, struct llog_logid *logid,
+ char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_get_size(struct llog_handle *loghandle);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+ /**
+ * Any useful data needed while processing catalog. This is
+ * passed later to process callback.
+ */
+ void *lpd_data;
+ /**
+ * Catalog process callback function, called for each record
+ * in catalog.
+ */
+ llog_cb_t lpd_cb;
+ /**
+ * Start processing the catalog from startcat/startidx
+ */
+ int lpd_startcat;
+ int lpd_startidx;
+};
+
+struct llog_process_cat_data {
+ /**
+ * Temporary stored first_idx while scanning log.
+ */
+ int lpcd_first_idx;
+ /**
+ * Temporary stored last_idx while scanning log.
+ */
+ int lpcd_last_idx;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ void *buf, struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+ struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ void *buf);
+int llog_cat_cancel_records(const struct lu_env *env,
+ struct llog_handle *cathandle, int count,
+ struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+ struct llog_handle *cat_llh, llog_cb_t cb,
+ void *data, int startcat, int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data, int startcat, int startidx);
+int llog_cat_reverse_process(const struct lu_env *env,
+ struct llog_handle *cat_llh, llog_cb_t cb,
+ void *data);
+int llog_cat_init_and_process(const struct lu_env *env,
+ struct llog_handle *llh);
+
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+ struct obd_llog_group *olg, int index,
+ struct obd_device *disk_obd, struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+ struct llog_cookie *logcookies, int numcookies);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct lov_stripe_md *lsm, int count,
+ struct llog_cookie *cookies, int flags);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+ struct obd_device *disk_obd, int *idx);
+
+int obd_llog_finish(struct obd_device *obd, int count);
+
+/* llog_ioctl.c */
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+ struct obd_ioctl_data *data);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+ int (*lop_destroy)(const struct lu_env *env,
+ struct llog_handle *handle);
+ int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+ int *curr_idx, int next_idx, __u64 *offset,
+ void *buf, int len);
+ int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+ int prev_idx, void *buf, int len);
+ int (*lop_read_header)(const struct lu_env *env,
+ struct llog_handle *handle);
+ int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+ struct obd_llog_group *olg, int ctxt_idx,
+ struct obd_device *disk_obd);
+ int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+ int flags);
+ int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+ int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct lov_stripe_md *lsm, int count,
+ struct llog_cookie *cookies, int flags);
+ int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+ struct llog_gen *gen, struct obd_uuid *uuid);
+ /**
+ * Any llog file must be opened first using llog_open(). Llog can be
+ * opened by name, logid or without both, in last case the new logid
+ * will be generated.
+ */
+ int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_logid *logid, char *name,
+ enum llog_open_param);
+ /**
+ * Opened llog may not exist and this must be checked where needed using
+ * the llog_exist() call.
+ */
+ int (*lop_exist)(struct llog_handle *lgh);
+ /**
+ * Close llog file and calls llog_free_handle() implicitly.
+ * Any opened llog must be closed by llog_close() call.
+ */
+ int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+ /**
+ * Create new llog file. The llog must be opened.
+ * Must be used only for local llog operations.
+ */
+ int (*lop_declare_create)(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct thandle *th);
+ int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+ struct thandle *th);
+ /**
+ * write new record in llog. It appends records usually but can edit
+ * existing records too.
+ */
+ int (*lop_declare_write_rec)(const struct lu_env *env,
+ struct llog_handle *lgh,
+ struct llog_rec_hdr *rec,
+ int idx, struct thandle *th);
+ int (*lop_write_rec)(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec,
+ struct llog_cookie *cookie, int cookiecount,
+ void *buf, int idx, struct thandle *th);
+ /**
+ * Add new record in llog catalog. Does the same as llog_write_rec()
+ * but using llog catalog.
+ */
+ int (*lop_declare_add)(const struct lu_env *env,
+ struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct thandle *th);
+ int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+ void *buf, struct thandle *th);
+ /* Old llog_add version, used in MDS-LOV-OSC now and will gone with
+ * LOD/OSP replacement */
+ int (*lop_obd_add)(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+ struct llog_cookie *logcookies, int numcookies);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+ struct rw_semaphore lgh_lock;
+ spinlock_t lgh_hdr_lock; /* protect lgh_hdr data */
+ struct llog_logid lgh_id; /* id of this log */
+ struct llog_log_hdr *lgh_hdr;
+ struct file *lgh_file;
+ struct dt_object *lgh_obj;
+ int lgh_last_idx;
+ int lgh_cur_idx; /* used during llog_process */
+ __u64 lgh_cur_offset; /* used during llog_process */
+ struct llog_ctxt *lgh_ctxt;
+ union {
+ struct plain_handle_data phd;
+ struct cat_handle_data chd;
+ } u;
+ char *lgh_name;
+ void *private_data;
+ struct llog_operations *lgh_logops;
+ atomic_t lgh_refcount;
+};
+
+/* llog_lvfs.c */
+extern struct llog_operations llog_lvfs_ops;
+
+/* llog_osd.c */
+extern struct llog_operations llog_osd_ops;
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+ int idx, int count,
+ struct llog_catid *idarray);
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+ int idx, int count,
+ struct llog_catid *idarray);
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED 0x00000001
+#define LLOG_CTXT_FLAG_STOP 0x00000002
+
+struct llog_ctxt {
+ int loc_idx; /* my index the obd array of ctxt's */
+ struct obd_device *loc_obd; /* points back to the containing obd*/
+ struct obd_llog_group *loc_olg; /* group containing that ctxt */
+ struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */
+ struct obd_import *loc_imp; /* to use in RPC's: can be backward
+ pointing import */
+ struct llog_operations *loc_logops;
+ struct llog_handle *loc_handle;
+ struct mutex loc_mutex; /* protect loc_imp */
+ atomic_t loc_refcount;
+ long loc_flags; /* flags, see above defines */
+ struct dt_object *loc_dir;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+ struct llog_operations **lop)
+{
+ if (ctxt == NULL)
+ return -ENOTCONN;
+
+ *lop = ctxt->loc_logops;
+ if (*lop == NULL)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+ struct llog_operations **lop)
+{
+ if (loghandle == NULL || loghandle->lgh_logops == NULL)
+ return -EINVAL;
+
+ *lop = loghandle->lgh_logops;
+ return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+ return cfs_size_round(len);
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+ atomic_inc(&ctxt->loc_refcount);
+ CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+ atomic_read(&ctxt->loc_refcount));
+ return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+ if (ctxt == NULL)
+ return;
+ LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+ CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+ atomic_read(&ctxt->loc_refcount) - 1);
+ __llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg, int group)
+{
+ init_waitqueue_head(&olg->olg_waitq);
+ spin_lock_init(&olg->olg_lock);
+ mutex_init(&olg->olg_cat_processing);
+ olg->olg_seq = group;
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+ struct llog_ctxt *ctxt, int index)
+{
+ LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+ spin_lock(&olg->olg_lock);
+ if (olg->olg_ctxts[index] != NULL) {
+ spin_unlock(&olg->olg_lock);
+ return -EEXIST;
+ }
+ olg->olg_ctxts[index] = ctxt;
+ spin_unlock(&olg->olg_lock);
+ return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+ int index)
+{
+ struct llog_ctxt *ctxt;
+
+ LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+ spin_lock(&olg->olg_lock);
+ if (olg->olg_ctxts[index] == NULL)
+ ctxt = NULL;
+ else
+ ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+ spin_unlock(&olg->olg_lock);
+ return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+ LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+ spin_lock(&olg->olg_lock);
+ olg->olg_ctxts[index] = NULL;
+ spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+ int index)
+{
+ return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+ return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+ return (llog_group_ctxt_null(&obd->obd_olg, index));
+}
+
+static inline int llog_destroy(const struct lu_env *env,
+ struct llog_handle *handle)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ ENTRY;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ RETURN(rc);
+ if (lop->lop_destroy == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ rc = lop->lop_destroy(env, handle);
+ RETURN(rc);
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+ struct llog_handle *loghandle, int *cur_idx,
+ int next_idx, __u64 *cur_offset, void *buf,
+ int len)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ ENTRY;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ RETURN(rc);
+ if (lop->lop_next_block == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+ cur_offset, buf, len);
+ RETURN(rc);
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ ENTRY;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ RETURN(rc);
+ if (lop->lop_prev_block == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+ RETURN(rc);
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+ struct llog_logid *logid, struct llog_gen *gen,
+ struct obd_uuid *uuid)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ ENTRY;
+
+ rc = llog_obd2ops(ctxt, &lop);
+ if (rc)
+ RETURN(rc);
+ if (lop->lop_connect == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ rc = lop->lop_connect(ctxt, logid, gen, uuid);
+ RETURN(rc);
+}
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+ struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+ struct thandle *th);
+int llog_declare_write_rec(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, int idx,
+ struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+ int numcookies, void *buf, int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+ void *buf, struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+ struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+ struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_handle **res, struct llog_logid *logid,
+ char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ int cookiecount, void *buf, int idx);
+
+/** @} log */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h
new file mode 100644
index 000000000000..fb1561a809b9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mdc.h
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+# include <linux/posix_acl_xattr.h>
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+struct mdc_rpc_lock {
+ struct mutex rpcl_mutex;
+ struct lookup_intent *rpcl_it;
+ int rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+ mutex_init(&lck->rpcl_mutex);
+ lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+ struct lookup_intent *it)
+{
+ ENTRY;
+
+ if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+ return;
+
+ /* This would normally block until the existing request finishes.
+ * If fail_loc is set it will block until the regular request is
+ * done, then set rpcl_it to MDC_FAKE_RPCL_IT. Once that is set
+ * it will only be cleared when all fake requests are finished.
+ * Only when all fake requests are finished can normal requests
+ * be sent, to ensure they are recoverable again. */
+ again:
+ mutex_lock(&lck->rpcl_mutex);
+
+ if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+ lck->rpcl_it = MDC_FAKE_RPCL_IT;
+ lck->rpcl_fakes++;
+ mutex_unlock(&lck->rpcl_mutex);
+ return;
+ }
+
+ /* This will only happen when the CFS_FAIL_CHECK() was
+ * just turned off but there are still requests in progress.
+ * Wait until they finish. It doesn't need to be efficient
+ * in this extremely rare case, just have low overhead in
+ * the common case when it isn't true. */
+ while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+ mutex_unlock(&lck->rpcl_mutex);
+ schedule_timeout(cfs_time_seconds(1) / 4);
+ goto again;
+ }
+
+ LASSERT(lck->rpcl_it == NULL);
+ lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+ struct lookup_intent *it)
+{
+ if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+ goto out;
+
+ if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+ mutex_lock(&lck->rpcl_mutex);
+
+ LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+ lck->rpcl_fakes--;
+
+ if (lck->rpcl_fakes == 0)
+ lck->rpcl_it = NULL;
+
+ } else {
+ LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+ lck->rpcl_it = NULL;
+ }
+
+ mutex_unlock(&lck->rpcl_mutex);
+ out:
+ EXIT;
+}
+
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+ struct mdt_body *body)
+{
+ if (body->valid & OBD_MD_FLMODEASIZE) {
+ if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize)
+ exp->exp_obd->u.cli.cl_max_mds_easize =
+ body->max_mdsize;
+ if (exp->exp_obd->u.cli.cl_max_mds_cookiesize <
+ body->max_cookiesize)
+ exp->exp_obd->u.cli.cl_max_mds_cookiesize =
+ body->max_cookiesize;
+ }
+}
+
+
+struct mdc_cache_waiter {
+ struct list_head mcw_entry;
+ wait_queue_head_t mcw_waitq;
+};
+
+/* mdc/mdc_locks.c */
+int it_disposition(struct lookup_intent *it, int flag);
+void it_clear_disposition(struct lookup_intent *it, int flag);
+void it_set_disposition(struct lookup_intent *it, int flag);
+int it_open_error(int phase, struct lookup_intent *it);
+
+/** @} mdc */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h
new file mode 100644
index 000000000000..b386f87471e3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mds.h
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct mds_group_info {
+ struct obd_uuid *uuid;
+ int group;
+};
+
+struct mds_capa_info {
+ struct obd_uuid *uuid;
+ struct lustre_capa_key *capa;
+};
+
+#define MDD_OBD_NAME "mdd_obd"
+#define MDD_OBD_UUID "mdd_obd_uuid"
+
+static inline int md_should_create(__u64 flags)
+{
+ return !(flags & MDS_OPEN_DELAY_CREATE ||
+ !(flags & FMODE_WRITE));
+}
+
+/* these are local flags, used only on the client, private */
+#define M_CHECK_STALE 0200000000
+
+/** @} mds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdt.h b/drivers/staging/lustre/lustre/include/lustre_mdt.h
new file mode 100644
index 000000000000..dba26a6cfa38
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mdt.h
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_MDT_H
+#define __LINUX_MDT_H
+
+/** \defgroup mdt mdt
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <md_object.h>
+#include <dt_object.h>
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Common thread info for mdt, seq and fld
+ */
+struct com_thread_info {
+ /*
+ * for req-layout interface.
+ */
+ struct req_capsule *cti_pill;
+};
+
+enum {
+ ESERIOUS = 0x0001000
+};
+
+static inline int err_serious(int rc)
+{
+ LASSERT(rc < 0);
+ LASSERT(-rc < ESERIOUS);
+ return -(-rc | ESERIOUS);
+}
+
+static inline int clear_serious(int rc)
+{
+ if (rc < 0)
+ rc = -(-rc & ~ESERIOUS);
+ return rc;
+}
+
+static inline int is_serious(int rc)
+{
+ return (rc < 0 && -rc & ESERIOUS);
+}
+
+/** @} mdt */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h
new file mode 100644
index 000000000000..293dd90e5b6c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -0,0 +1,3451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+
+#include <linux/lustre_net.h>
+
+#include <linux/libcfs/libcfs.h>
+// #include <obd.h>
+#include <linux/lnet/lnet.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ha.h>
+#include <lustre_sec.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lu_object.h>
+#include <lustre_req_layout.h>
+
+#include <obd_support.h>
+#include <lustre_ver.h>
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS 0
+
+/**
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value. The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS 2
+#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all. Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future. Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE (1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+#define ONE_MB_BRW_SIZE (1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE (1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define OFD_MAX_BRW_SIZE (1 << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+# error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE))
+# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_MAX_BRW_SIZE too big"
+# endif
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_MAX_BRW_PAGES too big"
+# endif
+
+#define PTLRPC_NTHRS_INIT 2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS # buffers to allocate when growing the pool
+ * ?_BUFSIZE # bytes in a single request buffer
+ * ?_MAXREQSIZE # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT # threads to create for each service partition on
+ * initializing. If it's non-affinity service and
+ * there is only one partition, it's the overall #
+ * threads for the service while initializing.
+ * ?_NTHRS_BASE # threads should be created at least for each
+ * ptlrpc partition to keep the service healthy.
+ * It's the low-water mark of threads upper-limit
+ * for each partition.
+ * ?_THR_FACTOR # threads can be added on threads upper-limit for
+ * each CPU core. This factor is only for reference,
+ * we might decrease value of factor if number of cores
+ * per CPT is above a limit.
+ * ?_NTHRS_MAX # overall threads can be created for a service,
+ * it's a soft limit because if service is running
+ * on machine with hundreds of cores and tens of
+ * CPU partitions, we need to guarantee each partition
+ * has ?_NTHRS_BASE threads, which means total threads
+ * will be ?_NTHRS_BASE * number_of_cpts which can
+ * exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT 2
+ * #define MDS_NTHRS_BASE 64
+ * #define MDS_NTHRS_FACTOR 8
+ * #define MDS_NTHRS_MAX 1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ * 96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ * 128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ * 160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ * MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ * MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ * filesystem itself, buffer cache, or underlying network stack might
+ * have some SMP scalability issues at that large scale.
+ *
+ * If user already has a fat machine with hundreds or thousands of cores,
+ * there are two choices for configuration:
+ * a) create CPU table from subset of all CPUs and run Lustre on
+ * top of this subset
+ * b) bind service threads on a few partitions, see modparameters of
+ * MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ * understanding, the real implementation is a little more complex,
+ * please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+ * LDLM threads constants:
+ *
+ * Given 8 as factor and 24 as base threads number
+ *
+ * example 1)
+ * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+ *
+ * example 2)
+ * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+ * threads for each partition and total threads number will be 112.
+ *
+ * example 3)
+ * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+ * threads for each partition to keep service healthy, so total threads
+ * number should be 24 * 8 = 192.
+ *
+ * So with these constants, threads number will be at the similar level
+ * of old versions, unless target machine has over a hundred cores
+ */
+#define LDLM_THR_FACTOR 8
+#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE 24
+#define LDLM_NTHRS_MAX (num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE (8 * 1024)
+#define LDLM_MAXREQSIZE (5 * 1024)
+#define LDLM_MAXREPSIZE (1024)
+
+ /*
+ * MDS threads constants:
+ *
+ * Please see examples in "Thread Constants", MDS threads number will be at
+ * the comparable level of old versions, unless the server has many cores.
+ */
+#ifndef MDS_MAX_THREADS
+#define MDS_MAX_THREADS 1024
+#define MDS_MAX_OTHR_THREADS 256
+
+#else /* MDS_MAX_THREADS */
+#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT
+#undef MDS_MAX_THREADS
+#define MDS_MAX_THREADS PTLRPC_NTHRS_INIT
+#endif
+#define MDS_MAX_OTHR_THREADS max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2)
+#endif
+
+/* default service */
+#define MDS_THR_FACTOR 8
+#define MDS_NTHRS_INIT PTLRPC_NTHRS_INIT
+#define MDS_NTHRS_MAX MDS_MAX_THREADS
+#define MDS_NTHRS_BASE min(64, MDS_NTHRS_MAX)
+
+/* read-page service */
+#define MDS_RDPG_THR_FACTOR 4
+#define MDS_RDPG_NTHRS_INIT PTLRPC_NTHRS_INIT
+#define MDS_RDPG_NTHRS_MAX MDS_MAX_OTHR_THREADS
+#define MDS_RDPG_NTHRS_BASE min(48, MDS_RDPG_NTHRS_MAX)
+
+/* these should be removed when we remove setattr service in the future */
+#define MDS_SETA_THR_FACTOR 4
+#define MDS_SETA_NTHRS_INIT PTLRPC_NTHRS_INIT
+#define MDS_SETA_NTHRS_MAX MDS_MAX_OTHR_THREADS
+#define MDS_SETA_NTHRS_BASE min(48, MDS_SETA_NTHRS_MAX)
+
+/* non-affinity threads */
+#define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT
+#define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS
+
+#define MDS_NBUFS 64
+
+/**
+ * Assume file name length = FNAME_MAX = 256 (true for ext3).
+ * path name length = PATH_MAX = 4096
+ * LOV MD size max = EA_MAX = 24 * 2000
+ * (NB: 24 is size of lov_ost_data)
+ * LOV LOGCOOKIE size max = 32 * 2000
+ * (NB: 32 is size of llog_cookie)
+ * symlink: FNAME_MAX + PATH_MAX <- largest
+ * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create)
+ * rename: FNAME_MAX + FNAME_MAX
+ * open: FNAME_MAX + EA_MAX
+ *
+ * MDS_MAXREQSIZE ~= 4736 bytes =
+ * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ *
+ * Realistic size is about 512 bytes (20 character name + 128 char symlink),
+ * except in the open case where there are a large number of OSTs in a LOV.
+ */
+#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */
+#define MDS_MAXREPSIZE (9 * 1024) /* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE max(MDS_MAXREQSIZE, \
+ 362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE MDS_LOV_MAXREQSIZE
+
+/**
+ * This is the size of a maximum REINT_SETXATTR request:
+ *
+ * lustre_msg 56 (32 + 4 x 5 + 4)
+ * ptlrpc_body 184
+ * mdt_rec_setxattr 136
+ * lustre_capa 120
+ * name 256 (XATTR_NAME_MAX)
+ * value 65536 (XATTR_SIZE_MAX)
+ */
+#define MDS_EA_MAXREQSIZE 66288
+
+/**
+ * These are the maximum request and reply sizes (rounded up to 1 KB
+ * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL.
+ */
+#define MDS_REG_MAXREQSIZE (((max(MDS_EA_MAXREQSIZE, \
+ MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10)
+#define MDS_REG_MAXREPSIZE MDS_REG_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 9K:
+ * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K)
+ */
+#define MDS_OUT_MAXREQSIZE (9 * 1024)
+#define MDS_OUT_MAXREPSIZE MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+ 8 * 1024)
+
+/**
+ * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD.
+ * However, we need to allocate a much larger buffer for it because LNet
+ * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid
+ * dropping of maximum-sized incoming request. So if MDS_REG_BUFSIZE is only a
+ * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request
+ * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory
+ * utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 160 KB, so even each rqbd is unlinked
+ * from LNet with unused 65 KB, buffer utilization will be about 59%.
+ * Please check LU-2432 for details.
+ */
+#define MDS_REG_BUFSIZE max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+ 160 * 1024)
+
+/**
+ * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+ */
+#define MDS_OUT_BUFSIZE max(MDS_OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+ 24 * 1024)
+
+/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
+#define FLD_MAXREQSIZE (160)
+
+/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
+#define FLD_MAXREPSIZE (152)
+#define FLD_BUFSIZE (1 << 12)
+
+/**
+ * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
+ * __u32 padding */
+#define SEQ_MAXREQSIZE (160)
+
+/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
+#define SEQ_MAXREPSIZE (152)
+#define SEQ_BUFSIZE (1 << 12)
+
+/** MGS threads must be >= 3, see bug 22458 comment #28 */
+#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1)
+#define MGS_NTHRS_MAX 32
+
+#define MGS_NBUFS 64
+#define MGS_BUFSIZE (8 * 1024)
+#define MGS_MAXREQSIZE (7 * 1024)
+#define MGS_MAXREPSIZE (9 * 1024)
+
+ /*
+ * OSS threads constants:
+ *
+ * Given 8 as factor and 64 as base threads number
+ *
+ * example 1):
+ * On 8-core server configured to 2 partitions, we will have
+ * 64 + 8 * 4 = 96 threads for each partition, 192 total threads.
+ *
+ * example 2):
+ * On 32-core machine configured to 4 partitions, we will have
+ * 64 + 8 * 8 = 112 threads for each partition, so total threads number
+ * will be 112 * 4 = 448.
+ *
+ * example 3):
+ * On 64-core machine configured to 4 partitions, we will have
+ * 64 + 16 * 8 = 192 threads for each partition, so total threads number
+ * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we
+ * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads
+ * for each partition.
+ *
+ * So we can see that with these constants, threads number wil be at the
+ * similar level of old versions, unless the server has many cores.
+ */
+ /* depress threads factor for VM with small memory size */
+#define OSS_THR_FACTOR min_t(int, 8, \
+ NUM_CACHEPAGES >> (28 - PAGE_CACHE_SHIFT))
+#define OSS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1)
+#define OSS_NTHRS_BASE 64
+#define OSS_NTHRS_MAX 512
+
+/* threads for handling "create" request */
+#define OSS_CR_THR_FACTOR 1
+#define OSS_CR_NTHRS_INIT PTLRPC_NTHRS_INIT
+#define OSS_CR_NTHRS_BASE 8
+#define OSS_CR_NTHRS_MAX 64
+
+/**
+ * OST_IO_MAXREQSIZE ~=
+ * lustre_msg + ptlrpc_body + obdo + obd_ioobj +
+ * DT_MAX_BRW_PAGES * niobuf_remote
+ *
+ * - single object with 16 pages is 512 bytes
+ * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - Must be a multiple of 1024
+ * - actual size is about 18K
+ */
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
+ sizeof(struct ptlrpc_body) + \
+ sizeof(struct obdo) + \
+ sizeof(struct obd_ioobj) + \
+ sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+/**
+ * FIEMAP request can be 4K+ for now
+ */
+#define OST_MAXREQSIZE (5 * 1024)
+#define OST_IO_MAXREQSIZE max_t(int, OST_MAXREQSIZE, \
+ (((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+
+#define OST_MAXREPSIZE (9 * 1024)
+#define OST_IO_MAXREPSIZE OST_MAXREPSIZE
+
+#define OST_NBUFS 64
+/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */
+#define OST_BUFSIZE max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024)
+/**
+ * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization
+ * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details.
+ */
+#define OST_IO_BUFSIZE max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
+
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+ /** linkage for connections hash table */
+ struct hlist_node c_hash;
+ /** Our own lnet nid for this connection */
+ lnet_nid_t c_self;
+ /** Remote side nid for this connection */
+ lnet_process_id_t c_peer;
+ /** UUID of the other side */
+ struct obd_uuid c_remote_uuid;
+ /** reference counter for this connection */
+ atomic_t c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+ /** What lnet portal does this client send messages to by default */
+ __u32 cli_request_portal;
+ /** What portal do we expect replies on */
+ __u32 cli_reply_portal;
+ /** Name of the client */
+ char *cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+ /**
+ * Scratchpad for passing args to completion interpreter. Users
+ * cast to the struct of their choosing, and CLASSERT that this is
+ * big enough. For _tons_ of context, OBD_ALLOC a struct and store
+ * a pointer to it here. The pointer_arg ensures this struct is at
+ * least big enough for that.
+ */
+ void *pointer_arg[11];
+ __u64 space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+ atomic_t set_refcount;
+ /** number of in queue requests */
+ atomic_t set_new_count;
+ /** number of uncompleted requests */
+ atomic_t set_remaining;
+ /** wait queue to wait on for request events */
+ wait_queue_head_t set_waitq;
+ wait_queue_head_t *set_wakeup_ptr;
+ /** List of requests in the set */
+ struct list_head set_requests;
+ /**
+ * List of completion callbacks to be called when the set is completed
+ * This is only used if \a set_interpret is NULL.
+ * Links struct ptlrpc_set_cbdata.
+ */
+ struct list_head set_cblist;
+ /** Completion callback, if only one. */
+ set_interpreter_func set_interpret;
+ /** opaq argument passed to completion \a set_interpret callback. */
+ void *set_arg;
+ /**
+ * Lock for \a set_new_requests manipulations
+ * locked so that any old caller can communicate requests to
+ * the set holder who can then fold them into the lock-free set
+ */
+ spinlock_t set_new_req_lock;
+ /** List of new yet unsent requests. Only used with ptlrpcd now. */
+ struct list_head set_new_requests;
+
+ /** rq_status of requests that have been freed already */
+ int set_rc;
+ /** Additional fields used by the flow control extension */
+ /** Maximum number of RPCs in flight */
+ int set_max_inflight;
+ /** Callback function used to generate RPCs */
+ set_producer_func set_producer;
+ /** opaq argument passed to the producer callback */
+ void *set_producer_arg;
+};
+
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+ /** List linkage item */
+ struct list_head psc_item;
+ /** Pointer to interpreting function */
+ set_interpreter_func psc_interpret;
+ /** Opaq argument to pass to the callback */
+ void *psc_data;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+ void (*cbid_fn)(lnet_event_t *ev); /* specific callback fn */
+ void *cbid_arg; /* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG 0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+ /** Callback description */
+ struct ptlrpc_cb_id rs_cb_id;
+ /** Linkage for list of all reply states in a system */
+ struct list_head rs_list;
+ /** Linkage for list of all reply states on same export */
+ struct list_head rs_exp_list;
+ /** Linkage for list of all reply states for same obd */
+ struct list_head rs_obd_list;
+#if RS_DEBUG
+ struct list_head rs_debug_list;
+#endif
+ /** A spinlock to protect the reply state flags */
+ spinlock_t rs_lock;
+ /** Reply state flags */
+ unsigned long rs_difficult:1; /* ACK/commit stuff */
+ unsigned long rs_no_ack:1; /* no ACK, even for
+ difficult requests */
+ unsigned long rs_scheduled:1; /* being handled? */
+ unsigned long rs_scheduled_ever:1;/* any schedule attempts? */
+ unsigned long rs_handled:1; /* been handled yet? */
+ unsigned long rs_on_net:1; /* reply_out_callback pending? */
+ unsigned long rs_prealloc:1; /* rs from prealloc list */
+ unsigned long rs_committed:1;/* the transaction was committed
+ and the rs was dispatched
+ by ptlrpc_commit_replies */
+ /** Size of the state */
+ int rs_size;
+ /** opcode */
+ __u32 rs_opc;
+ /** Transaction number */
+ __u64 rs_transno;
+ /** xid */
+ __u64 rs_xid;
+ struct obd_export *rs_export;
+ struct ptlrpc_service_part *rs_svcpt;
+ /** Lnet metadata handle for the reply */
+ lnet_handle_md_t rs_md_h;
+ atomic_t rs_refcount;
+
+ /** Context for the sevice thread */
+ struct ptlrpc_svc_ctx *rs_svc_ctx;
+ /** Reply buffer (actually sent to the client), encoded if needed */
+ struct lustre_msg *rs_repbuf; /* wrapper */
+ /** Size of the reply buffer */
+ int rs_repbuf_len; /* wrapper buf length */
+ /** Size of the reply message */
+ int rs_repdata_len; /* wrapper msg length */
+ /**
+ * Actual reply message. Its content is encrupted (if needed) to
+ * produce reply buffer for actual sending. In simple case
+ * of no network encryption we jus set \a rs_repbuf to \a rs_msg
+ */
+ struct lustre_msg *rs_msg; /* reply message */
+
+ /** Number of locks awaiting client ACK */
+ int rs_nlocks;
+ /** Handles of locks awaiting client reply ACK */
+ struct lustre_handle rs_locks[RS_MAX_LOCKS];
+ /** Lock modes of locks in \a rs_locks */
+ ldlm_mode_t rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+ RQ_PHASE_NEW = 0xebc0de00,
+ RQ_PHASE_RPC = 0xebc0de01,
+ RQ_PHASE_BULK = 0xebc0de02,
+ RQ_PHASE_INTERPRET = 0xebc0de03,
+ RQ_PHASE_COMPLETE = 0xebc0de04,
+ RQ_PHASE_UNREGISTERING = 0xebc0de05,
+ RQ_PHASE_UNDEFINED = 0xebc0de06
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ void *arg, int rc);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+ /** Locks the list */
+ spinlock_t prp_lock;
+ /** list of ptlrpc_request structs */
+ struct list_head prp_req_list;
+ /** Maximum message size that would fit into a rquest from this pool */
+ int prp_rq_size;
+ /** Function to allocate more requests for this pool */
+ void (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+ /**
+ * Not a valid opcode.
+ */
+ PTLRPC_NRS_CTL_INVALID,
+ /**
+ * Activate the policy.
+ */
+ PTLRPC_NRS_CTL_START,
+ /**
+ * Reserved for multiple primary policies, which may be a possibility
+ * in the future.
+ */
+ PTLRPC_NRS_CTL_STOP,
+ /**
+ * Policies can start using opcodes from this value and onwards for
+ * their own purposes; the assigned value itself is arbitrary.
+ */
+ PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+ NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+ NRS_CTL_ORR_WR_QUANTUM,
+ NRS_CTL_ORR_RD_OFF_TYPE,
+ NRS_CTL_ORR_WR_OFF_TYPE,
+ NRS_CTL_ORR_RD_SUPP_REQ,
+ NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+ /**
+ * Called during policy registration; this operation is optional.
+ *
+ * \param[in,out] policy The policy being initialized
+ */
+ int (*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+ /**
+ * Called during policy unregistration; this operation is optional.
+ *
+ * \param[in,out] policy The policy being unregistered/finalized
+ */
+ void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+ /**
+ * Called when activating a policy via lprocfs; policies allocate and
+ * initialize their resources here; this operation is optional.
+ *
+ * \param[in,out] policy The policy being started
+ *
+ * \see nrs_policy_start_locked()
+ */
+ int (*op_policy_start) (struct ptlrpc_nrs_policy *policy);
+ /**
+ * Called when deactivating a policy via lprocfs; policies deallocate
+ * their resources here; this operation is optional
+ *
+ * \param[in,out] policy The policy being stopped
+ *
+ * \see nrs_policy_stop0()
+ */
+ void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+ /**
+ * Used for policy-specific operations; i.e. not generic ones like
+ * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+ * to an ioctl; this operation is optional.
+ *
+ * \param[in,out] policy The policy carrying out operation \a opc
+ * \param[in] opc The command operation being carried out
+ * \param[in,out] arg An generic buffer for communication between the
+ * user and the control operation
+ *
+ * \retval -ve error
+ * \retval 0 success
+ *
+ * \see ptlrpc_nrs_policy_control()
+ */
+ int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+ enum ptlrpc_nrs_ctl opc, void *arg);
+
+ /**
+ * Called when obtaining references to the resources of the resource
+ * hierarchy for a request that has arrived for handling at the PTLRPC
+ * service. Policies should return -ve for requests they do not wish
+ * to handle. This operation is mandatory.
+ *
+ * \param[in,out] policy The policy we're getting resources for.
+ * \param[in,out] nrq The request we are getting resources for.
+ * \param[in] parent The parent resource of the resource being
+ * requested; set to NULL if none.
+ * \param[out] resp The resource is to be returned here; the
+ * fallback policy in an NRS head should
+ * \e always return a non-NULL pointer value.
+ * \param[in] moving_req When set, signifies that this is an attempt
+ * to obtain resources for a request being moved
+ * to the high-priority NRS head by
+ * ldlm_lock_reorder_req().
+ * This implies two things:
+ * 1. We are under obd_export::exp_rpc_lock and
+ * so should not sleep.
+ * 2. We should not perform non-idempotent or can
+ * skip performing idempotent operations that
+ * were carried out when resources were first
+ * taken for the request when it was initialized
+ * in ptlrpc_nrs_req_initialize().
+ *
+ * \retval 0, +ve The level of the returned resource in the resource
+ * hierarchy; currently only 0 (for a non-leaf resource)
+ * and 1 (for a leaf resource) are supported by the
+ * framework.
+ * \retval -ve error
+ *
+ * \see ptlrpc_nrs_req_initialize()
+ * \see ptlrpc_nrs_hpreq_add_nolock()
+ * \see ptlrpc_nrs_req_hp_move()
+ */
+ int (*op_res_get) (struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq,
+ const struct ptlrpc_nrs_resource *parent,
+ struct ptlrpc_nrs_resource **resp,
+ bool moving_req);
+ /**
+ * Called when releasing references taken for resources in the resource
+ * hierarchy for the request; this operation is optional.
+ *
+ * \param[in,out] policy The policy the resource belongs to
+ * \param[in] res The resource to be freed
+ *
+ * \see ptlrpc_nrs_req_finalize()
+ * \see ptlrpc_nrs_hpreq_add_nolock()
+ * \see ptlrpc_nrs_req_hp_move()
+ */
+ void (*op_res_put) (struct ptlrpc_nrs_policy *policy,
+ const struct ptlrpc_nrs_resource *res);
+
+ /**
+ * Obtains a request for handling from the policy, and optionally
+ * removes the request from the policy; this operation is mandatory.
+ *
+ * \param[in,out] policy The policy to poll
+ * \param[in] peek When set, signifies that we just want to
+ * examine the request, and not handle it, so the
+ * request is not removed from the policy.
+ * \param[in] force When set, it will force a policy to return a
+ * request if it has one queued.
+ *
+ * \retval NULL No request available for handling
+ * \retval valid-pointer The request polled for handling
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ */
+ struct ptlrpc_nrs_request *
+ (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+ bool force);
+ /**
+ * Called when attempting to add a request to a policy for later
+ * handling; this operation is mandatory.
+ *
+ * \param[in,out] policy The policy on which to enqueue \a nrq
+ * \param[in,out] nrq The request to enqueue
+ *
+ * \retval 0 success
+ * \retval != 0 error
+ *
+ * \see ptlrpc_nrs_req_add_nolock()
+ */
+ int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq);
+ /**
+ * Removes a request from the policy's set of pending requests. Normally
+ * called after a request has been polled successfully from the policy
+ * for handling; this operation is mandatory.
+ *
+ * \param[in,out] policy The policy the request \a nrq belongs to
+ * \param[in,out] nrq The request to dequeue
+ *
+ * \see ptlrpc_nrs_req_del_nolock()
+ */
+ void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq);
+ /**
+ * Called after the request being carried out. Could be used for
+ * job/resource control; this operation is optional.
+ *
+ * \param[in,out] policy The policy which is stopping to handle request
+ * \a nrq
+ * \param[in,out] nrq The request
+ *
+ * \pre spin_is_locked(&svcpt->scp_req_lock)
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+ void (*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq);
+ /**
+ * Registers the policy's lprocfs interface with a PTLRPC service.
+ *
+ * \param[in] svc The service
+ *
+ * \retval 0 success
+ * \retval != 0 error
+ */
+ int (*op_lprocfs_init) (struct ptlrpc_service *svc);
+ /**
+ * Unegisters the policy's lprocfs interface with a PTLRPC service.
+ *
+ * In cases of failed policy registration in
+ * \e ptlrpc_nrs_policy_register(), this function may be called for a
+ * service which has not registered the policy successfully, so
+ * implementations of this method should make sure their operations are
+ * safe in such cases.
+ *
+ * \param[in] svc The service
+ */
+ void (*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+ /**
+ * Fallback policy, use this flag only on a single supported policy per
+ * service. The flag cannot be used on policies that use
+ * \e PTLRPC_NRS_FL_REG_EXTERN
+ */
+ PTLRPC_NRS_FL_FALLBACK = (1 << 0),
+ /**
+ * Start policy immediately after registering.
+ */
+ PTLRPC_NRS_FL_REG_START = (1 << 1),
+ /**
+ * This is a policy registering from a module different to the one NRS
+ * core ships in (currently ptlrpc).
+ */
+ PTLRPC_NRS_FL_REG_EXTERN = (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+ PTLRPC_NRS_QUEUE_REG = (1 << 0),
+ PTLRPC_NRS_QUEUE_HP = (1 << 1),
+ PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ * was initialized.
+ * - when the primary policy that was at the
+ * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ * RPC was initialized, denoted it did not wish, or for some other reason was
+ * not able to handle the request, by returning a non-valid NRS resource
+ * reference.
+ * - when the primary policy that was at the
+ * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ * RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+ spinlock_t nrs_lock;
+ /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+ /**
+ * List of registered policies
+ */
+ struct list_head nrs_policy_list;
+ /**
+ * List of policies with queued requests. Policies that have any
+ * outstanding requests are queued here, and this list is queried
+ * in a round-robin manner from NRS core when obtaining a request
+ * for handling. This ensures that requests from policies that at some
+ * point transition away from the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+ */
+ struct list_head nrs_policy_queued;
+ /**
+ * Service partition for this NRS head
+ */
+ struct ptlrpc_service_part *nrs_svcpt;
+ /**
+ * Primary policy, which is the preferred policy for handling RPCs
+ */
+ struct ptlrpc_nrs_policy *nrs_policy_primary;
+ /**
+ * Fallback policy, which is the backup policy for handling RPCs
+ */
+ struct ptlrpc_nrs_policy *nrs_policy_fallback;
+ /**
+ * This NRS head handles either HP or regular requests
+ */
+ enum ptlrpc_nrs_queue_type nrs_queue_type;
+ /**
+ * # queued requests from all policies in this NRS head
+ */
+ unsigned long nrs_req_queued;
+ /**
+ * # scheduled requests from all policies in this NRS head
+ */
+ unsigned long nrs_req_started;
+ /**
+ * # policies on this NRS
+ */
+ unsigned nrs_num_pols;
+ /**
+ * This NRS head is in progress of starting a policy
+ */
+ unsigned nrs_policy_starting:1;
+ /**
+ * In progress of shutting down the whole NRS head; used during
+ * unregistration
+ */
+ unsigned nrs_stopping:1;
+};
+
+#define NRS_POL_NAME_MAX 16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+ const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+ /**
+ * Human-readable policy name
+ */
+ char nc_name[NRS_POL_NAME_MAX];
+ /**
+ * NRS operations for this policy
+ */
+ const struct ptlrpc_nrs_pol_ops *nc_ops;
+ /**
+ * Service compatibility predicate
+ */
+ nrs_pol_desc_compat_t nc_compat;
+ /**
+ * Set for policies that support a single ptlrpc service, i.e. ones that
+ * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+ * depicts the name of the single service that such policies are
+ * compatible with.
+ */
+ const char *nc_compat_svc_name;
+ /**
+ * Owner module for this policy descriptor; policies registering from a
+ * different module to the one the NRS framework is held within
+ * (currently ptlrpc), should set this field to THIS_MODULE.
+ */
+ module_t *nc_owner;
+ /**
+ * Policy registration flags; a bitmast of \e nrs_policy_flags
+ */
+ unsigned nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+ /**
+ * Human-readable policy name
+ */
+ char pd_name[NRS_POL_NAME_MAX];
+ /**
+ * Link into nrs_core::nrs_policies
+ */
+ struct list_head pd_list;
+ /**
+ * NRS operations for this policy
+ */
+ const struct ptlrpc_nrs_pol_ops *pd_ops;
+ /**
+ * Service compatibility predicate
+ */
+ nrs_pol_desc_compat_t pd_compat;
+ /**
+ * Set for policies that are compatible with only one PTLRPC service.
+ *
+ * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+ */
+ const char *pd_compat_svc_name;
+ /**
+ * Owner module for this policy descriptor.
+ *
+ * We need to hold a reference to the module whenever we might make use
+ * of any of the module's contents, i.e.
+ * - If one or more instances of the policy are at a state where they
+ * might be handling a request, i.e.
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+ * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+ * is taken on the module when
+ * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+ * becomes 0, so that we hold only one reference to the module maximum
+ * at any time.
+ *
+ * We do not need to hold a reference to the module, even though we
+ * might use code and data from the module, in the following cases:
+ * - During external policy registration, because this should happen in
+ * the module's init() function, in which case the module is safe from
+ * removal because a reference is being held on the module by the
+ * kernel, and iirc kmod (and I guess module-init-tools also) will
+ * serialize any racing processes properly anyway.
+ * - During external policy unregistration, because this should happen
+ * in a module's exit() function, and any attempts to start a policy
+ * instance would need to take a reference on the module, and this is
+ * not possible once we have reached the point where the exit()
+ * handler is called.
+ * - During service registration and unregistration, as service setup
+ * and cleanup, and policy registration, unregistration and policy
+ * instance starting, are serialized by \e nrs_core::nrs_mutex, so
+ * as long as users adhere to the convention of registering policies
+ * in init() and unregistering them in module exit() functions, there
+ * should not be a race between these operations.
+ * - During any policy-specific lprocfs operations, because a reference
+ * is held by the kernel on a proc entry that has been entered by a
+ * syscall, so as long as proc entries are removed during unregistration time,
+ * then unregistration and lprocfs operations will be properly
+ * serialized.
+ */
+ module_t *pd_owner;
+ /**
+ * Bitmask of \e nrs_policy_flags
+ */
+ unsigned pd_flags;
+ /**
+ * # of references on this descriptor
+ */
+ atomic_t pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+ /**
+ * Not a valid policy state.
+ */
+ NRS_POL_STATE_INVALID,
+ /**
+ * Policies are at this state either at the start of their life, or
+ * transition here when the user selects a different policy to act
+ * as the primary one.
+ */
+ NRS_POL_STATE_STOPPED,
+ /**
+ * Policy is progress of stopping
+ */
+ NRS_POL_STATE_STOPPING,
+ /**
+ * Policy is in progress of starting
+ */
+ NRS_POL_STATE_STARTING,
+ /**
+ * A policy is in this state in two cases:
+ * - it is the fallback policy, which is always in this state.
+ * - it has been activated by the user; i.e. it is the primary policy,
+ */
+ NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+ /**
+ * Policy name
+ */
+ char pi_name[NRS_POL_NAME_MAX];
+ /**
+ * Current policy state
+ */
+ enum ptlrpc_nrs_pol_state pi_state;
+ /**
+ * # RPCs enqueued for later dispatching by the policy
+ */
+ long pi_req_queued;
+ /**
+ * # RPCs started for dispatch by the policy
+ */
+ long pi_req_started;
+ /**
+ * Is this a fallback policy?
+ */
+ unsigned pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+ /**
+ * Linkage into the NRS head's list of policies,
+ * ptlrpc_nrs:nrs_policy_list
+ */
+ struct list_head pol_list;
+ /**
+ * Linkage into the NRS head's list of policies with enqueued
+ * requests ptlrpc_nrs:nrs_policy_queued
+ */
+ struct list_head pol_list_queued;
+ /**
+ * Current state of this policy
+ */
+ enum ptlrpc_nrs_pol_state pol_state;
+ /**
+ * Bitmask of nrs_policy_flags
+ */
+ unsigned pol_flags;
+ /**
+ * # RPCs enqueued for later dispatching by the policy
+ */
+ long pol_req_queued;
+ /**
+ * # RPCs started for dispatch by the policy
+ */
+ long pol_req_started;
+ /**
+ * Usage Reference count taken on the policy instance
+ */
+ long pol_ref;
+ /**
+ * The NRS head this policy has been created at
+ */
+ struct ptlrpc_nrs *pol_nrs;
+ /**
+ * Private policy data; varies by policy type
+ */
+ void *pol_private;
+ /**
+ * Policy descriptor for this policy instance.
+ */
+ struct ptlrpc_nrs_pol_desc *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ * ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ * policies; e.g. on a policy that performs round robin or similar order
+ * scheduling across client NIDs, there would be one NRS resource per unique
+ * client NID. On a policy which performs round robin scheduling across
+ * backend filesystem objects, there would be one resource associated with
+ * each of the backend filesystem objects partaking in the scheduling
+ * performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+ /**
+ * This NRS resource's parent; is NULL for resources embedded in NRS
+ * policy instances; i.e. those are top-level ones.
+ */
+ struct ptlrpc_nrs_resource *res_parent;
+ /**
+ * The policy associated with this resource.
+ */
+ struct ptlrpc_nrs_policy *res_policy;
+};
+
+enum {
+ NRS_RES_FALLBACK,
+ NRS_RES_PRIMARY,
+ NRS_RES_MAX
+};
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+ /**
+ * Resource object for policy instance.
+ */
+ struct ptlrpc_nrs_resource fh_res;
+ /**
+ * List of queued requests.
+ */
+ struct list_head fh_list;
+ /**
+ * For debugging purposes.
+ */
+ __u64 fh_sequence;
+};
+
+struct nrs_fifo_req {
+ struct list_head fr_list;
+ __u64 fr_sequence;
+};
+
+/** @} fifo */
+
+/**
+ * \name CRR-N
+ *
+ * CRR-N, Client Round Robin over NIDs
+ * @{
+ */
+
+/**
+ * private data structure for CRR-N NRS
+ */
+struct nrs_crrn_net {
+ struct ptlrpc_nrs_resource cn_res;
+ cfs_binheap_t *cn_binheap;
+ cfs_hash_t *cn_cli_hash;
+ /**
+ * Used when a new scheduling round commences, in order to synchronize
+ * all clients with the new round number.
+ */
+ __u64 cn_round;
+ /**
+ * Determines the relevant ordering amongst request batches within a
+ * scheduling round.
+ */
+ __u64 cn_sequence;
+ /**
+ * Round Robin quantum; the maximum number of RPCs that each request
+ * batch for each client can have in a scheduling round.
+ */
+ __u16 cn_quantum;
+};
+
+/**
+ * Object representing a client in CRR-N, as identified by its NID
+ */
+struct nrs_crrn_client {
+ struct ptlrpc_nrs_resource cc_res;
+ struct hlist_node cc_hnode;
+ lnet_nid_t cc_nid;
+ /**
+ * The round number against which this client is currently scheduling
+ * requests.
+ */
+ __u64 cc_round;
+ /**
+ * The sequence number used for requests scheduled by this client during
+ * the current round number.
+ */
+ __u64 cc_sequence;
+ atomic_t cc_ref;
+ /**
+ * Round Robin quantum; the maximum number of RPCs the client is allowed
+ * to schedule in a single batch of each round.
+ */
+ __u16 cc_quantum;
+ /**
+ * # of pending requests for this client, on all existing rounds
+ */
+ __u16 cc_active;
+};
+
+/**
+ * CRR-N NRS request definition
+ */
+struct nrs_crrn_req {
+ /**
+ * Round number for this request; shared with all other requests in the
+ * same batch.
+ */
+ __u64 cr_round;
+ /**
+ * Sequence number for this request; shared with all other requests in
+ * the same batch.
+ */
+ __u64 cr_sequence;
+};
+
+/**
+ * CRR-N policy operations.
+ */
+enum nrs_ctl_crr {
+ /**
+ * Read the RR quantum size of a CRR-N policy.
+ */
+ NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+ /**
+ * Write the RR quantum size of a CRR-N policy.
+ */
+ NRS_CTL_CRRN_WR_QUANTUM,
+};
+
+/** @} CRR-N */
+
+/**
+ * \name ORR/TRR
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ * @{
+ */
+
+/**
+ * Lower and upper byte offsets of a brw RPC
+ */
+struct nrs_orr_req_range {
+ __u64 or_start;
+ __u64 or_end;
+};
+
+/**
+ * RPC types supported by the ORR/TRR policies
+ */
+enum nrs_orr_supp {
+ NOS_OST_READ = (1 << 0),
+ NOS_OST_WRITE = (1 << 1),
+ NOS_OST_RW = (NOS_OST_READ | NOS_OST_WRITE),
+ /**
+ * Default value for policies.
+ */
+ NOS_DFLT = NOS_OST_READ
+};
+
+/**
+ * As unique keys for grouping RPCs together, we use the object's OST FID for
+ * the ORR policy, and the OST index for the TRR policy.
+ *
+ * XXX: We waste some space for TRR policy instances by using a union, but it
+ * allows to consolidate some of the code between ORR and TRR, and these
+ * policies will probably eventually merge into one anyway.
+ */
+struct nrs_orr_key {
+ union {
+ /** object FID for ORR */
+ struct lu_fid ok_fid;
+ /** OST index for TRR */
+ __u32 ok_idx;
+ };
+};
+
+/**
+ * The largest base string for unique hash/slab object names is
+ * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT
+ * id number, so this _should_ be more than enough for the maximum number of
+ * CPTs on any system. If it does happen that this statement is incorrect,
+ * nrs_orr_genobjname() will inevitably yield a non-unique name and cause
+ * kmem_cache_create() to complain (on Linux), so the erroneous situation
+ * will hopefully not go unnoticed.
+ */
+#define NRS_ORR_OBJ_NAME_MAX (sizeof("nrs_orr_reg_") + 3)
+
+/**
+ * private data structure for ORR and TRR NRS
+ */
+struct nrs_orr_data {
+ struct ptlrpc_nrs_resource od_res;
+ cfs_binheap_t *od_binheap;
+ cfs_hash_t *od_obj_hash;
+ struct kmem_cache *od_cache;
+ /**
+ * Used when a new scheduling round commences, in order to synchronize
+ * all object or OST batches with the new round number.
+ */
+ __u64 od_round;
+ /**
+ * Determines the relevant ordering amongst request batches within a
+ * scheduling round.
+ */
+ __u64 od_sequence;
+ /**
+ * RPC types that are currently supported.
+ */
+ enum nrs_orr_supp od_supp;
+ /**
+ * Round Robin quantum; the maxium number of RPCs that each request
+ * batch for each object or OST can have in a scheduling round.
+ */
+ __u16 od_quantum;
+ /**
+ * Whether to use physical disk offsets or logical file offsets.
+ */
+ bool od_physical;
+ /**
+ * XXX: We need to provide a persistently allocated string to hold
+ * unique object names for this policy, since in currently supported
+ * versions of Linux by Lustre, kmem_cache_create() just sets a pointer
+ * to the name string provided. kstrdup() is used in the version of
+ * kmeme_cache_create() in current Linux mainline, so we may be able to
+ * remove this in the future.
+ */
+ char od_objname[NRS_ORR_OBJ_NAME_MAX];
+};
+
+/**
+ * Represents a backend-fs object or OST in the ORR and TRR policies
+ * respectively
+ */
+struct nrs_orr_object {
+ struct ptlrpc_nrs_resource oo_res;
+ struct hlist_node oo_hnode;
+ /**
+ * The round number against which requests are being scheduled for this
+ * object or OST
+ */
+ __u64 oo_round;
+ /**
+ * The sequence number used for requests scheduled for this object or
+ * OST during the current round number.
+ */
+ __u64 oo_sequence;
+ /**
+ * The key of the object or OST for which this structure instance is
+ * scheduling RPCs
+ */
+ struct nrs_orr_key oo_key;
+ atomic_t oo_ref;
+ /**
+ * Round Robin quantum; the maximum number of RPCs that are allowed to
+ * be scheduled for the object or OST in a single batch of each round.
+ */
+ __u16 oo_quantum;
+ /**
+ * # of pending requests for this object or OST, on all existing rounds
+ */
+ __u16 oo_active;
+};
+
+/**
+ * ORR/TRR NRS request definition
+ */
+struct nrs_orr_req {
+ /**
+ * The offset range this request covers
+ */
+ struct nrs_orr_req_range or_range;
+ /**
+ * Round number for this request; shared with all other requests in the
+ * same batch.
+ */
+ __u64 or_round;
+ /**
+ * Sequence number for this request; shared with all other requests in
+ * the same batch.
+ */
+ __u64 or_sequence;
+ /**
+ * For debugging purposes.
+ */
+ struct nrs_orr_key or_key;
+ /**
+ * An ORR policy instance has filled in request information while
+ * enqueueing the request on the service partition's regular NRS head.
+ */
+ unsigned int or_orr_set:1;
+ /**
+ * A TRR policy instance has filled in request information while
+ * enqueueing the request on the service partition's regular NRS head.
+ */
+ unsigned int or_trr_set:1;
+ /**
+ * Request offset ranges have been filled in with logical offset
+ * values.
+ */
+ unsigned int or_logical_set:1;
+ /**
+ * Request offset ranges have been filled in with physical offset
+ * values.
+ */
+ unsigned int or_physical_set:1;
+};
+
+/** @} ORR/TRR */
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+ /**
+ * The request's resource hierarchy.
+ */
+ struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX];
+ /**
+ * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+ * policy that was used to enqueue the request.
+ *
+ * \see nrs_request_enqueue()
+ */
+ unsigned nr_res_idx;
+ unsigned nr_initialized:1;
+ unsigned nr_enqueued:1;
+ unsigned nr_started:1;
+ unsigned nr_finalized:1;
+ cfs_binheap_node_t nr_node;
+
+ /**
+ * Policy-specific fields, used for determining a request's scheduling
+ * priority, and other supporting functionality.
+ */
+ union {
+ /**
+ * Fields for the FIFO policy
+ */
+ struct nrs_fifo_req fifo;
+ /**
+ * CRR-N request defintion
+ */
+ struct nrs_crrn_req crr;
+ /** ORR and TRR share the same request definition */
+ struct nrs_orr_req orr;
+ } nr_u;
+ /**
+ * Externally-registering policies may want to use this to allocate
+ * their own request properties.
+ */
+ void *ext;
+};
+
+/** @} nrs */
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+ /**
+ * Check if the lock handle of the given lock is the same as
+ * taken from the request.
+ */
+ int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+ /**
+ * Check if the request is a high priority one.
+ */
+ int (*hpreq_check)(struct ptlrpc_request *);
+ /**
+ * Called after the request has been handled.
+ */
+ void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+ /* Request type: one of PTL_RPC_MSG_* */
+ int rq_type;
+ /** Result of request processing */
+ int rq_status;
+ /**
+ * Linkage item through which this request is included into
+ * sending/delayed lists on client and into rqbd list on server
+ */
+ struct list_head rq_list;
+ /**
+ * Server side list of incoming unserved requests sorted by arrival
+ * time. Traversed from time to time to notice about to expire
+ * requests and sent back "early replies" to clients to let them
+ * know server is alive and well, just very busy to service their
+ * requests in time
+ */
+ struct list_head rq_timed_list;
+ /** server-side history, used for debuging purposes. */
+ struct list_head rq_history_list;
+ /** server-side per-export list */
+ struct list_head rq_exp_list;
+ /** server-side hp handlers */
+ struct ptlrpc_hpreq_ops *rq_ops;
+
+ /** initial thread servicing this request */
+ struct ptlrpc_thread *rq_svc_thread;
+
+ /** history sequence # */
+ __u64 rq_history_seq;
+ /** \addtogroup nrs
+ * @{
+ */
+ /** stub for NRS request */
+ struct ptlrpc_nrs_request rq_nrq;
+ /** @} nrs */
+ /** the index of service's srv_at_array into which request is linked */
+ time_t rq_at_index;
+ /** Lock to protect request flags and some other important bits, like
+ * rq_list
+ */
+ spinlock_t rq_lock;
+ /** client-side flags are serialized by rq_lock */
+ unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+ rq_timedout:1, rq_resend:1, rq_restart:1,
+ /**
+ * when ->rq_replay is set, request is kept by the client even
+ * after server commits corresponding transaction. This is
+ * used for operations that require sequence of multiple
+ * requests to be replayed. The only example currently is file
+ * open/close. When last request in such a sequence is
+ * committed, ->rq_replay is cleared on all requests in the
+ * sequence.
+ */
+ rq_replay:1,
+ rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+ rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+ rq_early:1, rq_must_unlink:1,
+ rq_memalloc:1, /* req originated from "kswapd" */
+ /* server-side flags */
+ rq_packed_final:1, /* packed final reply */
+ rq_hp:1, /* high priority RPC */
+ rq_at_linked:1, /* link into service's srv_at_array */
+ rq_reply_truncate:1,
+ rq_committed:1,
+ /* whether the "rq_set" is a valid one */
+ rq_invalid_rqset:1,
+ rq_generation_set:1,
+ /* do not resend request on -EINPROGRESS */
+ rq_no_retry_einprogress:1,
+ /* allow the req to be sent if the import is in recovery
+ * status */
+ rq_allow_replay:1;
+
+ unsigned int rq_nr_resend;
+
+ enum rq_phase rq_phase; /* one of RQ_PHASE_* */
+ enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
+ atomic_t rq_refcount;/* client-side refcount for SENT race,
+ server-side refcounf for multiple replies */
+
+ /** Portal to which this request would be sent */
+ short rq_request_portal; /* XXX FIXME bug 249 */
+ /** Portal where to wait for reply and where reply would be sent */
+ short rq_reply_portal; /* XXX FIXME bug 249 */
+
+ /**
+ * client-side:
+ * !rq_truncate : # reply bytes actually received,
+ * rq_truncate : required repbuf_len for resend
+ */
+ int rq_nob_received;
+ /** Request length */
+ int rq_reqlen;
+ /** Reply length */
+ int rq_replen;
+ /** Request message - what client sent */
+ struct lustre_msg *rq_reqmsg;
+ /** Reply message - server response */
+ struct lustre_msg *rq_repmsg;
+ /** Transaction number */
+ __u64 rq_transno;
+ /** xid */
+ __u64 rq_xid;
+ /**
+ * List item to for replay list. Not yet commited requests get linked
+ * there.
+ * Also see \a rq_replay comment above.
+ */
+ struct list_head rq_replay_list;
+
+ /**
+ * security and encryption data
+ * @{ */
+ struct ptlrpc_cli_ctx *rq_cli_ctx; /**< client's half ctx */
+ struct ptlrpc_svc_ctx *rq_svc_ctx; /**< server's half ctx */
+ struct list_head rq_ctx_chain; /**< link to waited ctx */
+
+ struct sptlrpc_flavor rq_flvr; /**< for client & server */
+ enum lustre_sec_part rq_sp_from;
+
+ /* client/server security flags */
+ unsigned int
+ rq_ctx_init:1, /* context initiation */
+ rq_ctx_fini:1, /* context destroy */
+ rq_bulk_read:1, /* request bulk read */
+ rq_bulk_write:1, /* request bulk write */
+ /* server authentication flags */
+ rq_auth_gss:1, /* authenticated by gss */
+ rq_auth_remote:1, /* authed as remote user */
+ rq_auth_usr_root:1, /* authed as root */
+ rq_auth_usr_mdt:1, /* authed as mdt */
+ rq_auth_usr_ost:1, /* authed as ost */
+ /* security tfm flags */
+ rq_pack_udesc:1,
+ rq_pack_bulk:1,
+ /* doesn't expect reply FIXME */
+ rq_no_reply:1,
+ rq_pill_init:1; /* pill initialized */
+
+ uid_t rq_auth_uid; /* authed uid */
+ uid_t rq_auth_mapped_uid; /* authed uid mapped to */
+
+ /* (server side), pointed directly into req buffer */
+ struct ptlrpc_user_desc *rq_user_desc;
+
+ /* various buffer pointers */
+ struct lustre_msg *rq_reqbuf; /* req wrapper */
+ char *rq_repbuf; /* rep buffer */
+ struct lustre_msg *rq_repdata; /* rep wrapper msg */
+ struct lustre_msg *rq_clrbuf; /* only in priv mode */
+ int rq_reqbuf_len; /* req wrapper buf len */
+ int rq_reqdata_len; /* req wrapper msg len */
+ int rq_repbuf_len; /* rep buffer len */
+ int rq_repdata_len; /* rep wrapper msg len */
+ int rq_clrbuf_len; /* only in priv mode */
+ int rq_clrdata_len; /* only in priv mode */
+
+ /** early replies go to offset 0, regular replies go after that */
+ unsigned int rq_reply_off;
+
+ /** @} */
+
+ /** Fields that help to see if request and reply were swabbed or not */
+ __u32 rq_req_swab_mask;
+ __u32 rq_rep_swab_mask;
+
+ /** What was import generation when this request was sent */
+ int rq_import_generation;
+ enum lustre_imp_state rq_send_state;
+
+ /** how many early replies (for stats) */
+ int rq_early_count;
+
+ /** client+server request */
+ lnet_handle_md_t rq_req_md_h;
+ struct ptlrpc_cb_id rq_req_cbid;
+ /** optional time limit for send attempts */
+ cfs_duration_t rq_delay_limit;
+ /** time request was first queued */
+ cfs_time_t rq_queued_time;
+
+ /* server-side... */
+ /** request arrival time */
+ struct timeval rq_arrival_time;
+ /** separated reply state */
+ struct ptlrpc_reply_state *rq_reply_state;
+ /** incoming request buffer */
+ struct ptlrpc_request_buffer_desc *rq_rqbd;
+
+ /** client-only incoming reply */
+ lnet_handle_md_t rq_reply_md_h;
+ wait_queue_head_t rq_reply_waitq;
+ struct ptlrpc_cb_id rq_reply_cbid;
+
+ /** our LNet NID */
+ lnet_nid_t rq_self;
+ /** Peer description (the other side) */
+ lnet_process_id_t rq_peer;
+ /** Server-side, export on which request was received */
+ struct obd_export *rq_export;
+ /** Client side, import where request is being sent */
+ struct obd_import *rq_import;
+
+ /** Replay callback, called after request is replayed at recovery */
+ void (*rq_replay_cb)(struct ptlrpc_request *);
+ /**
+ * Commit callback, called when request is committed and about to be
+ * freed.
+ */
+ void (*rq_commit_cb)(struct ptlrpc_request *);
+ /** Opaq data for replay and commit callbacks. */
+ void *rq_cb_data;
+
+ /** For bulk requests on client only: bulk descriptor */
+ struct ptlrpc_bulk_desc *rq_bulk;
+
+ /** client outgoing req */
+ /**
+ * when request/reply sent (secs), or time when request should be sent
+ */
+ time_t rq_sent;
+ /** time for request really sent out */
+ time_t rq_real_sent;
+
+ /** when request must finish. volatile
+ * so that servers' early reply updates to the deadline aren't
+ * kept in per-cpu cache */
+ volatile time_t rq_deadline;
+ /** when req reply unlink must finish. */
+ time_t rq_reply_deadline;
+ /** when req bulk unlink must finish. */
+ time_t rq_bulk_deadline;
+ /**
+ * service time estimate (secs)
+ * If the requestsis not served by this time, it is marked as timed out.
+ */
+ int rq_timeout;
+
+ /** Multi-rpc bits */
+ /** Per-request waitq introduced by bug 21938 for recovery waiting */
+ wait_queue_head_t rq_set_waitq;
+ /** Link item for request set lists */
+ struct list_head rq_set_chain;
+ /** Link back to the request set */
+ struct ptlrpc_request_set *rq_set;
+ /** Async completion handler, called when reply is received */
+ ptlrpc_interpterer_t rq_interpret_reply;
+ /** Async completion context */
+ union ptlrpc_async_args rq_async_args;
+
+ /** Pool if request is from preallocated list */
+ struct ptlrpc_request_pool *rq_pool;
+
+ struct lu_context rq_session;
+ struct lu_context rq_recov_session;
+
+ /** request format description */
+ struct req_capsule rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req, int rc)
+{
+ if (req->rq_interpret_reply != NULL) {
+ req->rq_status = req->rq_interpret_reply(env, req,
+ &req->rq_async_args,
+ rc);
+ return req->rq_status;
+ }
+ return rc;
+}
+
+/** \addtogroup nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+ struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+ /**
+ * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+ * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+ * to make sure it has not been scheduled yet (analogous to previous
+ * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+ */
+ return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Returns 1 if request buffer at offset \a index was already swabbed
+ */
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index)
+{
+ LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+ return req->rq_req_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
+ */
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+ LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+ return req->rq_rep_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+{
+ return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+{
+ return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index)
+{
+ LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+ LASSERT((req->rq_req_swab_mask & (1 << index)) == 0);
+ req->rq_req_swab_mask |= 1 << index;
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+ LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+ LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0);
+ req->rq_rep_swab_mask |= 1 << index;
+}
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+ switch (phase) {
+ case RQ_PHASE_NEW:
+ return "New";
+ case RQ_PHASE_RPC:
+ return "Rpc";
+ case RQ_PHASE_BULK:
+ return "Bulk";
+ case RQ_PHASE_INTERPRET:
+ return "Interpret";
+ case RQ_PHASE_COMPLETE:
+ return "Complete";
+ case RQ_PHASE_UNREGISTERING:
+ return "Unregistering";
+ default:
+ return "?Phase?";
+ }
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+ return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req) \
+ ptlrpc_rqphase2str(req), \
+ FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
+ FLAG(req->rq_err, "E"), \
+ FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
+ FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
+ FLAG(req->rq_no_resend, "N"), \
+ FLAG(req->rq_waiting, "W"), \
+ FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \
+ FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+ struct libcfs_debug_msg_data *data, const char *fmt, ...)
+ __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Helper that decides if we need to print request accordig to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...) \
+do { \
+ CFS_CHECK_STACK(msgdata, mask, cdls); \
+ \
+ if (((mask) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (mask)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \
+ _debug_req((req), msgdata, fmt, ##a); \
+} while(0)
+
+/**
+ * This is the debug print function you need to use to print request sturucture
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...) \
+do { \
+ if ((level) & (D_ERROR | D_WARNING)) { \
+ static cfs_debug_limit_state_t cdls; \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \
+ debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+ } else { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \
+ debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+ } \
+} while (0)
+/** @} */
+
+/**
+ * Structure that defines a single page of a bulk transfer
+ */
+struct ptlrpc_bulk_page {
+ /** Linkage to list of pages in a bulk */
+ struct list_head bp_link;
+ /**
+ * Number of bytes in a page to transfer starting from \a bp_pageoffset
+ */
+ int bp_buflen;
+ /** offset within a page */
+ int bp_pageoffset;
+ /** The page itself */
+ struct page *bp_page;
+};
+
+#define BULK_GET_SOURCE 0
+#define BULK_PUT_SINK 1
+#define BULK_GET_SINK 2
+#define BULK_PUT_SOURCE 3
+
+/**
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ * Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+ /** completed with failure */
+ unsigned long bd_failure:1;
+ /** {put,get}{source,sink} */
+ unsigned long bd_type:2;
+ /** client side */
+ unsigned long bd_registered:1;
+ /** For serialization with callback */
+ spinlock_t bd_lock;
+ /** Import generation when request for this bulk was sent */
+ int bd_import_generation;
+ /** LNet portal for this bulk */
+ __u32 bd_portal;
+ /** Server side - export this bulk created for */
+ struct obd_export *bd_export;
+ /** Client side - import this bulk was sent on */
+ struct obd_import *bd_import;
+ /** Back pointer to the request */
+ struct ptlrpc_request *bd_req;
+ wait_queue_head_t bd_waitq; /* server side only WQ */
+ int bd_iov_count; /* # entries in bd_iov */
+ int bd_max_iov; /* allocated size of bd_iov */
+ int bd_nob; /* # bytes covered */
+ int bd_nob_transferred; /* # bytes GOT/PUT */
+
+ __u64 bd_last_xid;
+
+ struct ptlrpc_cb_id bd_cbid; /* network callback info */
+ lnet_nid_t bd_sender; /* stash event::sender */
+ int bd_md_count; /* # valid entries in bd_mds */
+ int bd_md_max_brw; /* max entries in bd_mds */
+ /** array of associated MDs */
+ lnet_handle_md_t bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+ /*
+ * encrypt iov, size is either 0 or bd_iov_count.
+ */
+ lnet_kiov_t *bd_enc_iov;
+
+ lnet_kiov_t bd_iov[0];
+};
+
+enum {
+ SVC_STOPPED = 1 << 0,
+ SVC_STOPPING = 1 << 1,
+ SVC_STARTING = 1 << 2,
+ SVC_RUNNING = 1 << 3,
+ SVC_EVENT = 1 << 4,
+ SVC_SIGNAL = 1 << 5,
+};
+
+#define PTLRPC_THR_NAME_LEN 32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+ /**
+ * List of active threads in svc->srv_threads
+ */
+ struct list_head t_link;
+ /**
+ * thread-private data (preallocated memory)
+ */
+ void *t_data;
+ __u32 t_flags;
+ /**
+ * service thread index, from ptlrpc_start_threads
+ */
+ unsigned int t_id;
+ /**
+ * service thread pid
+ */
+ pid_t t_pid;
+ /**
+ * put watchdog in the structure per thread b=14840
+ */
+ struct lc_watchdog *t_watchdog;
+ /**
+ * the svc this thread belonged to b=18582
+ */
+ struct ptlrpc_service_part *t_svcpt;
+ wait_queue_head_t t_ctl_waitq;
+ struct lu_env *t_env;
+ char t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+ return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline int thread_is_event(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_EVENT);
+}
+
+static inline int thread_is_signal(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_SIGNAL);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+ thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+ thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+ thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+ __u32 flags)
+{
+ if (thread->t_flags & flags) {
+ thread->t_flags &= ~flags;
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+ /** Link item for rqbds on a service */
+ struct list_head rqbd_list;
+ /** History of requests for this buffer */
+ struct list_head rqbd_reqs;
+ /** Back pointer to service for which this buffer is registered */
+ struct ptlrpc_service_part *rqbd_svcpt;
+ /** LNet descriptor */
+ lnet_handle_md_t rqbd_md_h;
+ int rqbd_refcount;
+ /** The buffer itself */
+ char *rqbd_buffer;
+ struct ptlrpc_cb_id rqbd_cbid;
+ /**
+ * This "embedded" request structure is only used for the
+ * last request to fit into the buffer
+ */
+ struct ptlrpc_request rqbd_req;
+};
+
+typedef int (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+ /**
+ * if non-NULL called during thread creation (ptlrpc_start_thread())
+ * to initialize service specific per-thread state.
+ */
+ int (*so_thr_init)(struct ptlrpc_thread *thr);
+ /**
+ * if non-NULL called during thread shutdown (ptlrpc_main()) to
+ * destruct state created by ->srv_init().
+ */
+ void (*so_thr_done)(struct ptlrpc_thread *thr);
+ /**
+ * Handler function for incoming requests for this service
+ */
+ int (*so_req_handler)(struct ptlrpc_request *req);
+ /**
+ * function to determine priority of the request, it's called
+ * on every new request
+ */
+ int (*so_hpreq_handler)(struct ptlrpc_request *);
+ /**
+ * service-specific print fn
+ */
+ void (*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+ /** serialize /proc operations */
+ spinlock_t srv_lock;
+ /** most often accessed fields */
+ /** chain thru all services */
+ struct list_head srv_list;
+ /** service operations table */
+ struct ptlrpc_service_ops srv_ops;
+ /** only statically allocated strings here; we don't clean them */
+ char *srv_name;
+ /** only statically allocated strings here; we don't clean them */
+ char *srv_thread_name;
+ /** service thread list */
+ struct list_head srv_threads;
+ /** threads # should be created for each partition on initializing */
+ int srv_nthrs_cpt_init;
+ /** limit of threads number for each partition */
+ int srv_nthrs_cpt_limit;
+ /** Root of /proc dir tree for this service */
+ proc_dir_entry_t *srv_procroot;
+ /** Pointer to statistic data for this service */
+ struct lprocfs_stats *srv_stats;
+ /** # hp per lp reqs to handle */
+ int srv_hpreq_ratio;
+ /** biggest request to receive */
+ int srv_max_req_size;
+ /** biggest reply to send */
+ int srv_max_reply_size;
+ /** size of individual buffers */
+ int srv_buf_size;
+ /** # buffers to allocate in 1 group */
+ int srv_nbuf_per_group;
+ /** Local portal on which to receive requests */
+ __u32 srv_req_portal;
+ /** Portal on the client to send replies to */
+ __u32 srv_rep_portal;
+ /**
+ * Tags for lu_context associated with this thread, see struct
+ * lu_context.
+ */
+ __u32 srv_ctx_tags;
+ /** soft watchdog timeout multiplier */
+ int srv_watchdog_factor;
+ /** under unregister_service */
+ unsigned srv_is_stopping:1;
+
+ /** max # request buffers in history per partition */
+ int srv_hist_nrqbds_cpt_max;
+ /** number of CPTs this service bound on */
+ int srv_ncpts;
+ /** CPTs array this service bound on */
+ __u32 *srv_cpts;
+ /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+ int srv_cpt_bits;
+ /** CPT table this service is running over */
+ struct cfs_cpt_table *srv_cptable;
+ /**
+ * partition data for ptlrpc service
+ */
+ struct ptlrpc_service_part *srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ * serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ * serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ * serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ * serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+ /** back reference to owner */
+ struct ptlrpc_service *scp_service __cfs_cacheline_aligned;
+ /* CPT id, reserved */
+ int scp_cpt;
+ /** always increasing number */
+ int scp_thr_nextid;
+ /** # of starting threads */
+ int scp_nthrs_starting;
+ /** # of stopping threads, reserved for shrinking threads */
+ int scp_nthrs_stopping;
+ /** # running threads */
+ int scp_nthrs_running;
+ /** service threads list */
+ struct list_head scp_threads;
+
+ /**
+ * serialize the following fields, used for protecting
+ * rqbd list and incoming requests waiting for preprocess,
+ * threads starting & stopping are also protected by this lock.
+ */
+ spinlock_t scp_lock __cfs_cacheline_aligned;
+ /** total # req buffer descs allocated */
+ int scp_nrqbds_total;
+ /** # posted request buffers for receiving */
+ int scp_nrqbds_posted;
+ /** in progress of allocating rqbd */
+ int scp_rqbd_allocating;
+ /** # incoming reqs */
+ int scp_nreqs_incoming;
+ /** request buffers to be reposted */
+ struct list_head scp_rqbd_idle;
+ /** req buffers receiving */
+ struct list_head scp_rqbd_posted;
+ /** incoming reqs */
+ struct list_head scp_req_incoming;
+ /** timeout before re-posting reqs, in tick */
+ cfs_duration_t scp_rqbd_timeout;
+ /**
+ * all threads sleep on this. This wait-queue is signalled when new
+ * incoming request arrives and when difficult reply has to be handled.
+ */
+ wait_queue_head_t scp_waitq;
+
+ /** request history */
+ struct list_head scp_hist_reqs;
+ /** request buffer history */
+ struct list_head scp_hist_rqbds;
+ /** # request buffers in history */
+ int scp_hist_nrqbds;
+ /** sequence number for request */
+ __u64 scp_hist_seq;
+ /** highest seq culled from history */
+ __u64 scp_hist_seq_culled;
+
+ /**
+ * serialize the following fields, used for processing requests
+ * sent to this portal
+ */
+ spinlock_t scp_req_lock __cfs_cacheline_aligned;
+ /** # reqs in either of the NRS heads below */
+ /** # reqs being served */
+ int scp_nreqs_active;
+ /** # HPreqs being served */
+ int scp_nhreqs_active;
+ /** # hp requests handled */
+ int scp_hreq_count;
+
+ /** NRS head for regular requests */
+ struct ptlrpc_nrs scp_nrs_reg;
+ /** NRS head for HP requests; this is only valid for services that can
+ * handle HP requests */
+ struct ptlrpc_nrs *scp_nrs_hp;
+
+ /** AT stuff */
+ /** @{ */
+ /**
+ * serialize the following fields, used for changes on
+ * adaptive timeout
+ */
+ spinlock_t scp_at_lock __cfs_cacheline_aligned;
+ /** estimated rpc service time */
+ struct adaptive_timeout scp_at_estimate;
+ /** reqs waiting for replies */
+ struct ptlrpc_at_array scp_at_array;
+ /** early reply timer */
+ timer_list_t scp_at_timer;
+ /** debug */
+ cfs_time_t scp_at_checktime;
+ /** check early replies */
+ unsigned scp_at_check;
+ /** @} */
+
+ /**
+ * serialize the following fields, used for processing
+ * replies for this portal
+ */
+ spinlock_t scp_rep_lock __cfs_cacheline_aligned;
+ /** all the active replies */
+ struct list_head scp_rep_active;
+ /** List of free reply_states */
+ struct list_head scp_rep_idle;
+ /** waitq to run, when adding stuff to srv_free_rs_list */
+ wait_queue_head_t scp_rep_waitq;
+ /** # 'difficult' replies */
+ atomic_t scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc) \
+ for (i = 0; \
+ i < (svc)->srv_ncpts && \
+ (svc)->srv_parts != NULL && \
+ ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+ /**
+ * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+ */
+ unsigned long pc_flags;
+ /**
+ * Thread lock protecting structure fields.
+ */
+ spinlock_t pc_lock;
+ /**
+ * Start completion.
+ */
+ struct completion pc_starting;
+ /**
+ * Stop completion.
+ */
+ struct completion pc_finishing;
+ /**
+ * Thread requests set.
+ */
+ struct ptlrpc_request_set *pc_set;
+ /**
+ * Thread name used in cfs_daemonize()
+ */
+ char pc_name[16];
+ /**
+ * Environment for request interpreters to run in.
+ */
+ struct lu_env pc_env;
+ /**
+ * Index of ptlrpcd thread in the array.
+ */
+ int pc_index;
+ /**
+ * Number of the ptlrpcd's partners.
+ */
+ int pc_npartners;
+ /**
+ * Pointer to the array of partners' ptlrpcd_ctl structure.
+ */
+ struct ptlrpcd_ctl **pc_partners;
+ /**
+ * Record the partner index to be processed next.
+ */
+ int pc_cursor;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+ /**
+ * Ptlrpc thread start flag.
+ */
+ LIOD_START = 1 << 0,
+ /**
+ * Ptlrpc thread stop flag.
+ */
+ LIOD_STOP = 1 << 1,
+ /**
+ * Ptlrpc thread force flag (only stop force so far).
+ * This will cause aborting any inflight rpcs handled
+ * by thread if LIOD_STOP is specified.
+ */
+ LIOD_FORCE = 1 << 2,
+ /**
+ * This is a recovery ptlrpc thread.
+ */
+ LIOD_RECOVERY = 1 << 3,
+ /**
+ * The ptlrpcd is bound to some CPU core.
+ */
+ LIOD_BIND = 1 << 4,
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+ const struct ptlrpc_nrs_pol_desc *desc)
+{
+ return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+ const struct ptlrpc_nrs_pol_desc *desc)
+{
+ LASSERT(desc->pd_compat_svc_name != NULL);
+ return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern lnet_handle_eq_t ptlrpc_eq_h;
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+ lnet_process_id_t *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(lnet_event_t *ev);
+extern void reply_in_callback(lnet_event_t *ev);
+extern void client_bulk_callback(lnet_event_t *ev);
+extern void request_in_callback(lnet_event_t *ev);
+extern void reply_out_callback(lnet_event_t *ev);
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer,
+ lnet_nid_t self,
+ struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+ struct ptlrpc_bulk_desc *desc;
+ int rc;
+
+ LASSERT(req != NULL);
+ desc = req->rq_bulk;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+ req->rq_bulk_deadline > cfs_time_current_sec())
+ return 1;
+
+ if (!desc)
+ return 0;
+
+ spin_lock(&desc->bd_lock);
+ rc = desc->bd_md_count;
+ spin_unlock(&desc->bd_lock);
+ return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY 0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+ struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+ void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+ set_interpreter_func fn, void *data);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_expired_set(void *data);
+void ptlrpc_interrupted_set(void *data);
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+ struct ptlrpc_request *req);
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+ void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+ const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+ struct ptlrpc_request_pool *,
+ const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+ __u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+ const struct req_format *format,
+ __u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+ __u32 version, int opcode, char **bufs,
+ struct ptlrpc_cli_ctx *ctx);
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
+ int opcode, int count, __u32 *lengths,
+ char **bufs);
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp,
+ __u32 version, int opcode,
+ int count, __u32 *lengths, char **bufs,
+ struct ptlrpc_request_pool *pool);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+ unsigned npages, unsigned max_brw,
+ unsigned type, unsigned portal);
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
+static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
+{
+ __ptlrpc_free_bulk(bulk, 1);
+}
+static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk)
+{
+ __ptlrpc_free_bulk(bulk, 0);
+}
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset, int len, int);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset,
+ int len)
+{
+ __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset,
+ int len)
+{
+ __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+ struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+ int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+ /* nbufs is buffers # to allocate when growing the pool */
+ unsigned int bc_nbufs;
+ /* buffer size to post */
+ unsigned int bc_buf_size;
+ /* portal to listed for requests on */
+ unsigned int bc_req_portal;
+ /* portal of where to send replies to */
+ unsigned int bc_rep_portal;
+ /* maximum request size to be accepted for this service */
+ unsigned int bc_req_max_size;
+ /* maximum reply size this service can ever send */
+ unsigned int bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+ /* threadname should be 8 characters or less - 6 will be added on */
+ char *tc_thr_name;
+ /* threads increasing factor for each CPU */
+ unsigned int tc_thr_factor;
+ /* service threads # to start on each partition while initializing */
+ unsigned int tc_nthrs_init;
+ /*
+ * low water of threads # upper-limit on each partition while running,
+ * service availability may be impacted if threads number is lower
+ * than this value. It can be ZERO if the service doesn't require
+ * CPU affinity or there is only one partition.
+ */
+ unsigned int tc_nthrs_base;
+ /* "soft" limit for total threads number */
+ unsigned int tc_nthrs_max;
+ /* user specified threads number, it will be validated due to
+ * other members of this structure. */
+ unsigned int tc_nthrs_user;
+ /* set NUMA node affinity for service threads */
+ unsigned int tc_cpu_affinity;
+ /* Tags for lu_context associated with service thread */
+ __u32 tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+ struct cfs_cpt_table *cc_cptable;
+ /* string pattern to describe CPTs for a service */
+ char *cc_pattern;
+};
+
+struct ptlrpc_service_conf {
+ /* service name */
+ char *psc_name;
+ /* soft watchdog timeout multiplifier to print stuck service traces */
+ unsigned int psc_watchdog_factor;
+ /* buffer information */
+ struct ptlrpc_service_buf_conf psc_buf;
+ /* thread information */
+ struct ptlrpc_service_thr_conf psc_thr;
+ /* CPU partition information */
+ struct ptlrpc_service_cpt_conf psc_cpt;
+ /* function table */
+ struct ptlrpc_service_ops psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req,
+ struct lustre_handle *lock, int mode, int no_ack);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+ struct ptlrpc_service_conf *conf,
+ struct proc_dir_entry *proc_entry);
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc);
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+ struct obd_export *export);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+ int *uuid_len);
+
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+ int index);
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+ int index);
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+ char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+ __u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+ char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+ __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+ char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+ unsigned int newlen, int move_data);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+int lustre_msg_hdr_size(__u32 magic, int count);
+int lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+int lustre_msg_size_v2(int count, __u32 *lengths);
+int lustre_packed_msg_size(struct lustre_msg *msg);
+int lustre_msg_early_size(void);
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size);
+void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
+int lustre_msg_buflen(struct lustre_msg *m, int n);
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len);
+int lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, int n, int max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, int version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18);
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
+#endif
+void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+ unsigned int newlen, int move_data)
+{
+ LASSERT(req->rq_reply_state);
+ LASSERT(req->rq_repmsg);
+ req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+ newlen, move_data);
+}
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+ if (req->rq_phase == new_phase)
+ return;
+
+ if (new_phase == RQ_PHASE_UNREGISTERING) {
+ req->rq_next_phase = req->rq_phase;
+ if (req->rq_import)
+ atomic_inc(&req->rq_import->imp_unregistering);
+ }
+
+ if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+ if (req->rq_import)
+ atomic_dec(&req->rq_import->imp_unregistering);
+ }
+
+ DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"",
+ ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+ req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ req->rq_reply_deadline > cfs_time_current_sec())
+ return 0;
+ return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ req->rq_reply_deadline > cfs_time_current_sec())
+ return 0;
+ return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ req->rq_reply_deadline > cfs_time_current_sec())
+ return 1;
+ return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+ int rc;
+
+ spin_lock(&req->rq_lock);
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ req->rq_reply_deadline > cfs_time_current_sec()) {
+ spin_unlock(&req->rq_lock);
+ return 1;
+ }
+ rc = req->rq_receiving_reply || req->rq_must_unlink;
+ spin_unlock(&req->rq_lock);
+ return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+ if (req->rq_set == NULL)
+ wake_up(&req->rq_reply_waitq);
+ else
+ wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+ LASSERT(atomic_read(&rs->rs_refcount) > 0);
+ atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+ LASSERT(atomic_read(&rs->rs_refcount) > 0);
+ if (atomic_dec_and_test(&rs->rs_refcount))
+ lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+ if (req->rq_reply_state == NULL)
+ return; /* shouldn't occur */
+ ptlrpc_rs_decref(req->rq_reply_state);
+ req->rq_reply_state = NULL;
+ req->rq_repmsg = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+ return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+ switch (req->rq_reqmsg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return req->rq_reqmsg->lm_repsize;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n",
+ req->rq_reqmsg->lm_magic);
+ return -EFAULT;
+ }
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+ if (req->rq_delay_limit != 0 &&
+ cfs_time_before(cfs_time_add(req->rq_queued_time,
+ cfs_time_seconds(req->rq_delay_limit)),
+ cfs_time_current())) {
+ return 1;
+ }
+ return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+ if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+ spin_lock(&req->rq_lock);
+ req->rq_no_resend = 1;
+ spin_unlock(&req->rq_lock);
+ }
+ return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+ int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+ return svcpt->scp_service->srv_watchdog_factor *
+ max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_rqbd != NULL);
+ return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obddev);
+int client_connect_import(const struct lu_env *env,
+ struct obd_export **exp, struct obd_device *obd,
+ struct obd_uuid *cluuid, struct obd_connect_data *,
+ void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+ int priority);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+ struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+ TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+ timeout_cb_t cb, void *data,
+ struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+ enum timeout_event event);
+struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+cfs_time_t ptlrpc_suspend_wakeup_time(void);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc daemon bind policy */
+typedef enum {
+ /* all ptlrpcd threads are free mode */
+ PDB_POLICY_NONE = 1,
+ /* all ptlrpcd threads are bound mode */
+ PDB_POLICY_FULL = 2,
+ /* <free1 bound1> <free2 bound2> ... <freeN boundN> */
+ PDB_POLICY_PAIR = 3,
+ /* <free1 bound1> <bound1 free2> ... <freeN boundN> <boundN free1>,
+ * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1].
+ * If kernel supports NUMA, pthrpcd threads are binded and
+ * grouped by NUMA node */
+ PDB_POLICY_NEIGHBOR = 4,
+} pdb_policy_t;
+
+/* ptlrpc daemon load policy
+ * It is caller's duty to specify how to push the async RPC into some ptlrpcd
+ * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is
+ * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd,
+ * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner,
+ * depends on which is scheduled firstly, to accelerate the RPC processing. */
+typedef enum {
+ /* on the same CPU core as the caller */
+ PDL_POLICY_SAME = 1,
+ /* within the same CPU partition, but not the same core as the caller */
+ PDL_POLICY_LOCAL = 2,
+ /* round-robin on all CPU cores, but not the same core as the caller */
+ PDL_POLICY_ROUND = 3,
+ /* the specified CPU core is preferred, but not enforced */
+ PDL_POLICY_PREFERRED = 4,
+} pdl_policy_t;
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char* ll_opcode2str(__u32 opcode);
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_server.c */
+int llog_origin_handle_open(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
+int llog_origin_handle_next_block(struct ptlrpc_request *req);
+int llog_origin_handle_read_header(struct ptlrpc_request *req);
+int llog_origin_handle_close(struct ptlrpc_request *req);
+int llog_origin_handle_cancel(struct ptlrpc_request *req);
+
+/* ptlrpc/llog_client.c */
+extern struct llog_operations llog_client_ops;
+
+/** @} net */
+
+#endif
+/** @} PtlRPC */
diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h
new file mode 100644
index 000000000000..ed654684cb64
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_param.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_param.h
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_PARAM_H
+#define _LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/* For interoperability */
+struct cfg_interop_param {
+ char *old_param;
+ char *new_param;
+};
+
+/* obd_config.c */
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+ struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, __u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, __u32 net);
+/* obd_mount.c */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+ char *s1, char *s2, char *s3, char *s4);
+
+
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+ tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+ lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+ ... testfs-MDT0000.lov.stripesize=4M
+ ... testfs-OST0000.ost.client_cache_seconds=15
+ ... testfs.sys.timeout=<secs>
+ ... testfs.llite.max_read_ahead_mb=16
+*/
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT "timeout=" /* global */
+#define PARAM_LDLM_TIMEOUT "ldlm_timeout=" /* global */
+#define PARAM_AT_MIN "at_min=" /* global */
+#define PARAM_AT_MAX "at_max=" /* global */
+#define PARAM_AT_EXTRA "at_extra=" /* global */
+#define PARAM_AT_EARLY_MARGIN "at_early_margin=" /* global */
+#define PARAM_AT_HISTORY "at_history=" /* global */
+#define PARAM_JOBID_VAR "jobid_var=" /* global */
+#define PARAM_MGSNODE "mgsnode=" /* only at mounttime */
+#define PARAM_FAILNODE "failover.node=" /* add failover nid */
+#define PARAM_FAILMODE "failover.mode=" /* initial mount only */
+#define PARAM_ACTIVE "active=" /* activate/deactivate */
+#define PARAM_NETWORK "network=" /* bind on nid */
+#define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST "ost."
+#define PARAM_OSC "osc."
+#define PARAM_MDT "mdt."
+#define PARAM_MDD "mdd."
+#define PARAM_MDC "mdc."
+#define PARAM_LLITE "llite."
+#define PARAM_LOV "lov."
+#define PARAM_LOD "lod."
+#define PARAM_OSP "osp."
+#define PARAM_SYS "sys." /* global */
+#define PARAM_SRPC "srpc."
+#define PARAM_SRPC_FLVR "srpc.flavor."
+#define PARAM_SRPC_UDESC "srpc.udesc.cli2mdt"
+#define PARAM_SEC "security."
+#define PARAM_QUOTA "quota." /* global */
+
+/** @} param */
+
+#endif /* _LUSTRE_PARAM_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_quota.h b/drivers/staging/lustre/lustre/include/lustre_quota.h
new file mode 100644
index 000000000000..1c3041f50049
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_quota.h
@@ -0,0 +1,239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/lustre_quota.h>
+
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+ struct lquota_glb_rec lqr_glb_rec;
+ struct lquota_slv_rec lqr_slv_rec;
+ struct lquota_acct_rec lqr_acct_rec;
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME "mdt="
+#define QUOTA_DATAPOOL_NAME "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+ /* Handle quotactl request from client. */
+ int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+ struct obd_quotactl *);
+
+ /* Handle dqacq/dqrel request from slave. */
+ int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+ struct ptlrpc_request *);
+
+ /* LDLM intent policy associated with quota locks */
+ int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+ struct ptlrpc_request *, struct ldlm_lock **,
+ int);
+
+ /* Initialize LVB of ldlm resource associated with quota objects */
+ int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+ /* Update LVB of ldlm resource associated with quota objects */
+ int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+ struct ptlrpc_request *, int);
+
+ /* Return size of LVB to be packed in ldlm message */
+ int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+ /* Fill request buffer with lvb */
+ int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+ int);
+
+ /* Free lvb associated with ldlm resource */
+ int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ * instance via qsd_init(). This creates all required structures
+ * to manage quota enforcement for this target and performs all
+ * low-level initialization which does not involve any lustre
+ * object. qsd_init() should typically be called when the OSD
+ * is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ * feature and initiates the quota reintegration procedure if
+ * needed. qsd_prepare() should typically be called when
+ * ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ * (i.e. when ->ldo_recovery_complete is called). This is used
+ * to notify the qsd layer that quota should now be enforced
+ * again via the qsd_op_begin/end functions. The last step of the
+ * reintegration prodecure (namely usage reconciliation) will be
+ * completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ * qsd_init(). This releases all quota slave objects and frees the
+ * structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ * declaration of each operation. qsd_op_end() should then be
+ * invoked later once all operations have been completed in
+ * order to release/adjust the quota space.
+ * Running qsd_op_begin() before qsd_start() isn't fatal and
+ * will return success.
+ * Once qsd_start() has been run, qsd_op_begin() will block
+ * until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ * called after the operation transaction stopped.
+ * While qsd_op_begin() must be invoked each time a new
+ * operation is declared, qsd_op_end() should be called only
+ * once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined. */
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+ proc_dir_entry_t *);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+ struct lquota_trans *, struct lquota_id_info *, int *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+ struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+ union lquota_id *, int);
+/* This is exported for the ldiskfs quota migration only,
+ * see convert_quota_file() */
+int lquota_disk_write_glb(const struct lu_env *, struct dt_object *,
+ __u64, struct lquota_glb_rec *);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+ /* quota identifier */
+ union lquota_id lqi_id;
+
+ /* USRQUOTA or GRPQUOTA for now, could be expanded for
+ * directory quota or other types later. */
+ int lqi_type;
+
+ /* inodes or kbytes to be consumed or released, it could
+ * be negative when releasing space. */
+ long long lqi_space;
+
+ /* quota slave entry structure associated with this ID */
+ struct lquota_entry *lqi_qentry;
+
+ /* whether we are reporting blocks or inodes */
+ bool lqi_is_blk;
+};
+
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added. */
+#define QUOTA_MAX_TRANSIDS 4
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+ unsigned short lqt_id_cnt;
+ struct lquota_id_info lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA 0x01
+#define QUOTA_FL_OVER_GRPQUOTA 0x02
+#define QUOTA_FL_SYNC 0x04
+
+#define IS_LQUOTA_RES(res) \
+ (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA || \
+ res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+ struct obd_quotactl *);
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
new file mode 100644
index 000000000000..f4d3820865f1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
@@ -0,0 +1,334 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+ RCL_CLIENT,
+ RCL_SERVER,
+ RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR 9
+
+struct req_capsule {
+ struct ptlrpc_request *rc_req;
+ const struct req_format *rc_fmt;
+ enum req_location rc_loc;
+ __u32 rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_net.h>
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+ enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc);
+int req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+ const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len);
+void *req_capsule_server_get(struct req_capsule *pill,
+ const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+ const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc, int size);
+int req_capsule_get_size(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc);
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+ enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ unsigned int newlen,
+ enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ unsigned int newlen);
+int req_layout_init(void);
+void req_layout_fini(void);
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_GETSTATUS;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_UPDATE_OBJ;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_PIN;
+extern struct req_format RQF_MDS_UNPIN;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_WRITEPAGE;
+extern struct req_format RQF_MDS_IS_SUBDIR;
+extern struct req_format RQF_MDS_DONE_WRITING;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACHECK;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QC_CALLBACK;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACHECK;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO_GENERIC;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+/* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_UPDATE;
+extern struct req_msg_field RMF_UPDATE_REPLY;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h
new file mode 100644
index 000000000000..9e0908e1c4d6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_sec.h
@@ -0,0 +1,1145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+/* Linux specific */
+struct key;
+struct seq_file;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech) | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+ SPTLRPC_POLICY_NULL = 0,
+ SPTLRPC_POLICY_PLAIN = 1,
+ SPTLRPC_POLICY_GSS = 2,
+ SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+ SPTLRPC_MECH_NULL = 0,
+ SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+ SPTLRPC_MECH_PLAIN = 0,
+ SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+ SPTLRPC_MECH_GSS_NULL = 0,
+ SPTLRPC_MECH_GSS_KRB5 = 1,
+ SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+ SPTLRPC_SVC_NULL = 0, /**< no security */
+ SPTLRPC_SVC_AUTH = 1, /**< authentication only */
+ SPTLRPC_SVC_INTG = 2, /**< integrity */
+ SPTLRPC_SVC_PRIV = 3, /**< privacy */
+ SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+ SPTLRPC_BULK_DEFAULT = 0, /**< follow rpc flavor */
+ SPTLRPC_BULK_HASH = 1, /**< hash integrity */
+ SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+ SPTLRPC_BULK_SVC_NULL = 0, /**< no security */
+ SPTLRPC_BULK_SVC_AUTH = 1, /**< authentication only */
+ SPTLRPC_BULK_SVC_INTG = 2, /**< integrity */
+ SPTLRPC_BULK_SVC_PRIV = 3, /**< privacy */
+ SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET (0)
+#define FLVR_MECH_OFFSET (4)
+#define FLVR_SVC_OFFSET (8)
+#define FLVR_BULK_TYPE_OFFSET (12)
+#define FLVR_BULK_SVC_OFFSET (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \
+ (((__u32)(policy) << FLVR_POLICY_OFFSET) | \
+ ((__u32)(mech) << FLVR_MECH_OFFSET) | \
+ ((__u32)(svc) << FLVR_SVC_OFFSET) | \
+ ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \
+ ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor) \
+ ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor) \
+ ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor) \
+ ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor) \
+ ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor) \
+ ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor) \
+ ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor) \
+ ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc) \
+ ((__u32)(mech) | \
+ ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_KRB5N \
+ MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A \
+ MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I \
+ MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P \
+ MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL \
+ MAKE_FLVR(SPTLRPC_POLICY_NULL, \
+ SPTLRPC_MECH_NULL, \
+ SPTLRPC_SVC_NULL, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN \
+ MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \
+ SPTLRPC_MECH_PLAIN, \
+ SPTLRPC_SVC_NULL, \
+ SPTLRPC_BULK_HASH, \
+ SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5N \
+ MAKE_FLVR(SPTLRPC_POLICY_GSS, \
+ SPTLRPC_MECH_GSS_KRB5, \
+ SPTLRPC_SVC_NULL, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A \
+ MAKE_FLVR(SPTLRPC_POLICY_GSS, \
+ SPTLRPC_MECH_GSS_KRB5, \
+ SPTLRPC_SVC_AUTH, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I \
+ MAKE_FLVR(SPTLRPC_POLICY_GSS, \
+ SPTLRPC_MECH_GSS_KRB5, \
+ SPTLRPC_SVC_INTG, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P \
+ MAKE_FLVR(SPTLRPC_POLICY_GSS, \
+ SPTLRPC_MECH_GSS_KRB5, \
+ SPTLRPC_SVC_PRIV, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY ((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr) (((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+ LASSERT(svc < SPTLRPC_SVC_MAX);
+ *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+ SPTLRPC_FLVR_MECH(*flvr),
+ svc,
+ SPTLRPC_FLVR_BULK_TYPE(*flvr),
+ SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+ LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+ *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+ SPTLRPC_FLVR_MECH(*flvr),
+ SPTLRPC_FLVR_SVC(*flvr),
+ SPTLRPC_FLVR_BULK_TYPE(*flvr),
+ svc);
+}
+
+struct bulk_spec_hash {
+ __u8 hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+ /**
+ * wire flavor, should be renamed to sf_wire.
+ */
+ __u32 sf_rpc;
+ /**
+ * general flags of PTLRPC_SEC_FL_*
+ */
+ __u32 sf_flags;
+ /**
+ * rpc flavor specification
+ */
+ union {
+ /* nothing for now */
+ } u_rpc;
+ /**
+ * bulk flavor specification
+ */
+ union {
+ struct bulk_spec_hash hash;
+ } u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+ LUSTRE_SP_CLI = 0,
+ LUSTRE_SP_MDT,
+ LUSTRE_SP_OST,
+ LUSTRE_SP_MGC,
+ LUSTRE_SP_MGS,
+ LUSTRE_SP_ANY = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+ __u32 sr_netid; /* LNET network ID */
+ __u8 sr_from; /* sec_part */
+ __u8 sr_to; /* sec_part */
+ __u16 sr_padding;
+ struct sptlrpc_flavor sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+ int srs_nslot;
+ int srs_nrule;
+ struct sptlrpc_rule *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+ memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+ struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+ enum lustre_sec_part from,
+ enum lustre_sec_part to,
+ lnet_nid_t nid,
+ struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+int sptlrpc_conf_target_get_rules(struct obd_device *obd,
+ struct sptlrpc_rule_set *rset,
+ int initial);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+ enum lustre_sec_part from,
+ lnet_nid_t nid,
+ struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD (1024)
+
+
+struct vfs_cred {
+ uint32_t vc_uid;
+ uint32_t vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+ /**
+ * To determine whether it's suitable to use the \a ctx for \a vcred.
+ */
+ int (*match) (struct ptlrpc_cli_ctx *ctx,
+ struct vfs_cred *vcred);
+
+ /**
+ * To bring the \a ctx uptodate.
+ */
+ int (*refresh) (struct ptlrpc_cli_ctx *ctx);
+
+ /**
+ * Validate the \a ctx.
+ */
+ int (*validate) (struct ptlrpc_cli_ctx *ctx);
+
+ /**
+ * Force the \a ctx to die.
+ */
+ void (*die) (struct ptlrpc_cli_ctx *ctx,
+ int grace);
+ int (*display) (struct ptlrpc_cli_ctx *ctx,
+ char *buf, int bufsize);
+
+ /**
+ * Sign the request message using \a ctx.
+ *
+ * \pre req->rq_reqmsg point to request message.
+ * \pre req->rq_reqlen is the request message length.
+ * \post req->rq_reqbuf point to request message with signature.
+ * \post req->rq_reqdata_len is set to the final request message size.
+ *
+ * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+ */
+ int (*sign) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req);
+
+ /**
+ * Verify the reply message using \a ctx.
+ *
+ * \pre req->rq_repdata point to reply message with signature.
+ * \pre req->rq_repdata_len is the total reply message length.
+ * \post req->rq_repmsg point to reply message without signature.
+ * \post req->rq_replen is the reply message length.
+ *
+ * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+ */
+ int (*verify) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req);
+
+ /**
+ * Encrypt the request message using \a ctx.
+ *
+ * \pre req->rq_reqmsg point to request message in clear text.
+ * \pre req->rq_reqlen is the request message length.
+ * \post req->rq_reqbuf point to request message.
+ * \post req->rq_reqdata_len is set to the final request message size.
+ *
+ * \see gss_cli_ctx_seal().
+ */
+ int (*seal) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req);
+
+ /**
+ * Decrypt the reply message using \a ctx.
+ *
+ * \pre req->rq_repdata point to encrypted reply message.
+ * \pre req->rq_repdata_len is the total cipher text length.
+ * \post req->rq_repmsg point to reply message in clear text.
+ * \post req->rq_replen is the reply message length in clear text.
+ *
+ * \see gss_cli_ctx_unseal().
+ */
+ int (*unseal) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req);
+
+ /**
+ * Wrap bulk request data. This is called before wrapping RPC
+ * request message.
+ *
+ * \pre bulk buffer is descripted by desc->bd_iov and
+ * desc->bd_iov_count. note for read it's just buffer, no data
+ * need to be sent; for write it contains data in clear text.
+ * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+ * (usually inside of RPC request message).
+ * - encryption: cipher text bulk buffer is descripted by
+ * desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+ * count remains the same).
+ * - otherwise: bulk buffer is still desc->bd_iov and
+ * desc->bd_iov_count.
+ *
+ * \return 0: success.
+ * \return -ev: error code.
+ *
+ * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+ */
+ int (*wrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+
+ /**
+ * Unwrap bulk reply data. This is called after wrapping RPC
+ * reply message.
+ *
+ * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+ * desc->bd_iov_count, according to wrap_bulk().
+ * \post final bulk data in clear text is placed in buffer described
+ * by desc->bd_iov and desc->bd_iov_count.
+ * \return +ve nob of actual bulk data in clear text.
+ * \return -ve error code.
+ *
+ * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+ */
+ int (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT (0) /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT (1) /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT (2) /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT (3) /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT (8) /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT (9) /* always valid */
+
+#define PTLRPC_CTX_NEW (1 << PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE (1 << PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD (1 << PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR (1 << PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED (1 << PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL (1 << PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK (PTLRPC_CTX_NEW_BIT | \
+ PTLRPC_CTX_UPTODATE | \
+ PTLRPC_CTX_DEAD | \
+ PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+ struct hlist_node cc_cache; /* linked into ctx cache */
+ atomic_t cc_refcount;
+ struct ptlrpc_sec *cc_sec;
+ struct ptlrpc_ctx_ops *cc_ops;
+ cfs_time_t cc_expire; /* in seconds */
+ unsigned int cc_early_expire:1;
+ unsigned long cc_flags;
+ struct vfs_cred cc_vcred;
+ spinlock_t cc_lock;
+ struct list_head cc_req_list; /* waiting reqs linked here */
+ struct list_head cc_gc_chain; /* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+ /**
+ * Given an \a imp, create and initialize a ptlrpc_sec structure.
+ * \param ctx service context:
+ * - regular import: \a ctx should be NULL;
+ * - reverse import: \a ctx is obtained from incoming request.
+ * \param flavor specify what flavor to use.
+ *
+ * When necessary, policy module is responsible for taking reference
+ * on the import.
+ *
+ * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+ */
+ struct ptlrpc_sec * (*create_sec) (struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx,
+ struct sptlrpc_flavor *flavor);
+
+ /**
+ * Destructor of ptlrpc_sec. When called, refcount has been dropped
+ * to 0 and all contexts has been destroyed.
+ *
+ * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+ */
+ void (*destroy_sec) (struct ptlrpc_sec *sec);
+
+ /**
+ * Notify that this ptlrpc_sec is going to die. Optionally, policy
+ * module is supposed to set sec->ps_dying and whatever necessary
+ * actions.
+ *
+ * \see plain_kill_sec(), gss_sec_kill().
+ */
+ void (*kill_sec) (struct ptlrpc_sec *sec);
+
+ /**
+ * Given \a vcred, lookup and/or create its context. The policy module
+ * is supposed to maintain its own context cache.
+ * XXX currently \a create and \a remove_dead is always 1, perhaps
+ * should be removed completely.
+ *
+ * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+ */
+ struct ptlrpc_cli_ctx * (*lookup_ctx) (struct ptlrpc_sec *sec,
+ struct vfs_cred *vcred,
+ int create,
+ int remove_dead);
+
+ /**
+ * Called then the reference of \a ctx dropped to 0. The policy module
+ * is supposed to destroy this context or whatever else according to
+ * its cache maintainance mechamism.
+ *
+ * \param sync if zero, we shouldn't wait for the context being
+ * destroyed completely.
+ *
+ * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+ */
+ void (*release_ctx) (struct ptlrpc_sec *sec,
+ struct ptlrpc_cli_ctx *ctx,
+ int sync);
+
+ /**
+ * Flush the context cache.
+ *
+ * \param uid context of which user, -1 means all contexts.
+ * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+ * contexts should be cleared immediately.
+ * \param force if zero, only idle contexts will be flushed.
+ *
+ * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+ */
+ int (*flush_ctx_cache)
+ (struct ptlrpc_sec *sec,
+ uid_t uid,
+ int grace,
+ int force);
+
+ /**
+ * Called periodically by garbage collector to remove dead contexts
+ * from cache.
+ *
+ * \see gss_sec_gc_ctx_kr().
+ */
+ void (*gc_ctx) (struct ptlrpc_sec *sec);
+
+ /**
+ * Given an context \a ctx, install a corresponding reverse service
+ * context on client side.
+ * XXX currently it's only used by GSS module, maybe we should remove
+ * this from general API.
+ */
+ int (*install_rctx)(struct obd_import *imp,
+ struct ptlrpc_sec *sec,
+ struct ptlrpc_cli_ctx *ctx);
+
+ /**
+ * To allocate request buffer for \a req.
+ *
+ * \pre req->rq_reqmsg == NULL.
+ * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+ * we are not supposed to free it.
+ * \post if success, req->rq_reqmsg point to a buffer with size
+ * at least \a lustre_msg_size.
+ *
+ * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+ */
+ int (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int lustre_msg_size);
+
+ /**
+ * To free request buffer for \a req.
+ *
+ * \pre req->rq_reqbuf != NULL.
+ *
+ * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+ */
+ void (*free_reqbuf) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req);
+
+ /**
+ * To allocate reply buffer for \a req.
+ *
+ * \pre req->rq_repbuf == NULL.
+ * \post if success, req->rq_repbuf point to a buffer with size
+ * req->rq_repbuf_len, the size should be large enough to receive
+ * reply which be transformed from \a lustre_msg_size of clear text.
+ *
+ * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+ */
+ int (*alloc_repbuf)(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int lustre_msg_size);
+
+ /**
+ * To free reply buffer for \a req.
+ *
+ * \pre req->rq_repbuf != NULL.
+ * \post req->rq_repbuf == NULL.
+ * \post req->rq_repbuf_len == 0.
+ *
+ * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+ */
+ void (*free_repbuf) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req);
+
+ /**
+ * To expand the request buffer of \a req, thus the \a segment in
+ * the request message pointed by req->rq_reqmsg can accommodate
+ * at least \a newsize of data.
+ *
+ * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+ *
+ * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+ * gss_enlarge_reqbuf().
+ */
+ int (*enlarge_reqbuf)
+ (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int segment, int newsize);
+ /*
+ * misc
+ */
+ int (*display) (struct ptlrpc_sec *sec,
+ struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+ /**
+ * verify an incoming request.
+ *
+ * \pre request message is pointed by req->rq_reqbuf, size is
+ * req->rq_reqdata_len; and the message has been unpacked to
+ * host byte order.
+ *
+ * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+ * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+ * req->rq_sp_from is decoded from request.
+ * \retval SECSVC_COMPLETE success, the request has been fully
+ * processed, and reply message has been prepared; req->rq_sp_from is
+ * decoded from request.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ *
+ * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+ */
+ int (*accept) (struct ptlrpc_request *req);
+
+ /**
+ * Perform security transformation upon reply message.
+ *
+ * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+ * is req->rq_replen.
+ * \post req->rs_repdata_len is the final message size.
+ * \post req->rq_reply_off is set.
+ *
+ * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+ */
+ int (*authorize) (struct ptlrpc_request *req);
+
+ /**
+ * Invalidate server context \a ctx.
+ *
+ * \see gss_svc_invalidate_ctx().
+ */
+ void (*invalidate_ctx)
+ (struct ptlrpc_svc_ctx *ctx);
+
+ /**
+ * Allocate a ptlrpc_reply_state.
+ *
+ * \param msgsize size of the reply message in clear text.
+ * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+ * should simply use it; otherwise we'll responsible for allocating
+ * a new one.
+ * \post req->rq_reply_state != NULL;
+ * \post req->rq_reply_state->rs_msg != NULL;
+ *
+ * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+ */
+ int (*alloc_rs) (struct ptlrpc_request *req,
+ int msgsize);
+
+ /**
+ * Free a ptlrpc_reply_state.
+ */
+ void (*free_rs) (struct ptlrpc_reply_state *rs);
+
+ /**
+ * Release the server context \a ctx.
+ *
+ * \see gss_svc_free_ctx().
+ */
+ void (*free_ctx) (struct ptlrpc_svc_ctx *ctx);
+
+ /**
+ * Install a reverse context based on the server context \a ctx.
+ *
+ * \see gss_svc_install_rctx_kr().
+ */
+ int (*install_rctx)(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx);
+
+ /**
+ * Prepare buffer for incoming bulk write.
+ *
+ * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+ * intended to receive the write.
+ *
+ * \see gss_svc_prep_bulk().
+ */
+ int (*prep_bulk) (struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+
+ /**
+ * Unwrap the bulk write data.
+ *
+ * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+ */
+ int (*unwrap_bulk) (struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+
+ /**
+ * Wrap the bulk read data.
+ *
+ * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+ */
+ int (*wrap_bulk) (struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+ module_t *sp_owner;
+ char *sp_name;
+ __u16 sp_policy; /* policy number */
+ struct ptlrpc_sec_cops *sp_cops; /* client ops */
+ struct ptlrpc_sec_sops *sp_sops; /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE 0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY 0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC 0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK 0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG 0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+ struct ptlrpc_sec_policy *ps_policy;
+ atomic_t ps_refcount;
+ /** statistic only */
+ atomic_t ps_nctx;
+ /** unique identifier */
+ int ps_id;
+ struct sptlrpc_flavor ps_flvr;
+ enum lustre_sec_part ps_part;
+ /** after set, no more new context will be created */
+ unsigned int ps_dying:1;
+ /** owning import */
+ struct obd_import *ps_import;
+ spinlock_t ps_lock;
+
+ /*
+ * garbage collection
+ */
+ struct list_head ps_gc_list;
+ cfs_time_t ps_gc_interval; /* in seconds */
+ cfs_time_t ps_gc_next; /* in seconds */
+};
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+ return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+ return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+ atomic_t sc_refcount;
+ struct ptlrpc_sec_policy *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS (128)
+
+struct ptlrpc_user_desc {
+ __u32 pud_uid;
+ __u32 pud_gid;
+ __u32 pud_fsuid;
+ __u32 pud_fsgid;
+ __u32 pud_cap;
+ __u32 pud_ngroups;
+ __u32 pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+ BULK_HASH_ALG_NULL = 0,
+ BULK_HASH_ALG_ADLER32,
+ BULK_HASH_ALG_CRC32,
+ BULK_HASH_ALG_MD5,
+ BULK_HASH_ALG_SHA1,
+ BULK_HASH_ALG_SHA256,
+ BULK_HASH_ALG_SHA384,
+ BULK_HASH_ALG_SHA512,
+ BULK_HASH_ALG_MAX
+};
+
+const char * sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+ BSD_FL_ERR = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+ __u8 bsd_version; /* 0 */
+ __u8 bsd_type; /* SPTLRPC_BULK_XXX */
+ __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */
+ __u8 bsd_flags; /* flags */
+ __u32 bsd_nob; /* nob of bulk data */
+ __u8 bsd_data[0]; /* policy-specific token */
+};
+
+
+/*
+ * lprocfs
+ */
+struct proc_dir_entry;
+extern struct proc_dir_entry *sptlrpc_proc_root;
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+ size--;
+ size |= size >> 1;
+ size |= size >> 2;
+ size |= size >> 4;
+ size |= size >> 8;
+ size |= size >> 16;
+ size++;
+ return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+ int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+ char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline
+struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+ __module_get(policy->sp_owner);
+ return policy;
+}
+
+static inline
+void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+ module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+ return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+ return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+ return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+ return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+ return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+ return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+ return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy impelentation
+ */
+int sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+ int segment, int newsize);
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+ struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx,
+ struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char * sec2target_str(struct ptlrpc_sec *sec);
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev);
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+ SECSVC_OK = 0,
+ SECSVC_COMPLETE,
+ SECSVC_DROP,
+};
+
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int sptlrpc_target_export_check(struct obd_export *exp,
+ struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+ struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+ struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc,
+ int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+ void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+ return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
+enum {
+ LUSTRE_SEC_NONE = 0,
+ LUSTRE_SEC_REMOTE = 1,
+ LUSTRE_SEC_SPECIFY = 2,
+ LUSTRE_SEC_ALL = 3
+};
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_update.h b/drivers/staging/lustre/lustre/include/lustre_update.h
new file mode 100644
index 000000000000..84defce0f623
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_update.h
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.htm
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_update.h
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_UPDATE_H
+#define _LUSTRE_UPDATE_H
+
+#define UPDATE_BUFFER_SIZE 8192
+struct update_request {
+ struct dt_device *ur_dt;
+ struct list_head ur_list; /* attached itself to thandle */
+ int ur_flags;
+ int ur_rc; /* request result */
+ int ur_batchid; /* Current batch(trans) id */
+ struct update_buf *ur_buf; /* Holding the update req */
+};
+
+static inline unsigned long update_size(struct update *update)
+{
+ unsigned long size;
+ int i;
+
+ size = cfs_size_round(offsetof(struct update, u_bufs[0]));
+ for (i = 0; i < UPDATE_BUF_COUNT; i++)
+ size += cfs_size_round(update->u_lens[i]);
+
+ return size;
+}
+
+static inline void *update_param_buf(struct update *update, int index,
+ int *size)
+{
+ int i;
+ void *ptr;
+
+ if (index >= UPDATE_BUF_COUNT)
+ return NULL;
+
+ ptr = (char *)update + cfs_size_round(offsetof(struct update,
+ u_bufs[0]));
+ for (i = 0; i < index; i++) {
+ LASSERT(update->u_lens[i] > 0);
+ ptr += cfs_size_round(update->u_lens[i]);
+ }
+
+ if (size != NULL)
+ *size = update->u_lens[index];
+
+ return ptr;
+}
+
+static inline unsigned long update_buf_size(struct update_buf *buf)
+{
+ unsigned long size;
+ int i = 0;
+
+ size = cfs_size_round(offsetof(struct update_buf, ub_bufs[0]));
+ for (i = 0; i < buf->ub_count; i++) {
+ struct update *update;
+
+ update = (struct update *)((char *)buf + size);
+ size += update_size(update);
+ }
+ LASSERT(size <= UPDATE_BUFFER_SIZE);
+ return size;
+}
+
+static inline void *update_buf_get(struct update_buf *buf, int index, int *size)
+{
+ int count = buf->ub_count;
+ void *ptr;
+ int i = 0;
+
+ if (index >= count)
+ return NULL;
+
+ ptr = (char *)buf + cfs_size_round(offsetof(struct update_buf,
+ ub_bufs[0]));
+ for (i = 0; i < index; i++)
+ ptr += update_size((struct update *)ptr);
+
+ if (size != NULL)
+ *size = update_size((struct update *)ptr);
+
+ return ptr;
+}
+
+static inline void update_init_reply_buf(struct update_reply *reply, int count)
+{
+ reply->ur_version = UPDATE_REPLY_V1;
+ reply->ur_count = count;
+}
+
+static inline void *update_get_buf_internal(struct update_reply *reply,
+ int index, int *size)
+{
+ char *ptr;
+ int count = reply->ur_count;
+ int i;
+
+ if (index >= count)
+ return NULL;
+
+ ptr = (char *)reply + cfs_size_round(offsetof(struct update_reply,
+ ur_lens[count]));
+ for (i = 0; i < index; i++) {
+ LASSERT(reply->ur_lens[i] > 0);
+ ptr += cfs_size_round(reply->ur_lens[i]);
+ }
+
+ if (size != NULL)
+ *size = reply->ur_lens[index];
+
+ return ptr;
+}
+
+static inline void update_insert_reply(struct update_reply *reply, void *data,
+ int data_len, int index, int rc)
+{
+ char *ptr;
+
+ ptr = update_get_buf_internal(reply, index, NULL);
+ LASSERT(ptr != NULL);
+
+ *(int *)ptr = cpu_to_le32(rc);
+ ptr += sizeof(int);
+ if (data_len > 0) {
+ LASSERT(data != NULL);
+ memcpy(ptr, data, data_len);
+ }
+ reply->ur_lens[index] = data_len + sizeof(int);
+}
+
+static inline int update_get_reply_buf(struct update_reply *reply, void **buf,
+ int index)
+{
+ char *ptr;
+ int size = 0;
+ int result;
+
+ ptr = update_get_buf_internal(reply, index, &size);
+ result = *(int *)ptr;
+
+ if (result < 0)
+ return result;
+
+ LASSERT((ptr != NULL && size >= sizeof(int)));
+ *buf = ptr + sizeof(int);
+ return size - sizeof(int);
+}
+
+static inline int update_get_reply_result(struct update_reply *reply,
+ void **buf, int index)
+{
+ void *ptr;
+ int size;
+
+ ptr = update_get_buf_internal(reply, index, &size);
+ LASSERT(ptr != NULL && size > sizeof(int));
+ return *(int *)ptr;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h
new file mode 100644
index 000000000000..dc187b8f741f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_ver.h
@@ -0,0 +1,24 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+/* This file automatically generated from lustre/include/lustre_ver.h.in,
+ * based on parameters in lustre/autoconf/lustre-version.ac.
+ * Changes made directly to this file will be lost. */
+
+#define LUSTRE_MAJOR 2
+#define LUSTRE_MINOR 3
+#define LUSTRE_PATCH 64
+#define LUSTRE_FIX 0
+#define LUSTRE_VERSION_STRING "2.3.64"
+
+#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX)
+
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lvfs.h b/drivers/staging/lustre/lustre/include/lvfs.h
new file mode 100644
index 000000000000..28f1a6b76f73
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lvfs.h
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LVFS_H__
+#define __LVFS_H__
+
+#define LL_FID_NAMELEN (16 + 1 + 8 + 1)
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lvfs.h>
+
+#include <linux/libcfs/lucache.h>
+
+
+/* lvfs_common.c */
+struct dentry *lvfs_fid2dentry(struct lvfs_run_ctxt *, __u64, __u32, __u64 ,void *data);
+
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+ struct lvfs_ucred *cred);
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+ struct lvfs_ucred *cred);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/md_object.h b/drivers/staging/lustre/lustre/include/md_object.h
new file mode 100644
index 000000000000..92d6420b21da
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/md_object.h
@@ -0,0 +1,908 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/md_object.h
+ *
+ * Extention of lu_object.h for metadata objects
+ */
+
+#ifndef _LUSTRE_MD_OBJECT_H
+#define _LUSTRE_MD_OBJECT_H
+
+/** \defgroup md md
+ * Sub-class of lu_object with methods common for "meta-data" objects in MDT
+ * stack.
+ *
+ * Meta-data objects implement namespace operations: you can link, unlink
+ * them, and treat them as directories.
+ *
+ * Examples: mdt, cmm, and mdt are implementations of md interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <dt_object.h>
+
+struct md_device;
+struct md_device_operations;
+struct md_object;
+struct obd_export;
+
+enum {
+ UCRED_INVALID = -1,
+ UCRED_INIT = 0,
+ UCRED_OLD = 1,
+ UCRED_NEW = 2
+};
+
+enum {
+ MD_CAPAINFO_MAX = 5
+};
+
+/** there are at most 5 fids in one operation, see rename, NOTE the last one
+ * is a temporary one used for is_subdir() */
+struct md_capainfo {
+ __u32 mc_auth;
+ __u32 mc_padding;
+ struct lu_fid mc_fid[MD_CAPAINFO_MAX];
+ struct lustre_capa *mc_capa[MD_CAPAINFO_MAX];
+};
+
+struct md_quota {
+ struct obd_export *mq_exp;
+};
+
+/**
+ * Implemented in mdd/mdd_handler.c.
+ *
+ * XXX should be moved into separate .h/.c together with all md security
+ * related definitions.
+ */
+struct md_capainfo *md_capainfo(const struct lu_env *env);
+struct md_quota *md_quota(const struct lu_env *env);
+
+/** metadata attributes */
+enum ma_valid {
+ MA_INODE = (1 << 0),
+ MA_LOV = (1 << 1),
+ MA_COOKIE = (1 << 2),
+ MA_FLAGS = (1 << 3),
+ MA_LMV = (1 << 4),
+ MA_ACL_DEF = (1 << 5),
+ MA_LOV_DEF = (1 << 6),
+ MA_LAY_GEN = (1 << 7),
+ MA_HSM = (1 << 8),
+ MA_SOM = (1 << 9),
+ MA_PFID = (1 << 10)
+};
+
+typedef enum {
+ MDL_MINMODE = 0,
+ MDL_EX = 1,
+ MDL_PW = 2,
+ MDL_PR = 4,
+ MDL_CW = 8,
+ MDL_CR = 16,
+ MDL_NL = 32,
+ MDL_GROUP = 64,
+ MDL_MAXMODE
+} mdl_mode_t;
+
+typedef enum {
+ MDT_NUL_LOCK = 0,
+ MDT_REG_LOCK = (1 << 0),
+ MDT_PDO_LOCK = (1 << 1)
+} mdl_type_t;
+
+/* memory structure for hsm attributes
+ * for fields description see the on disk structure hsm_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_hsm {
+ __u32 mh_compat;
+ __u32 mh_flags;
+ __u64 mh_arch_id;
+ __u64 mh_arch_ver;
+};
+
+#define IOEPOCH_INVAL 0
+
+/* memory structure for som attributes
+ * for fields description see the on disk structure som_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_som_data {
+ __u32 msd_compat;
+ __u32 msd_incompat;
+ __u64 msd_ioepoch;
+ __u64 msd_size;
+ __u64 msd_blocks;
+ __u64 msd_mountid;
+};
+
+struct md_attr {
+ __u64 ma_valid;
+ __u64 ma_need;
+ __u64 ma_attr_flags;
+ struct lu_attr ma_attr;
+ struct lu_fid ma_pfid;
+ struct md_hsm ma_hsm;
+ struct lov_mds_md *ma_lmm;
+ struct lmv_stripe_md *ma_lmv;
+ void *ma_acl;
+ struct llog_cookie *ma_cookie;
+ struct lustre_capa *ma_capa;
+ struct md_som_data *ma_som;
+ int ma_lmm_size;
+ int ma_lmv_size;
+ int ma_acl_size;
+ int ma_cookie_size;
+ __u16 ma_layout_gen;
+};
+
+/** Additional parameters for create */
+struct md_op_spec {
+ union {
+ /** symlink target */
+ const char *sp_symname;
+ /** parent FID for cross-ref mkdir */
+ const struct lu_fid *sp_pfid;
+ /** eadata for regular files */
+ struct md_spec_reg {
+ /** lov objs exist already */
+ const struct lu_fid *fid;
+ const void *eadata;
+ int eadatalen;
+ } sp_ea;
+ } u;
+
+ /** Create flag from client: such as MDS_OPEN_CREAT, and others. */
+ __u64 sp_cr_flags;
+
+ /** don't create lov objects or llog cookie - this replay */
+ unsigned int no_create:1,
+ sp_cr_lookup:1, /* do lookup sanity check or not. */
+ sp_rm_entry:1; /* only remove name entry */
+
+ /** Current lock mode for parent dir where create is performing. */
+ mdl_mode_t sp_cr_mode;
+
+ /** to create directory */
+ const struct dt_index_features *sp_feat;
+};
+
+/**
+ * Operations implemented for each md object (both directory and leaf).
+ */
+struct md_object_operations {
+ int (*moo_permission)(const struct lu_env *env,
+ struct md_object *pobj, struct md_object *cobj,
+ struct md_attr *attr, int mask);
+
+ int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+ struct md_attr *attr);
+
+ int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+ const struct md_attr *attr);
+
+ int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+ struct lu_buf *buf, const char *name);
+
+ int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+ struct lu_buf *buf);
+
+ int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+ const struct lu_buf *buf, const char *name,
+ int fl);
+
+ int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+ const char *name);
+
+ /** This method is used to swap the layouts between 2 objects */
+ int (*moo_swap_layouts)(const struct lu_env *env,
+ struct md_object *obj1, struct md_object *obj2,
+ __u64 flags);
+
+ /** \retval number of bytes actually read upon success */
+ int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+ const struct lu_rdpg *rdpg);
+
+ int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+ struct lu_buf *buf);
+ int (*moo_changelog)(const struct lu_env *env,
+ enum changelog_rec_type type, int flags,
+ struct md_object *obj);
+ /** part of cross-ref operation */
+ int (*moo_object_create)(const struct lu_env *env,
+ struct md_object *obj,
+ const struct md_op_spec *spec,
+ struct md_attr *ma);
+
+ int (*moo_ref_add)(const struct lu_env *env,
+ struct md_object *obj,
+ const struct md_attr *ma);
+
+ int (*moo_ref_del)(const struct lu_env *env,
+ struct md_object *obj,
+ struct md_attr *ma);
+
+ int (*moo_open)(const struct lu_env *env,
+ struct md_object *obj, int flag);
+
+ int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+ struct md_attr *ma, int mode);
+
+ int (*moo_capa_get)(const struct lu_env *, struct md_object *,
+ struct lustre_capa *, int renewal);
+
+ int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+
+ int (*moo_file_lock)(const struct lu_env *env, struct md_object *obj,
+ struct lov_mds_md *lmm, struct ldlm_extent *extent,
+ struct lustre_handle *lockh);
+ int (*moo_file_unlock)(const struct lu_env *env, struct md_object *obj,
+ struct lov_mds_md *lmm,
+ struct lustre_handle *lockh);
+ int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
+ struct lustre_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ void *policy);
+};
+
+/**
+ * Operations implemented for each directory object.
+ */
+struct md_dir_operations {
+ int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
+ const struct lu_fid *fid, struct lu_fid *sfid);
+
+ int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+ const struct lu_name *lname, struct lu_fid *fid,
+ struct md_op_spec *spec);
+
+ mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+ struct md_object *obj,
+ mdl_mode_t mode);
+
+ int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+ const struct lu_name *lname, struct md_object *child,
+ struct md_op_spec *spec,
+ struct md_attr *ma);
+
+ /** This method is used for creating data object for this meta object*/
+ int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+ struct md_object *o,
+ const struct md_op_spec *spec,
+ struct md_attr *ma);
+
+ int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+ struct md_object *tpobj, const struct lu_fid *lf,
+ const struct lu_name *lsname, struct md_object *tobj,
+ const struct lu_name *ltname, struct md_attr *ma);
+
+ int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+ struct md_object *src_obj, const struct lu_name *lname,
+ struct md_attr *ma);
+
+ int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
+ struct md_object *cobj, const struct lu_name *lname,
+ struct md_attr *ma, int no_name);
+
+ /** This method is used to compare a requested layout to an existing
+ * layout (struct lov_mds_md_v1/3 vs struct lov_mds_md_v1/3) */
+ int (*mdo_lum_lmm_cmp)(const struct lu_env *env,
+ struct md_object *cobj,
+ const struct md_op_spec *spec,
+ struct md_attr *ma);
+
+ /** partial ops for cross-ref case */
+ int (*mdo_name_insert)(const struct lu_env *env,
+ struct md_object *obj,
+ const struct lu_name *lname,
+ const struct lu_fid *fid,
+ const struct md_attr *ma);
+
+ int (*mdo_name_remove)(const struct lu_env *env,
+ struct md_object *obj,
+ const struct lu_name *lname,
+ const struct md_attr *ma);
+
+ int (*mdo_rename_tgt)(const struct lu_env *env, struct md_object *pobj,
+ struct md_object *tobj, const struct lu_fid *fid,
+ const struct lu_name *lname, struct md_attr *ma);
+};
+
+struct md_device_operations {
+ /** meta-data device related handlers. */
+ int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
+ struct lu_fid *f);
+
+ int (*mdo_maxsize_get)(const struct lu_env *env, struct md_device *m,
+ int *md_size, int *cookie_size);
+
+ int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
+ struct obd_statfs *sfs);
+
+ int (*mdo_init_capa_ctxt)(const struct lu_env *env, struct md_device *m,
+ int mode, unsigned long timeout, __u32 alg,
+ struct lustre_capa_key *keys);
+
+ int (*mdo_update_capa_key)(const struct lu_env *env,
+ struct md_device *m,
+ struct lustre_capa_key *key);
+
+ int (*mdo_llog_ctxt_get)(const struct lu_env *env,
+ struct md_device *m, int idx, void **h);
+
+ int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m,
+ unsigned int cmd, int len, void *data);
+};
+
+enum md_upcall_event {
+ /** Sync the md layer*/
+ MD_LOV_SYNC = (1 << 0),
+ /** Just for split, no need trans, for replay */
+ MD_NO_TRANS = (1 << 1),
+ MD_LOV_CONFIG = (1 << 2),
+ /** Trigger quota recovery */
+ MD_LOV_QUOTA = (1 << 3)
+};
+
+struct md_upcall {
+ /** this lock protects upcall using against its removal
+ * read lock is for usage the upcall, write - for init/fini */
+ struct rw_semaphore mu_upcall_sem;
+ /** device to call, upper layer normally */
+ struct md_device *mu_upcall_dev;
+ /** upcall function */
+ int (*mu_upcall)(const struct lu_env *env, struct md_device *md,
+ enum md_upcall_event ev, void *data);
+};
+
+struct md_device {
+ struct lu_device md_lu_dev;
+ const struct md_device_operations *md_ops;
+ struct md_upcall md_upcall;
+};
+
+static inline void md_upcall_init(struct md_device *m, void *upcl)
+{
+ init_rwsem(&m->md_upcall.mu_upcall_sem);
+ m->md_upcall.mu_upcall_dev = NULL;
+ m->md_upcall.mu_upcall = upcl;
+}
+
+static inline void md_upcall_dev_set(struct md_device *m, struct md_device *up)
+{
+ down_write(&m->md_upcall.mu_upcall_sem);
+ m->md_upcall.mu_upcall_dev = up;
+ up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline void md_upcall_fini(struct md_device *m)
+{
+ down_write(&m->md_upcall.mu_upcall_sem);
+ m->md_upcall.mu_upcall_dev = NULL;
+ m->md_upcall.mu_upcall = NULL;
+ up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline int md_do_upcall(const struct lu_env *env, struct md_device *m,
+ enum md_upcall_event ev, void *data)
+{
+ int rc = 0;
+ down_read(&m->md_upcall.mu_upcall_sem);
+ if (m->md_upcall.mu_upcall_dev != NULL &&
+ m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall != NULL) {
+ rc = m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall(env,
+ m->md_upcall.mu_upcall_dev,
+ ev, data);
+ }
+ up_read(&m->md_upcall.mu_upcall_sem);
+ return rc;
+}
+
+struct md_object {
+ struct lu_object mo_lu;
+ const struct md_object_operations *mo_ops;
+ const struct md_dir_operations *mo_dir_ops;
+};
+
+/**
+ * seq-server site.
+ */
+struct seq_server_site {
+ struct lu_site *ss_lu;
+ /**
+ * mds number of this site.
+ */
+ mdsno_t ss_node_id;
+ /**
+ * Fid location database
+ */
+ struct lu_server_fld *ss_server_fld;
+ struct lu_client_fld *ss_client_fld;
+
+ /**
+ * Server Seq Manager
+ */
+ struct lu_server_seq *ss_server_seq;
+
+ /**
+ * Controller Seq Manager
+ */
+ struct lu_server_seq *ss_control_seq;
+ struct obd_export *ss_control_exp;
+
+ /**
+ * Client Seq Manager
+ */
+ struct lu_client_seq *ss_client_seq;
+};
+
+static inline struct md_device *lu2md_dev(const struct lu_device *d)
+{
+ LASSERT(IS_ERR(d) || lu_device_is_md(d));
+ return container_of0(d, struct md_device, md_lu_dev);
+}
+
+static inline struct lu_device *md2lu_dev(struct md_device *d)
+{
+ return &d->md_lu_dev;
+}
+
+static inline struct md_object *lu2md(const struct lu_object *o)
+{
+ LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev));
+ return container_of0(o, struct md_object, mo_lu);
+}
+
+static inline struct md_object *md_object_next(const struct md_object *obj)
+{
+ return (obj ? lu2md(lu_object_next(&obj->mo_lu)) : NULL);
+}
+
+static inline struct md_device *md_obj2dev(const struct md_object *o)
+{
+ LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->mo_lu.lo_dev));
+ return container_of0(o->mo_lu.lo_dev, struct md_device, md_lu_dev);
+}
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+ return s->ld_seq_site;
+}
+
+static inline int md_device_init(struct md_device *md, struct lu_device_type *t)
+{
+ return lu_device_init(&md->md_lu_dev, t);
+}
+
+static inline void md_device_fini(struct md_device *md)
+{
+ lu_device_fini(&md->md_lu_dev);
+}
+
+static inline struct md_object *md_object_find_slice(const struct lu_env *env,
+ struct md_device *md,
+ const struct lu_fid *f)
+{
+ return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL));
+}
+
+
+/** md operations */
+static inline int mo_permission(const struct lu_env *env,
+ struct md_object *p,
+ struct md_object *c,
+ struct md_attr *at,
+ int mask)
+{
+ LASSERT(c->mo_ops->moo_permission);
+ return c->mo_ops->moo_permission(env, p, c, at, mask);
+}
+
+static inline int mo_attr_get(const struct lu_env *env,
+ struct md_object *m,
+ struct md_attr *at)
+{
+ LASSERT(m->mo_ops->moo_attr_get);
+ return m->mo_ops->moo_attr_get(env, m, at);
+}
+
+static inline int mo_readlink(const struct lu_env *env,
+ struct md_object *m,
+ struct lu_buf *buf)
+{
+ LASSERT(m->mo_ops->moo_readlink);
+ return m->mo_ops->moo_readlink(env, m, buf);
+}
+
+static inline int mo_changelog(const struct lu_env *env,
+ enum changelog_rec_type type,
+ int flags, struct md_object *m)
+{
+ LASSERT(m->mo_ops->moo_changelog);
+ return m->mo_ops->moo_changelog(env, type, flags, m);
+}
+
+static inline int mo_attr_set(const struct lu_env *env,
+ struct md_object *m,
+ const struct md_attr *at)
+{
+ LASSERT(m->mo_ops->moo_attr_set);
+ return m->mo_ops->moo_attr_set(env, m, at);
+}
+
+static inline int mo_xattr_get(const struct lu_env *env,
+ struct md_object *m,
+ struct lu_buf *buf,
+ const char *name)
+{
+ LASSERT(m->mo_ops->moo_xattr_get);
+ return m->mo_ops->moo_xattr_get(env, m, buf, name);
+}
+
+static inline int mo_xattr_del(const struct lu_env *env,
+ struct md_object *m,
+ const char *name)
+{
+ LASSERT(m->mo_ops->moo_xattr_del);
+ return m->mo_ops->moo_xattr_del(env, m, name);
+}
+
+static inline int mo_xattr_set(const struct lu_env *env,
+ struct md_object *m,
+ const struct lu_buf *buf,
+ const char *name,
+ int flags)
+{
+ LASSERT(m->mo_ops->moo_xattr_set);
+ return m->mo_ops->moo_xattr_set(env, m, buf, name, flags);
+}
+
+static inline int mo_xattr_list(const struct lu_env *env,
+ struct md_object *m,
+ struct lu_buf *buf)
+{
+ LASSERT(m->mo_ops->moo_xattr_list);
+ return m->mo_ops->moo_xattr_list(env, m, buf);
+}
+
+static inline int mo_swap_layouts(const struct lu_env *env,
+ struct md_object *o1,
+ struct md_object *o2, __u64 flags)
+{
+ LASSERT(o1->mo_ops->moo_swap_layouts);
+ LASSERT(o2->mo_ops->moo_swap_layouts);
+ if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts)
+ return -EPERM;
+ return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
+}
+
+static inline int mo_open(const struct lu_env *env,
+ struct md_object *m,
+ int flags)
+{
+ LASSERT(m->mo_ops->moo_open);
+ return m->mo_ops->moo_open(env, m, flags);
+}
+
+static inline int mo_close(const struct lu_env *env,
+ struct md_object *m,
+ struct md_attr *ma,
+ int mode)
+{
+ LASSERT(m->mo_ops->moo_close);
+ return m->mo_ops->moo_close(env, m, ma, mode);
+}
+
+static inline int mo_readpage(const struct lu_env *env,
+ struct md_object *m,
+ const struct lu_rdpg *rdpg)
+{
+ LASSERT(m->mo_ops->moo_readpage);
+ return m->mo_ops->moo_readpage(env, m, rdpg);
+}
+
+static inline int mo_object_create(const struct lu_env *env,
+ struct md_object *m,
+ const struct md_op_spec *spc,
+ struct md_attr *at)
+{
+ LASSERT(m->mo_ops->moo_object_create);
+ return m->mo_ops->moo_object_create(env, m, spc, at);
+}
+
+static inline int mo_ref_add(const struct lu_env *env,
+ struct md_object *m,
+ const struct md_attr *ma)
+{
+ LASSERT(m->mo_ops->moo_ref_add);
+ return m->mo_ops->moo_ref_add(env, m, ma);
+}
+
+static inline int mo_ref_del(const struct lu_env *env,
+ struct md_object *m,
+ struct md_attr *ma)
+{
+ LASSERT(m->mo_ops->moo_ref_del);
+ return m->mo_ops->moo_ref_del(env, m, ma);
+}
+
+static inline int mo_capa_get(const struct lu_env *env,
+ struct md_object *m,
+ struct lustre_capa *c,
+ int renewal)
+{
+ LASSERT(m->mo_ops->moo_capa_get);
+ return m->mo_ops->moo_capa_get(env, m, c, renewal);
+}
+
+static inline int mo_object_sync(const struct lu_env *env, struct md_object *m)
+{
+ LASSERT(m->mo_ops->moo_object_sync);
+ return m->mo_ops->moo_object_sync(env, m);
+}
+
+static inline int mo_file_lock(const struct lu_env *env, struct md_object *m,
+ struct lov_mds_md *lmm,
+ struct ldlm_extent *extent,
+ struct lustre_handle *lockh)
+{
+ LASSERT(m->mo_ops->moo_file_lock);
+ return m->mo_ops->moo_file_lock(env, m, lmm, extent, lockh);
+}
+
+static inline int mo_file_unlock(const struct lu_env *env, struct md_object *m,
+ struct lov_mds_md *lmm,
+ struct lustre_handle *lockh)
+{
+ LASSERT(m->mo_ops->moo_file_unlock);
+ return m->mo_ops->moo_file_unlock(env, m, lmm, lockh);
+}
+
+static inline int mo_object_lock(const struct lu_env *env,
+ struct md_object *m,
+ struct lustre_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ void *policy)
+{
+ LASSERT(m->mo_ops->moo_object_lock);
+ return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy);
+}
+
+static inline int mdo_lookup(const struct lu_env *env,
+ struct md_object *p,
+ const struct lu_name *lname,
+ struct lu_fid *f,
+ struct md_op_spec *spec)
+{
+ LASSERT(p->mo_dir_ops->mdo_lookup);
+ return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec);
+}
+
+static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env,
+ struct md_object *mo,
+ mdl_mode_t lm)
+{
+ if (mo->mo_dir_ops->mdo_lock_mode == NULL)
+ return MDL_MINMODE;
+ return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm);
+}
+
+static inline int mdo_create(const struct lu_env *env,
+ struct md_object *p,
+ const struct lu_name *lchild_name,
+ struct md_object *c,
+ struct md_op_spec *spc,
+ struct md_attr *at)
+{
+ LASSERT(p->mo_dir_ops->mdo_create);
+ return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at);
+}
+
+static inline int mdo_create_data(const struct lu_env *env,
+ struct md_object *p,
+ struct md_object *c,
+ const struct md_op_spec *spec,
+ struct md_attr *ma)
+{
+ LASSERT(c->mo_dir_ops->mdo_create_data);
+ return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma);
+}
+
+static inline int mdo_rename(const struct lu_env *env,
+ struct md_object *sp,
+ struct md_object *tp,
+ const struct lu_fid *lf,
+ const struct lu_name *lsname,
+ struct md_object *t,
+ const struct lu_name *ltname,
+ struct md_attr *ma)
+{
+ LASSERT(tp->mo_dir_ops->mdo_rename);
+ return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname,
+ ma);
+}
+
+static inline int mdo_is_subdir(const struct lu_env *env,
+ struct md_object *mo,
+ const struct lu_fid *fid,
+ struct lu_fid *sfid)
+{
+ LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+ return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
+}
+
+static inline int mdo_link(const struct lu_env *env,
+ struct md_object *p,
+ struct md_object *s,
+ const struct lu_name *lname,
+ struct md_attr *ma)
+{
+ LASSERT(s->mo_dir_ops->mdo_link);
+ return s->mo_dir_ops->mdo_link(env, p, s, lname, ma);
+}
+
+static inline int mdo_unlink(const struct lu_env *env,
+ struct md_object *p,
+ struct md_object *c,
+ const struct lu_name *lname,
+ struct md_attr *ma, int no_name)
+{
+ LASSERT(p->mo_dir_ops->mdo_unlink);
+ return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
+}
+
+static inline int mdo_lum_lmm_cmp(const struct lu_env *env,
+ struct md_object *c,
+ const struct md_op_spec *spec,
+ struct md_attr *ma)
+{
+ LASSERT(c->mo_dir_ops->mdo_lum_lmm_cmp);
+ return c->mo_dir_ops->mdo_lum_lmm_cmp(env, c, spec, ma);
+}
+
+static inline int mdo_name_insert(const struct lu_env *env,
+ struct md_object *p,
+ const struct lu_name *lname,
+ const struct lu_fid *f,
+ const struct md_attr *ma)
+{
+ LASSERT(p->mo_dir_ops->mdo_name_insert);
+ return p->mo_dir_ops->mdo_name_insert(env, p, lname, f, ma);
+}
+
+static inline int mdo_name_remove(const struct lu_env *env,
+ struct md_object *p,
+ const struct lu_name *lname,
+ const struct md_attr *ma)
+{
+ LASSERT(p->mo_dir_ops->mdo_name_remove);
+ return p->mo_dir_ops->mdo_name_remove(env, p, lname, ma);
+}
+
+static inline int mdo_rename_tgt(const struct lu_env *env,
+ struct md_object *p,
+ struct md_object *t,
+ const struct lu_fid *lf,
+ const struct lu_name *lname,
+ struct md_attr *ma)
+{
+ if (t) {
+ LASSERT(t->mo_dir_ops->mdo_rename_tgt);
+ return t->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+ } else {
+ LASSERT(p->mo_dir_ops->mdo_rename_tgt);
+ return p->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+ }
+}
+
+/**
+ * Used in MDD/OUT layer for object lock rule
+ **/
+enum mdd_object_role {
+ MOR_SRC_PARENT,
+ MOR_SRC_CHILD,
+ MOR_TGT_PARENT,
+ MOR_TGT_CHILD,
+ MOR_TGT_ORPHAN
+};
+
+struct dt_device;
+/**
+ * Structure to hold object information. This is used to create object
+ * \pre llod_dir exist
+ */
+struct lu_local_obj_desc {
+ const char *llod_dir;
+ const char *llod_name;
+ __u32 llod_oid;
+ int llod_is_index;
+ const struct dt_index_features *llod_feat;
+ struct list_head llod_linkage;
+};
+
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd);
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
+void lustre_hsm2buf(void *buf, struct md_hsm *mh);
+
+struct lu_ucred {
+ __u32 uc_valid;
+ __u32 uc_o_uid;
+ __u32 uc_o_gid;
+ __u32 uc_o_fsuid;
+ __u32 uc_o_fsgid;
+ __u32 uc_uid;
+ __u32 uc_gid;
+ __u32 uc_fsuid;
+ __u32 uc_fsgid;
+ __u32 uc_suppgids[2];
+ cfs_cap_t uc_cap;
+ __u32 uc_umask;
+ group_info_t *uc_ginfo;
+ struct md_identity *uc_identity;
+};
+
+struct lu_ucred *lu_ucred(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_check(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env);
+
+int lu_ucred_global_init(void);
+
+void lu_ucred_global_fini(void);
+
+#define md_cap_t(x) (x)
+
+#define MD_CAP_TO_MASK(x) (1 << (x))
+
+#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag))
+
+/* capable() is copied from linux kernel! */
+static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap)
+{
+ if (md_cap_raised(uc->uc_cap, cap))
+ return 1;
+ return 0;
+}
+
+/** @} md */
+#endif /* _LINUX_MD_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h
new file mode 100644
index 000000000000..0a251fdfe167
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd.h
@@ -0,0 +1,1677 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include <linux/obd.h>
+
+#define IOC_OSC_TYPE 'h'
+#define IOC_OSC_MIN_NR 20
+#define IOC_OSC_SET_ACTIVE _IOWR(IOC_OSC_TYPE, 21, struct obd_device *)
+#define IOC_OSC_MAX_NR 50
+
+#define IOC_MDC_TYPE 'i'
+#define IOC_MDC_MIN_NR 20
+#define IOC_MDC_MAX_NR 50
+
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+#include <lustre_lib.h>
+#include <lustre_export.h>
+#include <lustre_fld.h>
+#include <lustre_capa.h>
+
+#include <linux/libcfs/bitmap.h>
+
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+ int ar_rc;
+ int ar_force_sync;
+ __u64 ar_min_xid;
+};
+
+struct lov_oinfo { /* per-stripe data structure */
+ struct ost_id loi_oi; /* object ID/Sequence on the target OST */
+ int loi_ost_idx; /* OST stripe index in lov_tgt_desc->tgts */
+ int loi_ost_gen; /* generation of this loi_ost_idx */
+
+ unsigned long loi_kms_valid:1;
+ __u64 loi_kms; /* known minimum size */
+ struct ost_lvb loi_lvb;
+ struct osc_async_rc loi_ar;
+};
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+ oinfo->loi_kms = kms;
+ oinfo->loi_kms_valid = 1;
+}
+
+static inline void loi_init(struct lov_oinfo *loi)
+{
+}
+
+struct lov_stripe_md {
+ atomic_t lsm_refc;
+ spinlock_t lsm_lock;
+ pid_t lsm_lock_owner; /* debugging */
+
+ /* maximum possible file size, might change as OSTs status changes,
+ * e.g. disconnected, deactivated */
+ __u64 lsm_maxbytes;
+ struct {
+ /* Public members. */
+ struct ost_id lw_object_oi; /* lov object id/seq */
+
+ /* LOV-private members start here -- only for use in lov/. */
+ __u32 lw_magic;
+ __u32 lw_stripe_size; /* size of the stripe */
+ __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */
+ __u16 lw_stripe_count; /* number of objects being striped over */
+ __u16 lw_layout_gen; /* generation of the layout */
+ char lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+ } lsm_wire;
+
+ struct lov_oinfo *lsm_oinfo[0];
+};
+
+#define lsm_oi lsm_wire.lw_object_oi
+#define lsm_magic lsm_wire.lw_magic
+#define lsm_layout_gen lsm_wire.lw_layout_gen
+#define lsm_stripe_size lsm_wire.lw_stripe_size
+#define lsm_pattern lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name lsm_wire.lw_pool_name
+
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+ /* Lock policy. It keeps an extent which is specific for a particular
+ * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy,
+ * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */
+ ldlm_policy_data_t oi_policy;
+ /* Flags used for set request specific flags:
+ - while lock handling, the flags obtained on the enqueue
+ request are set here.
+ - while stats, the flags used for control delay/resend.
+ - while setattr, the flags used for distinguish punch operation
+ */
+ __u64 oi_flags;
+ /* Lock handle specific for every OSC lock. */
+ struct lustre_handle *oi_lockh;
+ /* lsm data specific for every OSC. */
+ struct lov_stripe_md *oi_md;
+ /* obdo data specific for every OSC, if needed at all. */
+ struct obdo *oi_oa;
+ /* statfs data specific for every OSC, if needed at all. */
+ struct obd_statfs *oi_osfs;
+ /* An update callback which is called to update some data on upper
+ * level. E.g. it is used for update lsm->lsm_oinfo at every recieved
+ * request in osc level for enqueue requests. It is also possible to
+ * update some caller data from LOV layer if needed. */
+ obd_enqueue_update_f oi_cb_up;
+ /* oss capability, its type is obd_capa in client to avoid copy.
+ * in contrary its type is lustre_capa in OSS. */
+ void *oi_capa;
+ /* transfer jobid from ost_sync() to filter_sync()... */
+ char *oi_jobid;
+};
+
+/* compare all relevant fields. */
+static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
+ struct lov_stripe_md *m2)
+{
+ /*
+ * ->lsm_wire contains padding, but it should be zeroed out during
+ * allocation.
+ */
+ return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire);
+}
+
+static inline int lov_lum_lsm_cmp(struct lov_user_md *lum,
+ struct lov_stripe_md *lsm)
+{
+ if (lsm->lsm_magic != lum->lmm_magic)
+ return 1;
+ if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) &&
+ (lsm->lsm_stripe_count != lum->lmm_stripe_count))
+ return 2;
+ if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) &&
+ (lsm->lsm_stripe_size != lum->lmm_stripe_size))
+ return 3;
+ if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) &&
+ (lsm->lsm_pattern != lum->lmm_pattern))
+ return 4;
+ if ((lsm->lsm_magic == LOV_MAGIC_V3) &&
+ (strncmp(lsm->lsm_pool_name,
+ ((struct lov_user_md_v3 *)lum)->lmm_pool_name,
+ LOV_MAXPOOLNAME) != 0))
+ return 5;
+ return 0;
+}
+
+static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3,
+ int *lmm_magic,
+ struct lov_user_md *lum)
+{
+ if (lum && copy_from_user(lumv3, lum,sizeof(struct lov_user_md_v1)))
+ return -EFAULT;
+
+ *lmm_magic = lumv3->lmm_magic;
+
+ if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+ lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3);
+ *lmm_magic = LOV_USER_MAGIC_V1;
+ } else if (*lmm_magic == LOV_USER_MAGIC_V3) {
+ if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+ return -EFAULT;
+ } else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+ if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+ return -EFAULT;
+ lustre_swab_lov_user_md_v3(lumv3);
+ *lmm_magic = LOV_USER_MAGIC_V3;
+ } else if (*lmm_magic != LOV_USER_MAGIC_V1) {
+ CDEBUG(D_IOCTL,
+ "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+ *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+
+struct obd_type {
+ struct list_head typ_chain;
+ struct obd_ops *typ_dt_ops;
+ struct md_ops *typ_md_ops;
+ proc_dir_entry_t *typ_procroot;
+ char *typ_name;
+ int typ_refcnt;
+ struct lu_device_type *typ_lu;
+ spinlock_t obd_type_lock;
+};
+
+struct brw_page {
+ obd_off off;
+ struct page *pg;
+ int count;
+ obd_flag flag;
+};
+
+/* Individual type definitions */
+
+struct ost_server_data;
+
+struct osd_properties {
+ size_t osd_max_ea_size;
+};
+
+#define OBT_MAGIC 0xBDDECEAE
+/* hold common fields for "target" device */
+struct obd_device_target {
+ __u32 obt_magic;
+ __u32 obt_instance;
+ struct super_block *obt_sb;
+ /** last_rcvd file */
+ struct file *obt_rcvd_filp;
+ __u64 obt_mount_count;
+ struct rw_semaphore obt_rwsem;
+ struct vfsmount *obt_vfsmnt;
+ struct file *obt_health_check_filp;
+ struct osd_properties obt_osd_properties;
+ struct obd_job_stats obt_jobstats;
+};
+
+/* llog contexts */
+enum llog_ctxt_id {
+ LLOG_CONFIG_ORIG_CTXT = 0,
+ LLOG_CONFIG_REPL_CTXT,
+ LLOG_MDS_OST_ORIG_CTXT,
+ LLOG_MDS_OST_REPL_CTXT,
+ LLOG_SIZE_ORIG_CTXT,
+ LLOG_SIZE_REPL_CTXT,
+ LLOG_RD1_ORIG_CTXT,
+ LLOG_RD1_REPL_CTXT,
+ LLOG_TEST_ORIG_CTXT,
+ LLOG_TEST_REPL_CTXT,
+ LLOG_LOVEA_ORIG_CTXT,
+ LLOG_LOVEA_REPL_CTXT,
+ LLOG_CHANGELOG_ORIG_CTXT, /**< changelog generation on mdd */
+ LLOG_CHANGELOG_REPL_CTXT, /**< changelog access on clients */
+ LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */
+ LLOG_MAX_CTXTS
+};
+
+#define FILTER_SUBDIR_COUNT 32 /* set to zero for no subdirs */
+
+struct filter_subdirs {
+ struct dentry *dentry[FILTER_SUBDIR_COUNT];
+};
+
+
+struct filter_ext {
+ __u64 fe_start;
+ __u64 fe_end;
+};
+
+struct filter_obd {
+ /* NB this field MUST be first */
+ struct obd_device_target fo_obt;
+ const char *fo_fstype;
+
+ int fo_group_count;
+ struct dentry *fo_dentry_O;
+ struct dentry **fo_dentry_O_groups;
+ struct filter_subdirs *fo_dentry_O_sub;
+ struct mutex fo_init_lock; /* group initialization lock*/
+ int fo_committed_group;
+
+ spinlock_t fo_objidlock; /* protect fo_lastobjid */
+
+ unsigned long fo_destroys_in_progress;
+ struct mutex fo_create_locks[FILTER_SUBDIR_COUNT];
+
+ struct list_head fo_export_list;
+ int fo_subdir_count;
+
+ obd_size fo_tot_dirty; /* protected by obd_osfs_lock */
+ obd_size fo_tot_granted; /* all values in bytes */
+ obd_size fo_tot_pending;
+ int fo_tot_granted_clients;
+
+ obd_size fo_readcache_max_filesize;
+ spinlock_t fo_flags_lock;
+ unsigned int fo_read_cache:1, /**< enable read-only cache */
+ fo_writethrough_cache:1,/**< read cache writes */
+ fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/
+ fo_raid_degraded:1;/**< RAID device degraded */
+
+ struct obd_import *fo_mdc_imp;
+ struct obd_uuid fo_mdc_uuid;
+ struct lustre_handle fo_mdc_conn;
+ struct file **fo_last_objid_files;
+ __u64 *fo_last_objids; /* last created objid for groups,
+ * protected by fo_objidlock */
+
+ struct mutex fo_alloc_lock;
+
+ atomic_t fo_r_in_flight;
+ atomic_t fo_w_in_flight;
+
+ /*
+ * per-filter pool of kiobuf's allocated by filter_common_setup() and
+ * torn down by filter_cleanup().
+ *
+ * This pool contains kiobuf used by
+ * filter_{prep,commit}rw_{read,write}() and is shared by all OST
+ * threads.
+ *
+ * Locking: protected by internal lock of cfs_hash, pool can be
+ * found from this hash table by t_id of ptlrpc_thread.
+ */
+ struct cfs_hash *fo_iobuf_hash;
+
+ struct brw_stats fo_filter_stats;
+
+ int fo_fmd_max_num; /* per exp filter_mod_data */
+ int fo_fmd_max_age; /* jiffies to fmd expiry */
+ unsigned long fo_syncjournal:1, /* sync journal on writes */
+ fo_sync_lock_cancel:2;/* sync on lock cancel */
+
+
+ /* sptlrpc stuff */
+ rwlock_t fo_sptlrpc_lock;
+ struct sptlrpc_rule_set fo_sptlrpc_rset;
+
+ /* capability related */
+ unsigned int fo_fl_oss_capa;
+ struct list_head fo_capa_keys;
+ struct hlist_head *fo_capa_hash;
+ int fo_sec_level;
+};
+
+struct timeout_item {
+ enum timeout_event ti_event;
+ cfs_time_t ti_timeout;
+ timeout_cb_t ti_cb;
+ void *ti_cb_data;
+ struct list_head ti_obd_list;
+ struct list_head ti_chain;
+};
+
+#define OSC_MAX_RIF_DEFAULT 8
+#define MDS_OSC_MAX_RIF_DEFAULT 50
+#define OSC_MAX_RIF_MAX 256
+#define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS 10
+
+/* possible values for fo_sync_lock_cancel */
+enum {
+ NEVER_SYNC_ON_CANCEL = 0,
+ BLOCKING_SYNC_ON_CANCEL = 1,
+ ALWAYS_SYNC_ON_CANCEL = 2,
+ NUM_SYNC_ON_CANCEL_STATES
+};
+
+#define MDC_MAX_RIF_DEFAULT 8
+#define MDC_MAX_RIF_MAX 512
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+ struct rw_semaphore cl_sem;
+ struct obd_uuid cl_target_uuid;
+ struct obd_import *cl_import; /* ptlrpc connection state */
+ int cl_conn_count;
+ /* max_mds_easize is purely a performance thing so we don't have to
+ * call obd_size_diskmd() all the time. */
+ int cl_default_mds_easize;
+ int cl_max_mds_easize;
+ int cl_max_mds_cookiesize;
+
+ enum lustre_sec_part cl_sp_me;
+ enum lustre_sec_part cl_sp_to;
+ struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */
+
+ /* the grant values are protected by loi_list_lock below */
+ long cl_dirty; /* all _dirty_ in bytes */
+ long cl_dirty_max; /* allowed w/o rpc */
+ long cl_dirty_transit; /* dirty synchronous */
+ long cl_avail_grant; /* bytes of credit for ost */
+ long cl_lost_grant; /* lost credits (trunc) */
+
+ /* since we allocate grant by blocks, we don't know how many grant will
+ * be used to add a page into cache. As a solution, we reserve maximum
+ * grant before trying to dirty a page and unreserve the rest.
+ * See osc_{reserve|unreserve}_grant for details. */
+ long cl_reserved_grant;
+ struct list_head cl_cache_waiters; /* waiting for cache/grant */
+ cfs_time_t cl_next_shrink_grant; /* jiffies */
+ struct list_head cl_grant_shrink_list; /* Timeout event list */
+ int cl_grant_shrink_interval; /* seconds */
+
+ /* A chunk is an optimal size used by osc_extent to determine
+ * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */
+ int cl_chunkbits;
+ int cl_chunk;
+ int cl_extent_tax; /* extent overhead, by bytes */
+
+ /* keep track of objects that have lois that contain pages which
+ * have been queued for async brw. this lock also protects the
+ * lists of osc_client_pages that hang off of the loi */
+ /*
+ * ->cl_loi_list_lock protects consistency of
+ * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+ * ->ap_completion() call-backs are executed under this lock. As we
+ * cannot guarantee that these call-backs never block on all platforms
+ * (as a matter of fact they do block on Mac OS X), type of
+ * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+ * and blocking mutex on Mac OS X. (Alternative is to make this lock
+ * blocking everywhere, but we don't want to slow down fast-path of
+ * our main platform.)
+ *
+ * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together
+ * with client_obd_list_{un,}lock() and
+ * client_obd_list_lock_{init,done}() functions.
+ *
+ * NB by Jinshan: though field names are still _loi_, but actually
+ * osc_object{}s are in the list.
+ */
+ client_obd_lock_t cl_loi_list_lock;
+ struct list_head cl_loi_ready_list;
+ struct list_head cl_loi_hp_ready_list;
+ struct list_head cl_loi_write_list;
+ struct list_head cl_loi_read_list;
+ int cl_r_in_flight;
+ int cl_w_in_flight;
+ /* just a sum of the loi/lop pending numbers to be exported by /proc */
+ atomic_t cl_pending_w_pages;
+ atomic_t cl_pending_r_pages;
+ __u32 cl_max_pages_per_rpc;
+ int cl_max_rpcs_in_flight;
+ struct obd_histogram cl_read_rpc_hist;
+ struct obd_histogram cl_write_rpc_hist;
+ struct obd_histogram cl_read_page_hist;
+ struct obd_histogram cl_write_page_hist;
+ struct obd_histogram cl_read_offset_hist;
+ struct obd_histogram cl_write_offset_hist;
+
+ /* lru for osc caching pages */
+ struct cl_client_cache *cl_cache;
+ struct list_head cl_lru_osc; /* member of cl_cache->ccc_lru */
+ atomic_t *cl_lru_left;
+ atomic_t cl_lru_busy;
+ atomic_t cl_lru_shrinkers;
+ atomic_t cl_lru_in_list;
+ struct list_head cl_lru_list; /* lru page list */
+ client_obd_lock_t cl_lru_list_lock; /* page list protector */
+
+ /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+ atomic_t cl_destroy_in_flight;
+ wait_queue_head_t cl_destroy_waitq;
+
+ struct mdc_rpc_lock *cl_rpc_lock;
+ struct mdc_rpc_lock *cl_close_lock;
+
+ /* mgc datastruct */
+ struct semaphore cl_mgc_sem;
+ struct vfsmount *cl_mgc_vfsmnt;
+ struct dentry *cl_mgc_configs_dir;
+ atomic_t cl_mgc_refcount;
+ struct obd_export *cl_mgc_mgsexp;
+
+ /* checksumming for data sent over the network */
+ unsigned int cl_checksum:1; /* 0 = disabled, 1 = enabled */
+ /* supported checksum types that are worked out at connect time */
+ __u32 cl_supp_cksum_types;
+ /* checksum algorithm to be used */
+ cksum_type_t cl_cksum_type;
+
+ /* also protected by the poorly named _loi_list_lock lock above */
+ struct osc_async_rc cl_ar;
+
+ /* used by quotacheck when the servers are older than 2.4 */
+ int cl_qchk_stat; /* quotacheck stat of the peer */
+#define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0)
+#warning "please consider removing quotacheck compatibility code"
+#endif
+
+ /* sequence manager */
+ struct lu_client_seq *cl_seq;
+
+ atomic_t cl_resends; /* resend count */
+
+ /* ptlrpc work for writeback in ptlrpcd context */
+ void *cl_writeback_work;
+ /* hash tables for osc_quota_info */
+ cfs_hash_t *cl_quota_hash[MAXQUOTAS];
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+ __u32 idx;
+ obd_id *data;
+};
+
+/* */
+
+struct echo_obd {
+ struct obd_device_target eo_obt;
+ struct obdo eo_oa;
+ spinlock_t eo_lock;
+ __u64 eo_lastino;
+ struct lustre_handle eo_nl_lock;
+ atomic_t eo_prep;
+};
+
+struct ost_obd {
+ struct ptlrpc_service *ost_service;
+ struct ptlrpc_service *ost_create_service;
+ struct ptlrpc_service *ost_io_service;
+ struct ptlrpc_service *ost_seq_service;
+ struct mutex ost_health_mutex;
+};
+
+struct echo_client_obd {
+ struct obd_export *ec_exp; /* the local connection to osc/lov */
+ spinlock_t ec_lock;
+ struct list_head ec_objects;
+ struct list_head ec_locks;
+ int ec_nstripes;
+ __u64 ec_unique;
+};
+
+struct lov_qos_oss {
+ struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */
+ struct list_head lqo_oss_list; /* link to lov_qos */
+ __u64 lqo_bavail; /* total bytes avail on OSS */
+ __u64 lqo_penalty; /* current penalty */
+ __u64 lqo_penalty_per_obj;/* penalty decrease every obj*/
+ time_t lqo_used; /* last used time, seconds */
+ __u32 lqo_ost_count; /* number of osts on this oss */
+};
+
+struct ltd_qos {
+ struct lov_qos_oss *ltq_oss; /* oss info */
+ __u64 ltq_penalty; /* current penalty */
+ __u64 ltq_penalty_per_obj; /* penalty decrease every obj*/
+ __u64 ltq_weight; /* net weighting */
+ time_t ltq_used; /* last used time, seconds */
+ unsigned int ltq_usable:1; /* usable for striping */
+};
+
+/* Generic subset of OSTs */
+struct ost_pool {
+ __u32 *op_array; /* array of index of
+ lov_obd->lov_tgts */
+ unsigned int op_count; /* number of OSTs in the array */
+ unsigned int op_size; /* allocated size of lp_array */
+ struct rw_semaphore op_rw_sem; /* to protect ost_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+ __u32 lqr_start_idx; /* start index of new inode */
+ __u32 lqr_offset_idx; /* aliasing for start_idx */
+ int lqr_start_count; /* reseed counter */
+ struct ost_pool lqr_pool; /* round-robin optimized list */
+ unsigned long lqr_dirty:1; /* recalc round-robin list */
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+
+struct lov_statfs_data {
+ struct obd_info lsd_oi;
+ struct obd_statfs lsd_statfs;
+};
+/* Stripe placement optimization */
+struct lov_qos {
+ struct list_head lq_oss_list; /* list of OSSs that targets use */
+ struct rw_semaphore lq_rw_sem;
+ __u32 lq_active_oss_count;
+ unsigned int lq_prio_free; /* priority for free space */
+ unsigned int lq_threshold_rr;/* priority for rr */
+ struct lov_qos_rr lq_rr; /* round robin qos data */
+ unsigned long lq_dirty:1, /* recalc qos data */
+ lq_same_space:1,/* the ost's all have approx.
+ the same space avail */
+ lq_reset:1, /* zero current penalties */
+ lq_statfs_in_progress:1; /* statfs op in
+ progress */
+ /* qos statfs data */
+ struct lov_statfs_data *lq_statfs_data;
+ wait_queue_head_t lq_statfs_waitq; /* waitqueue to notify statfs
+ * requests completion */
+};
+
+struct lov_tgt_desc {
+ struct list_head ltd_kill;
+ struct obd_uuid ltd_uuid;
+ struct obd_device *ltd_obd;
+ struct obd_export *ltd_exp;
+ struct ltd_qos ltd_qos; /* qos info per target */
+ __u32 ltd_gen;
+ __u32 ltd_index; /* index in lov_obd->tgts */
+ unsigned long ltd_active:1,/* is this target up for requests */
+ ltd_activate:1,/* should target be activated */
+ ltd_reap:1; /* should this target be deleted */
+};
+
+/* Pool metadata */
+#define pool_tgt_size(_p) _p->pool_obds.op_size
+#define pool_tgt_count(_p) _p->pool_obds.op_count
+#define pool_tgt_array(_p) _p->pool_obds.op_array
+#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem
+
+struct pool_desc {
+ char pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */
+ struct ost_pool pool_obds; /* pool members */
+ atomic_t pool_refcount; /* pool ref. counter */
+ struct lov_qos_rr pool_rr; /* round robin qos */
+ struct hlist_node pool_hash; /* access by poolname */
+ struct list_head pool_list; /* serial access */
+ proc_dir_entry_t *pool_proc_entry; /* file in /proc */
+ struct obd_device *pool_lobd; /* obd of the lov/lod to which
+ * this pool belongs */
+};
+
+struct lov_obd {
+ struct lov_desc desc;
+ struct lov_tgt_desc **lov_tgts; /* sparse array */
+ struct ost_pool lov_packed; /* all OSTs in a packed
+ array */
+ struct mutex lov_lock;
+ struct obd_connect_data lov_ocd;
+ atomic_t lov_refcount;
+ __u32 lov_tgt_count; /* how many OBD's */
+ __u32 lov_active_tgt_count; /* how many active */
+ __u32 lov_death_row;/* tgts scheduled to be deleted */
+ __u32 lov_tgt_size; /* size of tgts array */
+ int lov_connects;
+ int lov_pool_count;
+ cfs_hash_t *lov_pools_hash_body; /* used for key access */
+ struct list_head lov_pool_list; /* used for sequential access */
+ proc_dir_entry_t *lov_pool_proc_entry;
+ enum lustre_sec_part lov_sp_me;
+
+ /* Cached LRU pages from upper layer */
+ void *lov_cache;
+
+ struct rw_semaphore lov_notify_lock;
+};
+
+struct lmv_tgt_desc {
+ struct obd_uuid ltd_uuid;
+ struct obd_export *ltd_exp;
+ int ltd_idx;
+ struct mutex ltd_fid_mutex;
+ unsigned long ltd_active:1; /* target up for requests */
+};
+
+enum placement_policy {
+ PLACEMENT_CHAR_POLICY = 0,
+ PLACEMENT_NID_POLICY = 1,
+ PLACEMENT_INVAL_POLICY = 2,
+ PLACEMENT_MAX_POLICY
+};
+
+typedef enum placement_policy placement_policy_t;
+
+struct lmv_obd {
+ int refcount;
+ struct lu_client_fld lmv_fld;
+ spinlock_t lmv_lock;
+ placement_policy_t lmv_placement;
+ struct lmv_desc desc;
+ struct obd_uuid cluuid;
+ struct obd_export *exp;
+
+ struct mutex init_mutex;
+ int connected;
+ int max_easize;
+ int max_def_easize;
+ int max_cookiesize;
+ int server_timeout;
+
+ int tgts_size; /* size of tgts array */
+ struct lmv_tgt_desc **tgts;
+
+ struct obd_connect_data conn_data;
+};
+
+struct niobuf_local {
+ __u64 lnb_file_offset;
+ __u32 lnb_page_offset;
+ __u32 len;
+ __u32 flags;
+ struct page *page;
+ struct dentry *dentry;
+ int lnb_grant_used;
+ int rc;
+};
+
+#define LUSTRE_FLD_NAME "fld"
+#define LUSTRE_SEQ_NAME "seq"
+
+#define LUSTRE_MDD_NAME "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME "osd-zfs"
+#define LUSTRE_VVP_NAME "vvp"
+#define LUSTRE_LMV_NAME "lmv"
+#define LUSTRE_SLP_NAME "slp"
+#define LUSTRE_LOD_NAME "lod"
+#define LUSTRE_OSP_NAME "osp"
+#define LUSTRE_LWP_NAME "lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME "mds"
+#define LUSTRE_MDT_NAME "mdt"
+#define LUSTRE_MDC_NAME "mdc"
+#define LUSTRE_OSS_NAME "ost" /* FIXME change name to oss */
+#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME "osc"
+#define LUSTRE_LOV_NAME "lov"
+#define LUSTRE_MGS_NAME "mgs"
+#define LUSTRE_MGC_NAME "mgc"
+
+#define LUSTRE_ECHO_NAME "obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+static inline int is_osp_on_mdt(char *name)
+{
+ char *ptr;
+
+ ptr = strrchr(name, '-');
+ if (ptr == NULL) {
+ CERROR("%s is not a obdname\n", name);
+ return 0;
+ }
+
+ /* 1.8 OSC/OSP name on MDT is fsname-OSTxxxx-osc */
+ if (strncmp(ptr + 1, "osc", 3) == 0)
+ return 1;
+
+ if (strncmp(ptr + 1, "MDT", 3) != 0)
+ return 0;
+
+ while (*(--ptr) != '-' && ptr != name);
+
+ if (ptr == name)
+ return 0;
+
+ if (strncmp(ptr + 1, LUSTRE_OSP_NAME, strlen(LUSTRE_OSP_NAME)) != 0 &&
+ strncmp(ptr + 1, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME)) != 0)
+ return 0;
+
+ return 1;
+}
+
+/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
+#define N_LOCAL_TEMP_PAGE 0x10000000
+
+struct obd_trans_info {
+ __u64 oti_transno;
+ __u64 oti_xid;
+ /* Only used on the server side for tracking acks. */
+ struct oti_req_ack_lock {
+ struct lustre_handle lock;
+ __u32 mode;
+ } oti_ack_locks[4];
+ void *oti_handle;
+ struct llog_cookie oti_onecookie;
+ struct llog_cookie *oti_logcookies;
+ int oti_numcookies;
+ /** synchronous write is needed */
+ unsigned long oti_sync_write:1;
+
+ /* initial thread handling transaction */
+ struct ptlrpc_thread * oti_thread;
+ __u32 oti_conn_cnt;
+ /** VBR: versions */
+ __u64 oti_pre_version;
+ /** JobID */
+ char *oti_jobid;
+
+ struct obd_uuid *oti_ost_uuid;
+};
+
+static inline void oti_init(struct obd_trans_info *oti,
+ struct ptlrpc_request *req)
+{
+ if (oti == NULL)
+ return;
+ memset(oti, 0, sizeof(*oti));
+
+ if (req == NULL)
+ return;
+
+ oti->oti_xid = req->rq_xid;
+ /** VBR: take versions from request */
+ if (req->rq_reqmsg != NULL &&
+ lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+ __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg);
+ oti->oti_pre_version = pre_version ? pre_version[0] : 0;
+ oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+ }
+
+ /** called from mds_create_objects */
+ if (req->rq_repmsg != NULL)
+ oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+ oti->oti_thread = req->rq_svc_thread;
+ if (req->rq_reqmsg != NULL)
+ oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+}
+
+static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies)
+{
+ if (!oti)
+ return;
+
+ if (num_cookies == 1)
+ oti->oti_logcookies = &oti->oti_onecookie;
+ else
+ OBD_ALLOC_LARGE(oti->oti_logcookies,
+ num_cookies * sizeof(oti->oti_onecookie));
+
+ oti->oti_numcookies = num_cookies;
+}
+
+static inline void oti_free_cookies(struct obd_trans_info *oti)
+{
+ if (!oti || !oti->oti_logcookies)
+ return;
+
+ if (oti->oti_logcookies == &oti->oti_onecookie)
+ LASSERT(oti->oti_numcookies == 1);
+ else
+ OBD_FREE_LARGE(oti->oti_logcookies,
+ oti->oti_numcookies*sizeof(oti->oti_onecookie));
+ oti->oti_logcookies = NULL;
+ oti->oti_numcookies = 0;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+ /* target added */
+ OBD_NOTIFY_CREATE,
+ /* Device connect start */
+ OBD_NOTIFY_CONNECT,
+ /* Device activated */
+ OBD_NOTIFY_ACTIVE,
+ /* Device deactivated */
+ OBD_NOTIFY_INACTIVE,
+ /* Device disconnected */
+ OBD_NOTIFY_DISCON,
+ /* Connect data for import were changed */
+ OBD_NOTIFY_OCD,
+ /* Sync request */
+ OBD_NOTIFY_SYNC_NONBLOCK,
+ OBD_NOTIFY_SYNC,
+ /* Configuration event */
+ OBD_NOTIFY_CONFIG,
+ /* Administratively deactivate/activate event */
+ OBD_NOTIFY_DEACTIVATE,
+ OBD_NOTIFY_ACTIVATE
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+ int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+ enum obd_notify_event ev, void *owner, void *data);
+ /* Opaque datum supplied by upper layer listener */
+ void *onu_owner;
+};
+
+struct target_recovery_data {
+ svc_handler_t trd_recovery_handler;
+ pid_t trd_processing_task;
+ struct completion trd_starting;
+ struct completion trd_finishing;
+};
+
+struct obd_llog_group {
+ int olg_seq;
+ struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS];
+ wait_queue_head_t olg_waitq;
+ spinlock_t olg_lock;
+ struct mutex olg_cat_processing;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC 0XAB5CD6EF
+#define OBD_DEV_BY_DEVNAME 0xffffd0de
+
+struct obd_device {
+ struct obd_type *obd_type;
+ __u32 obd_magic;
+
+ /* common and UUID name of this device */
+ char obd_name[MAX_OBD_NAME];
+ struct obd_uuid obd_uuid;
+
+ struct lu_device *obd_lu_dev;
+
+ int obd_minor;
+ /* bitfield modification is protected by obd_dev_lock */
+ unsigned long obd_attached:1, /* finished attach */
+ obd_set_up:1, /* finished setup */
+ obd_recovering:1, /* there are recoverable clients */
+ obd_abort_recovery:1,/* recovery expired */
+ obd_version_recov:1, /* obd uses version checking */
+ obd_replayable:1, /* recovery is enabled; inform clients */
+ obd_no_transno:1, /* no committed-transno notification */
+ obd_no_recov:1, /* fail instead of retry messages */
+ obd_stopping:1, /* started cleanup */
+ obd_starting:1, /* started setup */
+ obd_force:1, /* cleanup with > 0 obd refcount */
+ obd_fail:1, /* cleanup with failover */
+ obd_async_recov:1, /* allow asynchronous orphan cleanup */
+ obd_no_conn:1, /* deny new connections */
+ obd_inactive:1, /* device active/inactive
+ * (for /proc/status only!!) */
+ obd_no_ir:1, /* no imperative recovery. */
+ obd_process_conf:1; /* device is processing mgs config */
+ /* use separate field as it is set in interrupt to don't mess with
+ * protection of other bits using _bh lock */
+ unsigned long obd_recovery_expired:1;
+ /* uuid-export hash body */
+ cfs_hash_t *obd_uuid_hash;
+ /* nid-export hash body */
+ cfs_hash_t *obd_nid_hash;
+ /* nid stats body */
+ cfs_hash_t *obd_nid_stats_hash;
+ struct list_head obd_nid_stats;
+ atomic_t obd_refcount;
+ wait_queue_head_t obd_refcount_waitq;
+ struct list_head obd_exports;
+ struct list_head obd_unlinked_exports;
+ struct list_head obd_delayed_exports;
+ int obd_num_exports;
+ spinlock_t obd_nid_lock;
+ struct ldlm_namespace *obd_namespace;
+ struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */
+ /* a spinlock is OK for what we do now, may need a semaphore later */
+ spinlock_t obd_dev_lock; /* protect OBD bitfield above */
+ struct mutex obd_dev_mutex;
+ __u64 obd_last_committed;
+ struct fsfilt_operations *obd_fsops;
+ spinlock_t obd_osfs_lock;
+ struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */
+ __u64 obd_osfs_age;
+ struct lvfs_run_ctxt obd_lvfs_ctxt;
+ struct obd_llog_group obd_olg; /* default llog group */
+ struct obd_device *obd_observer;
+ struct rw_semaphore obd_observer_link_sem;
+ struct obd_notify_upcall obd_upcall;
+ struct obd_export *obd_self_export;
+ /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+ struct list_head obd_exports_timed;
+ time_t obd_eviction_timer; /* for ping evictor */
+
+ int obd_max_recoverable_clients;
+ atomic_t obd_connected_clients;
+ int obd_stale_clients;
+ int obd_delayed_clients;
+ /* this lock protects all recovery list_heads, timer and
+ * obd_next_recovery_transno value */
+ spinlock_t obd_recovery_task_lock;
+ __u64 obd_next_recovery_transno;
+ int obd_replayed_requests;
+ int obd_requests_queued_for_recovery;
+ wait_queue_head_t obd_next_transno_waitq;
+ /* protected by obd_recovery_task_lock */
+ timer_list_t obd_recovery_timer;
+ time_t obd_recovery_start; /* seconds */
+ time_t obd_recovery_end; /* seconds, for lprocfs_status */
+ int obd_recovery_time_hard;
+ int obd_recovery_timeout;
+ int obd_recovery_ir_factor;
+
+ /* new recovery stuff from CMD2 */
+ struct target_recovery_data obd_recovery_data;
+ int obd_replayed_locks;
+ atomic_t obd_req_replay_clients;
+ atomic_t obd_lock_replay_clients;
+ /* all lists are protected by obd_recovery_task_lock */
+ struct list_head obd_req_replay_queue;
+ struct list_head obd_lock_replay_queue;
+ struct list_head obd_final_req_queue;
+ int obd_recovery_stage;
+
+ union {
+ struct obd_device_target obt;
+ struct filter_obd filter;
+ struct client_obd cli;
+ struct ost_obd ost;
+ struct echo_client_obd echo_client;
+ struct echo_obd echo;
+ struct lov_obd lov;
+ struct lmv_obd lmv;
+ } u;
+ /* Fields used by LProcFS */
+ unsigned int obd_cntr_base;
+ struct lprocfs_stats *obd_stats;
+
+ unsigned int md_cntr_base;
+ struct lprocfs_stats *md_stats;
+
+ proc_dir_entry_t *obd_proc_entry;
+ void *obd_proc_private; /* type private PDEs */
+ proc_dir_entry_t *obd_proc_exports_entry;
+ proc_dir_entry_t *obd_svc_procroot;
+ struct lprocfs_stats *obd_svc_stats;
+ atomic_t obd_evict_inprogress;
+ wait_queue_head_t obd_evict_inprogress_waitq;
+ struct list_head obd_evict_list; /* protected with pet_lock */
+
+ /**
+ * Ldlm pool part. Save last calculated SLV and Limit.
+ */
+ rwlock_t obd_pool_lock;
+ int obd_pool_limit;
+ __u64 obd_pool_slv;
+
+ /**
+ * A list of outstanding class_incref()'s against this obd. For
+ * debugging.
+ */
+ struct lu_ref obd_reference;
+
+ int obd_conn_inprogress;
+};
+
+#define OBD_LLOG_FL_SENDNOW 0x0001
+#define OBD_LLOG_FL_EXIT 0x0002
+
+enum obd_cleanup_stage {
+/* Special case hack for MDS LOVs */
+ OBD_CLEANUP_EARLY,
+/* can be directly mapped to .ldto_device_fini() */
+ OBD_CLEANUP_EXPORTS,
+};
+
+/* get/set_info keys */
+#define KEY_ASYNC "async"
+#define KEY_BLOCKSIZE_BITS "blocksize_bits"
+#define KEY_BLOCKSIZE "blocksize"
+#define KEY_CAPA_KEY "capa_key"
+#define KEY_CHANGELOG_CLEAR "changelog_clear"
+#define KEY_FID2PATH "fid2path"
+#define KEY_CHECKSUM "checksum"
+#define KEY_CLEAR_FS "clear_fs"
+#define KEY_CONN_DATA "conn_data"
+#define KEY_EVICT_BY_NID "evict_by_nid"
+#define KEY_FIEMAP "fiemap"
+#define KEY_FLUSH_CTX "flush_ctx"
+#define KEY_GRANT_SHRINK "grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND "hsm_send"
+#define KEY_INIT_RECOV_BACKUP "init_recov_bk"
+#define KEY_INIT_RECOV "initial_recov"
+#define KEY_INTERMDS "inter_mds"
+#define KEY_LAST_ID "last_id"
+#define KEY_LAST_FID "last_fid"
+#define KEY_LOCK_TO_STRIPE "lock_to_stripe"
+#define KEY_LOVDESC "lovdesc"
+#define KEY_LOV_IDX "lov_idx"
+#define KEY_MAX_EASIZE "max_easize"
+#define KEY_MDS_CONN "mds_conn"
+#define KEY_MGSSEC "mgssec"
+#define KEY_NEXT_ID "next_id"
+#define KEY_READ_ONLY "read-only"
+#define KEY_REGISTER_TARGET "register_target"
+#define KEY_SET_FS "set_fs"
+#define KEY_TGT_COUNT "tgt_count"
+/* KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF "sptlrpc_conf"
+#define KEY_CONNECT_FLAG "connect_flags"
+#define KEY_SYNC_LOCK_CANCEL "sync_lock_cancel"
+
+#define KEY_CACHE_SET "cache_set"
+#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink"
+#define KEY_CHANGELOG_INDEX "changelog_index"
+
+struct lu_context;
+
+/* /!\ must be coherent with include/linux/namei.h on patched kernel */
+#define IT_OPEN (1 << 0)
+#define IT_CREAT (1 << 1)
+#define IT_READDIR (1 << 2)
+#define IT_GETATTR (1 << 3)
+#define IT_LOOKUP (1 << 4)
+#define IT_UNLINK (1 << 5)
+#define IT_TRUNC (1 << 6)
+#define IT_GETXATTR (1 << 7)
+#define IT_EXEC (1 << 8)
+#define IT_PIN (1 << 9)
+#define IT_LAYOUT (1 << 10)
+#define IT_QUOTA_DQACQ (1 << 11)
+#define IT_QUOTA_CONN (1 << 12)
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+ /* CREAT needs to be tested before open (both could be set) */
+ if (it->it_op & IT_CREAT)
+ return LCK_CW;
+ else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP |
+ IT_LAYOUT))
+ return LCK_CR;
+
+ LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+ return -EINVAL;
+}
+
+struct md_op_data {
+ struct lu_fid op_fid1; /* operation fid1 (usualy parent) */
+ struct lu_fid op_fid2; /* operation fid2 (usualy child) */
+ struct lu_fid op_fid3; /* 2 extra fids to find conflicting */
+ struct lu_fid op_fid4; /* to the operation locks. */
+ mdsno_t op_mds; /* what mds server open will go to */
+ struct lustre_handle op_handle;
+ obd_time op_mod_time;
+ const char *op_name;
+ int op_namelen;
+ __u32 op_mode;
+ struct lmv_stripe_md *op_mea1;
+ struct lmv_stripe_md *op_mea2;
+ __u32 op_suppgids[2];
+ __u32 op_fsuid;
+ __u32 op_fsgid;
+ cfs_cap_t op_cap;
+ void *op_data;
+
+ /* iattr fields and blocks. */
+ struct iattr op_attr;
+ unsigned int op_attr_flags;
+ __u64 op_valid;
+ loff_t op_attr_blocks;
+
+ /* Size-on-MDS epoch and flags. */
+ __u64 op_ioepoch;
+ __u32 op_flags;
+
+ /* Capa fields */
+ struct obd_capa *op_capa1;
+ struct obd_capa *op_capa2;
+
+ /* Various operation flags. */
+ __u32 op_bias;
+
+ /* Operation type */
+ __u32 op_opc;
+
+ /* Used by readdir */
+ __u64 op_offset;
+
+ /* Used by readdir */
+ __u32 op_npages;
+
+ /* used to transfer info between the stacks of MD client
+ * see enum op_cli_flags */
+ __u32 op_cli_flags;
+};
+
+enum op_cli_flags {
+ CLI_SET_MEA = 1 << 0,
+ CLI_RM_ENTRY = 1 << 1,
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req,
+ struct md_enqueue_info *minfo,
+ int rc);
+
+/* seq client type */
+enum lu_cli_type {
+ LUSTRE_SEQ_METADATA = 1,
+ LUSTRE_SEQ_DATA
+};
+
+struct md_enqueue_info {
+ struct md_op_data mi_data;
+ struct lookup_intent mi_it;
+ struct lustre_handle mi_lockh;
+ struct inode *mi_dir;
+ md_enqueue_cb_t mi_cb;
+ __u64 mi_cbdata;
+ unsigned int mi_generation;
+};
+
+struct obd_ops {
+ module_t *o_owner;
+ int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+ void *karg, void *uarg);
+ int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+ __u32 keylen, void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *lsm);
+ int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+ __u32 keylen, void *key,
+ __u32 vallen, void *val,
+ struct ptlrpc_request_set *set);
+ int (*o_attach)(struct obd_device *dev, obd_count len, void *data);
+ int (*o_detach)(struct obd_device *dev);
+ int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg);
+ int (*o_precleanup)(struct obd_device *dev,
+ enum obd_cleanup_stage cleanup_stage);
+ int (*o_cleanup)(struct obd_device *dev);
+ int (*o_process_config)(struct obd_device *dev, obd_count len,
+ void *data);
+ int (*o_postrecov)(struct obd_device *dev);
+ int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+ int priority);
+ int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+ /* connect to the target device with given connection
+ * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+ * granted by the target, which are guaranteed to be a subset of flags
+ * asked for. If @ocd == NULL, use default parameters. */
+ int (*o_connect)(const struct lu_env *env,
+ struct obd_export **exp, struct obd_device *src,
+ struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+ void *localdata);
+ int (*o_reconnect)(const struct lu_env *env,
+ struct obd_export *exp, struct obd_device *src,
+ struct obd_uuid *cluuid,
+ struct obd_connect_data *ocd,
+ void *localdata);
+ int (*o_disconnect)(struct obd_export *exp);
+
+ /* Initialize/finalize fids infrastructure. */
+ int (*o_fid_init)(struct obd_device *obd,
+ struct obd_export *exp, enum lu_cli_type type);
+ int (*o_fid_fini)(struct obd_device *obd);
+
+ /* Allocate new fid according to passed @hint. */
+ int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid,
+ struct md_op_data *op_data);
+
+ /*
+ * Object with @fid is getting deleted, we may want to do something
+ * about this.
+ */
+ int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+ int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+ __u64 max_age, struct ptlrpc_request_set *set);
+ int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
+ struct lov_stripe_md *mem_src);
+ int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt,
+ struct lov_mds_md *disk_src, int disk_len);
+ int (*o_preallocate)(struct lustre_handle *, obd_count *req,
+ obd_id *ids);
+ /* FIXME: add fid capability support for create & destroy! */
+ int (*o_precreate)(struct obd_export *exp);
+ int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md **ea,
+ struct obd_trans_info *oti);
+ int (*o_create_async)(struct obd_export *exp, struct obd_info *oinfo,
+ struct lov_stripe_md **ea,
+ struct obd_trans_info *oti);
+ int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md *ea,
+ struct obd_trans_info *oti, struct obd_export *md_exp,
+ void *capa);
+ int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+ struct obd_info *oinfo, struct obd_trans_info *oti);
+ int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset);
+ int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo);
+ int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+ struct ptlrpc_request_set *set);
+ int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo,
+ obd_count oa_bufs, struct brw_page *pgarr,
+ struct obd_trans_info *oti);
+ int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
+ struct ost_lvb *lvb, int kms_only);
+ int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
+ obd_off size, int shrink);
+ int (*o_punch)(const struct lu_env *, struct obd_export *exp,
+ struct obd_info *oinfo, struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset);
+ int (*o_sync)(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo, obd_size start, obd_size end,
+ struct ptlrpc_request_set *set);
+ int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst,
+ struct lov_stripe_md *src, obd_size start,
+ obd_size end, struct obd_trans_info *oti);
+ int (*o_copy)(struct lustre_handle *dstconn, struct lov_stripe_md *dst,
+ struct lustre_handle *srconn, struct lov_stripe_md *src,
+ obd_size start, obd_size end, struct obd_trans_info *);
+ int (*o_iterate)(struct lustre_handle *conn,
+ int (*)(obd_id, obd_seq, void *),
+ obd_id *startid, obd_seq seq, void *data);
+ int (*o_preprw)(const struct lu_env *env, int cmd,
+ struct obd_export *exp, struct obdo *oa, int objcount,
+ struct obd_ioobj *obj, struct niobuf_remote *remote,
+ int *nr_pages, struct niobuf_local *local,
+ struct obd_trans_info *oti, struct lustre_capa *capa);
+ int (*o_commitrw)(const struct lu_env *env, int cmd,
+ struct obd_export *exp, struct obdo *oa,
+ int objcount, struct obd_ioobj *obj,
+ struct niobuf_remote *remote, int pages,
+ struct niobuf_local *local,
+ struct obd_trans_info *oti, int rc);
+ int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo,
+ struct ldlm_enqueue_info *einfo,
+ struct ptlrpc_request_set *rqset);
+ int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *,
+ ldlm_iterator_t it, void *data);
+ int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *,
+ ldlm_iterator_t it, void *data);
+ int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md,
+ __u32 mode, struct lustre_handle *);
+ int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *,
+ ldlm_cancel_flags_t flags, void *opaque);
+ int (*o_init_export)(struct obd_export *exp);
+ int (*o_destroy_export)(struct obd_export *exp);
+ int (*o_extent_calc)(struct obd_export *, struct lov_stripe_md *,
+ int cmd, obd_off *);
+
+ /* llog related obd_methods */
+ int (*o_llog_init)(struct obd_device *obd, struct obd_llog_group *grp,
+ struct obd_device *disk_obd, int *idx);
+ int (*o_llog_finish)(struct obd_device *obd, int count);
+ int (*o_llog_connect)(struct obd_export *, struct llogd_conn_body *);
+
+ /* metadata-only methods */
+ int (*o_pin)(struct obd_export *, const struct lu_fid *fid,
+ struct obd_capa *, struct obd_client_handle *, int flag);
+ int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int);
+
+ int (*o_import_event)(struct obd_device *, struct obd_import *,
+ enum obd_import_event);
+
+ int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+ enum obd_notify_event ev, void *data);
+
+ int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+ struct obd_uuid *(*o_get_uuid) (struct obd_export *exp);
+
+ /* quota methods */
+ int (*o_quotacheck)(struct obd_device *, struct obd_export *,
+ struct obd_quotactl *);
+ int (*o_quotactl)(struct obd_device *, struct obd_export *,
+ struct obd_quotactl *);
+
+ int (*o_ping)(const struct lu_env *, struct obd_export *exp);
+
+ /* pools methods */
+ int (*o_pool_new)(struct obd_device *obd, char *poolname);
+ int (*o_pool_del)(struct obd_device *obd, char *poolname);
+ int (*o_pool_add)(struct obd_device *obd, char *poolname,
+ char *ostname);
+ int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+ char *ostname);
+ void (*o_getref)(struct obd_device *obd);
+ void (*o_putref)(struct obd_device *obd);
+ /*
+ * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+ * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+ * Also, add a wrapper function in include/linux/obd_class.h. */
+};
+
+enum {
+ LUSTRE_OPC_MKDIR = (1 << 0),
+ LUSTRE_OPC_SYMLINK = (1 << 1),
+ LUSTRE_OPC_MKNOD = (1 << 2),
+ LUSTRE_OPC_CREATE = (1 << 3),
+ LUSTRE_OPC_ANY = (1 << 4)
+};
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR 0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS 0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b
+
+#define MAX_HASH_SIZE_32 0x7fffffffUL
+#define MAX_HASH_SIZE 0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL
+
+struct lustre_md {
+ struct mdt_body *body;
+ struct lov_stripe_md *lsm;
+ struct lmv_stripe_md *mea;
+#ifdef CONFIG_FS_POSIX_ACL
+ struct posix_acl *posix_acl;
+#endif
+ struct mdt_remote_perm *remote_perm;
+ struct obd_capa *mds_capa;
+ struct obd_capa *oss_capa;
+};
+
+struct md_open_data {
+ struct obd_client_handle *mod_och;
+ struct ptlrpc_request *mod_open_req;
+ struct ptlrpc_request *mod_close_req;
+ atomic_t mod_refcount;
+};
+
+struct lookup_intent;
+
+struct md_ops {
+ int (*m_getstatus)(struct obd_export *, struct lu_fid *,
+ struct obd_capa **);
+ int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+ int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *,
+ ldlm_iterator_t, void *);
+ int (*m_close)(struct obd_export *, struct md_op_data *,
+ struct md_open_data *, struct ptlrpc_request **);
+ int (*m_create)(struct obd_export *, struct md_op_data *,
+ const void *, int, int, __u32, __u32, cfs_cap_t,
+ __u64, struct ptlrpc_request **);
+ int (*m_done_writing)(struct obd_export *, struct md_op_data *,
+ struct md_open_data *);
+ int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+ struct lookup_intent *, struct md_op_data *,
+ struct lustre_handle *, void *, int,
+ struct ptlrpc_request **, __u64);
+ int (*m_getattr)(struct obd_export *, struct md_op_data *,
+ struct ptlrpc_request **);
+ int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+ struct ptlrpc_request **);
+ int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+ void *, int, struct lookup_intent *, int,
+ struct ptlrpc_request **,
+ ldlm_blocking_callback, __u64);
+ int (*m_link)(struct obd_export *, struct md_op_data *,
+ struct ptlrpc_request **);
+ int (*m_rename)(struct obd_export *, struct md_op_data *,
+ const char *, int, const char *, int,
+ struct ptlrpc_request **);
+ int (*m_is_subdir)(struct obd_export *, const struct lu_fid *,
+ const struct lu_fid *,
+ struct ptlrpc_request **);
+ int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+ int , void *, int, struct ptlrpc_request **,
+ struct md_open_data **mod);
+ int (*m_sync)(struct obd_export *, const struct lu_fid *,
+ struct obd_capa *, struct ptlrpc_request **);
+ int (*m_readpage)(struct obd_export *, struct md_op_data *,
+ struct page **, struct ptlrpc_request **);
+
+ int (*m_unlink)(struct obd_export *, struct md_op_data *,
+ struct ptlrpc_request **);
+
+ int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+ struct obd_capa *, obd_valid, const char *,
+ const char *, int, int, int, __u32,
+ struct ptlrpc_request **);
+
+ int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+ struct obd_capa *, obd_valid, const char *,
+ const char *, int, int, int,
+ struct ptlrpc_request **);
+
+ int (*m_init_ea_size)(struct obd_export *, int, int, int);
+
+ int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *,
+ struct obd_export *, struct obd_export *,
+ struct lustre_md *);
+
+ int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+ int (*m_set_open_replay_data)(struct obd_export *,
+ struct obd_client_handle *,
+ struct ptlrpc_request *);
+ int (*m_clear_open_replay_data)(struct obd_export *,
+ struct obd_client_handle *);
+ int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *);
+
+ ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64,
+ const struct lu_fid *, ldlm_type_t,
+ ldlm_policy_data_t *, ldlm_mode_t,
+ struct lustre_handle *);
+
+ int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+ ldlm_policy_data_t *, ldlm_mode_t,
+ ldlm_cancel_flags_t flags, void *opaque);
+ int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc,
+ renew_capa_cb_t cb);
+ int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *,
+ const struct req_msg_field *, struct obd_capa **);
+
+ int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *,
+ struct obd_capa *, __u32,
+ struct ptlrpc_request **);
+
+ int (*m_intent_getattr_async)(struct obd_export *,
+ struct md_enqueue_info *,
+ struct ldlm_enqueue_info *);
+
+ int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+ struct lu_fid *, __u64 *bits);
+
+ /*
+ * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to
+ * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a
+ * wrapper function in include/linux/obd_class.h.
+ */
+};
+
+struct lsm_operations {
+ void (*lsm_free)(struct lov_stripe_md *);
+ int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+ struct obd_export *md_exp);
+ void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
+ obd_off *);
+ void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
+ obd_off *);
+ int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes,
+ __u16 *stripe_count);
+ int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct lov_mds_md *lmm);
+};
+
+extern const struct lsm_operations lsm_v1_ops;
+extern const struct lsm_operations lsm_v3_ops;
+static inline const struct lsm_operations *lsm_op_find(int magic)
+{
+ switch(magic) {
+ case LOV_MAGIC_V1:
+ return &lsm_v1_ops;
+ case LOV_MAGIC_V3:
+ return &lsm_v3_ops;
+ default:
+ CERROR("Cannot recognize lsm_magic %08x\n", magic);
+ return NULL;
+ }
+}
+
+/* Requests for obd_extent_calc() */
+#define OBD_CALC_STRIPE_START 1
+#define OBD_CALC_STRIPE_END 2
+
+static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo)
+{
+ return oinfo->oi_capa;
+}
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+ struct md_open_data *mod;
+ OBD_ALLOC_PTR(mod);
+ if (mod == NULL)
+ return NULL;
+ atomic_set(&mod->mod_refcount, 1);
+ return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod) \
+({ \
+ if (atomic_dec_and_test(&(mod)->mod_refcount)) { \
+ if ((mod)->mod_open_req) \
+ ptlrpc_req_finished((mod)->mod_open_req); \
+ OBD_FREE_PTR(mod); \
+ } \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+ return atomic_read(&cli->cl_resends) ?
+ atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+ return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, int namelen, int *idx)
+{
+ const char *start;
+ char *end;
+
+ if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+ return false;
+
+ /* caller does not care of idx */
+ if (idx == NULL)
+ return true;
+
+ /* volatile file, the MDT can be set from name */
+ /* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+ /* if no MDT is specified, use std way */
+ if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+ goto bad_format;
+ /* test for no MDT idx case */
+ if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+ (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+ *idx = -1;
+ return true;
+ }
+ /* we have an idx, read it */
+ start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+ *idx = strtoul(start, &end, 0);
+ /* error cases:
+ * no digit, no trailing :, negative value
+ */
+ if (((*idx == 0) && (end == start)) ||
+ (*end != ':') || (*idx < 0))
+ goto bad_format;
+
+ return true;
+bad_format:
+ /* bad format of mdt idx, we cannot return an error
+ * to caller so we use hash algo */
+ CERROR("Bad volatile file name format: %s\n",
+ name + LUSTRE_VOLATILE_HDR_LEN);
+ return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+ LASSERT(obd != NULL);
+ return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_cache.h b/drivers/staging/lustre/lustre/include/obd_cache.h
new file mode 100644
index 000000000000..c8249fbb0d72
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_cache.h
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h
new file mode 100644
index 000000000000..5f740f1743ca
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_cksum.h
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+{
+ switch (cksum_type) {
+ case OBD_CKSUM_CRC32:
+ return CFS_HASH_ALG_CRC32;
+ case OBD_CKSUM_ADLER:
+ return CFS_HASH_ALG_ADLER32;
+ case OBD_CKSUM_CRC32C:
+ return CFS_HASH_ALG_CRC32C;
+ default:
+ CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+ LBUG();
+ }
+ return 0;
+}
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline obd_flag cksum_type_pack(cksum_type_t cksum_type)
+{
+ unsigned int performance = 0, tmp;
+ obd_flag flag = OBD_FL_CKSUM_ADLER;
+
+ if (cksum_type & OBD_CKSUM_CRC32) {
+ tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+ if (tmp > performance) {
+ performance = tmp;
+ flag = OBD_FL_CKSUM_CRC32;
+ }
+ }
+ if (cksum_type & OBD_CKSUM_CRC32C) {
+ tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+ if (tmp > performance) {
+ performance = tmp;
+ flag = OBD_FL_CKSUM_CRC32C;
+ }
+ }
+ if (cksum_type & OBD_CKSUM_ADLER) {
+ tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+ if (tmp > performance) {
+ performance = tmp;
+ flag = OBD_FL_CKSUM_ADLER;
+ }
+ }
+ if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+ OBD_CKSUM_CRC32 |
+ OBD_CKSUM_ADLER))))
+ CWARN("unknown cksum type %x\n", cksum_type);
+
+ return flag;
+}
+
+static inline cksum_type_t cksum_type_unpack(obd_flag o_flags)
+{
+ switch (o_flags & OBD_FL_CKSUM_ALL) {
+ case OBD_FL_CKSUM_CRC32C:
+ return OBD_CKSUM_CRC32C;
+ case OBD_FL_CKSUM_CRC32:
+ return OBD_CKSUM_CRC32;
+ default:
+ break;
+ }
+
+ return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline cksum_type_t cksum_types_supported_client(void)
+{
+ cksum_type_t ret = OBD_CKSUM_ADLER;
+
+ CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+ if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+ ret |= OBD_CKSUM_CRC32C;
+ if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+ ret |= OBD_CKSUM_CRC32;
+
+ return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline cksum_type_t cksum_types_supported_server(void)
+{
+ int base_speed;
+ cksum_type_t ret = OBD_CKSUM_ADLER;
+
+ CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+ base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+ if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+ base_speed)
+ ret |= OBD_CKSUM_CRC32C;
+ if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+ base_speed)
+ ret |= OBD_CKSUM_CRC32;
+
+ return ret;
+}
+
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server. */
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+{
+ return cksum_type_unpack(cksum_type_pack(cksum_types));
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h
new file mode 100644
index 000000000000..de5c5853647f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_class.h
@@ -0,0 +1,2281 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+
+#include <obd_support.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <obd.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_class.h>
+
+#define OBD_STATFS_NODELAY 0x0001 /* requests should be send without delay
+ * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE 0x0002 /* the statfs callback should not update
+ * obd_osfs_age */
+#define OBD_STATFS_PTLRPCD 0x0004 /* requests will be sent via ptlrpcd
+ * instead of a specific set. This
+ * means that we cannot rely on the set
+ * interpret routine to be called.
+ * lov_statfs_fini() must thus be called
+ * by the request interpret routine */
+#define OBD_STATFS_FOR_MDT0 0x0008 /* The statfs is only for retrieving
+ * information from MDT0. */
+#define OBD_FL_PUNCH 0x00000001 /* To indicate it is punch operation */
+
+/* OBD Device Declarations */
+extern struct obd_device *obd_devs[MAX_OBD_DEVICES];
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+extern int lustre_get_jobid(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *, struct md_ops *,
+ struct lprocfs_vars *, const char *nm,
+ struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
+
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+ const char * typ_name,
+ struct obd_uuid *grp_uuid);
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid,
+ int *next);
+struct obd_device * class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+char *obd_export_nid2str(struct obd_export *exp);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr * kuc_ptr(void *p);
+int kuc_ispayload(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+ struct llog_rec_hdr *, void *);
+/* obd_config.c */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+ const char *new_name);
+int class_process_config(struct lustre_cfg *lcfg);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+ struct lustre_cfg *lcfg, void *data);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd,
+ const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+ const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks);
+int class_config_llog_handler(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_add_uuid(const char *uuid, __u64 nid);
+
+/*obdecho*/
+#ifdef LPROCFS
+extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+ memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+#define CFG_F_START 0x01 /* Set when we start updating from a log */
+#define CFG_F_MARKER 0x02 /* We are within a maker */
+#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08 /* Allow old-style logs */
+#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+ char *cfg_obdname;
+ void *cfg_instance;
+ struct super_block *cfg_sb;
+ struct obd_uuid cfg_uuid;
+ llog_cb_t cfg_callback;
+ int cfg_last_idx; /* for partial llog processing */
+ int cfg_flags;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name, struct config_llog_instance *cfg);
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name, struct config_llog_instance *cfg);
+
+enum {
+ CONFIG_T_CONFIG = 0,
+ CONFIG_T_SPTLRPC = 1,
+ CONFIG_T_RECOVER = 2,
+ CONFIG_T_MAX = 3
+};
+
+/* list of active configuration logs */
+struct config_llog_data {
+ struct ldlm_res_id cld_resid;
+ struct config_llog_instance cld_cfg;
+ struct list_head cld_list_chain;
+ atomic_t cld_refcount;
+ struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */
+ struct config_llog_data *cld_recover; /* imperative recover log */
+ struct obd_export *cld_mgcexp;
+ struct mutex cld_lock;
+ int cld_type;
+ unsigned int cld_stopping:1, /* we were told to stop
+ * watching */
+ cld_lostlock:1; /* lock not requeued */
+ char cld_logname[0];
+};
+
+struct lustre_profile {
+ struct list_head lp_list;
+ char *lp_profile;
+ char *lp_dt;
+ char *lp_md;
+};
+
+struct lustre_profile *class_get_profile(const char * prof);
+void class_del_profile(const char *prof);
+void class_del_profiles(void);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock) do {} while(0)
+#define __class_export_del_lock_ref(exp, lock) do {} while(0)
+
+#endif
+
+#define class_export_rpc_inc(exp) \
+({ \
+ atomic_inc(&(exp)->exp_rpc_count); \
+ CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n", \
+ (exp), atomic_read(&(exp)->exp_rpc_count)); \
+})
+
+#define class_export_rpc_dec(exp) \
+({ \
+ LASSERT_ATOMIC_POS(&exp->exp_rpc_count); \
+ atomic_dec(&(exp)->exp_rpc_count); \
+ CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n", \
+ (exp), atomic_read(&(exp)->exp_rpc_count)); \
+})
+
+#define class_export_lock_get(exp, lock) \
+({ \
+ atomic_inc(&(exp)->exp_locks_count); \
+ __class_export_add_lock_ref(exp, lock); \
+ CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+ (exp), atomic_read(&(exp)->exp_locks_count)); \
+ class_export_get(exp); \
+})
+
+#define class_export_lock_put(exp, lock) \
+({ \
+ LASSERT_ATOMIC_POS(&exp->exp_locks_count); \
+ atomic_dec(&(exp)->exp_locks_count); \
+ __class_export_del_lock_ref(exp, lock); \
+ CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+ (exp), atomic_read(&(exp)->exp_locks_count)); \
+ class_export_put(exp); \
+})
+
+#define class_export_cb_get(exp) \
+({ \
+ atomic_inc(&(exp)->exp_cb_count); \
+ CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+ (exp), atomic_read(&(exp)->exp_cb_count)); \
+ class_export_get(exp); \
+})
+
+#define class_export_cb_put(exp) \
+({ \
+ LASSERT_ATOMIC_POS(&exp->exp_cb_count); \
+ atomic_dec(&(exp)->exp_cb_count); \
+ CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+ (exp), atomic_read(&(exp)->exp_cb_count)); \
+ class_export_put(exp); \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obddev,
+ struct obd_uuid *cluuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+ struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obddev);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+ int (*test_export)(struct obd_export *));
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+ return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+ (obd->obd_force ? OBD_OPT_FORCE : 0) |
+ (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+ 0);
+}
+
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr,
+ unsigned int ia_valid);
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid);
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid);
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+ unsigned int valid);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo);
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo);
+
+#define OBT(dev) (dev)->obd_type
+#define OBP(dev, op) (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op) (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
+
+/* Ensure obd_setup: used for cleanup which must be called
+ while obd is stopping */
+#define OBD_CHECK_DEV(obd) \
+do { \
+ if (!(obd)) { \
+ CERROR("NULL device\n"); \
+ RETURN(-ENODEV); \
+ } \
+} while (0)
+
+/* ensure obd_setup and !obd_stopping */
+#define OBD_CHECK_DEV_ACTIVE(obd) \
+do { \
+ OBD_CHECK_DEV(obd); \
+ if (!(obd)->obd_set_up || (obd)->obd_stopping) { \
+ CERROR("Device %d not setup\n", \
+ (obd)->obd_minor); \
+ RETURN(-ENODEV); \
+ } \
+} while (0)
+
+
+#ifdef LPROCFS
+#define OBD_COUNTER_OFFSET(op) \
+ ((offsetof(struct obd_ops, o_ ## op) - \
+ offsetof(struct obd_ops, o_iocontrol)) \
+ / sizeof(((struct obd_ops *)(0))->o_iocontrol))
+
+#define OBD_COUNTER_INCREMENT(obdx, op) \
+ if ((obdx)->obd_stats != NULL) { \
+ unsigned int coffset; \
+ coffset = (unsigned int)((obdx)->obd_cntr_base) + \
+ OBD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < (obdx)->obd_stats->ls_num); \
+ lprocfs_counter_incr((obdx)->obd_stats, coffset); \
+ }
+
+#define EXP_COUNTER_INCREMENT(export, op) \
+ if ((export)->exp_obd->obd_stats != NULL) { \
+ unsigned int coffset; \
+ coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \
+ OBD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num); \
+ lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \
+ if ((export)->exp_nid_stats != NULL && \
+ (export)->exp_nid_stats->nid_stats != NULL) \
+ lprocfs_counter_incr( \
+ (export)->exp_nid_stats->nid_stats, coffset);\
+ }
+
+#define MD_COUNTER_OFFSET(op) \
+ ((offsetof(struct md_ops, m_ ## op) - \
+ offsetof(struct md_ops, m_getstatus)) \
+ / sizeof(((struct md_ops *)(0))->m_getstatus))
+
+#define MD_COUNTER_INCREMENT(obdx, op) \
+ if ((obd)->md_stats != NULL) { \
+ unsigned int coffset; \
+ coffset = (unsigned int)((obdx)->md_cntr_base) + \
+ MD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < (obdx)->md_stats->ls_num); \
+ lprocfs_counter_incr((obdx)->md_stats, coffset); \
+ }
+
+#define EXP_MD_COUNTER_INCREMENT(export, op) \
+ if ((export)->exp_obd->obd_stats != NULL) { \
+ unsigned int coffset; \
+ coffset = (unsigned int)((export)->exp_obd->md_cntr_base) + \
+ MD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < (export)->exp_obd->md_stats->ls_num); \
+ lprocfs_counter_incr((export)->exp_obd->md_stats, coffset); \
+ if ((export)->exp_md_stats != NULL) \
+ lprocfs_counter_incr( \
+ (export)->exp_md_stats, coffset); \
+ }
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define MD_COUNTER_INCREMENT(obd, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
+{
+ /* Always add in ldlm_stats */
+ tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC
+ ,LPROCFS_STATS_FLAG_NOPERCPU);
+ if (tmp->nid_ldlm_stats == NULL)
+ return -ENOMEM;
+
+ lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+ return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+ tmp->nid_ldlm_stats);
+}
+
+#define OBD_CHECK_MD_OP(obd, op, err) \
+do { \
+ if (!OBT(obd) || !MDP((obd), op)) { \
+ if (err) \
+ CERROR("md_" #op ": dev %s/%d no operation\n", \
+ obd->obd_name, obd->obd_minor); \
+ RETURN(err); \
+ } \
+} while (0)
+
+#define EXP_CHECK_MD_OP(exp, op) \
+do { \
+ if ((exp) == NULL) { \
+ CERROR("obd_" #op ": NULL export\n"); \
+ RETURN(-ENODEV); \
+ } \
+ if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \
+ CERROR("obd_" #op ": cleaned up obd\n"); \
+ RETURN(-EOPNOTSUPP); \
+ } \
+ if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \
+ CERROR("obd_" #op ": dev %s/%d no operation\n", \
+ (exp)->exp_obd->obd_name, \
+ (exp)->exp_obd->obd_minor); \
+ RETURN(-EOPNOTSUPP); \
+ } \
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err) \
+do { \
+ if (!OBT(obd) || !OBP((obd), op)) { \
+ if (err) \
+ CERROR("obd_" #op ": dev %d no operation\n", \
+ obd->obd_minor); \
+ RETURN(err); \
+ } \
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op) \
+do { \
+ if ((exp) == NULL) { \
+ CERROR("obd_" #op ": NULL export\n"); \
+ RETURN(-ENODEV); \
+ } \
+ if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \
+ CERROR("obd_" #op ": cleaned up obd\n"); \
+ RETURN(-EOPNOTSUPP); \
+ } \
+ if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \
+ CERROR("obd_" #op ": dev %d no operation\n", \
+ (exp)->exp_obd->obd_minor); \
+ RETURN(-EOPNOTSUPP); \
+ } \
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err) \
+do { \
+ if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) { \
+ if (err) \
+ CERROR("lop_" #op ": dev %d no operation\n", \
+ ctxt->loc_obd->obd_minor); \
+ RETURN(err); \
+ } \
+} while (0)
+
+static inline int class_devno_max(void)
+{
+ return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env,
+ struct obd_export *exp, __u32 keylen,
+ void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *lsm)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, get_info);
+ EXP_COUNTER_INCREMENT(exp, get_info);
+
+ rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val,
+ lsm);
+ RETURN(rc);
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+ struct obd_export *exp, obd_count keylen,
+ void *key, obd_count vallen, void *val,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, set_info_async);
+ EXP_COUNTER_INCREMENT(exp, set_info_async);
+
+ rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+ val, set);
+ RETURN(rc);
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
+#define DECLARE_LU_VARS(ldt, d) \
+ struct lu_device_type *ldt; \
+ struct lu_device *d
+
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+ int rc;
+ DECLARE_LU_VARS(ldt, d);
+ ENTRY;
+
+ ldt = obd->obd_type->typ_lu;
+ if (ldt != NULL) {
+ struct lu_context session_ctx;
+ struct lu_env env;
+ lu_context_init(&session_ctx, LCT_SESSION);
+ session_ctx.lc_thread = NULL;
+ lu_context_enter(&session_ctx);
+
+ rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+ if (rc == 0) {
+ env.le_ses = &session_ctx;
+ d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+ lu_env_fini(&env);
+ if (!IS_ERR(d)) {
+ obd->obd_lu_dev = d;
+ d->ld_obd = obd;
+ rc = 0;
+ } else
+ rc = PTR_ERR(d);
+ }
+ lu_context_exit(&session_ctx);
+ lu_context_fini(&session_ctx);
+
+ } else {
+ OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, setup);
+ rc = OBP(obd, setup)(obd, cfg);
+ }
+ RETURN(rc);
+}
+
+static inline int obd_precleanup(struct obd_device *obd,
+ enum obd_cleanup_stage cleanup_stage)
+{
+ int rc;
+ DECLARE_LU_VARS(ldt, d);
+ ENTRY;
+
+ OBD_CHECK_DEV(obd);
+ ldt = obd->obd_type->typ_lu;
+ d = obd->obd_lu_dev;
+ if (ldt != NULL && d != NULL) {
+ if (cleanup_stage == OBD_CLEANUP_EXPORTS) {
+ struct lu_env env;
+
+ rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+ if (rc == 0) {
+ ldt->ldt_ops->ldto_device_fini(&env, d);
+ lu_env_fini(&env);
+ }
+ }
+ }
+ OBD_CHECK_DT_OP(obd, precleanup, 0);
+ OBD_COUNTER_INCREMENT(obd, precleanup);
+
+ rc = OBP(obd, precleanup)(obd, cleanup_stage);
+ RETURN(rc);
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+ int rc;
+ DECLARE_LU_VARS(ldt, d);
+ ENTRY;
+
+ OBD_CHECK_DEV(obd);
+
+ ldt = obd->obd_type->typ_lu;
+ d = obd->obd_lu_dev;
+ if (ldt != NULL && d != NULL) {
+ struct lu_env env;
+
+ rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+ if (rc == 0) {
+ ldt->ldt_ops->ldto_device_free(&env, d);
+ lu_env_fini(&env);
+ obd->obd_lu_dev = NULL;
+ }
+ }
+ OBD_CHECK_DT_OP(obd, cleanup, 0);
+ OBD_COUNTER_INCREMENT(obd, cleanup);
+
+ rc = OBP(obd, cleanup)(obd);
+ RETURN(rc);
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+ ENTRY;
+
+ /* If we set up but never connected, the
+ client import will not have been cleaned. */
+ down_write(&obd->u.cli.cl_sem);
+ if (obd->u.cli.cl_import) {
+ struct obd_import *imp;
+ imp = obd->u.cli.cl_import;
+ CDEBUG(D_CONFIG, "%s: client import never connected\n",
+ obd->obd_name);
+ ptlrpc_invalidate_import(imp);
+ if (imp->imp_rq_pool) {
+ ptlrpc_free_rq_pool(imp->imp_rq_pool);
+ imp->imp_rq_pool = NULL;
+ }
+ client_destroy_import(imp);
+ obd->u.cli.cl_import = NULL;
+ }
+ up_write(&obd->u.cli.cl_sem);
+
+ EXIT;
+}
+
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
+{
+ int rc;
+ DECLARE_LU_VARS(ldt, d);
+ ENTRY;
+
+ OBD_CHECK_DEV(obd);
+
+ obd->obd_process_conf = 1;
+ ldt = obd->obd_type->typ_lu;
+ d = obd->obd_lu_dev;
+ if (ldt != NULL && d != NULL) {
+ struct lu_env env;
+
+ rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+ if (rc == 0) {
+ rc = d->ld_ops->ldo_process_config(&env, d, data);
+ lu_env_fini(&env);
+ }
+ } else {
+ OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+ rc = OBP(obd, process_config)(obd, datalen, data);
+ }
+ OBD_COUNTER_INCREMENT(obd, process_config);
+ obd->obd_process_conf = 0;
+
+ RETURN(rc);
+}
+
+/* Pack an in-memory MD struct for storage on disk.
+ * Returns +ve size of packed MD (0 for free), or -ve error.
+ *
+ * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
+ * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed.
+ * If @*disk_tgt == NULL, it will be allocated
+ */
+static inline int obd_packmd(struct obd_export *exp,
+ struct lov_mds_md **disk_tgt,
+ struct lov_stripe_md *mem_src)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, packmd);
+ EXP_COUNTER_INCREMENT(exp, packmd);
+
+ rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src);
+ RETURN(rc);
+}
+
+static inline int obd_size_diskmd(struct obd_export *exp,
+ struct lov_stripe_md *mem_src)
+{
+ return obd_packmd(exp, NULL, mem_src);
+}
+
+/* helper functions */
+static inline int obd_alloc_diskmd(struct obd_export *exp,
+ struct lov_mds_md **disk_tgt)
+{
+ LASSERT(disk_tgt);
+ LASSERT(*disk_tgt == NULL);
+ return obd_packmd(exp, disk_tgt, NULL);
+}
+
+static inline int obd_free_diskmd(struct obd_export *exp,
+ struct lov_mds_md **disk_tgt)
+{
+ LASSERT(disk_tgt);
+ LASSERT(*disk_tgt);
+ /*
+ * LU-2590, for caller's convenience, *disk_tgt could be host
+ * endianness, it needs swab to LE if necessary, while just
+ * lov_mds_md header needs it for figuring out how much memory
+ * needs to be freed.
+ */
+ if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+ (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) ||
+ ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3)))
+ lustre_swab_lov_mds_md(*disk_tgt);
+ return obd_packmd(exp, disk_tgt, NULL);
+}
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL).
+ * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed.
+ * If @*mem_tgt == NULL, it will be allocated
+ */
+static inline int obd_unpackmd(struct obd_export *exp,
+ struct lov_stripe_md **mem_tgt,
+ struct lov_mds_md *disk_src,
+ int disk_len)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, unpackmd);
+ EXP_COUNTER_INCREMENT(exp, unpackmd);
+
+ rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len);
+ RETURN(rc);
+}
+
+/* helper functions */
+static inline int obd_alloc_memmd(struct obd_export *exp,
+ struct lov_stripe_md **mem_tgt)
+{
+ LASSERT(mem_tgt);
+ LASSERT(*mem_tgt == NULL);
+ return obd_unpackmd(exp, mem_tgt, NULL, 0);
+}
+
+static inline int obd_free_memmd(struct obd_export *exp,
+ struct lov_stripe_md **mem_tgt)
+{
+ int rc;
+
+ LASSERT(mem_tgt);
+ LASSERT(*mem_tgt);
+ rc = obd_unpackmd(exp, mem_tgt, NULL, 0);
+ *mem_tgt = NULL;
+ return rc;
+}
+
+static inline int obd_precreate(struct obd_export *exp)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, precreate);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, precreate);
+
+ rc = OBP(exp->exp_obd, precreate)(exp);
+ RETURN(rc);
+}
+
+static inline int obd_create_async(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct lov_stripe_md **ea,
+ struct obd_trans_info *oti)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, create_async);
+ EXP_COUNTER_INCREMENT(exp, create_async);
+
+ rc = OBP(exp->exp_obd, create_async)(exp, oinfo, ea, oti);
+ RETURN(rc);
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *obdo, struct lov_stripe_md **ea,
+ struct obd_trans_info *oti)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, create);
+ EXP_COUNTER_INCREMENT(exp, create);
+
+ rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti);
+ RETURN(rc);
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *obdo, struct lov_stripe_md *ea,
+ struct obd_trans_info *oti,
+ struct obd_export *md_exp, void *capa)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, destroy);
+ EXP_COUNTER_INCREMENT(exp, destroy);
+
+ rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa);
+ RETURN(rc);
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, getattr);
+ EXP_COUNTER_INCREMENT(exp, getattr);
+
+ rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo);
+ RETURN(rc);
+}
+
+static inline int obd_getattr_async(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, getattr_async);
+ EXP_COUNTER_INCREMENT(exp, getattr_async);
+
+ rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set);
+ RETURN(rc);
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, setattr);
+ EXP_COUNTER_INCREMENT(exp, setattr);
+
+ rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti);
+ RETURN(rc);
+}
+
+/* This performs all the requests set init/wait/destroy actions. */
+static inline int obd_setattr_rqset(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti)
+{
+ struct ptlrpc_request_set *set = NULL;
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, setattr_async);
+ EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ RETURN(rc);
+}
+
+/* This adds all the requests into @set if @set != NULL, otherwise
+ all requests are sent asynchronously without waiting for response. */
+static inline int obd_setattr_async(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, setattr_async);
+ EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+ rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+ RETURN(rc);
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+ int priority)
+{
+ struct obd_device *obd = imp->imp_obd;
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DEV_ACTIVE(obd);
+ OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, add_conn);
+
+ rc = OBP(obd, add_conn)(imp, uuid, priority);
+ RETURN(rc);
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+ struct obd_device *obd = imp->imp_obd;
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DEV_ACTIVE(obd);
+ OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, del_conn);
+
+ rc = OBP(obd, del_conn)(imp, uuid);
+ RETURN(rc);
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+ struct obd_uuid *uuid;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+ EXP_COUNTER_INCREMENT(exp, get_uuid);
+
+ uuid = OBP(exp->exp_obd, get_uuid)(exp);
+ RETURN(uuid);
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ * by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+ struct obd_export **exp,struct obd_device *obd,
+ struct obd_uuid *cluuid,
+ struct obd_connect_data *data,
+ void *localdata)
+{
+ int rc;
+ __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+ * check */
+ ENTRY;
+
+ OBD_CHECK_DEV_ACTIVE(obd);
+ OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, connect);
+
+ rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+ /* check that only subset is granted */
+ LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+ data->ocd_connect_flags));
+ RETURN(rc);
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+ struct obd_export *exp,
+ struct obd_device *obd,
+ struct obd_uuid *cluuid,
+ struct obd_connect_data *d,
+ void *localdata)
+{
+ int rc;
+ __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+ * check */
+
+ ENTRY;
+
+ OBD_CHECK_DEV_ACTIVE(obd);
+ OBD_CHECK_DT_OP(obd, reconnect, 0);
+ OBD_COUNTER_INCREMENT(obd, reconnect);
+
+ rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+ /* check that only subset is granted */
+ LASSERT(ergo(d != NULL,
+ (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+ RETURN(rc);
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, disconnect);
+ EXP_COUNTER_INCREMENT(exp, disconnect);
+
+ rc = OBP(exp->exp_obd, disconnect)(exp);
+ RETURN(rc);
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+ enum lu_cli_type type)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(obd, fid_init, 0);
+ OBD_COUNTER_INCREMENT(obd, fid_init);
+
+ rc = OBP(obd, fid_init)(obd, exp, type);
+ RETURN(rc);
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(obd, fid_fini, 0);
+ OBD_COUNTER_INCREMENT(obd, fid_fini);
+
+ rc = OBP(obd, fid_fini)(obd);
+ RETURN(rc);
+}
+
+static inline int obd_fid_alloc(struct obd_export *exp,
+ struct lu_fid *fid,
+ struct md_op_data *op_data)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, fid_alloc);
+ EXP_COUNTER_INCREMENT(exp, fid_alloc);
+
+ rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data);
+ RETURN(rc);
+}
+
+static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
+ EXP_COUNTER_INCREMENT(exp, ping);
+
+ rc = OBP(exp->exp_obd, ping)(env, exp);
+ RETURN(rc);
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, pool_new);
+
+ rc = OBP(obd, pool_new)(obd, poolname);
+ RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, pool_del);
+
+ rc = OBP(obd, pool_del)(obd, poolname);
+ RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, pool_add);
+
+ rc = OBP(obd, pool_add)(obd, poolname, ostname);
+ RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+ rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+ RETURN(rc);
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+ ENTRY;
+ if (OBT(obd) && OBP(obd, getref)) {
+ OBD_COUNTER_INCREMENT(obd, getref);
+ OBP(obd, getref)(obd);
+ }
+ EXIT;
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+ ENTRY;
+ if (OBT(obd) && OBP(obd, putref)) {
+ OBD_COUNTER_INCREMENT(obd, putref);
+ OBP(obd, putref)(obd);
+ }
+ EXIT;
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+ int rc = 0;
+
+ ENTRY;
+ if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+ OBP((exp)->exp_obd, init_export))
+ rc = OBP(exp->exp_obd, init_export)(exp);
+ RETURN(rc);
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+ ENTRY;
+ if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+ OBP((exp)->exp_obd, destroy_export))
+ OBP(exp->exp_obd, destroy_export)(exp);
+ RETURN(0);
+}
+
+static inline int obd_extent_calc(struct obd_export *exp,
+ struct lov_stripe_md *md,
+ int cmd, obd_off *offset)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_DT_OP(exp, extent_calc);
+ rc = OBP(exp->exp_obd, extent_calc)(exp, md, cmd, offset);
+ RETURN(rc);
+}
+
+static inline struct dentry *
+obd_lvfs_fid2dentry(struct obd_export *exp, struct ost_id *oi, __u32 gen)
+{
+ struct lvfs_run_ctxt *ctxt = &exp->exp_obd->obd_lvfs_ctxt;
+ LASSERT(exp->exp_obd);
+
+ return ctxt->cb_ops.l_fid2dentry(ostid_id(oi), gen, ostid_seq(oi),
+ exp->exp_obd);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_export *exp,
+ struct obd_info *oinfo,
+ __u64 max_age,
+ struct ptlrpc_request_set *rqset)
+{
+ int rc = 0;
+ struct obd_device *obd;
+ ENTRY;
+
+ if (exp == NULL || exp->exp_obd == NULL)
+ RETURN(-EINVAL);
+
+ obd = exp->exp_obd;
+ OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, statfs);
+
+ CDEBUG(D_SUPER, "%s: osfs %p age "LPU64", max_age "LPU64"\n",
+ obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+ if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+ rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+ } else {
+ CDEBUG(D_SUPER,"%s: use %p cache blocks "LPU64"/"LPU64
+ " objects "LPU64"/"LPU64"\n",
+ obd->obd_name, &obd->obd_osfs,
+ obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+ obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+ spin_unlock(&obd->obd_osfs_lock);
+ oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+ if (oinfo->oi_cb_up)
+ oinfo->oi_cb_up(oinfo, 0);
+ }
+ RETURN(rc);
+}
+
+static inline int obd_statfs_rqset(struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age,
+ __u32 flags)
+{
+ struct ptlrpc_request_set *set = NULL;
+ struct obd_info oinfo = { { { 0 } } };
+ int rc = 0;
+ ENTRY;
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ oinfo.oi_osfs = osfs;
+ oinfo.oi_flags = flags;
+ rc = obd_statfs_async(exp, &oinfo, max_age, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ RETURN(rc);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age,
+ __u32 flags)
+{
+ int rc = 0;
+ struct obd_device *obd = exp->exp_obd;
+ ENTRY;
+
+ if (obd == NULL)
+ RETURN(-EINVAL);
+
+ OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, statfs);
+
+ CDEBUG(D_SUPER, "osfs "LPU64", max_age "LPU64"\n",
+ obd->obd_osfs_age, max_age);
+ if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+ rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+ if (rc == 0) {
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+ obd->obd_osfs_age = cfs_time_current_64();
+ spin_unlock(&obd->obd_osfs_lock);
+ }
+ } else {
+ CDEBUG(D_SUPER, "%s: use %p cache blocks "LPU64"/"LPU64
+ " objects "LPU64"/"LPU64"\n",
+ obd->obd_name, &obd->obd_osfs,
+ obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+ obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+ spin_unlock(&obd->obd_osfs_lock);
+ }
+ RETURN(rc);
+}
+
+static inline int obd_sync_rqset(struct obd_export *exp, struct obd_info *oinfo,
+ obd_size start, obd_size end)
+{
+ struct ptlrpc_request_set *set = NULL;
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+ EXP_COUNTER_INCREMENT(exp, sync);
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ rc = OBP(exp->exp_obd, sync)(NULL, exp, oinfo, start, end, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ RETURN(rc);
+}
+
+static inline int obd_sync(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo, obd_size start, obd_size end,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+ EXP_COUNTER_INCREMENT(exp, sync);
+
+ rc = OBP(exp->exp_obd, sync)(env, exp, oinfo, start, end, set);
+ RETURN(rc);
+}
+
+static inline int obd_punch_rqset(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti)
+{
+ struct ptlrpc_request_set *set = NULL;
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, punch);
+ EXP_COUNTER_INCREMENT(exp, punch);
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ rc = OBP(exp->exp_obd, punch)(NULL, exp, oinfo, oti, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ RETURN(rc);
+}
+
+static inline int obd_punch(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo, struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, punch);
+ EXP_COUNTER_INCREMENT(exp, punch);
+
+ rc = OBP(exp->exp_obd, punch)(env, exp, oinfo, oti, rqset);
+ RETURN(rc);
+}
+
+static inline int obd_brw(int cmd, struct obd_export *exp,
+ struct obd_info *oinfo, obd_count oa_bufs,
+ struct brw_page *pg, struct obd_trans_info *oti)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, brw);
+ EXP_COUNTER_INCREMENT(exp, brw);
+
+ if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) {
+ CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, "
+ "or OBD_BRW_CHECK\n");
+ LBUG();
+ }
+
+ rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti);
+ RETURN(rc);
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+ struct obd_export *exp, struct obdo *oa,
+ int objcount, struct obd_ioobj *obj,
+ struct niobuf_remote *remote, int *pages,
+ struct niobuf_local *local,
+ struct obd_trans_info *oti,
+ struct lustre_capa *capa)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, preprw);
+ EXP_COUNTER_INCREMENT(exp, preprw);
+
+ rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+ pages, local, oti, capa);
+ RETURN(rc);
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+ struct obd_export *exp, struct obdo *oa,
+ int objcount, struct obd_ioobj *obj,
+ struct niobuf_remote *rnb, int pages,
+ struct niobuf_local *local,
+ struct obd_trans_info *oti, int rc)
+{
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, commitrw);
+ EXP_COUNTER_INCREMENT(exp, commitrw);
+
+ rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+ rnb, pages, local, oti, rc);
+ RETURN(rc);
+}
+
+static inline int obd_merge_lvb(struct obd_export *exp,
+ struct lov_stripe_md *lsm,
+ struct ost_lvb *lvb, int kms_only)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, merge_lvb);
+ EXP_COUNTER_INCREMENT(exp, merge_lvb);
+
+ rc = OBP(exp->exp_obd, merge_lvb)(exp, lsm, lvb, kms_only);
+ RETURN(rc);
+}
+
+static inline int obd_adjust_kms(struct obd_export *exp,
+ struct lov_stripe_md *lsm, obd_off size,
+ int shrink)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, adjust_kms);
+ EXP_COUNTER_INCREMENT(exp, adjust_kms);
+
+ rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink);
+ RETURN(rc);
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+ int len, void *karg, void *uarg)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, iocontrol);
+ EXP_COUNTER_INCREMENT(exp, iocontrol);
+
+ rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+ RETURN(rc);
+}
+
+static inline int obd_enqueue_rqset(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct ldlm_enqueue_info *einfo)
+{
+ struct ptlrpc_request_set *set = NULL;
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, enqueue);
+ EXP_COUNTER_INCREMENT(exp, enqueue);
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ RETURN(-ENOMEM);
+
+ rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ RETURN(rc);
+}
+
+static inline int obd_enqueue(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct ldlm_enqueue_info *einfo,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, enqueue);
+ EXP_COUNTER_INCREMENT(exp, enqueue);
+
+ rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+ RETURN(rc);
+}
+
+static inline int obd_change_cbdata(struct obd_export *exp,
+ struct lov_stripe_md *lsm,
+ ldlm_iterator_t it, void *data)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, change_cbdata);
+ EXP_COUNTER_INCREMENT(exp, change_cbdata);
+
+ rc = OBP(exp->exp_obd, change_cbdata)(exp, lsm, it, data);
+ RETURN(rc);
+}
+
+static inline int obd_find_cbdata(struct obd_export *exp,
+ struct lov_stripe_md *lsm,
+ ldlm_iterator_t it, void *data)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, find_cbdata);
+ EXP_COUNTER_INCREMENT(exp, find_cbdata);
+
+ rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data);
+ RETURN(rc);
+}
+
+static inline int obd_cancel(struct obd_export *exp,
+ struct lov_stripe_md *ea, __u32 mode,
+ struct lustre_handle *lockh)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, cancel);
+ EXP_COUNTER_INCREMENT(exp, cancel);
+
+ rc = OBP(exp->exp_obd, cancel)(exp, ea, mode, lockh);
+ RETURN(rc);
+}
+
+static inline int obd_cancel_unused(struct obd_export *exp,
+ struct lov_stripe_md *ea,
+ ldlm_cancel_flags_t flags,
+ void *opaque)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, cancel_unused);
+ EXP_COUNTER_INCREMENT(exp, cancel_unused);
+
+ rc = OBP(exp->exp_obd, cancel_unused)(exp, ea, flags, opaque);
+ RETURN(rc);
+}
+
+static inline int obd_pin(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, struct obd_client_handle *handle,
+ int flag)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, pin);
+ EXP_COUNTER_INCREMENT(exp, pin);
+
+ rc = OBP(exp->exp_obd, pin)(exp, fid, oc, handle, flag);
+ RETURN(rc);
+}
+
+static inline int obd_unpin(struct obd_export *exp,
+ struct obd_client_handle *handle, int flag)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, unpin);
+ EXP_COUNTER_INCREMENT(exp, unpin);
+
+ rc = OBP(exp->exp_obd, unpin)(exp, handle, flag);
+ RETURN(rc);
+}
+
+
+static inline void obd_import_event(struct obd_device *obd,
+ struct obd_import *imp,
+ enum obd_import_event event)
+{
+ ENTRY;
+ if (!obd) {
+ CERROR("NULL device\n");
+ EXIT;
+ return;
+ }
+ if (obd->obd_set_up && OBP(obd, import_event)) {
+ OBD_COUNTER_INCREMENT(obd, import_event);
+ OBP(obd, import_event)(obd, imp, event);
+ }
+ EXIT;
+}
+
+static inline int obd_llog_connect(struct obd_export *exp,
+ struct llogd_conn_body *body)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, llog_connect, 0);
+ EXP_COUNTER_INCREMENT(exp, llog_connect);
+
+ rc = OBP(exp->exp_obd, llog_connect)(exp, body);
+ RETURN(rc);
+}
+
+
+static inline int obd_notify(struct obd_device *obd,
+ struct obd_device *watched,
+ enum obd_notify_event ev,
+ void *data)
+{
+ int rc;
+ ENTRY;
+ OBD_CHECK_DEV(obd);
+
+ /* the check for async_recov is a complete hack - I'm hereby
+ overloading the meaning to also mean "this was called from
+ mds_postsetup". I know that my mds is able to handle notifies
+ by this point, and it needs to get them to execute mds_postrecov. */
+ if (!obd->obd_set_up && !obd->obd_async_recov) {
+ CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+ RETURN(-EINVAL);
+ }
+
+ if (!OBP(obd, notify)) {
+ CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+ RETURN(-ENOSYS);
+ }
+
+ OBD_COUNTER_INCREMENT(obd, notify);
+ rc = OBP(obd, notify)(obd, watched, ev, data);
+ RETURN(rc);
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+ struct obd_device *observed,
+ enum obd_notify_event ev,
+ void *data)
+{
+ int rc1;
+ int rc2;
+
+ struct obd_notify_upcall *onu;
+
+ if (observer->obd_observer)
+ rc1 = obd_notify(observer->obd_observer, observed, ev, data);
+ else
+ rc1 = 0;
+ /*
+ * Also, call non-obd listener, if any
+ */
+ onu = &observer->obd_upcall;
+ if (onu->onu_upcall != NULL)
+ rc2 = onu->onu_upcall(observer, observed, ev,
+ onu->onu_owner, NULL);
+ else
+ rc2 = 0;
+
+ return rc1 ? rc1 : rc2;
+}
+
+static inline int obd_quotacheck(struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, quotacheck);
+ EXP_COUNTER_INCREMENT(exp, quotacheck);
+
+ rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl);
+ RETURN(rc);
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_DT_OP(exp, quotactl);
+ EXP_COUNTER_INCREMENT(exp, quotactl);
+
+ rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+ RETURN(rc);
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+ struct obd_device *obd)
+{
+ /* returns: 0 on healthy
+ * >0 on unhealthy + reason code/flag
+ * however the only suppored reason == 1 right now
+ * We'll need to define some better reasons
+ * or flags in the future.
+ * <0 on error
+ */
+ int rc;
+ ENTRY;
+
+ /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+ if (obd == NULL || !OBT(obd)) {
+ CERROR("cleaned up obd\n");
+ RETURN(-EOPNOTSUPP);
+ }
+ if (!obd->obd_set_up || obd->obd_stopping)
+ RETURN(0);
+ if (!OBP(obd, health_check))
+ RETURN(0);
+
+ rc = OBP(obd, health_check)(env, obd);
+ RETURN(rc);
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+ struct obd_device *observer)
+{
+ ENTRY;
+ OBD_CHECK_DEV(obd);
+ down_write(&obd->obd_observer_link_sem);
+ if (obd->obd_observer && observer) {
+ up_write(&obd->obd_observer_link_sem);
+ RETURN(-EALREADY);
+ }
+ obd->obd_observer = observer;
+ up_write(&obd->obd_observer_link_sem);
+ RETURN(0);
+}
+
+static inline int obd_pin_observer(struct obd_device *obd,
+ struct obd_device **observer)
+{
+ ENTRY;
+ down_read(&obd->obd_observer_link_sem);
+ if (!obd->obd_observer) {
+ *observer = NULL;
+ up_read(&obd->obd_observer_link_sem);
+ RETURN(-ENOENT);
+ }
+ *observer = obd->obd_observer;
+ RETURN(0);
+}
+
+static inline int obd_unpin_observer(struct obd_device *obd)
+{
+ ENTRY;
+ up_read(&obd->obd_observer_link_sem);
+ RETURN(0);
+}
+
+#if 0
+static inline int obd_register_page_removal_cb(struct obd_export *exp,
+ obd_page_removal_cb_t cb,
+ obd_pin_extent_cb pin_cb)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb);
+
+ rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb);
+ RETURN(rc);
+}
+
+static inline int obd_unregister_page_removal_cb(struct obd_export *exp,
+ obd_page_removal_cb_t cb)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb);
+
+ rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb);
+ RETURN(rc);
+}
+
+static inline int obd_register_lock_cancel_cb(struct obd_export *exp,
+ obd_lock_cancel_cb cb)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb);
+
+ rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb);
+ RETURN(rc);
+}
+
+static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp,
+ obd_lock_cancel_cb cb)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb);
+
+ rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb);
+ RETURN(rc);
+}
+#endif
+
+/* metadata helpers */
+static inline int md_getstatus(struct obd_export *exp,
+ struct lu_fid *fid, struct obd_capa **pc)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_MD_OP(exp, getstatus);
+ EXP_MD_COUNTER_INCREMENT(exp, getstatus);
+ rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc);
+ RETURN(rc);
+}
+
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, getattr);
+ EXP_MD_COUNTER_INCREMENT(exp, getattr);
+ rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+ RETURN(rc);
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+ const struct lu_fid *fid)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, null_inode);
+ EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+ rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+ RETURN(rc);
+}
+
+static inline int md_find_cbdata(struct obd_export *exp,
+ const struct lu_fid *fid,
+ ldlm_iterator_t it, void *data)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, find_cbdata);
+ EXP_MD_COUNTER_INCREMENT(exp, find_cbdata);
+ rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data);
+ RETURN(rc);
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+ struct md_open_data *mod,
+ struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, close);
+ EXP_MD_COUNTER_INCREMENT(exp, close);
+ rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+ RETURN(rc);
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+ const void *data, int datalen, int mode, __u32 uid,
+ __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+ struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, create);
+ EXP_MD_COUNTER_INCREMENT(exp, create);
+ rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+ uid, gid, cap_effective, rdev, request);
+ RETURN(rc);
+}
+
+static inline int md_done_writing(struct obd_export *exp,
+ struct md_op_data *op_data,
+ struct md_open_data *mod)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, done_writing);
+ EXP_MD_COUNTER_INCREMENT(exp, done_writing);
+ rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod);
+ RETURN(rc);
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+ struct ldlm_enqueue_info *einfo,
+ struct lookup_intent *it,
+ struct md_op_data *op_data,
+ struct lustre_handle *lockh,
+ void *lmm, int lmmsize,
+ struct ptlrpc_request **req,
+ int extra_lock_flags)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, enqueue);
+ EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+ rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh,
+ lmm, lmmsize, req, extra_lock_flags);
+ RETURN(rc);
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+ struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, getattr_name);
+ EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+ rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+ RETURN(rc);
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+ struct md_op_data *op_data, void *lmm,
+ int lmmsize, struct lookup_intent *it,
+ int lookup_flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, intent_lock);
+ EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+ rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize,
+ it, lookup_flags, reqp, cb_blocking,
+ extra_lock_flags);
+ RETURN(rc);
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, link);
+ EXP_MD_COUNTER_INCREMENT(exp, link);
+ rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+ RETURN(rc);
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+ const char *old, int oldlen, const char *new,
+ int newlen, struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, rename);
+ EXP_MD_COUNTER_INCREMENT(exp, rename);
+ rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+ newlen, request);
+ RETURN(rc);
+}
+
+static inline int md_is_subdir(struct obd_export *exp,
+ const struct lu_fid *pfid,
+ const struct lu_fid *cfid,
+ struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, is_subdir);
+ EXP_MD_COUNTER_INCREMENT(exp, is_subdir);
+ rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request);
+ RETURN(rc);
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+ void *ea, int ealen, void *ea2, int ea2len,
+ struct ptlrpc_request **request,
+ struct md_open_data **mod)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, setattr);
+ EXP_MD_COUNTER_INCREMENT(exp, setattr);
+ rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen,
+ ea2, ea2len, request, mod);
+ RETURN(rc);
+}
+
+static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, sync);
+ EXP_MD_COUNTER_INCREMENT(exp, sync);
+ rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request);
+ RETURN(rc);
+}
+
+static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata,
+ struct page **pages,
+ struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, readpage);
+ EXP_MD_COUNTER_INCREMENT(exp, readpage);
+ rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request);
+ RETURN(rc);
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, unlink);
+ EXP_MD_COUNTER_INCREMENT(exp, unlink);
+ rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+ RETURN(rc);
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ struct obd_export *dt_exp,
+ struct obd_export *md_exp,
+ struct lustre_md *md)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, get_lustre_md);
+ EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+ RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+ struct lustre_md *md)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, free_lustre_md);
+ EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+ RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
+}
+
+static inline int md_setxattr(struct obd_export *exp,
+ const struct lu_fid *fid, struct obd_capa *oc,
+ obd_valid valid, const char *name,
+ const char *input, int input_size,
+ int output_size, int flags, __u32 suppgid,
+ struct ptlrpc_request **request)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, setxattr);
+ EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+ RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input,
+ input_size, output_size, flags,
+ suppgid, request));
+}
+
+static inline int md_getxattr(struct obd_export *exp,
+ const struct lu_fid *fid, struct obd_capa *oc,
+ obd_valid valid, const char *name,
+ const char *input, int input_size,
+ int output_size, int flags,
+ struct ptlrpc_request **request)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, getxattr);
+ EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+ RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input,
+ input_size, output_size, flags,
+ request));
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och,
+ struct ptlrpc_request *open_req)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, set_open_replay_data);
+ EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+ RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req));
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+ EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+ RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+ __u64 *lockh, void *data, __u64 *bits)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, set_lock_data);
+ EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+ RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
+}
+
+static inline int md_cancel_unused(struct obd_export *exp,
+ const struct lu_fid *fid,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode,
+ ldlm_cancel_flags_t flags,
+ void *opaque)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_MD_OP(exp, cancel_unused);
+ EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+
+ rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+ flags, opaque);
+ RETURN(rc);
+}
+
+static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags,
+ const struct lu_fid *fid,
+ ldlm_type_t type,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode,
+ struct lustre_handle *lockh)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, lock_match);
+ EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+ RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+ policy, mode, lockh));
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, int easize,
+ int def_asize, int cookiesize)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, init_ea_size);
+ EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+ RETURN(MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize,
+ cookiesize));
+}
+
+static inline int md_get_remote_perm(struct obd_export *exp,
+ const struct lu_fid *fid,
+ struct obd_capa *oc, __u32 suppgid,
+ struct ptlrpc_request **request)
+{
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, get_remote_perm);
+ EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm);
+ RETURN(MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid,
+ request));
+}
+
+static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa,
+ renew_capa_cb_t cb)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, renew_capa);
+ EXP_MD_COUNTER_INCREMENT(exp, renew_capa);
+ rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb);
+ RETURN(rc);
+}
+
+static inline int md_unpack_capa(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ const struct req_msg_field *field,
+ struct obd_capa **oc)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, unpack_capa);
+ EXP_MD_COUNTER_INCREMENT(exp, unpack_capa);
+ rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc);
+ RETURN(rc);
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+ struct md_enqueue_info *minfo,
+ struct ldlm_enqueue_info *einfo)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, intent_getattr_async);
+ EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+ rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo);
+ RETURN(rc);
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+ struct lookup_intent *it,
+ struct lu_fid *fid, __u64 *bits)
+{
+ int rc;
+ ENTRY;
+ EXP_CHECK_MD_OP(exp, revalidate_lock);
+ EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+ rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+ RETURN(rc);
+}
+
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr) \
+do { \
+ OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, __GFP_IO); \
+} while(0)
+
+#define OBDO_FREE(ptr) \
+do { \
+ OBD_SLAB_FREE_PTR((ptr), obdo_cachep); \
+} while(0)
+
+
+static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid)
+{
+ /* something here */
+}
+
+static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa)
+{
+ /* something here */
+}
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+ struct obd_export **lri_exp;
+ register_lwp_cb lri_cb_func;
+ void *lri_cb_data;
+ struct list_head lri_list;
+ char lri_name[MTI_NAME_MAXLEN];
+};
+
+/* I'm as embarrassed about this as you are.
+ *
+ * <shaver> // XXX do not look into _superhack with remaining eye
+ * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
+extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+
+/* obd_mount.c */
+
+/* sysctl.c */
+extern void obd_sysctl_init (void);
+extern void obd_sysctl_clean (void);
+
+/* uuid.c */
+typedef __u8 class_uuid_t[16];
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+void class_init_uuidlist(void);
+void class_exit_uuidlist(void);
+
+/* mea.c */
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen);
+int raw_name2idx(int hashtype, int count, const char *name, int namelen);
+
+/* prng.c */
+#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_lov.h b/drivers/staging/lustre/lustre/include/obd_lov.h
new file mode 100644
index 000000000000..d82f3341d0a8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_lov.h
@@ -0,0 +1,126 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_LOV_H__
+#define _OBD_LOV_H__
+
+#define LOV_DEFAULT_STRIPE_SIZE (1 << LNET_MTU_BITS)
+
+static inline int lov_stripe_md_size(__u16 stripes)
+{
+ return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*);
+}
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+ if (lmm_magic == LOV_MAGIC_V3)
+ return sizeof(struct lov_mds_md_v3) +
+ stripes * sizeof(struct lov_ost_data_v1);
+ else
+ return sizeof(struct lov_mds_md_v1) +
+ stripes * sizeof(struct lov_ost_data_v1);
+}
+
+struct lov_version_size {
+ __u32 lvs_magic;
+ size_t lvs_lmm_size;
+ size_t lvs_lod_size;
+};
+
+static inline __u32 lov_mds_md_stripecnt(int ea_size, __u32 lmm_magic)
+{
+ static const struct lov_version_size lmm_ver_size[] = {
+ { .lvs_magic = LOV_MAGIC_V3,
+ .lvs_lmm_size = sizeof(struct lov_mds_md_v3),
+ .lvs_lod_size = sizeof(struct lov_ost_data_v1) },
+ { .lvs_magic = LOV_MAGIC_V1,
+ .lvs_lmm_size = sizeof(struct lov_mds_md_v1),
+ .lvs_lod_size = sizeof(struct lov_ost_data_v1)} };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lmm_ver_size); i++) {
+ if (lmm_magic == lmm_ver_size[i].lvs_magic) {
+ if (ea_size <= lmm_ver_size[i].lvs_lmm_size)
+ return 0;
+ return (ea_size - lmm_ver_size[i].lvs_lmm_size) /
+ lmm_ver_size[i].lvs_lod_size;
+ }
+ }
+
+ /* Invalid LOV magic, so no stripes could fit */
+ return 0;
+}
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number. If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG > 32
+# define lov_do_div64(n,base) ({ \
+ uint64_t __base = (base); \
+ uint64_t __rem; \
+ __rem = ((uint64_t)(n)) % __base; \
+ (n) = ((uint64_t)(n)) / __base; \
+ __rem; \
+ })
+#else
+# define lov_do_div64(n,base) ({ \
+ uint64_t __rem; \
+ if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) { \
+ int __remainder; \
+ LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+ "division %llu / %llu\n", (n), (uint64_t)(base)); \
+ __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1); \
+ (n) >>= LOV_MIN_STRIPE_BITS; \
+ __rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS); \
+ __rem <<= LOV_MIN_STRIPE_BITS; \
+ __rem += __remainder; \
+ } else { \
+ __rem = do_div(n, base); \
+ } \
+ __rem; \
+ })
+#endif
+
+#define IOC_LOV_TYPE 'g'
+#define IOC_LOV_MIN_NR 50
+#define IOC_LOV_SET_OSC_ACTIVE _IOWR('g', 50, long)
+#define IOC_LOV_MAX_NR 50
+
+#define QOS_DEFAULT_THRESHOLD 10 /* MB */
+#define QOS_DEFAULT_MAXAGE 5 /* Seconds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_ost.h b/drivers/staging/lustre/lustre/include/obd_ost.h
new file mode 100644
index 000000000000..af89843c312b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_ost.h
@@ -0,0 +1,96 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/obd_ost.h
+ *
+ * Data structures for object storage targets and client: OST & OSC's
+ *
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_OST_H
+#define _LUSTRE_OST_H
+
+#include <obd_class.h>
+
+struct osc_brw_async_args {
+ struct obdo *aa_oa;
+ int aa_requested_nob;
+ int aa_nio_count;
+ obd_count aa_page_count;
+ int aa_resends;
+ struct brw_page **aa_ppga;
+ struct client_obd *aa_cli;
+ struct list_head aa_oaps;
+ struct list_head aa_exts;
+ struct obd_capa *aa_ocapa;
+ struct cl_req *aa_clerq;
+};
+
+#define osc_grant_args osc_brw_async_args
+struct osc_async_args {
+ struct obd_info *aa_oi;
+};
+
+struct osc_setattr_args {
+ struct obdo *sa_oa;
+ obd_enqueue_update_f sa_upcall;
+ void *sa_cookie;
+};
+
+struct osc_fsync_args {
+ struct obd_info *fa_oi;
+ obd_enqueue_update_f fa_upcall;
+ void *fa_cookie;
+};
+
+struct osc_enqueue_args {
+ struct obd_export *oa_exp;
+ __u64 *oa_flags;
+ obd_enqueue_update_f oa_upcall;
+ void *oa_cookie;
+ struct ost_lvb *oa_lvb;
+ struct lustre_handle *oa_lockh;
+ struct ldlm_enqueue_info *oa_ei;
+ unsigned int oa_agl:1;
+};
+
+#if 0
+int osc_extent_blocking_cb(struct ldlm_lock *lock,
+ struct ldlm_lock_desc *new, void *data,
+ int flag);
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
new file mode 100644
index 000000000000..b5d40afc3599
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_support.h
@@ -0,0 +1,851 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#include <linux/libcfs/libcfs.h>
+#include <lvfs.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_support.h>
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+ OBD_MEMORY_STAT = 0,
+ OBD_MEMORY_PAGES_STAT = 1,
+ OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+ networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout; /* seconds */
+extern unsigned int ldlm_timeout; /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned int obd_sync_filter;
+extern unsigned int obd_max_dirty_pages;
+extern atomic_t obd_dirty_pages;
+extern atomic_t obd_dirty_transit_pages;
+extern unsigned int obd_alloc_fail_rate;
+extern char obd_jobid_var[];
+
+/* lvfs.c */
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+ size_t size, const char *file, int line);
+
+/* Some hash init argument constants */
+#define HASH_POOLS_BKT_BITS 3
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_BKT_BITS 5
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_BKT_BITS 5
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_CONN_BKT_BITS 5
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+#define HASH_EXP_LOCK_BKT_BITS 5
+#define HASH_EXP_LOCK_CUR_BITS 7
+#define HASH_EXP_LOCK_MAX_BITS 16
+#define HASH_CL_ENV_BKT_BITS 5
+#define HASH_CL_ENV_BITS 10
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT 100
+#define LDLM_TIMEOUT_DEFAULT 20
+#define MDS_LDLM_TIMEOUT_DEFAULT 6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+ connect requests in the LND queues, but within obd_timeout so we don't
+ miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+ running on a backup server. (If it's too low, import_select_connection
+ will increase the timeout anyhow.) */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+ INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN (2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN 1
+#define OBD_IR_FACTOR_MAX 10
+#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT (4*obd_timeout)
+#define LONG_UNLINK 300 /* Unlink should happen before now */
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL 1200/*20 minutes*/
+
+#define OBD_FAIL_MDS 0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101
+#define OBD_FAIL_MDS_GETATTR_NET 0x102
+#define OBD_FAIL_MDS_GETATTR_PACK 0x103
+#define OBD_FAIL_MDS_READPAGE_NET 0x104
+#define OBD_FAIL_MDS_READPAGE_PACK 0x105
+#define OBD_FAIL_MDS_SENDPAGE 0x106
+#define OBD_FAIL_MDS_REINT_NET 0x107
+#define OBD_FAIL_MDS_REINT_UNPACK 0x108
+#define OBD_FAIL_MDS_REINT_SETATTR 0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE 0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE 0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK 0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE 0x10e
+#define OBD_FAIL_MDS_REINT_LINK 0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE 0x110
+#define OBD_FAIL_MDS_REINT_RENAME 0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE 0x112
+#define OBD_FAIL_MDS_OPEN_NET 0x113
+#define OBD_FAIL_MDS_OPEN_PACK 0x114
+#define OBD_FAIL_MDS_CLOSE_NET 0x115
+#define OBD_FAIL_MDS_CLOSE_PACK 0x116
+#define OBD_FAIL_MDS_CONNECT_NET 0x117
+#define OBD_FAIL_MDS_CONNECT_PACK 0x118
+#define OBD_FAIL_MDS_REINT_NET_REP 0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET 0x11a
+#define OBD_FAIL_MDS_GETSTATUS_NET 0x11b
+#define OBD_FAIL_MDS_GETSTATUS_PACK 0x11c
+#define OBD_FAIL_MDS_STATFS_PACK 0x11d
+#define OBD_FAIL_MDS_STATFS_NET 0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f
+#define OBD_FAIL_MDS_PIN_NET 0x120
+#define OBD_FAIL_MDS_UNPIN_NET 0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET 0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123
+#define OBD_FAIL_MDS_SYNC_NET 0x124
+#define OBD_FAIL_MDS_SYNC_PACK 0x125
+#define OBD_FAIL_MDS_DONE_WRITING_NET 0x126
+#define OBD_FAIL_MDS_DONE_WRITING_PACK 0x127
+#define OBD_FAIL_MDS_ALLOC_OBDO 0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN 0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE 0x12b
+#define OBD_FAIL_MDS_OST_SETATTR 0x12c
+#define OBD_FAIL_MDS_QUOTACHECK_NET 0x12d
+#define OBD_FAIL_MDS_QUOTACTL_NET 0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD 0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET 0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK 0x131
+#define OBD_FAIL_MDS_SETXATTR_NET 0x132
+#define OBD_FAIL_MDS_SETXATTR 0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE 0x134
+#define OBD_FAIL_MDS_FS_SETUP 0x135
+#define OBD_FAIL_MDS_RESEND 0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE 0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING 0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE 0x141
+#define OBD_FAIL_MDS_REINT_DELAY 0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO 0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144
+#define OBD_FAIL_MDS_PDO_LOCK 0x145
+#define OBD_FAIL_MDS_PDO_LOCK2 0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL 0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE 0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET 0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET 0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET 0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET 0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET 0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET 0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN 0x171
+#define OBD_FAIL_MDS_LL_BLOCK 0x172
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK 0x181
+#define OBD_FAIL_MDS_SET_INFO_NET 0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET 0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK 0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET 0x186
+#define OBD_FAIL_MDS_DQACQ_NET 0x187
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY 0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH 0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL 0x192
+#define OBD_FAIL_OSD_FID_MAPPING 0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT 0x194
+
+#define OBD_FAIL_OST 0x200
+#define OBD_FAIL_OST_CONNECT_NET 0x201
+#define OBD_FAIL_OST_DISCONNECT_NET 0x202
+#define OBD_FAIL_OST_GET_INFO_NET 0x203
+#define OBD_FAIL_OST_CREATE_NET 0x204
+#define OBD_FAIL_OST_DESTROY_NET 0x205
+#define OBD_FAIL_OST_GETATTR_NET 0x206
+#define OBD_FAIL_OST_SETATTR_NET 0x207
+#define OBD_FAIL_OST_OPEN_NET 0x208
+#define OBD_FAIL_OST_CLOSE_NET 0x209
+#define OBD_FAIL_OST_BRW_NET 0x20a
+#define OBD_FAIL_OST_PUNCH_NET 0x20b
+#define OBD_FAIL_OST_STATFS_NET 0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK 0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK 0x20f
+#define OBD_FAIL_OST_SYNC_NET 0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET 0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214
+#define OBD_FAIL_OST_ENOSPC 0x215
+#define OBD_FAIL_OST_EROFS 0x216
+#define OBD_FAIL_OST_ENOENT 0x217
+#define OBD_FAIL_OST_QUOTACHECK_NET 0x218
+#define OBD_FAIL_OST_QUOTACTL_NET 0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND 0x21b
+#define OBD_FAIL_OST_BRW_SIZE 0x21c
+#define OBD_FAIL_OST_DROP_REQ 0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS 0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE 0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224
+#define OBD_FAIL_OST_CONNECT_NET2 0x225
+#define OBD_FAIL_OST_NOMEM 0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228
+#define OBD_FAIL_OST_ENOINO 0x229
+#define OBD_FAIL_OST_DQACQ_NET 0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231
+
+#define OBD_FAIL_LDLM 0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
+#define OBD_FAIL_LDLM_CONVERT_NET 0x303
+#define OBD_FAIL_LDLM_CANCEL_NET 0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET 0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET 0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
+#define OBD_FAIL_LDLM_REPLY 0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE 0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE 0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT 0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST 0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE 0x318
+#define OBD_FAIL_LDLM_NEW_LOCK 0x319
+#define OBD_FAIL_LDLM_AGL_DELAY 0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b
+#define OBD_FAIL_LDLM_OST_LVB 0x31c
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION 0x385
+
+#define OBD_FAIL_OSC 0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK 0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK 0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST 0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST 0x404
+#define OBD_FAIL_OSC_MATCH 0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406
+#define OBD_FAIL_OSC_SHUTDOWN 0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND 0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a
+#define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE 0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE 0x410
+#define OBD_FAIL_OSC_NO_GRANT 0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME 0x412
+
+#define OBD_FAIL_PTLRPC 0x500
+#define OBD_FAIL_PTLRPC_ACK 0x501
+#define OBD_FAIL_PTLRPC_RQBD 0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC 0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE 0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517
+
+#define OBD_FAIL_OBD_PING_NET 0x600
+#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601
+#define OBD_FAIL_OBD_LOGD_NET 0x602
+#define OBD_FAIL_OBD_QC_CALLBACK_NET 0x603
+#define OBD_FAIL_OBD_DQACQ 0x604
+#define OBD_FAIL_OBD_LLOG_SETUP 0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606
+#define OBD_FAIL_OBD_IDX_READ_NET 0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK 0x608
+#define OBD_FAIL_OBD_NO_LRU 0x609
+
+#define OBD_FAIL_TGT_REPLY_NET 0x700
+#define OBD_FAIL_TGT_CONN_RACE 0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT 0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT 0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706
+#define OBD_FAIL_TGT_REPLAY_DROP 0x707
+#define OBD_FAIL_TGT_FAKE_EXP 0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY 0x709
+#define OBD_FAIL_TGT_LAST_REPLAY 0x710
+#define OBD_FAIL_TGT_CLIENT_ADD 0x711
+#define OBD_FAIL_TGT_RCVG_FLAG 0x712
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803
+#define OBD_FAIL_MDC_RPCS_SEM 0x804
+#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805
+
+#define OBD_FAIL_MGS 0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903
+#define OBD_FAIL_MGS_PAUSE_REQ 0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905
+
+#define OBD_FAIL_QUOTA_DQACQ_NET 0xA01
+#define OBD_FAIL_QUOTA_EDQUOT 0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT 0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04
+
+#define OBD_FAIL_LPROC_REMOVE 0xB00
+
+#define OBD_FAIL_GENERAL_ALLOC 0xC00
+
+#define OBD_FAIL_SEQ 0x1000
+#define OBD_FAIL_SEQ_QUERY_NET 0x1001
+#define OBD_FAIL_SEQ_EXHAUST 0x1002
+
+#define OBD_FAIL_FLD 0x1100
+#define OBD_FAIL_FLD_QUERY_NET 0x1101
+
+#define OBD_FAIL_SEC_CTX 0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204
+
+#define OBD_FAIL_LLOG 0x1300
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET 0x1301
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET 0x1302
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET 0x1303
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET 0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET 0x1306
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET 0x1309
+#define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310
+#define OBD_FAIL_SEQ_ALLOC 0x1311
+
+#define OBD_FAIL_LLITE 0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR 0x1402
+#define OBD_FAIL_LOV_INIT 0x1403
+#define OBD_FAIL_GLIMPSE_DELAY 0x1404
+
+#define OBD_FAIL_FID_INDIR 0x1501
+#define OBD_FAIL_FID_INLMA 0x1502
+#define OBD_FAIL_FID_IGIF 0x1504
+#define OBD_FAIL_FID_LOOKUP 0x1505
+#define OBD_FAIL_FID_NOLMA 0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1 0x1600
+#define OBD_FAIL_LFSCK_DELAY2 0x1601
+#define OBD_FAIL_LFSCK_DELAY3 0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
+#define OBD_FAIL_LFSCK_FATAL1 0x1608
+#define OBD_FAIL_LFSCK_FATAL2 0x1609
+#define OBD_FAIL_LFSCK_CRASH 0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO 0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
+
+/* UPDATE */
+#define OBD_FAIL_UPDATE_OBJ_NET 0x1700
+#define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701
+
+
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id)
+#define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_VALUE(id, value) CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value) CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value) CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret) CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs) CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms) CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id) CFS_RACE(id)
+#define OBD_FAIL_ONCE CFS_FAIL_ONCE
+#define OBD_FAILED CFS_FAILED
+
+extern atomic_t libcfs_kmemory;
+
+#ifdef LPROCFS
+#define obd_memory_add(size) \
+ lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size) \
+ lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum() \
+ lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT, \
+ LPROCFS_FIELDS_FLAGS_SUM)
+#define obd_pages_add(order) \
+ lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT, \
+ (long)(1 << (order)))
+#define obd_pages_sub(order) \
+ lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT, \
+ (long)(1 << (order)))
+#define obd_pages_sum() \
+ lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT, \
+ LPROCFS_FIELDS_FLAGS_SUM)
+
+extern void obd_update_maxusage(void);
+extern __u64 obd_memory_max(void);
+extern __u64 obd_pages_max(void);
+
+#else
+
+extern __u64 obd_alloc;
+extern __u64 obd_pages;
+
+extern __u64 obd_max_alloc;
+extern __u64 obd_max_pages;
+
+static inline void obd_memory_add(long size)
+{
+ obd_alloc += size;
+ if (obd_alloc > obd_max_alloc)
+ obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+ obd_alloc -= size;
+}
+
+static inline void obd_pages_add(int order)
+{
+ obd_pages += 1<< order;
+ if (obd_pages > obd_max_pages)
+ obd_max_pages = obd_pages;
+}
+
+static inline void obd_pages_sub(int order)
+{
+ obd_pages -= 1<< order;
+}
+
+#define obd_memory_sum() (obd_alloc)
+#define obd_pages_sum() (obd_pages)
+
+#define obd_memory_max() (obd_max_alloc)
+#define obd_pages_max() (obd_max_pages)
+
+#endif
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name) \
+ obd_memory_add(size); \
+ CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \
+ (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name) \
+ LASSERT(ptr); \
+ obd_memory_sub(size); \
+ CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \
+ (int)(size), ptr); \
+ POISON(ptr, 0x5a, size)
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name) ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC)
+
+#define OBD_ALLOC_FAIL_BITS 24
+#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1)
+#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100)
+
+#if defined(LUSTRE_UTILS) /* this version is for utils only */
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \
+do { \
+ (ptr) = (cptab) == NULL ? \
+ kmalloc(size, flags) : \
+ kmalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt)); \
+ if (unlikely((ptr) == NULL)) { \
+ CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n", \
+ (int)(size), __FILE__, __LINE__); \
+ } else { \
+ memset(ptr, 0, size); \
+ CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n", \
+ (int)(size), ptr); \
+ } \
+} while (0)
+
+#else /* this version is for the kernel and liblustre */
+#define OBD_FREE_RTN0(ptr) \
+({ \
+ kfree(ptr); \
+ (ptr) = NULL; \
+ 0; \
+})
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \
+do { \
+ (ptr) = (cptab) == NULL ? \
+ kmalloc(size, flags | __GFP_ZERO) : \
+ kmalloc_node(size, flags | __GFP_ZERO, \
+ cfs_cpt_spread_node(cptab, cpt)); \
+ if (likely((ptr) != NULL && \
+ (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \
+ !obd_alloc_fail(ptr, #ptr, "km", size, \
+ __FILE__, __LINE__) || \
+ OBD_FREE_RTN0(ptr)))){ \
+ OBD_ALLOC_POST(ptr, size, "kmalloced"); \
+ } \
+} while (0)
+#endif
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask) \
+ __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, __GFP_IO)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_IOFS)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask) \
+ __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size) \
+ OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt) \
+ OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr))
+
+# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size) \
+do { \
+ (ptr) = cptab == NULL ? \
+ vzalloc(size) : \
+ vzalloc_node(size, cfs_cpt_spread_node(cptab, cpt)); \
+ if (unlikely((ptr) == NULL)) { \
+ CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n", \
+ (int)(size)); \
+ CERROR(LPU64" total bytes allocated by Lustre, %d by LNET\n", \
+ obd_memory_sum(), atomic_read(&libcfs_kmemory)); \
+ } else { \
+ OBD_ALLOC_POST(ptr, size, "vmalloced"); \
+ } \
+} while(0)
+
+# define OBD_VMALLOC(ptr, size) \
+ __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size)
+# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size) \
+ __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)
+
+
+/* Allocations above this size are considered too big and could not be done
+ * atomically.
+ *
+ * Be very careful when changing this value, especially when decreasing it,
+ * since vmalloc in Linux doesn't perform well on multi-cores system, calling
+ * vmalloc in critical path would hurt peformance badly. See LU-66.
+ */
+#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE)
+
+#define OBD_ALLOC_LARGE(ptr, size) \
+do { \
+ if (size > OBD_ALLOC_BIG) \
+ OBD_VMALLOC(ptr, size); \
+ else \
+ OBD_ALLOC(ptr, size); \
+} while (0)
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size) \
+do { \
+ if (size > OBD_ALLOC_BIG) \
+ OBD_CPT_VMALLOC(ptr, cptab, cpt, size); \
+ else \
+ OBD_CPT_ALLOC(ptr, cptab, cpt, size); \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size) \
+do { \
+ if (size > OBD_ALLOC_BIG) \
+ OBD_VFREE(ptr, size); \
+ else \
+ OBD_FREE(ptr, size); \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr) ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr) (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE); \
+ kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size) \
+do { \
+ OBD_FREE_PRE(ptr, size, "kfreed"); \
+ kfree(ptr); \
+ POISON_PTR(ptr); \
+} while(0)
+
+
+#define OBD_FREE_RCU(ptr, size, handle) \
+do { \
+ struct portals_handle *__h = (handle); \
+ \
+ LASSERT(handle != NULL); \
+ __h->h_cookie = (unsigned long)(ptr); \
+ __h->h_size = (size); \
+ call_rcu(&__h->h_rcu, class_handle_free_cb); \
+ POISON_PTR(ptr); \
+} while(0)
+
+
+#define OBD_VFREE(ptr, size) \
+ do { \
+ OBD_FREE_PRE(ptr, size, "vfreed"); \
+ vfree(ptr); \
+ POISON_PTR(ptr); \
+ } while (0)
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING. its work will be cleared here. we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab) \
+({ \
+ kmem_cache_free((slab), (ptr)); \
+ (ptr) = NULL; \
+ 0; \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type) \
+do { \
+ LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt())); \
+ (ptr) = (cptab) == NULL ? \
+ kmem_cache_alloc(slab, type | __GFP_ZERO) : \
+ kmem_cache_alloc_node(slab, type | __GFP_ZERO, \
+ cfs_cpt_spread_node(cptab, cpt)); \
+ if (likely((ptr) != NULL && \
+ (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \
+ !obd_alloc_fail(ptr, #ptr, "slab-", size, \
+ __FILE__, __LINE__) || \
+ OBD_SLAB_FREE_RTN0(ptr, slab)))) { \
+ OBD_ALLOC_POST(ptr, size, "slab-alloced"); \
+ } \
+} while(0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags) \
+ __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags) \
+ __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr))
+
+#define OBD_SLAB_FREE(ptr, slab, size) \
+do { \
+ OBD_FREE_PRE(ptr, size, "slab-freed"); \
+ kmem_cache_free(slab, ptr); \
+ POISON_PTR(ptr); \
+} while(0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size) \
+ OBD_SLAB_ALLOC_GFP(ptr, slab, size, __GFP_IO)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size) \
+ OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, __GFP_IO)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab) \
+ OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt) \
+ OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags) \
+ OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags) \
+ OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab) \
+ OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr))
+
+#define KEY_IS(str) \
+ (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+/* Wrapper for contiguous page frame allocation */
+#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask) \
+do { \
+ (ptr) = (cptab) == NULL ? \
+ alloc_page(gfp_mask) : \
+ alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), gfp_mask, 0);\
+ if (unlikely((ptr) == NULL)) { \
+ CERROR("alloc_pages of '" #ptr "' %d page(s) / "LPU64" bytes "\
+ "failed\n", (int)1, \
+ (__u64)(1 << PAGE_CACHE_SHIFT)); \
+ CERROR(LPU64" total bytes and "LPU64" total pages " \
+ "("LPU64" bytes) allocated by Lustre, " \
+ "%d total bytes by LNET\n", \
+ obd_memory_sum(), \
+ obd_pages_sum() << PAGE_CACHE_SHIFT, \
+ obd_pages_sum(), \
+ atomic_read(&libcfs_kmemory)); \
+ } else { \
+ obd_pages_add(0); \
+ CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / " \
+ LPU64" bytes at %p.\n", \
+ (int)1, \
+ (__u64)(1 << PAGE_CACHE_SHIFT), ptr); \
+ } \
+} while (0)
+
+#define OBD_PAGE_ALLOC(ptr, gfp_mask) \
+ __OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask)
+#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask) \
+ __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)
+
+#define OBD_PAGE_FREE(ptr) \
+do { \
+ LASSERT(ptr); \
+ obd_pages_sub(0); \
+ CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / "LPU64" bytes " \
+ "at %p.\n", \
+ (int)1, (__u64)(1 << PAGE_CACHE_SHIFT), \
+ ptr); \
+ __free_page(ptr); \
+ (ptr) = (void *)0xdeadbeef; \
+} while (0)
+
+#endif